From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- third_party/aom/av1/encoder/allintra_vis.c | 1055 +++ third_party/aom/av1/encoder/allintra_vis.h | 46 + third_party/aom/av1/encoder/aq_complexity.c | 175 + third_party/aom/av1/encoder/aq_complexity.h | 37 + third_party/aom/av1/encoder/aq_cyclicrefresh.c | 657 ++ third_party/aom/av1/encoder/aq_cyclicrefresh.h | 332 + third_party/aom/av1/encoder/aq_variance.c | 220 + third_party/aom/av1/encoder/aq_variance.h | 35 + .../aom/av1/encoder/arm/crc32/hash_arm_crc32.c | 61 + .../aom/av1/encoder/arm/neon/av1_error_neon.c | 95 + .../aom/av1/encoder/arm/neon/av1_error_sve.c | 109 + .../aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c | 3090 ++++++++ .../encoder/arm/neon/av1_highbd_quantize_neon.c | 146 + .../aom/av1/encoder/arm/neon/av1_k_means_neon.c | 115 + .../encoder/arm/neon/av1_temporal_denoiser_neon.c | 360 + third_party/aom/av1/encoder/arm/neon/cnn_neon.c | 1144 +++ .../aom/av1/encoder/arm/neon/encodetxb_neon.c | 646 ++ .../av1/encoder/arm/neon/highbd_fwd_txfm_neon.c | 2619 +++++++ .../aom/av1/encoder/arm/neon/highbd_pickrst_neon.c | 1207 +++ .../aom/av1/encoder/arm/neon/highbd_rdopt_neon.c | 49 + .../encoder/arm/neon/highbd_temporal_filter_neon.c | 562 ++ .../av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c | 73 + third_party/aom/av1/encoder/arm/neon/ml_neon.c | 339 + .../aom/av1/encoder/arm/neon/pickrst_neon.c | 1217 +++ .../aom/av1/encoder/arm/neon/pickrst_neon.h | 188 + .../aom/av1/encoder/arm/neon/quantize_neon.c | 928 +++ third_party/aom/av1/encoder/arm/neon/rdopt_neon.c | 459 ++ .../aom/av1/encoder/arm/neon/reconinter_enc_neon.c | 288 + third_party/aom/av1/encoder/arm/neon/shift_neon.h | 49 + .../av1/encoder/arm/neon/temporal_filter_neon.c | 548 ++ .../arm/neon/temporal_filter_neon_dotprod.c | 299 + third_party/aom/av1/encoder/arm/neon/txfm_neon.h | 26 + .../aom/av1/encoder/arm/neon/wedge_utils_neon.c | 131 + third_party/aom/av1/encoder/av1_fwd_txfm1d.c | 1885 +++++ third_party/aom/av1/encoder/av1_fwd_txfm1d.h | 49 + third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h | 19 + third_party/aom/av1/encoder/av1_fwd_txfm2d.c | 423 ++ .../aom/av1/encoder/av1_ml_partition_models.h | 179 + third_party/aom/av1/encoder/av1_noise_estimate.c | 296 + third_party/aom/av1/encoder/av1_noise_estimate.h | 50 + third_party/aom/av1/encoder/av1_quantize.c | 917 +++ third_party/aom/av1/encoder/av1_quantize.h | 224 + .../aom/av1/encoder/av1_temporal_denoiser.c | 805 ++ .../aom/av1/encoder/av1_temporal_denoiser.h | 134 + third_party/aom/av1/encoder/bitstream.c | 4248 +++++++++++ third_party/aom/av1/encoder/bitstream.h | 137 + third_party/aom/av1/encoder/block.h | 1515 ++++ third_party/aom/av1/encoder/blockiness.c | 140 + third_party/aom/av1/encoder/cnn.c | 1189 +++ third_party/aom/av1/encoder/cnn.h | 191 + third_party/aom/av1/encoder/compound_type.c | 1678 +++++ third_party/aom/av1/encoder/compound_type.h | 52 + third_party/aom/av1/encoder/context_tree.c | 311 + third_party/aom/av1/encoder/context_tree.h | 142 + third_party/aom/av1/encoder/cost.c | 46 + third_party/aom/av1/encoder/cost.h | 51 + third_party/aom/av1/encoder/deltaq4_model.c | 7776 ++++++++++++++++++++ third_party/aom/av1/encoder/dwt.c | 146 + third_party/aom/av1/encoder/dwt.h | 27 + third_party/aom/av1/encoder/enc_enums.h | 268 + third_party/aom/av1/encoder/encode_strategy.c | 1767 +++++ third_party/aom/av1/encoder/encode_strategy.h | 138 + third_party/aom/av1/encoder/encodeframe.c | 2408 ++++++ third_party/aom/av1/encoder/encodeframe.h | 55 + third_party/aom/av1/encoder/encodeframe_utils.c | 1775 +++++ third_party/aom/av1/encoder/encodeframe_utils.h | 595 ++ third_party/aom/av1/encoder/encodemb.c | 866 +++ third_party/aom/av1/encoder/encodemb.h | 180 + third_party/aom/av1/encoder/encodemv.c | 345 + third_party/aom/av1/encoder/encodemv.h | 110 + third_party/aom/av1/encoder/encoder.c | 5409 ++++++++++++++ third_party/aom/av1/encoder/encoder.h | 4512 ++++++++++++ third_party/aom/av1/encoder/encoder_alloc.h | 531 ++ third_party/aom/av1/encoder/encoder_utils.c | 1503 ++++ third_party/aom/av1/encoder/encoder_utils.h | 1141 +++ third_party/aom/av1/encoder/encodetxb.c | 886 +++ third_party/aom/av1/encoder/encodetxb.h | 276 + third_party/aom/av1/encoder/ethread.c | 3469 +++++++++ third_party/aom/av1/encoder/ethread.h | 133 + third_party/aom/av1/encoder/extend.c | 163 + third_party/aom/av1/encoder/extend.h | 29 + third_party/aom/av1/encoder/external_partition.c | 98 + third_party/aom/av1/encoder/external_partition.h | 58 + third_party/aom/av1/encoder/firstpass.c | 1600 ++++ third_party/aom/av1/encoder/firstpass.h | 603 ++ third_party/aom/av1/encoder/global_motion.c | 575 ++ third_party/aom/av1/encoder/global_motion.h | 157 + third_party/aom/av1/encoder/global_motion_facade.c | 450 ++ third_party/aom/av1/encoder/global_motion_facade.h | 58 + third_party/aom/av1/encoder/gop_structure.c | 867 +++ third_party/aom/av1/encoder/gop_structure.h | 95 + third_party/aom/av1/encoder/grain_test_vectors.h | 781 ++ third_party/aom/av1/encoder/hash.c | 126 + third_party/aom/av1/encoder/hash.h | 53 + third_party/aom/av1/encoder/hash_motion.c | 503 ++ third_party/aom/av1/encoder/hash_motion.h | 103 + third_party/aom/av1/encoder/hybrid_fwd_txfm.c | 370 + third_party/aom/av1/encoder/hybrid_fwd_txfm.h | 40 + third_party/aom/av1/encoder/interp_search.c | 801 ++ third_party/aom/av1/encoder/interp_search.h | 205 + third_party/aom/av1/encoder/intra_mode_search.c | 1739 +++++ third_party/aom/av1/encoder/intra_mode_search.h | 329 + .../aom/av1/encoder/intra_mode_search_utils.h | 690 ++ third_party/aom/av1/encoder/k_means_template.h | 151 + third_party/aom/av1/encoder/level.c | 1397 ++++ third_party/aom/av1/encoder/level.h | 221 + third_party/aom/av1/encoder/lookahead.c | 222 + third_party/aom/av1/encoder/lookahead.h | 138 + third_party/aom/av1/encoder/mcomp.c | 3998 ++++++++++ third_party/aom/av1/encoder/mcomp.h | 398 + third_party/aom/av1/encoder/mcomp_structs.h | 109 + third_party/aom/av1/encoder/misc_model_weights.h | 696 ++ third_party/aom/av1/encoder/ml.c | 171 + third_party/aom/av1/encoder/ml.h | 85 + .../aom/av1/encoder/mode_prune_model_weights.h | 185 + third_party/aom/av1/encoder/model_rd.h | 270 + third_party/aom/av1/encoder/motion_search_facade.c | 1071 +++ third_party/aom/av1/encoder/motion_search_facade.h | 145 + third_party/aom/av1/encoder/mv_prec.c | 429 ++ third_party/aom/av1/encoder/mv_prec.h | 52 + third_party/aom/av1/encoder/nonrd_opt.c | 933 +++ third_party/aom/av1/encoder/nonrd_opt.h | 575 ++ third_party/aom/av1/encoder/nonrd_pickmode.c | 3537 +++++++++ third_party/aom/av1/encoder/optical_flow.c | 1113 +++ third_party/aom/av1/encoder/optical_flow.h | 76 + third_party/aom/av1/encoder/palette.c | 975 +++ third_party/aom/av1/encoder/palette.h | 215 + .../aom/av1/encoder/partition_cnn_weights.h | 2139 ++++++ .../aom/av1/encoder/partition_model_weights.h | 5646 ++++++++++++++ third_party/aom/av1/encoder/partition_search.c | 6263 ++++++++++++++++ third_party/aom/av1/encoder/partition_search.h | 81 + third_party/aom/av1/encoder/partition_strategy.c | 2573 +++++++ third_party/aom/av1/encoder/partition_strategy.h | 265 + third_party/aom/av1/encoder/pass2_strategy.c | 4488 +++++++++++ third_party/aom/av1/encoder/pass2_strategy.h | 149 + third_party/aom/av1/encoder/pickcdef.c | 958 +++ third_party/aom/av1/encoder/pickcdef.h | 261 + third_party/aom/av1/encoder/picklpf.c | 339 + third_party/aom/av1/encoder/picklpf.h | 165 + third_party/aom/av1/encoder/pickrst.c | 2217 ++++++ third_party/aom/av1/encoder/pickrst.h | 126 + third_party/aom/av1/encoder/pustats.h | 198 + third_party/aom/av1/encoder/random.h | 85 + third_party/aom/av1/encoder/ratectrl.c | 3587 +++++++++ third_party/aom/av1/encoder/ratectrl.h | 864 +++ third_party/aom/av1/encoder/rc_utils.h | 469 ++ third_party/aom/av1/encoder/rd.c | 1580 ++++ third_party/aom/av1/encoder/rd.h | 390 + third_party/aom/av1/encoder/rdopt.c | 6598 +++++++++++++++++ third_party/aom/av1/encoder/rdopt.h | 327 + third_party/aom/av1/encoder/rdopt_data_defs.h | 294 + third_party/aom/av1/encoder/rdopt_utils.h | 797 ++ third_party/aom/av1/encoder/reconinter_enc.c | 701 ++ third_party/aom/av1/encoder/reconinter_enc.h | 94 + third_party/aom/av1/encoder/saliency_map.c | 1414 ++++ third_party/aom/av1/encoder/saliency_map.h | 28 + third_party/aom/av1/encoder/segmentation.c | 54 + third_party/aom/av1/encoder/segmentation.h | 38 + third_party/aom/av1/encoder/sorting_network.h | 140 + third_party/aom/av1/encoder/sparse_linear_solver.c | 472 ++ third_party/aom/av1/encoder/sparse_linear_solver.h | 67 + third_party/aom/av1/encoder/speed_features.c | 2715 +++++++ third_party/aom/av1/encoder/speed_features.h | 2025 +++++ third_party/aom/av1/encoder/superres_scale.c | 423 ++ third_party/aom/av1/encoder/superres_scale.h | 28 + third_party/aom/av1/encoder/svc_layercontext.c | 701 ++ third_party/aom/av1/encoder/svc_layercontext.h | 325 + third_party/aom/av1/encoder/temporal_filter.c | 1520 ++++ third_party/aom/av1/encoder/temporal_filter.h | 458 ++ third_party/aom/av1/encoder/thirdpass.c | 877 +++ third_party/aom/av1/encoder/thirdpass.h | 197 + third_party/aom/av1/encoder/tokenize.c | 396 + third_party/aom/av1/encoder/tokenize.h | 159 + third_party/aom/av1/encoder/tpl_model.c | 2511 +++++++ third_party/aom/av1/encoder/tpl_model.h | 794 ++ third_party/aom/av1/encoder/tune_butteraugli.c | 313 + third_party/aom/av1/encoder/tune_butteraugli.h | 45 + third_party/aom/av1/encoder/tune_vmaf.c | 1112 +++ third_party/aom/av1/encoder/tune_vmaf.h | 63 + .../aom/av1/encoder/tx_prune_model_weights.h | 3422 +++++++++ third_party/aom/av1/encoder/tx_search.c | 3830 ++++++++++ third_party/aom/av1/encoder/tx_search.h | 226 + third_party/aom/av1/encoder/txb_rdopt.c | 659 ++ third_party/aom/av1/encoder/txb_rdopt.h | 160 + third_party/aom/av1/encoder/txb_rdopt_utils.h | 236 + third_party/aom/av1/encoder/var_based_part.c | 1914 +++++ third_party/aom/av1/encoder/var_based_part.h | 104 + third_party/aom/av1/encoder/wedge_utils.c | 125 + .../aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c | 1409 ++++ .../aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c | 3010 ++++++++ .../aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c | 336 + .../aom/av1/encoder/x86/av1_fwd_txfm_avx2.h | 96 + .../aom/av1/encoder/x86/av1_fwd_txfm_sse2.c | 2673 +++++++ .../aom/av1/encoder/x86/av1_fwd_txfm_sse2.h | 253 + .../aom/av1/encoder/x86/av1_highbd_quantize_avx2.c | 137 + .../aom/av1/encoder/x86/av1_highbd_quantize_sse4.c | 195 + third_party/aom/av1/encoder/x86/av1_k_means_avx2.c | 132 + third_party/aom/av1/encoder/x86/av1_k_means_sse2.c | 124 + .../aom/av1/encoder/x86/av1_quantize_avx2.c | 414 ++ .../aom/av1/encoder/x86/av1_quantize_sse2.c | 289 + .../av1/encoder/x86/av1_quantize_ssse3_x86_64.asm | 204 + .../aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm | 222 + .../av1/encoder/x86/av1_temporal_denoiser_sse2.c | 328 + third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h | 144 + third_party/aom/av1/encoder/x86/cnn_avx2.c | 532 ++ third_party/aom/av1/encoder/x86/dct_sse2.asm | 82 + third_party/aom/av1/encoder/x86/encodetxb_avx2.c | 122 + third_party/aom/av1/encoder/x86/encodetxb_sse2.c | 505 ++ third_party/aom/av1/encoder/x86/encodetxb_sse4.c | 84 + .../aom/av1/encoder/x86/error_intrin_avx2.c | 210 + .../aom/av1/encoder/x86/error_intrin_sse2.c | 75 + third_party/aom/av1/encoder/x86/error_sse2.asm | 88 + third_party/aom/av1/encoder/x86/hash_sse42.c | 53 + .../encoder/x86/highbd_block_error_intrin_avx2.c | 64 + .../encoder/x86/highbd_block_error_intrin_sse2.c | 74 + .../aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c | 3132 ++++++++ .../aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c | 2629 +++++++ .../av1/encoder/x86/highbd_temporal_filter_avx2.c | 466 ++ .../av1/encoder/x86/highbd_temporal_filter_sse2.c | 341 + third_party/aom/av1/encoder/x86/ml_avx2.c | 240 + third_party/aom/av1/encoder/x86/ml_sse3.c | 336 + third_party/aom/av1/encoder/x86/ml_sse3.h | 29 + third_party/aom/av1/encoder/x86/pickrst_avx2.c | 2348 ++++++ third_party/aom/av1/encoder/x86/pickrst_sse4.c | 1483 ++++ third_party/aom/av1/encoder/x86/rdopt_avx2.c | 254 + third_party/aom/av1/encoder/x86/rdopt_sse4.c | 272 + .../aom/av1/encoder/x86/reconinter_enc_sse2.c | 347 + .../aom/av1/encoder/x86/reconinter_enc_ssse3.c | 67 + .../aom/av1/encoder/x86/temporal_filter_avx2.c | 647 ++ .../aom/av1/encoder/x86/temporal_filter_sse2.c | 320 + third_party/aom/av1/encoder/x86/wedge_utils_avx2.c | 215 + third_party/aom/av1/encoder/x86/wedge_utils_sse2.c | 254 + 232 files changed, 188397 insertions(+) create mode 100644 third_party/aom/av1/encoder/allintra_vis.c create mode 100644 third_party/aom/av1/encoder/allintra_vis.h create mode 100644 third_party/aom/av1/encoder/aq_complexity.c create mode 100644 third_party/aom/av1/encoder/aq_complexity.h create mode 100644 third_party/aom/av1/encoder/aq_cyclicrefresh.c create mode 100644 third_party/aom/av1/encoder/aq_cyclicrefresh.h create mode 100644 third_party/aom/av1/encoder/aq_variance.c create mode 100644 third_party/aom/av1/encoder/aq_variance.h create mode 100644 third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_error_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_error_sve.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/cnn_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/ml_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/pickrst_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/pickrst_neon.h create mode 100644 third_party/aom/av1/encoder/arm/neon/quantize_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/rdopt_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/shift_neon.h create mode 100644 third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c create mode 100644 third_party/aom/av1/encoder/arm/neon/txfm_neon.h create mode 100644 third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm1d.c create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm1d.h create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm2d.c create mode 100644 third_party/aom/av1/encoder/av1_ml_partition_models.h create mode 100644 third_party/aom/av1/encoder/av1_noise_estimate.c create mode 100644 third_party/aom/av1/encoder/av1_noise_estimate.h create mode 100644 third_party/aom/av1/encoder/av1_quantize.c create mode 100644 third_party/aom/av1/encoder/av1_quantize.h create mode 100644 third_party/aom/av1/encoder/av1_temporal_denoiser.c create mode 100644 third_party/aom/av1/encoder/av1_temporal_denoiser.h create mode 100644 third_party/aom/av1/encoder/bitstream.c create mode 100644 third_party/aom/av1/encoder/bitstream.h create mode 100644 third_party/aom/av1/encoder/block.h create mode 100644 third_party/aom/av1/encoder/blockiness.c create mode 100644 third_party/aom/av1/encoder/cnn.c create mode 100644 third_party/aom/av1/encoder/cnn.h create mode 100644 third_party/aom/av1/encoder/compound_type.c create mode 100644 third_party/aom/av1/encoder/compound_type.h create mode 100644 third_party/aom/av1/encoder/context_tree.c create mode 100644 third_party/aom/av1/encoder/context_tree.h create mode 100644 third_party/aom/av1/encoder/cost.c create mode 100644 third_party/aom/av1/encoder/cost.h create mode 100644 third_party/aom/av1/encoder/deltaq4_model.c create mode 100644 third_party/aom/av1/encoder/dwt.c create mode 100644 third_party/aom/av1/encoder/dwt.h create mode 100644 third_party/aom/av1/encoder/enc_enums.h create mode 100644 third_party/aom/av1/encoder/encode_strategy.c create mode 100644 third_party/aom/av1/encoder/encode_strategy.h create mode 100644 third_party/aom/av1/encoder/encodeframe.c create mode 100644 third_party/aom/av1/encoder/encodeframe.h create mode 100644 third_party/aom/av1/encoder/encodeframe_utils.c create mode 100644 third_party/aom/av1/encoder/encodeframe_utils.h create mode 100644 third_party/aom/av1/encoder/encodemb.c create mode 100644 third_party/aom/av1/encoder/encodemb.h create mode 100644 third_party/aom/av1/encoder/encodemv.c create mode 100644 third_party/aom/av1/encoder/encodemv.h create mode 100644 third_party/aom/av1/encoder/encoder.c create mode 100644 third_party/aom/av1/encoder/encoder.h create mode 100644 third_party/aom/av1/encoder/encoder_alloc.h create mode 100644 third_party/aom/av1/encoder/encoder_utils.c create mode 100644 third_party/aom/av1/encoder/encoder_utils.h create mode 100644 third_party/aom/av1/encoder/encodetxb.c create mode 100644 third_party/aom/av1/encoder/encodetxb.h create mode 100644 third_party/aom/av1/encoder/ethread.c create mode 100644 third_party/aom/av1/encoder/ethread.h create mode 100644 third_party/aom/av1/encoder/extend.c create mode 100644 third_party/aom/av1/encoder/extend.h create mode 100644 third_party/aom/av1/encoder/external_partition.c create mode 100644 third_party/aom/av1/encoder/external_partition.h create mode 100644 third_party/aom/av1/encoder/firstpass.c create mode 100644 third_party/aom/av1/encoder/firstpass.h create mode 100644 third_party/aom/av1/encoder/global_motion.c create mode 100644 third_party/aom/av1/encoder/global_motion.h create mode 100644 third_party/aom/av1/encoder/global_motion_facade.c create mode 100644 third_party/aom/av1/encoder/global_motion_facade.h create mode 100644 third_party/aom/av1/encoder/gop_structure.c create mode 100644 third_party/aom/av1/encoder/gop_structure.h create mode 100644 third_party/aom/av1/encoder/grain_test_vectors.h create mode 100644 third_party/aom/av1/encoder/hash.c create mode 100644 third_party/aom/av1/encoder/hash.h create mode 100644 third_party/aom/av1/encoder/hash_motion.c create mode 100644 third_party/aom/av1/encoder/hash_motion.h create mode 100644 third_party/aom/av1/encoder/hybrid_fwd_txfm.c create mode 100644 third_party/aom/av1/encoder/hybrid_fwd_txfm.h create mode 100644 third_party/aom/av1/encoder/interp_search.c create mode 100644 third_party/aom/av1/encoder/interp_search.h create mode 100644 third_party/aom/av1/encoder/intra_mode_search.c create mode 100644 third_party/aom/av1/encoder/intra_mode_search.h create mode 100644 third_party/aom/av1/encoder/intra_mode_search_utils.h create mode 100644 third_party/aom/av1/encoder/k_means_template.h create mode 100644 third_party/aom/av1/encoder/level.c create mode 100644 third_party/aom/av1/encoder/level.h create mode 100644 third_party/aom/av1/encoder/lookahead.c create mode 100644 third_party/aom/av1/encoder/lookahead.h create mode 100644 third_party/aom/av1/encoder/mcomp.c create mode 100644 third_party/aom/av1/encoder/mcomp.h create mode 100644 third_party/aom/av1/encoder/mcomp_structs.h create mode 100644 third_party/aom/av1/encoder/misc_model_weights.h create mode 100644 third_party/aom/av1/encoder/ml.c create mode 100644 third_party/aom/av1/encoder/ml.h create mode 100644 third_party/aom/av1/encoder/mode_prune_model_weights.h create mode 100644 third_party/aom/av1/encoder/model_rd.h create mode 100644 third_party/aom/av1/encoder/motion_search_facade.c create mode 100644 third_party/aom/av1/encoder/motion_search_facade.h create mode 100644 third_party/aom/av1/encoder/mv_prec.c create mode 100644 third_party/aom/av1/encoder/mv_prec.h create mode 100644 third_party/aom/av1/encoder/nonrd_opt.c create mode 100644 third_party/aom/av1/encoder/nonrd_opt.h create mode 100644 third_party/aom/av1/encoder/nonrd_pickmode.c create mode 100644 third_party/aom/av1/encoder/optical_flow.c create mode 100644 third_party/aom/av1/encoder/optical_flow.h create mode 100644 third_party/aom/av1/encoder/palette.c create mode 100644 third_party/aom/av1/encoder/palette.h create mode 100644 third_party/aom/av1/encoder/partition_cnn_weights.h create mode 100644 third_party/aom/av1/encoder/partition_model_weights.h create mode 100644 third_party/aom/av1/encoder/partition_search.c create mode 100644 third_party/aom/av1/encoder/partition_search.h create mode 100644 third_party/aom/av1/encoder/partition_strategy.c create mode 100644 third_party/aom/av1/encoder/partition_strategy.h create mode 100644 third_party/aom/av1/encoder/pass2_strategy.c create mode 100644 third_party/aom/av1/encoder/pass2_strategy.h create mode 100644 third_party/aom/av1/encoder/pickcdef.c create mode 100644 third_party/aom/av1/encoder/pickcdef.h create mode 100644 third_party/aom/av1/encoder/picklpf.c create mode 100644 third_party/aom/av1/encoder/picklpf.h create mode 100644 third_party/aom/av1/encoder/pickrst.c create mode 100644 third_party/aom/av1/encoder/pickrst.h create mode 100644 third_party/aom/av1/encoder/pustats.h create mode 100644 third_party/aom/av1/encoder/random.h create mode 100644 third_party/aom/av1/encoder/ratectrl.c create mode 100644 third_party/aom/av1/encoder/ratectrl.h create mode 100644 third_party/aom/av1/encoder/rc_utils.h create mode 100644 third_party/aom/av1/encoder/rd.c create mode 100644 third_party/aom/av1/encoder/rd.h create mode 100644 third_party/aom/av1/encoder/rdopt.c create mode 100644 third_party/aom/av1/encoder/rdopt.h create mode 100644 third_party/aom/av1/encoder/rdopt_data_defs.h create mode 100644 third_party/aom/av1/encoder/rdopt_utils.h create mode 100644 third_party/aom/av1/encoder/reconinter_enc.c create mode 100644 third_party/aom/av1/encoder/reconinter_enc.h create mode 100644 third_party/aom/av1/encoder/saliency_map.c create mode 100644 third_party/aom/av1/encoder/saliency_map.h create mode 100644 third_party/aom/av1/encoder/segmentation.c create mode 100644 third_party/aom/av1/encoder/segmentation.h create mode 100644 third_party/aom/av1/encoder/sorting_network.h create mode 100644 third_party/aom/av1/encoder/sparse_linear_solver.c create mode 100644 third_party/aom/av1/encoder/sparse_linear_solver.h create mode 100644 third_party/aom/av1/encoder/speed_features.c create mode 100644 third_party/aom/av1/encoder/speed_features.h create mode 100644 third_party/aom/av1/encoder/superres_scale.c create mode 100644 third_party/aom/av1/encoder/superres_scale.h create mode 100644 third_party/aom/av1/encoder/svc_layercontext.c create mode 100644 third_party/aom/av1/encoder/svc_layercontext.h create mode 100644 third_party/aom/av1/encoder/temporal_filter.c create mode 100644 third_party/aom/av1/encoder/temporal_filter.h create mode 100644 third_party/aom/av1/encoder/thirdpass.c create mode 100644 third_party/aom/av1/encoder/thirdpass.h create mode 100644 third_party/aom/av1/encoder/tokenize.c create mode 100644 third_party/aom/av1/encoder/tokenize.h create mode 100644 third_party/aom/av1/encoder/tpl_model.c create mode 100644 third_party/aom/av1/encoder/tpl_model.h create mode 100644 third_party/aom/av1/encoder/tune_butteraugli.c create mode 100644 third_party/aom/av1/encoder/tune_butteraugli.h create mode 100644 third_party/aom/av1/encoder/tune_vmaf.c create mode 100644 third_party/aom/av1/encoder/tune_vmaf.h create mode 100644 third_party/aom/av1/encoder/tx_prune_model_weights.h create mode 100644 third_party/aom/av1/encoder/tx_search.c create mode 100644 third_party/aom/av1/encoder/tx_search.h create mode 100644 third_party/aom/av1/encoder/txb_rdopt.c create mode 100644 third_party/aom/av1/encoder/txb_rdopt.h create mode 100644 third_party/aom/av1/encoder/txb_rdopt_utils.h create mode 100644 third_party/aom/av1/encoder/var_based_part.c create mode 100644 third_party/aom/av1/encoder/var_based_part.h create mode 100644 third_party/aom/av1/encoder/wedge_utils.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h create mode 100644 third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/av1_k_means_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_k_means_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_quantize_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_quantize_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm create mode 100644 third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm create mode 100644 third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h create mode 100644 third_party/aom/av1/encoder/x86/cnn_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/dct_sse2.asm create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/error_intrin_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/error_intrin_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/error_sse2.asm create mode 100644 third_party/aom/av1/encoder/x86/hash_sse42.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/ml_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/ml_sse3.c create mode 100644 third_party/aom/av1/encoder/x86/ml_sse3.h create mode 100644 third_party/aom/av1/encoder/x86/pickrst_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/pickrst_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/rdopt_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/rdopt_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c create mode 100644 third_party/aom/av1/encoder/x86/temporal_filter_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/temporal_filter_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/wedge_utils_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/wedge_utils_sse2.c (limited to 'third_party/aom/av1/encoder') diff --git a/third_party/aom/av1/encoder/allintra_vis.c b/third_party/aom/av1/encoder/allintra_vis.c new file mode 100644 index 0000000000..8dcef5fc85 --- /dev/null +++ b/third_party/aom/av1/encoder/allintra_vis.c @@ -0,0 +1,1055 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#if CONFIG_TFLITE +#include "tensorflow/lite/c/c_api.h" +#include "av1/encoder/deltaq4_model.c" +#endif + +#include "av1/common/common_data.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/allintra_vis.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/rdopt_utils.h" + +#define MB_WIENER_PRED_BLOCK_SIZE BLOCK_128X128 +#define MB_WIENER_PRED_BUF_STRIDE 128 + +void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td) { + const int is_high_bitdepth = is_cur_buf_hbd(&td->mb.e_mbd); + assert(MB_WIENER_PRED_BLOCK_SIZE < BLOCK_SIZES_ALL); + const int buf_width = block_size_wide[MB_WIENER_PRED_BLOCK_SIZE]; + const int buf_height = block_size_high[MB_WIENER_PRED_BLOCK_SIZE]; + assert(buf_width == MB_WIENER_PRED_BUF_STRIDE); + const size_t buf_size = + (buf_width * buf_height * sizeof(*td->wiener_tmp_pred_buf)) + << is_high_bitdepth; + CHECK_MEM_ERROR(cm, td->wiener_tmp_pred_buf, aom_memalign(32, buf_size)); +} + +void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td) { + aom_free(td->wiener_tmp_pred_buf); + td->wiener_tmp_pred_buf = NULL; +} + +void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + + // This block size is also used to determine number of workers in + // multi-threading. If it is changed, one needs to change it accordingly in + // "compute_num_ai_workers()". + cpi->weber_bsize = BLOCK_8X8; + + if (cpi->oxcf.enable_rate_guide_deltaq) { + if (cpi->mb_weber_stats && cpi->prep_rate_estimates && + cpi->ext_rate_distribution) + return; + } else { + if (cpi->mb_weber_stats) return; + } + + CHECK_MEM_ERROR(cm, cpi->mb_weber_stats, + aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols, + sizeof(*cpi->mb_weber_stats))); + + if (cpi->oxcf.enable_rate_guide_deltaq) { + CHECK_MEM_ERROR( + cm, cpi->prep_rate_estimates, + aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols, + sizeof(*cpi->prep_rate_estimates))); + + CHECK_MEM_ERROR( + cm, cpi->ext_rate_distribution, + aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols, + sizeof(*cpi->ext_rate_distribution))); + } +} + +static int64_t get_satd(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + const int mi_step = mi_size_wide[cpi->weber_bsize]; + int mb_stride = cpi->frame_info.mi_cols; + int mb_count = 0; + int64_t satd = 0; + + for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { + for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { + if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) + continue; + + satd += cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)] + .satd; + ++mb_count; + } + } + + if (mb_count) satd = (int)(satd / mb_count); + satd = AOMMAX(1, satd); + + return (int)satd; +} + +static int64_t get_sse(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + const int mi_step = mi_size_wide[cpi->weber_bsize]; + int mb_stride = cpi->frame_info.mi_cols; + int mb_count = 0; + int64_t distortion = 0; + + for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { + for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { + if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) + continue; + + distortion += + cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)] + .distortion; + ++mb_count; + } + } + + if (mb_count) distortion = (int)(distortion / mb_count); + distortion = AOMMAX(1, distortion); + + return (int)distortion; +} + +static double get_max_scale(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + const int mi_step = mi_size_wide[cpi->weber_bsize]; + int mb_stride = cpi->frame_info.mi_cols; + double min_max_scale = 10.0; + + for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { + for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { + if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) + continue; + WeberStats *weber_stats = + &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]; + if (weber_stats->max_scale < 1.0) continue; + if (weber_stats->max_scale < min_max_scale) + min_max_scale = weber_stats->max_scale; + } + } + return min_max_scale; +} + +static int get_window_wiener_var(AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + const int mi_step = mi_size_wide[cpi->weber_bsize]; + int sb_wiener_var = 0; + int mb_stride = cpi->frame_info.mi_cols; + int mb_count = 0; + double base_num = 1; + double base_den = 1; + double base_reg = 1; + + for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { + for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { + if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) + continue; + + WeberStats *weber_stats = + &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]; + + base_num += ((double)weber_stats->distortion) * + sqrt((double)weber_stats->src_variance) * + weber_stats->rec_pix_max; + + base_den += fabs( + weber_stats->rec_pix_max * sqrt((double)weber_stats->src_variance) - + weber_stats->src_pix_max * sqrt((double)weber_stats->rec_variance)); + + base_reg += sqrt((double)weber_stats->distortion) * + sqrt((double)weber_stats->src_pix_max) * 0.1; + ++mb_count; + } + } + + sb_wiener_var = + (int)(((base_num + base_reg) / (base_den + base_reg)) / mb_count); + sb_wiener_var = AOMMAX(1, sb_wiener_var); + + return (int)sb_wiener_var; +} + +static int get_var_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + int sb_wiener_var = get_window_wiener_var(cpi, bsize, mi_row, mi_col); + + if (mi_row >= (mi_high / 2)) { + sb_wiener_var = + AOMMIN(sb_wiener_var, + get_window_wiener_var(cpi, bsize, mi_row - mi_high / 2, mi_col)); + } + if (mi_row <= (cm->mi_params.mi_rows - mi_high - (mi_high / 2))) { + sb_wiener_var = + AOMMIN(sb_wiener_var, + get_window_wiener_var(cpi, bsize, mi_row + mi_high / 2, mi_col)); + } + if (mi_col >= (mi_wide / 2)) { + sb_wiener_var = + AOMMIN(sb_wiener_var, + get_window_wiener_var(cpi, bsize, mi_row, mi_col - mi_wide / 2)); + } + if (mi_col <= (cm->mi_params.mi_cols - mi_wide - (mi_wide / 2))) { + sb_wiener_var = + AOMMIN(sb_wiener_var, + get_window_wiener_var(cpi, bsize, mi_row, mi_col + mi_wide / 2)); + } + + return sb_wiener_var; +} + +static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { + const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; + + assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); + int rate_cost = 1; + + for (int idx = 0; idx < eob; ++idx) { + int abs_level = abs(qcoeff[scan_order->scan[idx]]); + rate_cost += (int)(log1p(abs_level) / log(2.0)) + 1 + (abs_level > 0); + } + + return (rate_cost << AV1_PROB_COST_SHIFT); +} + +void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x, + MACROBLOCKD *xd, const int mi_row, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + double *sum_rec_distortion, + double *sum_est_rate, uint8_t *pred_buffer) { + AV1_COMMON *const cm = &cpi->common; + uint8_t *buffer = cpi->source->y_buffer; + int buf_stride = cpi->source->y_stride; + MB_MODE_INFO mbmi; + memset(&mbmi, 0, sizeof(mbmi)); + MB_MODE_INFO *mbmi_ptr = &mbmi; + xd->mi = &mbmi_ptr; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int block_size = tx_size_wide[tx_size]; + const int coeff_count = block_size * block_size; + const int mb_step = mi_size_wide[bsize]; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + const MultiThreadInfo *const mt_info = &cpi->mt_info; + const AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt; + AV1EncRowMultiThreadSync *const intra_row_mt_sync = + &cpi->ppi->intra_row_mt_sync; + const int mi_cols = cm->mi_params.mi_cols; + const int mt_thread_id = mi_row / mb_step; + // TODO(chengchen): test different unit step size + const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; + const int mt_unit_cols = (mi_cols + (mt_unit_step >> 1)) / mt_unit_step; + int mt_unit_col = 0; + const int is_high_bitdepth = is_cur_buf_hbd(xd); + + uint8_t *dst_buffer = pred_buffer; + const int dst_buffer_stride = MB_WIENER_PRED_BUF_STRIDE; + + if (is_high_bitdepth) { + uint16_t *pred_buffer_16 = (uint16_t *)pred_buffer; + dst_buffer = CONVERT_TO_BYTEPTR(pred_buffer_16); + } + + for (int mi_col = 0; mi_col < mi_cols; mi_col += mb_step) { + if (mi_col % mt_unit_step == 0) { + intra_mt->intra_sync_read_ptr(intra_row_mt_sync, mt_thread_id, + mt_unit_col); +#if CONFIG_MULTITHREAD + const int num_workers = + AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers); + if (num_workers > 1) { + const AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + pthread_mutex_lock(enc_row_mt->mutex_); + const bool exit = enc_row_mt->mb_wiener_mt_exit; + pthread_mutex_unlock(enc_row_mt->mutex_); + // Stop further processing in case any worker has encountered an error. + if (exit) break; + } +#endif + } + + PREDICTION_MODE best_mode = DC_PRED; + int best_intra_cost = INT_MAX; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width, + AOMMIN(mi_row + mi_height, cm->mi_params.mi_rows), + AOMMIN(mi_col + mi_width, cm->mi_params.mi_cols)); + set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], + av1_num_planes(cm)); + xd->mi[0]->bsize = bsize; + xd->mi[0]->motion_mode = SIMPLE_TRANSLATION; + // Set above and left mbmi to NULL as they are not available in the + // preprocessing stage. + // They are used to detemine intra edge filter types in intra prediction. + if (xd->up_available) { + xd->above_mbmi = NULL; + } + if (xd->left_available) { + xd->left_mbmi = NULL; + } + uint8_t *mb_buffer = + buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE; + for (PREDICTION_MODE mode = INTRA_MODE_START; mode < INTRA_MODE_END; + ++mode) { + // TODO(chengchen): Here we use src instead of reconstructed frame as + // the intra predictor to make single and multithread version match. + // Ideally we want to use the reconstructed. + av1_predict_intra_block( + xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter, + block_size, block_size, tx_size, mode, 0, 0, FILTER_INTRA_MODES, + mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0); + av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, dst_buffer, dst_buffer_stride); + av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff); + int intra_cost = aom_satd(coeff, coeff_count); + if (intra_cost < best_intra_cost) { + best_intra_cost = intra_cost; + best_mode = mode; + } + } + + av1_predict_intra_block( + xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter, + block_size, block_size, tx_size, best_mode, 0, 0, FILTER_INTRA_MODES, + mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0); + av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, dst_buffer, dst_buffer_stride); + av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff); + + const struct macroblock_plane *const p = &x->plane[0]; + uint16_t eob; + const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; + QUANT_PARAM quant_param; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, + scan_order, &quant_param); + } else { + av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, + scan_order, &quant_param); + } +#else + av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, scan_order, + &quant_param); +#endif // CONFIG_AV1_HIGHBITDEPTH + + if (cpi->oxcf.enable_rate_guide_deltaq) { + const int rate_cost = rate_estimator(qcoeff, eob, tx_size); + cpi->prep_rate_estimates[(mi_row / mb_step) * cpi->frame_info.mi_cols + + (mi_col / mb_step)] = rate_cost; + } + + av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer, + dst_buffer_stride, eob, 0); + WeberStats *weber_stats = + &cpi->mb_weber_stats[(mi_row / mb_step) * cpi->frame_info.mi_cols + + (mi_col / mb_step)]; + + weber_stats->rec_pix_max = 1; + weber_stats->rec_variance = 0; + weber_stats->src_pix_max = 1; + weber_stats->src_variance = 0; + weber_stats->distortion = 0; + + int64_t src_mean = 0; + int64_t rec_mean = 0; + int64_t dist_mean = 0; + + for (int pix_row = 0; pix_row < block_size; ++pix_row) { + for (int pix_col = 0; pix_col < block_size; ++pix_col) { + int src_pix, rec_pix; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + uint16_t *src = CONVERT_TO_SHORTPTR(mb_buffer); + uint16_t *rec = CONVERT_TO_SHORTPTR(dst_buffer); + src_pix = src[pix_row * buf_stride + pix_col]; + rec_pix = rec[pix_row * dst_buffer_stride + pix_col]; + } else { + src_pix = mb_buffer[pix_row * buf_stride + pix_col]; + rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col]; + } +#else + src_pix = mb_buffer[pix_row * buf_stride + pix_col]; + rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col]; +#endif + src_mean += src_pix; + rec_mean += rec_pix; + dist_mean += src_pix - rec_pix; + weber_stats->src_variance += src_pix * src_pix; + weber_stats->rec_variance += rec_pix * rec_pix; + weber_stats->src_pix_max = AOMMAX(weber_stats->src_pix_max, src_pix); + weber_stats->rec_pix_max = AOMMAX(weber_stats->rec_pix_max, rec_pix); + weber_stats->distortion += (src_pix - rec_pix) * (src_pix - rec_pix); + } + } + + if (cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) { + *sum_rec_distortion += weber_stats->distortion; + int est_block_rate = 0; + int64_t est_block_dist = 0; + model_rd_sse_fn[MODELRD_LEGACY](cpi, x, bsize, 0, weber_stats->distortion, + pix_num, &est_block_rate, + &est_block_dist); + *sum_est_rate += est_block_rate; + } + + weber_stats->src_variance -= (src_mean * src_mean) / pix_num; + weber_stats->rec_variance -= (rec_mean * rec_mean) / pix_num; + weber_stats->distortion -= (dist_mean * dist_mean) / pix_num; + weber_stats->satd = best_intra_cost; + + qcoeff[0] = 0; + int max_scale = 0; + for (int idx = 1; idx < coeff_count; ++idx) { + const int abs_qcoeff = abs(qcoeff[idx]); + max_scale = AOMMAX(max_scale, abs_qcoeff); + } + weber_stats->max_scale = max_scale; + + if ((mi_col + mb_step) % mt_unit_step == 0 || + (mi_col + mb_step) >= mi_cols) { + intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id, + mt_unit_col, mt_unit_cols); + ++mt_unit_col; + } + } + // Set the pointer to null since mbmi is only allocated inside this function. + xd->mi = NULL; +} + +static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion, + double *sum_est_rate) { + MACROBLOCK *x = &cpi->td.mb; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const int mb_step = mi_size_wide[bsize]; + DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); + for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) { + av1_calc_mb_wiener_var_row(cpi, x, xd, mi_row, src_diff, coeff, qcoeff, + dqcoeff, sum_rec_distortion, sum_est_rate, + cpi->td.wiener_tmp_pred_buf); + } +} + +static int64_t estimate_wiener_var_norm(AV1_COMP *const cpi, + const BLOCK_SIZE norm_block_size) { + const AV1_COMMON *const cm = &cpi->common; + int64_t norm_factor = 1; + assert(norm_block_size >= BLOCK_16X16 && norm_block_size <= BLOCK_128X128); + const int norm_step = mi_size_wide[norm_block_size]; + double sb_wiener_log = 0; + double sb_count = 0; + for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) { + for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += norm_step) { + const int sb_wiener_var = + get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col); + const int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col); + const int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col); + const double scaled_satd = (double)satd / sqrt((double)sse); + sb_wiener_log += scaled_satd * log(sb_wiener_var); + sb_count += scaled_satd; + } + } + if (sb_count > 0) norm_factor = (int64_t)(exp(sb_wiener_log / sb_count)); + norm_factor = AOMMAX(1, norm_factor); + + return norm_factor; +} + +static void automatic_intra_tools_off(AV1_COMP *cpi, + const double sum_rec_distortion, + const double sum_est_rate) { + if (!cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) return; + + // Thresholds + const int high_quality_qindex = 128; + const double high_quality_bpp = 2.0; + const double high_quality_dist_per_pix = 4.0; + + AV1_COMMON *const cm = &cpi->common; + const int qindex = cm->quant_params.base_qindex; + const double dist_per_pix = + (double)sum_rec_distortion / (cm->width * cm->height); + // The estimate bpp is not accurate, an empirical constant 100 is divided. + const double estimate_bpp = sum_est_rate / (cm->width * cm->height * 100); + + if (qindex < high_quality_qindex && estimate_bpp > high_quality_bpp && + dist_per_pix < high_quality_dist_per_pix) { + cpi->oxcf.intra_mode_cfg.enable_smooth_intra = 0; + cpi->oxcf.intra_mode_cfg.enable_paeth_intra = 0; + cpi->oxcf.intra_mode_cfg.enable_cfl_intra = 0; + cpi->oxcf.intra_mode_cfg.enable_diagonal_intra = 0; + } +} + +static void ext_rate_guided_quantization(AV1_COMP *cpi) { + // Calculation uses 8x8. + const int mb_step = mi_size_wide[cpi->weber_bsize]; + // Accumulate to 16x16, step size is in the unit of mi. + const int block_step = 4; + + const char *filename = cpi->oxcf.rate_distribution_info; + FILE *pfile = fopen(filename, "r"); + if (pfile == NULL) { + assert(pfile != NULL); + return; + } + + double ext_rate_sum = 0.0; + for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) { + for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) { + float val; + const int fields_converted = fscanf(pfile, "%f", &val); + if (fields_converted != 1) { + assert(fields_converted == 1); + fclose(pfile); + return; + } + ext_rate_sum += val; + cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols + + (col / mb_step)] = val; + } + } + fclose(pfile); + + int uniform_rate_sum = 0; + for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) { + for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) { + int rate_sum = 0; + for (int r = 0; r < block_step; r += mb_step) { + for (int c = 0; c < block_step; c += mb_step) { + const int mi_row = row + r; + const int mi_col = col + c; + rate_sum += cpi->prep_rate_estimates[(mi_row / mb_step) * + cpi->frame_info.mi_cols + + (mi_col / mb_step)]; + } + } + uniform_rate_sum += rate_sum; + } + } + + const double scale = uniform_rate_sum / ext_rate_sum; + cpi->ext_rate_scale = scale; +} + +void av1_set_mb_wiener_variance(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + if (aom_realloc_frame_buffer( + &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL, cpi->image_pyramid_levels, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + av1_alloc_mb_wiener_var_pred_buf(&cpi->common, &cpi->td); + cpi->norm_wiener_variance = 0; + + MACROBLOCK *x = &cpi->td.mb; + MACROBLOCKD *xd = &x->e_mbd; + // xd->mi needs to be setup since it is used in av1_frame_init_quantizer. + MB_MODE_INFO mbmi; + memset(&mbmi, 0, sizeof(mbmi)); + MB_MODE_INFO *mbmi_ptr = &mbmi; + xd->mi = &mbmi_ptr; + cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level; + av1_frame_init_quantizer(cpi); + + double sum_rec_distortion = 0.0; + double sum_est_rate = 0.0; + + MultiThreadInfo *const mt_info = &cpi->mt_info; + const int num_workers = + AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers); + AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt; + intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read_dummy; + intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write_dummy; + // Calculate differential contrast for each block for the entire image. + // TODO(chengchen): properly accumulate the distortion and rate in + // av1_calc_mb_wiener_var_mt(). Until then, call calc_mb_wiener_var() if + // auto_intra_tools_off is true. + if (num_workers > 1 && !cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) { + intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read; + intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write; + av1_calc_mb_wiener_var_mt(cpi, num_workers, &sum_rec_distortion, + &sum_est_rate); + } else { + calc_mb_wiener_var(cpi, &sum_rec_distortion, &sum_est_rate); + } + + // Determine whether to turn off several intra coding tools. + automatic_intra_tools_off(cpi, sum_rec_distortion, sum_est_rate); + + // Read external rate distribution and use it to guide delta quantization + if (cpi->oxcf.enable_rate_guide_deltaq) ext_rate_guided_quantization(cpi); + + const BLOCK_SIZE norm_block_size = cm->seq_params->sb_size; + cpi->norm_wiener_variance = estimate_wiener_var_norm(cpi, norm_block_size); + const int norm_step = mi_size_wide[norm_block_size]; + + double sb_wiener_log = 0; + double sb_count = 0; + for (int its_cnt = 0; its_cnt < 2; ++its_cnt) { + sb_wiener_log = 0; + sb_count = 0; + for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) { + for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; + mi_col += norm_step) { + int sb_wiener_var = + get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col); + + double beta = (double)cpi->norm_wiener_variance / sb_wiener_var; + double min_max_scale = AOMMAX( + 1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col)); + + beta = AOMMIN(beta, 4); + beta = AOMMAX(beta, 0.25); + + if (beta < 1 / min_max_scale) continue; + + sb_wiener_var = (int)(cpi->norm_wiener_variance / beta); + + int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col); + int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col); + double scaled_satd = (double)satd / sqrt((double)sse); + sb_wiener_log += scaled_satd * log(sb_wiener_var); + sb_count += scaled_satd; + } + } + + if (sb_count > 0) + cpi->norm_wiener_variance = (int64_t)(exp(sb_wiener_log / sb_count)); + cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance); + } + + // Set the pointer to null since mbmi is only allocated inside this function. + xd->mi = NULL; + aom_free_frame_buffer(&cm->cur_frame->buf); + av1_dealloc_mb_wiener_var_pred_buf(&cpi->td); +} + +static int get_rate_guided_quantizer(AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + // Calculation uses 8x8. + const int mb_step = mi_size_wide[cpi->weber_bsize]; + // Accumulate to 16x16 + const int block_step = mi_size_wide[BLOCK_16X16]; + double sb_rate_hific = 0.0; + double sb_rate_uniform = 0.0; + for (int row = mi_row; row < mi_row + mi_size_wide[bsize]; + row += block_step) { + for (int col = mi_col; col < mi_col + mi_size_high[bsize]; + col += block_step) { + sb_rate_hific += + cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols + + (col / mb_step)]; + + for (int r = 0; r < block_step; r += mb_step) { + for (int c = 0; c < block_step; c += mb_step) { + const int this_row = row + r; + const int this_col = col + c; + sb_rate_uniform += + cpi->prep_rate_estimates[(this_row / mb_step) * + cpi->frame_info.mi_cols + + (this_col / mb_step)]; + } + } + } + } + sb_rate_hific *= cpi->ext_rate_scale; + + const double weight = 1.0; + const double rate_diff = + weight * (sb_rate_hific - sb_rate_uniform) / sb_rate_uniform; + double scale = pow(2, rate_diff); + + scale = scale * scale; + double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col)); + scale = 1.0 / AOMMIN(1.0 / scale, min_max_scale); + + AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + int offset = + av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, scale); + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + const int max_offset = delta_q_info->delta_q_res * 10; + offset = AOMMIN(offset, max_offset - 1); + offset = AOMMAX(offset, -max_offset + 1); + int qindex = cm->quant_params.base_qindex + offset; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1); + + return qindex; +} + +int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + if (cpi->oxcf.enable_rate_guide_deltaq) { + return get_rate_guided_quantizer(cpi, bsize, mi_row, mi_col); + } + + AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + int sb_wiener_var = get_var_perceptual_ai(cpi, bsize, mi_row, mi_col); + int offset = 0; + double beta = (double)cpi->norm_wiener_variance / sb_wiener_var; + double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col)); + beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale); + + // Cap beta such that the delta q value is not much far away from the base q. + beta = AOMMIN(beta, 4); + beta = AOMMAX(beta, 0.25); + offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta); + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + offset = AOMMIN(offset, delta_q_info->delta_q_res * 20 - 1); + offset = AOMMAX(offset, -delta_q_info->delta_q_res * 20 + 1); + int qindex = cm->quant_params.base_qindex + offset; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1); + + return qindex; +} + +void av1_init_mb_ur_var_buffer(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + + if (cpi->mb_delta_q) return; + + CHECK_MEM_ERROR(cm, cpi->mb_delta_q, + aom_calloc(cpi->frame_info.mb_rows * cpi->frame_info.mb_cols, + sizeof(*cpi->mb_delta_q))); +} + +#if CONFIG_TFLITE +static int model_predict(BLOCK_SIZE block_size, int num_cols, int num_rows, + int bit_depth, uint8_t *y_buffer, int y_stride, + float *predicts0, float *predicts1) { + // Create the model and interpreter options. + TfLiteModel *model = + TfLiteModelCreate(av1_deltaq4_model_file, av1_deltaq4_model_fsize); + if (model == NULL) return 1; + + TfLiteInterpreterOptions *options = TfLiteInterpreterOptionsCreate(); + TfLiteInterpreterOptionsSetNumThreads(options, 2); + if (options == NULL) { + TfLiteModelDelete(model); + return 1; + } + + // Create the interpreter. + TfLiteInterpreter *interpreter = TfLiteInterpreterCreate(model, options); + if (interpreter == NULL) { + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + return 1; + } + + // Allocate tensors and populate the input tensor data. + TfLiteInterpreterAllocateTensors(interpreter); + TfLiteTensor *input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0); + if (input_tensor == NULL) { + TfLiteInterpreterDelete(interpreter); + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + return 1; + } + + size_t input_size = TfLiteTensorByteSize(input_tensor); + float *input_data = aom_calloc(input_size, 1); + if (input_data == NULL) { + TfLiteInterpreterDelete(interpreter); + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + return 1; + } + + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int row_offset = (row * num_mi_h) << 2; + const int col_offset = (col * num_mi_w) << 2; + + uint8_t *buf = y_buffer + row_offset * y_stride + col_offset; + int r = row_offset, pos = 0; + const float base = (float)((1 << bit_depth) - 1); + while (r < row_offset + (num_mi_h << 2)) { + for (int c = 0; c < (num_mi_w << 2); ++c) { + input_data[pos++] = bit_depth > 8 + ? (float)*CONVERT_TO_SHORTPTR(buf + c) / base + : (float)*(buf + c) / base; + } + buf += y_stride; + ++r; + } + TfLiteTensorCopyFromBuffer(input_tensor, input_data, input_size); + + // Execute inference. + if (TfLiteInterpreterInvoke(interpreter) != kTfLiteOk) { + TfLiteInterpreterDelete(interpreter); + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + return 1; + } + + // Extract the output tensor data. + const TfLiteTensor *output_tensor = + TfLiteInterpreterGetOutputTensor(interpreter, 0); + if (output_tensor == NULL) { + TfLiteInterpreterDelete(interpreter); + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + return 1; + } + + size_t output_size = TfLiteTensorByteSize(output_tensor); + float output_data[2]; + + TfLiteTensorCopyToBuffer(output_tensor, output_data, output_size); + predicts0[row * num_cols + col] = output_data[0]; + predicts1[row * num_cols + col] = output_data[1]; + } + } + + // Dispose of the model and interpreter objects. + TfLiteInterpreterDelete(interpreter); + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + aom_free(input_data); + return 0; +} + +void av1_set_mb_ur_variance(AV1_COMP *cpi) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + uint8_t *y_buffer = cpi->source->y_buffer; + const int y_stride = cpi->source->y_stride; + const int block_size = cpi->common.seq_params->sb_size; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h; + + // TODO(sdeng): fit a better model_1; disable it at this time. + float *mb_delta_q0, *mb_delta_q1, delta_q_avg0 = 0.0f; + CHECK_MEM_ERROR(cm, mb_delta_q0, + aom_calloc(num_rows * num_cols, sizeof(float))); + CHECK_MEM_ERROR(cm, mb_delta_q1, + aom_calloc(num_rows * num_cols, sizeof(float))); + + if (model_predict(block_size, num_cols, num_rows, bit_depth, y_buffer, + y_stride, mb_delta_q0, mb_delta_q1)) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Failed to call TFlite functions."); + } + + // Loop through each SB block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + delta_q_avg0 += mb_delta_q0[index]; + } + } + + delta_q_avg0 /= (float)(num_rows * num_cols); + + float scaling_factor; + const float cq_level = (float)cpi->oxcf.rc_cfg.cq_level / (float)MAXQ; + if (cq_level < delta_q_avg0) { + scaling_factor = cq_level / delta_q_avg0; + } else { + scaling_factor = 1.0f - (cq_level - delta_q_avg0) / (1.0f - delta_q_avg0); + } + + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + cpi->mb_delta_q[index] = + RINT((float)cpi->oxcf.q_cfg.deltaq_strength / 100.0f * (float)MAXQ * + scaling_factor * (mb_delta_q0[index] - delta_q_avg0)); + } + } + + aom_free(mb_delta_q0); + aom_free(mb_delta_q1); +} +#else // !CONFIG_TFLITE +void av1_set_mb_ur_variance(AV1_COMP *cpi) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + uint8_t *y_buffer = cpi->source->y_buffer; + const int y_stride = cpi->source->y_stride; + const int block_size = cpi->common.seq_params->sb_size; + + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h; + + int *mb_delta_q[2]; + CHECK_MEM_ERROR(cm, mb_delta_q[0], + aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[0]))); + CHECK_MEM_ERROR(cm, mb_delta_q[1], + aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[1]))); + + // Approximates the model change between current version (Spet 2021) and the + // baseline (July 2021). + const double model_change[] = { 3.0, 3.0 }; + // The following parameters are fitted from user labeled data. + const double a[] = { -24.50 * 4.0, -17.20 * 4.0 }; + const double b[] = { 0.004898, 0.003093 }; + const double c[] = { (29.932 + model_change[0]) * 4.0, + (42.100 + model_change[1]) * 4.0 }; + int delta_q_avg[2] = { 0, 0 }; + // Loop through each SB block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + double var = 0.0, num_of_var = 0.0; + const int index = row * num_cols + col; + + // Loop through each 8x8 block. + for (int mi_row = row * num_mi_h; + mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h; + mi_row += 2) { + for (int mi_col = col * num_mi_w; + mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w; + mi_col += 2) { + struct buf_2d buf; + const int row_offset_y = mi_row << 2; + const int col_offset_y = mi_col << 2; + + buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + unsigned int block_variance; + block_variance = av1_get_perpixel_variance_facade( + cpi, xd, &buf, BLOCK_8X8, AOM_PLANE_Y); + + block_variance = AOMMAX(block_variance, 1); + var += log((double)block_variance); + num_of_var += 1.0; + } + } + var = exp(var / num_of_var); + mb_delta_q[0][index] = RINT(a[0] * exp(-b[0] * var) + c[0]); + mb_delta_q[1][index] = RINT(a[1] * exp(-b[1] * var) + c[1]); + delta_q_avg[0] += mb_delta_q[0][index]; + delta_q_avg[1] += mb_delta_q[1][index]; + } + } + + delta_q_avg[0] = RINT((double)delta_q_avg[0] / (num_rows * num_cols)); + delta_q_avg[1] = RINT((double)delta_q_avg[1] / (num_rows * num_cols)); + + int model_idx; + double scaling_factor; + const int cq_level = cpi->oxcf.rc_cfg.cq_level; + if (cq_level < delta_q_avg[0]) { + model_idx = 0; + scaling_factor = (double)cq_level / delta_q_avg[0]; + } else if (cq_level < delta_q_avg[1]) { + model_idx = 2; + scaling_factor = + (double)(cq_level - delta_q_avg[0]) / (delta_q_avg[1] - delta_q_avg[0]); + } else { + model_idx = 1; + scaling_factor = (double)(MAXQ - cq_level) / (MAXQ - delta_q_avg[1]); + } + + const double new_delta_q_avg = + delta_q_avg[0] + scaling_factor * (delta_q_avg[1] - delta_q_avg[0]); + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + if (model_idx == 2) { + const double delta_q = + mb_delta_q[0][index] + + scaling_factor * (mb_delta_q[1][index] - mb_delta_q[0][index]); + cpi->mb_delta_q[index] = RINT((double)cpi->oxcf.q_cfg.deltaq_strength / + 100.0 * (delta_q - new_delta_q_avg)); + } else { + cpi->mb_delta_q[index] = RINT( + (double)cpi->oxcf.q_cfg.deltaq_strength / 100.0 * scaling_factor * + (mb_delta_q[model_idx][index] - delta_q_avg[model_idx])); + } + } + } + + aom_free(mb_delta_q[0]); + aom_free(mb_delta_q[1]); +} +#endif + +int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col) { + const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size; + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + if (base_qindex == MINQ || base_qindex == MAXQ) return base_qindex; + + const int num_mi_w = mi_size_wide[bsize]; + const int num_mi_h = mi_size_high[bsize]; + const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; + const int index = (mi_row / num_mi_h) * num_cols + (mi_col / num_mi_w); + const int delta_q = cpi->mb_delta_q[index]; + + int qindex = base_qindex + delta_q; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ + 1); + + return qindex; +} diff --git a/third_party/aom/av1/encoder/allintra_vis.h b/third_party/aom/av1/encoder/allintra_vis.h new file mode 100644 index 0000000000..0d34ce0841 --- /dev/null +++ b/third_party/aom/av1/encoder/allintra_vis.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ALLINTRA_VIS_H_ +#define AOM_AV1_ENCODER_ALLINTRA_VIS_H_ + +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" + +#define MB_WIENER_MT_UNIT_SIZE BLOCK_64X64 + +void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi); + +void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x, + MACROBLOCKD *xd, const int mi_row, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + double *sum_rec_distortion, + double *sum_est_rate, uint8_t *pred_buffer); + +void av1_set_mb_wiener_variance(AV1_COMP *cpi); + +int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col); + +// User rating based mode +void av1_init_mb_ur_var_buffer(AV1_COMP *cpi); + +void av1_set_mb_ur_variance(AV1_COMP *cpi); + +int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col); + +#endif // AOM_AV1_ENCODER_ALLINTRA_VIS_H_ diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c new file mode 100644 index 0000000000..4cf6bd572d --- /dev/null +++ b/third_party/aom/av1/encoder/aq_complexity.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/encodeframe.h" +#include "av1/common/seg_common.h" +#include "av1/encoder/segmentation.h" +#include "aom_dsp/aom_dsp_common.h" + +#define AQ_C_SEGMENTS 5 +#define DEFAULT_AQ2_SEG 3 // Neutral Q segment +#define AQ_C_STRENGTHS 3 +static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { 1.75, 1.25, 1.05, 1.00, 0.90 }, + { 2.00, 1.50, 1.15, 1.00, 0.85 }, + { 2.50, 1.75, 1.25, 1.00, 0.80 } +}; +static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { 0.15, 0.30, 0.55, 2.00, 100.0 }, + { 0.20, 0.40, 0.65, 2.00, 100.0 }, + { 0.25, 0.50, 0.75, 2.00, 100.0 } +}; +static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { -4.0, -3.0, -2.0, 100.00, 100.0 }, + { -3.5, -2.5, -1.5, 100.00, 100.0 }, + { -3.0, -2.0, -1.0, 100.00, 100.0 } +}; + +static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) { + // Approximate base quatizer (truncated to int) + const int base_quant = av1_ac_quant_QTX(q_index, 0, bit_depth) / 4; + return (base_quant > 10) + (base_quant > 25); +} + +static bool is_frame_aq_enabled(const AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + + return frame_is_intra_only(cm) || cm->features.error_resilient_mode || + refresh_frame->alt_ref_frame || + (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref); +} + +// Segmentation only makes sense if the target bits per SB is above a threshold. +// Below this the overheads will usually outweigh any benefit. +static bool is_sb_aq_enabled(const AV1_COMP *const cpi) { + return cpi->rc.sb64_target_rate >= 256; +} + +void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + struct segmentation *const seg = &cm->seg; + const int resolution_change = + cm->prev_frame && (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height); + + // Make SURE use of floating point in this function is safe. + + if (resolution_change) { + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_clearall_segfeatures(seg); + av1_disable_segmentation(seg); + return; + } + + if (is_frame_aq_enabled(cpi)) { + int segment; + const int aq_strength = + get_aq_c_strength(base_qindex, cm->seq_params->bit_depth); + + // Clear down the segment map. + memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG, + cm->mi_params.mi_rows * cm->mi_params.mi_cols); + + av1_clearall_segfeatures(seg); + + if (!is_sb_aq_enabled(cpi)) { + av1_disable_segmentation(seg); + return; + } + + av1_enable_segmentation(seg); + + // Default segment "Q" feature is disabled so it defaults to the baseline Q. + av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q); + + // Use some of the segments for in frame Q adjustment. + for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) { + int qindex_delta; + + if (segment == DEFAULT_AQ2_SEG) continue; + + qindex_delta = av1_compute_qdelta_by_rate( + cpi, cm->current_frame.frame_type, base_qindex, + aq_c_q_adj_factor[aq_strength][segment]); + + // For AQ complexity mode, we dont allow Q0 in a segment if the base + // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { + qindex_delta = -base_qindex + 1; + } + if ((base_qindex + qindex_delta) > 0) { + av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); + av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); + } + } + } +} + +#define DEFAULT_LV_THRESH 10.0 +#define MIN_DEFAULT_LV_THRESH 8.0 +// Select a segment for the current block. +// The choice of segment for a block depends on the ratio of the projected +// bits for the block vs a target average and its spatial complexity. +void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, + int mi_row, int mi_col, int projected_rate) { + if ((!is_frame_aq_enabled(cpi)) || (!is_sb_aq_enabled(cpi))) return; + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + const int mi_offset = mi_row * cm->mi_params.mi_cols + mi_col; + const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_size_wide[bs]); + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_size_high[bs]); + int i; + unsigned char segment; + + // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). + // It is converted to bits << AV1_PROB_COST_SHIFT units. + const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis) + << AV1_PROB_COST_SHIFT; + const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size; + const int target_rate = (int)(num / denom); + double logvar; + double low_var_thresh; + const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex, + cm->seq_params->bit_depth); + + low_var_thresh = + (is_stat_consumption_stage_twopass(cpi)) + ? AOMMAX(exp(cpi->twopass_frame.mb_av_energy), MIN_DEFAULT_LV_THRESH) + : DEFAULT_LV_THRESH; + + av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs); + logvar = av1_log_block_var(cpi, mb, bs); + + segment = AQ_C_SEGMENTS - 1; // Just in case no break out below. + for (i = 0; i < AQ_C_SEGMENTS; ++i) { + // Test rate against a threshold value and variance against a threshold. + // Increasing segment number (higher variance and complexity) = higher Q. + if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) && + (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) { + segment = i; + break; + } + } + + // Fill in the entires in the segment map corresponding to this SB64. + const int mi_stride = cm->mi_params.mi_cols; + set_segment_id(cpi->enc_seg.map, mi_offset, xmis, ymis, mi_stride, segment); +} diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h new file mode 100644 index 0000000000..3421d74c93 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_complexity.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ +#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/common/enums.h" + +struct AV1_COMP; +struct macroblock; + +// Select a segment for the current Block. +void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *, + BLOCK_SIZE bs, int mi_row, int mi_col, + int projected_rate); + +// This function sets up a set of segments with delta Q values around +// the baseline frame quantizer. +void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c new file mode 100644 index 0000000000..f48ff11e51 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c @@ -0,0 +1,657 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/pred_common.h" +#include "av1/common/seg_common.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/tokenize.h" +#include "aom_dsp/aom_dsp_common.h" + +CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) { + CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr)); + if (cr == NULL) return NULL; + + cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map)); + cr->counter_encode_maxq_scene_change = 0; + cr->percent_refresh_adjustment = 5; + cr->rate_ratio_qdelta_adjustment = 0.25; + if (cr->map == NULL) { + av1_cyclic_refresh_free(cr); + return NULL; + } + return cr; +} + +void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) { + if (cr != NULL) { + aom_free(cr->map); + aom_free(cr); + } +} + +// Check if this coding block, of size bsize, should be considered for refresh +// (lower-qp coding). Decision can be based on various factors, such as +// size of the coding block (i.e., below min_block size rejected), coding +// mode, and rate/distortion. +static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, + const MB_MODE_INFO *mbmi, int64_t rate, + int64_t dist, BLOCK_SIZE bsize, + int noise_level) { + MV mv = mbmi->mv[0].as_mv; + int is_compound = has_second_ref(mbmi); + // Reject the block for lower-qp coding for non-compound mode if + // projected distortion is above the threshold, and any of the following + // is true: + // 1) mode uses large mv + // 2) mode is an intra-mode + // Otherwise accept for refresh. + if (!is_compound && dist > cr->thresh_dist_sb && + (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh || + mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh || + !is_inter_block(mbmi))) + return CR_SEGMENT_ID_BASE; + else if ((is_compound && noise_level < kMedium) || + (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb && + is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 && + cr->rate_boost_fac > 10)) + // More aggressive delta-q for bigger blocks with zero motion. + return CR_SEGMENT_ID_BOOST2; + else + return CR_SEGMENT_ID_BOOST1; +} + +// Compute delta-q for the segment. +static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) { + const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int deltaq = av1_compute_qdelta_by_rate( + cpi, cpi->common.current_frame.frame_type, q, rate_factor); + if ((-deltaq) > cr->max_qdelta_perc * q / 100) { + deltaq = -cr->max_qdelta_perc * q / 100; + } + return deltaq; +} + +int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi, + double correction_factor) { + const AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int mbs = cm->mi_params.MBs; + const int num4x4bl = mbs << 4; + // Weight for non-base segments: use actual number of blocks refreshed in + // previous/just encoded frame. Note number of blocks here is in 4x4 units. + double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl; + double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl; + if (cpi->rc.rtc_external_ratectrl) { + weight_segment1 = (double)(cr->percent_refresh * cm->mi_params.mi_rows * + cm->mi_params.mi_cols / 100) / + num4x4bl; + weight_segment2 = 0; + } + // Take segment weighted average for estimated bits. + const int estimated_bits = + (int)((1.0 - weight_segment1 - weight_segment2) * + av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) + + weight_segment1 * + av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1], + correction_factor) + + weight_segment2 * + av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2], + correction_factor)); + return estimated_bits; +} + +int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i, + double correction_factor) { + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int bits_per_mb; + int num4x4bl = cm->mi_params.MBs << 4; + // Weight for segment prior to encoding: take the average of the target + // number for the frame to be encoded and the actual from the previous frame. + double weight_segment = + (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks + + cr->actual_num_seg2_blocks) >> + 1) / + num4x4bl; + if (cpi->rc.rtc_external_ratectrl) { + weight_segment = (double)((cr->target_num_seg_blocks + + cr->percent_refresh * cm->mi_params.mi_rows * + cm->mi_params.mi_cols / 100) >> + 1) / + num4x4bl; + } + // Compute delta-q corresponding to qindex i. + int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); + const int accurate_estimate = cpi->sf.hl_sf.accurate_bit_estimate; + // Take segment weighted average for bits per mb. + bits_per_mb = + (int)((1.0 - weight_segment) * + av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i, + correction_factor, accurate_estimate) + + weight_segment * av1_rc_bits_per_mb( + cpi, cm->current_frame.frame_type, i + deltaq, + correction_factor, accurate_estimate)); + return bits_per_mb; +} + +void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RUN_TYPE dry_run) { + int cdf_num; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int prev_segment_id = mbmi->segment_id; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw); + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); + + assert(cm->seg.enabled); + + if (!cr->skip_over4x4) { + mbmi->segment_id = + av1_get_spatial_seg_pred(cm, xd, &cdf_num, cr->skip_over4x4); + if (prev_segment_id != mbmi->segment_id) { + const int block_index = mi_row * cm->mi_params.mi_cols + mi_col; + const int mi_stride = cm->mi_params.mi_cols; + const uint8_t segment_id = mbmi->segment_id; + for (int mi_y = 0; mi_y < ymis; mi_y++) { + const int map_offset = block_index + mi_y * mi_stride; + memset(&cr->map[map_offset], 0, xmis); + memset(&cpi->enc_seg.map[map_offset], segment_id, xmis); + memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis); + } + } + } + if (!dry_run) { + if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1) + x->actual_num_seg1_blocks -= xmis * ymis; + else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2) + x->actual_num_seg2_blocks -= xmis * ymis; + } +} + +void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int64_t rate, int64_t dist, int skip, + RUN_TYPE dry_run) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw); + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_params.mi_cols + mi_col; + int noise_level = 0; + if (cpi->noise_estimate.enabled) noise_level = cpi->noise_estimate.level; + const int refresh_this_block = + candidate_refresh_aq(cr, mbmi, rate, dist, bsize, noise_level); + int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1; + // Default is to not update the refresh map. + int new_map_value = cr->map[block_index]; + + // If this block is labeled for refresh, check if we should reset the + // segment_id. + if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) { + mbmi->segment_id = refresh_this_block; + // Reset segment_id if will be skipped. + if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE; + } + const uint8_t segment_id = mbmi->segment_id; + + // Update the cyclic refresh map, to be used for setting segmentation map + // for the next frame. If the block will be refreshed this frame, mark it + // as clean. The magnitude of the -ve influences how long before we consider + // it for refresh again. + if (cyclic_refresh_segment_id_boosted(segment_id)) { + new_map_value = -cr->time_for_refresh; + } else if (refresh_this_block) { + // Else if it is accepted as candidate for refresh, and has not already + // been refreshed (marked as 1) then mark it as a candidate for cleanup + // for future time (marked as 0), otherwise don't update it. + if (cr->map[block_index] == 1) new_map_value = 0; + } else { + // Leave it marked as block that is not candidate for refresh. + new_map_value = 1; + } + + // Update entries in the cyclic refresh map with new_map_value, and + // copy mbmi->segment_id into global segmentation map. + const int mi_stride = cm->mi_params.mi_cols; + for (int mi_y = 0; mi_y < ymis; mi_y += sh) { + const int map_offset = block_index + mi_y * mi_stride; + memset(&cr->map[map_offset], new_map_value, xmis); + memset(&cpi->enc_seg.map[map_offset], segment_id, xmis); + memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis); + } + + // Accumulate cyclic refresh update counters. + if (!dry_run) { + if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST1) + x->actual_num_seg1_blocks += xmis * ymis; + else if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST2) + x->actual_num_seg2_blocks += xmis * ymis; + } +} + +// Initializes counters used for cyclic refresh. +void av1_init_cyclic_refresh_counters(MACROBLOCK *const x) { + x->actual_num_seg1_blocks = 0; + x->actual_num_seg2_blocks = 0; +} + +// Accumulate cyclic refresh counters. +void av1_accumulate_cyclic_refresh_counters( + CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x) { + cyclic_refresh->actual_num_seg1_blocks += x->actual_num_seg1_blocks; + cyclic_refresh->actual_num_seg2_blocks += x->actual_num_seg2_blocks; +} + +void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + // Set minimum gf_interval for GF update to a multiple of the refresh period, + // with some max limit. Depending on past encoding stats, GF flag may be + // reset and update may not occur until next baseline_gf_interval. + const int gf_length_mult[2] = { 8, 4 }; + if (cr->percent_refresh > 0) + p_rc->baseline_gf_interval = + AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] * + (100 / cr->percent_refresh), + MAX_GF_INTERVAL_RT); + else + p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT; + if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40) + p_rc->baseline_gf_interval = 16; +} + +// Update the segmentation map, and related quantities: cyclic refresh map, +// refresh sb_index, and target number of blocks to be refreshed. +// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to +// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock. +// Blocks labeled as BOOST1 may later get set to BOOST2 (during the +// encoding of the superblock). +static void cyclic_refresh_update_map(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + unsigned char *const seg_map = cpi->enc_seg.map; + int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; + int xmis, ymis, x, y; + uint64_t sb_sad = 0; + uint64_t thresh_sad_low = 0; + uint64_t thresh_sad = INT64_MAX; + const int mi_rows = mi_params->mi_rows, mi_cols = mi_params->mi_cols; + const int mi_stride = mi_cols; + memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols); + sb_cols = (mi_cols + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size; + sb_rows = (mi_rows + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size; + sbs_in_frame = sb_cols * sb_rows; + // Number of target blocks to get the q delta (segment 1). + block_count = cr->percent_refresh * mi_rows * mi_cols / 100; + // Set the segmentation map: cycle through the superblocks, starting at + // cr->mb_index, and stopping when either block_count blocks have been found + // to be refreshed, or we have passed through whole frame. + if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0; + assert(cr->sb_index < sbs_in_frame); + i = cr->sb_index; + cr->last_sb_index = cr->sb_index; + cr->target_num_seg_blocks = 0; + do { + int sum_map = 0; + // Get the mi_row/mi_col corresponding to superblock index i. + int sb_row_index = (i / sb_cols); + int sb_col_index = i - sb_row_index * sb_cols; + int mi_row = sb_row_index * cm->seq_params->mib_size; + int mi_col = sb_col_index * cm->seq_params->mib_size; + assert(mi_row >= 0 && mi_row < mi_rows); + assert(mi_col >= 0 && mi_col < mi_cols); + bl_index = mi_row * mi_stride + mi_col; + // Loop through all MI blocks in superblock and update map. + xmis = AOMMIN(mi_cols - mi_col, cm->seq_params->mib_size); + ymis = AOMMIN(mi_rows - mi_row, cm->seq_params->mib_size); + if (cr->use_block_sad_scene_det && cpi->rc.frames_since_key > 30 && + cr->counter_encode_maxq_scene_change > 30 && + cpi->src_sad_blk_64x64 != NULL && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + sb_sad = cpi->src_sad_blk_64x64[sb_col_index + sb_cols * sb_row_index]; + int scale = (cm->width * cm->height < 640 * 360) ? 6 : 8; + int scale_low = 2; + thresh_sad = (scale * 64 * 64); + thresh_sad_low = (scale_low * 64 * 64); + // For temporal layers: the base temporal layer (temporal_layer_id = 0) + // has larger frame separation (2 or 4 frames apart), so use larger sad + // thresholds to compensate for larger frame sad. The larger thresholds + // also increase the amount of refresh, which is needed for the base + // temporal layer. + if (cpi->svc.number_temporal_layers > 1 && + cpi->svc.temporal_layer_id == 0) { + thresh_sad <<= 4; + thresh_sad_low <<= 2; + } + } + // cr_map only needed at 8x8 blocks. + for (y = 0; y < ymis; y += 2) { + for (x = 0; x < xmis; x += 2) { + const int bl_index2 = bl_index + y * mi_stride + x; + // If the block is as a candidate for clean up then mark it + // for possible boost/refresh (segment 1). The segment id may get + // reset to 0 later if block gets coded anything other than low motion. + // If the block_sad (sb_sad) is very low label it for refresh anyway. + if (cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) { + sum_map += 4; + } else if (cr->map[bl_index2] < 0) { + cr->map[bl_index2]++; + } + } + } + // Enforce constant segment over superblock. + // If segment is at least half of superblock, set to 1. + // Enforce that block sad (sb_sad) is not too high. + if (sum_map >= (xmis * ymis) >> 1 && sb_sad < thresh_sad) { + set_segment_id(seg_map, bl_index, xmis, ymis, mi_stride, + CR_SEGMENT_ID_BOOST1); + cr->target_num_seg_blocks += xmis * ymis; + } + i++; + if (i == sbs_in_frame) { + i = 0; + } + } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index); + cr->sb_index = i; + if (cr->target_num_seg_blocks == 0) { + // Disable segmentation, seg_map is already set to 0 above. + av1_disable_segmentation(&cm->seg); + } +} + +static int is_scene_change_detected(AV1_COMP *const cpi) { + return cpi->rc.high_source_sad; +} + +// Set cyclic refresh parameters. +void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) { + // TODO(marpan): Parameters need to be tuned. + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + SVC *const svc = &cpi->svc; + const int qp_thresh = AOMMAX(16, rc->best_quality + 4); + const int qp_max_thresh = 118 * MAXQ >> 7; + const int scene_change_detected = is_scene_change_detected(cpi); + const int is_screen_content = + (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); + + // A scene change or key frame marks the start of a cyclic refresh cycle. + const int frames_since_scene_change = + (cpi->ppi->use_svc || !is_screen_content) + ? cpi->rc.frames_since_key + : AOMMIN(cpi->rc.frames_since_key, + cr->counter_encode_maxq_scene_change); + + // Cases to reset the cyclic refresh adjustment parameters. + if (frame_is_intra_only(cm) || scene_change_detected || + cpi->ppi->rtc_ref.bias_recovery_frame) { + // Reset adaptive elements for intra only frames and scene changes. + cr->percent_refresh_adjustment = 5; + cr->rate_ratio_qdelta_adjustment = 0.25; + } + + // Although this segment feature for RTC is only used for + // blocks >= 8X8, for more efficient coding of the seg map + // cur_frame->seg_map needs to set at 4x4 along with the + // function av1_cyclic_reset_segment_skip(). Skipping over + // 4x4 will therefore have small bdrate loss (~0.2%), so + // we use it only for speed > 9 for now. + // Also if loop-filter deltas is applied via segment, then + // we need to set cr->skip_over4x4 = 1. + cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0; + + // should we enable cyclic refresh on this frame. + cr->apply_cyclic_refresh = 1; + if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) || + scene_change_detected || svc->temporal_layer_id > 0 || + svc->prev_number_spatial_layers != svc->number_spatial_layers || + p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || + (svc->number_spatial_layers > 1 && + svc->layer_context[svc->temporal_layer_id].is_key_frame) || + (frames_since_scene_change > 20 && + p_rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) || + (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 30 && + frames_since_scene_change > 40) || + cpi->ppi->rtc_ref.bias_recovery_frame) { + cr->apply_cyclic_refresh = 0; + return; + } + + // Increase the amount of refresh for #temporal_layers > 2 + if (svc->number_temporal_layers > 2) + cr->percent_refresh = 15; + else + cr->percent_refresh = 10 + cr->percent_refresh_adjustment; + + cr->max_qdelta_perc = 60; + cr->time_for_refresh = 0; + cr->use_block_sad_scene_det = + (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && + cm->seq_params->sb_size == BLOCK_64X64) + ? 1 + : 0; + cr->motion_thresh = 32; + cr->rate_boost_fac = + (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) ? 10 : 15; + + // Use larger delta-qp (increase rate_ratio_qdelta) for first few + // refresh cycles after a key frame (svc) or scene change (non svc). + // For non svc screen content, after a scene change gradually reduce + // this boost and supress it further if either of the previous two + // frames overshot. + if (cr->percent_refresh > 0) { + if (cpi->ppi->use_svc || !is_screen_content) { + if (frames_since_scene_change < + ((4 * svc->number_temporal_layers) * (100 / cr->percent_refresh))) { + cr->rate_ratio_qdelta = 3.0 + cr->rate_ratio_qdelta_adjustment; + } else { + cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment; + } + } else { + double distance_from_sc_factor = + AOMMIN(0.75, (int)(frames_since_scene_change / 10) * 0.1); + cr->rate_ratio_qdelta = + 3.0 + cr->rate_ratio_qdelta_adjustment - distance_from_sc_factor; + if ((frames_since_scene_change < 10) && + ((cpi->rc.rc_1_frame < 0) || (cpi->rc.rc_2_frame < 0))) { + cr->rate_ratio_qdelta -= 0.25; + } + } + } else { + cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment; + } + // Adjust some parameters for low resolutions. + if (cm->width * cm->height <= 352 * 288) { + if (cpi->svc.number_temporal_layers > 1) { + cr->motion_thresh = 32; + cr->rate_boost_fac = 13; + } else { + if (rc->avg_frame_bandwidth < 3000) { + cr->motion_thresh = 16; + cr->rate_boost_fac = 13; + } else { + cr->max_qdelta_perc = 50; + cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.0); + } + } + } + if (cpi->oxcf.rc_cfg.mode == AOM_VBR) { + // To be adjusted for VBR mode, e.g., based on gf period and boost. + // For now use smaller qp-delta (than CBR), no second boosted seg, and + // turn-off (no refresh) on golden refresh (since it's already boosted). + cr->percent_refresh = 10; + cr->rate_ratio_qdelta = 1.5; + cr->rate_boost_fac = 10; + if (cpi->refresh_frame.golden_frame) { + cr->percent_refresh = 0; + cr->rate_ratio_qdelta = 1.0; + } + } + if (rc->rtc_external_ratectrl) { + cr->actual_num_seg1_blocks = cr->percent_refresh * cm->mi_params.mi_rows * + cm->mi_params.mi_cols / 100; + cr->actual_num_seg2_blocks = 0; + } +} + +// Setup cyclic background refresh: set delta q and segmentation map. +void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + struct segmentation *const seg = &cm->seg; + const int scene_change_detected = is_scene_change_detected(cpi); + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + + // Set resolution_change flag: for svc only set it when the + // number of spatial layers has not changed. + const int resolution_change = + cm->prev_frame && + (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height) && + cpi->svc.prev_number_spatial_layers == cpi->svc.number_spatial_layers; + + if (resolution_change) av1_cyclic_refresh_reset_resize(cpi); + if (!cr->apply_cyclic_refresh) { + // Set segmentation map to 0 and disable. + unsigned char *const seg_map = cpi->enc_seg.map; + memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_disable_segmentation(&cm->seg); + if (frame_is_intra_only(cm) || scene_change_detected || + cpi->ppi->rtc_ref.bias_recovery_frame) { + cr->sb_index = 0; + cr->last_sb_index = 0; + cr->counter_encode_maxq_scene_change = 0; + cr->actual_num_seg1_blocks = 0; + cr->actual_num_seg2_blocks = 0; + } + return; + } else { + cr->counter_encode_maxq_scene_change++; + const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex, + cm->seq_params->bit_depth); + // Set rate threshold to some multiple (set to 2 for now) of the target + // rate (target is given by sb64_target_rate and scaled by 256). + cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2; + // Distortion threshold, quadratic in Q, scale factor to be adjusted. + // q will not exceed 457, so (q * q) is within 32bit; see: + // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[]. + cr->thresh_dist_sb = ((int64_t)(q * q)) << 2; + // For low-resoln or lower speeds, the rate/dist thresholds need to be + // tuned/updated. + if (cpi->oxcf.speed <= 7 || (cm->width * cm->height < 640 * 360)) { + cr->thresh_dist_sb = 0; + cr->thresh_rate_sb = INT64_MAX; + } + // Set up segmentation. + // Clear down the segment map. + av1_enable_segmentation(&cm->seg); + av1_clearall_segfeatures(seg); + + // Note: setting temporal_update has no effect, as the seg-map coding method + // (temporal or spatial) is determined in + // av1_choose_segmap_coding_method(), + // based on the coding cost of each method. For error_resilient mode on the + // last_frame_seg_map is set to 0, so if temporal coding is used, it is + // relative to 0 previous map. + // seg->temporal_update = 0; + + // Segment BASE "Q" feature is disabled so it defaults to the baseline Q. + av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q); + // Use segment BOOST1 for in-frame Q adjustment. + av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q); + // Use segment BOOST2 for more aggressive in-frame Q adjustment. + av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q); + + // Set the q delta for segment BOOST1. + const CommonQuantParams *const quant_params = &cm->quant_params; + int qindex_delta = + compute_deltaq(cpi, quant_params->base_qindex, cr->rate_ratio_qdelta); + cr->qindex_delta[1] = qindex_delta; + + // Compute rd-mult for segment BOOST1. + const int qindex2 = clamp( + quant_params->base_qindex + quant_params->y_dc_delta_q + qindex_delta, + 0, MAXQ); + cr->rdmult = av1_compute_rd_mult( + qindex2, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); + + av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta); + + // Set a more aggressive (higher) q delta for segment BOOST2. + qindex_delta = compute_deltaq( + cpi, quant_params->base_qindex, + AOMMIN(CR_MAX_RATE_TARGET_RATIO, + 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta)); + cr->qindex_delta[2] = qindex_delta; + av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); + + // Update the segmentation and refresh map. + cyclic_refresh_update_map(cpi); + } +} + +int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) { + return cr->rdmult; +} + +void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + cr->sb_index = 0; + cr->last_sb_index = 0; + cpi->refresh_frame.golden_frame = true; + cr->apply_cyclic_refresh = 0; + cr->counter_encode_maxq_scene_change = 0; + cr->percent_refresh_adjustment = 5; + cr->rate_ratio_qdelta_adjustment = 0.25; +} + +int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int qindex = cpi->common.quant_params.base_qindex; + if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 && + cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh && + cpi->rc.frame_source_sad < 1000 && + qindex < 7 * (cpi->rc.worst_quality >> 3)) + return 1; + // More aggressive skip. + else if (cpi->sf.rt_sf.skip_lf_screen > 1 && !cpi->rc.high_source_sad && + cpi->rc.frame_source_sad < 50000 && qindex < cpi->rc.worst_quality) + return 1; + return 0; +} diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h new file mode 100644 index 0000000000..10974f018b --- /dev/null +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ +#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ + +#include "av1/common/blockd.h" +#include "av1/encoder/block.h" +#include "av1/encoder/tokenize.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The segment ids used in cyclic refresh: from base (no boost) to increasing +// boost (higher delta-qp). +#define CR_SEGMENT_ID_BASE 0 +#define CR_SEGMENT_ID_BOOST1 1 +#define CR_SEGMENT_ID_BOOST2 2 + +// Maximum rate target ratio for setting segment delta-qp. +#define CR_MAX_RATE_TARGET_RATIO 4.0 + +/*! + * \brief The stucture of CYCLIC_REFRESH. + * \ingroup cyclic_refresh + */ +struct CYCLIC_REFRESH { + /*! + * Percentage of blocks per frame that are targeted as candidates + * for cyclic refresh. + */ + int percent_refresh; + + /*! + * Active adjustment delta for cyclic refresh for rate control. + */ + int percent_refresh_adjustment; + + /*! + * Maximum q-delta as percentage of base q. + */ + int max_qdelta_perc; + /*! + *Superblock starting index for cycling through the frame. + */ + int sb_index; + /*! + *Superblock index cyclic refresh index last frame + */ + int last_sb_index; + /*! + * Controls how long block will need to wait to be refreshed again, in + * excess of the cycle time, i.e., in the case of all zero motion, block + * will be refreshed every (100/percent_refresh + time_for_refresh) frames. + */ + int time_for_refresh; + /*! + * Target number of (4x4) blocks that are set for delta-q. + */ + int target_num_seg_blocks; + /*! + * Actual number of (4x4) blocks that were applied delta-q, + * for segment 1. + */ + int actual_num_seg1_blocks; + /*! + * Actual number of (4x4) blocks that were applied delta-q, + * for segment 2. + */ + int actual_num_seg2_blocks; + /*! + * RD mult. parameters for segment 1. + */ + int rdmult; + /*! + * Cyclic refresh map. + */ + int8_t *map; + /*! + * Threshold applied to the projected rate of the coding block, + * when deciding whether block should be refreshed. + */ + int64_t thresh_rate_sb; + /*! + * Threshold applied to the projected distortion of the coding block, + * when deciding whether block should be refreshed. + */ + int64_t thresh_dist_sb; + /*! + * Threshold applied to the motion vector (in units of 1/8 pel) of the + * coding block, when deciding whether block should be refreshed. + */ + int16_t motion_thresh; + /*! + * Rate target ratio to set q delta. + */ + double rate_ratio_qdelta; + + /*! + * Active adjustment of qdelta rate ratio for enhanced rate control + */ + double rate_ratio_qdelta_adjustment; + + /*! + * Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2. + */ + int rate_boost_fac; + + /*!\cond */ + int qindex_delta[3]; + int apply_cyclic_refresh; + int skip_over4x4; + int counter_encode_maxq_scene_change; + int use_block_sad_scene_det; + /*!\endcond */ +}; + +struct AV1_COMP; + +typedef struct CYCLIC_REFRESH CYCLIC_REFRESH; + +CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols); + +void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr); + +/*!\brief Estimate the bits, incorporating the delta-q from the segments. + * + * For the just encoded frame, estimate the bits, incorporating the delta-q + * from non-base segment(s). Note this function is called in the postencode + * (called from rc_update_rate_correction_factors()). + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] correction_factor rate correction factor + * + * \return Return the estimated bits at given q. + */ +int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi, + double correction_factor); + +/*!\brief Estimate the bits per mb, for given q = i and delta-q. + * + * Prior to encoding the frame, estimate the bits per mb, for a given q = i and + * a corresponding delta-q (for segment 1). This function is called in the + * rc_regulate_q() to set the base qp index. Note: the segment map is set to + * either 0/CR_SEGMENT_ID_BASE (no refresh) or to 1/CR_SEGMENT_ID_BOOST1 + * (refresh) for each superblock, prior to encoding. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] i q index + * \param[in] correction_factor rate correction factor + * + * \return Return the estimated bits for q = i and delta-q (segment 1). + */ +int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i, + double correction_factor); + +/*!\brief Update segment_id for blocks are skipped. + * + * After encoding a given prediction block, of size bsize at (mi_row, mi_col), + * check if we should reset the segment_id based on skip_txfm, + * and update the cyclic_refresh map and segmentation counters. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] x Pointer to MACROBLOCK structure + * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE + * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE + * \param[in] bsize Block size + * \param[in] dry_run A code indicating whether it is part of the final + * pass for reconstructing the superblock + * + * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and + * the \c cm->cpi->enc_seg.map. + */ + +void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi, + MACROBLOCK *const x, int mi_row, int mi_col, + BLOCK_SIZE bsize, RUN_TYPE dry_run); + +/*!\brief Update segment_id for block based on mode selected. + * + * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), + * check if we should reset the segment_id (based on mode/motion/skip selected + * for that block) and update the cyclic_refresh map and segmentation map. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] x Pointer to MACROBLOCK structure + * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE + * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE + * \param[in] bsize Block size + * \param[in] rate Projected block rate from pickmode + * \param[in] dist Projected block dist from pickmode + * \param[in] skip Skip flag set from picmode + * \param[in] dry_run A code indicating whether it is part of the final + * pass for reconstructing the superblock + * + * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and + * the \c cm->cpi->enc_seg.map. + */ +void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi, + MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int64_t rate, int64_t dist, int skip, + RUN_TYPE dry_run); + +/*!\brief Initialize counters used for cyclic refresh. + * + * Initializes cyclic refresh counters actual_num_seg1_blocks and + * actual_num_seg2_blocks. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] x Pointer to MACROBLOCK structure + * + * \remark Update the \c x->actual_num_seg1_blocks and the + * \c x->actual_num_seg2_blocks. + */ +void av1_init_cyclic_refresh_counters(MACROBLOCK *const x); + +/*!\brief Accumulate cyclic refresh counters. + * + * Accumulates cyclic refresh counters actual_num_seg1_blocks and + * actual_num_seg2_blocks from MACROBLOCK strcture to CYCLIC_REFRESH strcture. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cyclic_refresh Pointer to CYCLIC_REFRESH structure + * \param[in] x Pointer to MACROBLOCK structure + * + * \remark Update the \c cyclic_refresh->actual_num_seg1_blocks and the + * \c cyclic_refresh->actual_num_seg2_blocks. + */ +void av1_accumulate_cyclic_refresh_counters( + CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x); + +/*!\brief Set golden frame update interval nased on cyclic refresh. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Returns the interval in \c cpi->rc.baseline_gf_interval. + */ +void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi); + +/*!\brief Set the global/frame level parameters for cyclic refresh. + * + * First call to the cyclic refresh, before encoding the frame. + * Sets the flag on whether cyclic refresh should be applied, sets + * the amount/percent of refresh, and the amount of boost applied to + * the two segments (set by rate_ratio_qdelta and rate_boost_fac). + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Updates the \c cpi->cyclic_refresh with the settings. + */ +void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi); + +/*!\brief Setup the cyclic background refresh. + * + * Set the delta q for the segment(s), and set the segmentation map. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Updates the \c cpi->cyclic_refresh with the cyclic refresh + * parameters and the \c cm->seg with the segmentation data. + */ +void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi); + +int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr); + +void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi); + +int av1_cyclic_refresh_disable_lf_cdef(struct AV1_COMP *const cpi); + +static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) { + return segment_id == CR_SEGMENT_ID_BOOST1 || + segment_id == CR_SEGMENT_ID_BOOST2; +} + +static INLINE int cyclic_refresh_segment_id(int segment_id) { + if (segment_id == CR_SEGMENT_ID_BOOST1) + return CR_SEGMENT_ID_BOOST1; + else if (segment_id == CR_SEGMENT_ID_BOOST2) + return CR_SEGMENT_ID_BOOST2; + else + return CR_SEGMENT_ID_BASE; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c new file mode 100644 index 0000000000..086928a118 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_variance.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_ports/mem.h" + +#include "av1/encoder/aq_variance.h" +#include "av1/common/seg_common.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/dwt.h" + +static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0, + 0.9, .8, .7, .6 }; + +static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, + 0.75, 1.0, 1.0, 1.0 }; +#define ENERGY_MIN (-4) +#define ENERGY_MAX (1) +#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) +#define ENERGY_IN_BOUNDS(energy) \ + assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) + +DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 }; + +DECLARE_ALIGNED(16, static const uint16_t, + av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; + +static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; + +#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] + +void av1_vaq_frame_setup(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const int base_qindex = cm->quant_params.base_qindex; + struct segmentation *seg = &cm->seg; + int i; + + int resolution_change = + cm->prev_frame && (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height); + int avg_energy = (int)(cpi->twopass_frame.mb_av_energy - 2); + double avg_ratio; + if (avg_energy > 7) avg_energy = 7; + if (avg_energy < 0) avg_energy = 0; + avg_ratio = rate_ratio[avg_energy]; + + if (resolution_change) { + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_clearall_segfeatures(seg); + av1_disable_segmentation(seg); + return; + } + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + refresh_frame->alt_ref_frame || + (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + cpi->vaq_refresh = 1; + + av1_enable_segmentation(seg); + av1_clearall_segfeatures(seg); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + // Set up avg segment id to be 1.0 and adjust the other segments around + // it. + int qindex_delta = + av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, + base_qindex, rate_ratio[i] / avg_ratio); + + // We don't allow qindex 0 in a segment if the base value is not 0. + // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { + qindex_delta = -base_qindex + 1; + } + + av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta); + av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } + } +} + +int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + // This functions returns a score for the blocks local variance as calculated + // by: sum of the log of the (4x4 variances) of each subblock to the current + // block (x,bs) + // * 32 / number of pixels in the block_size. + // This is used for segmentation because to avoid situations in which a large + // block with a gentle gradient gets marked high variance even though each + // subblock has a low variance. This allows us to assign the same segment + // number for the same sorts of area regardless of how the partitioning goes. + + MACROBLOCKD *xd = &x->e_mbd; + double var = 0; + unsigned int sse; + int i, j; + + int right_overflow = + (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; + int bottom_overflow = + (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; + + const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; + const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; + + for (i = 0; i < bh; i += 4) { + for (j = 0; j < bw; j += 4) { + if (is_cur_buf_hbd(xd)) { + var += log1p(cpi->ppi->fn_ptr[BLOCK_4X4].vf( + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) / + 16.0); + } else { + var += log1p(cpi->ppi->fn_ptr[BLOCK_4X4].vf( + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, av1_all_zeros, 0, &sse) / + 16.0); + } + } + } + // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561. + var /= (bw / 4 * bh / 4); + if (var > 7) var = 7; + + return (int)(var); +} + +int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs, + int mi_row, int mi_col) { + // This functions returns the block average of luma block + unsigned int sum, avg, num_pix; + int r, c; + const int pic_w = cpi->common.width; + const int pic_h = cpi->common.height; + const int bw = MI_SIZE * mi_size_wide[bs]; + const int bh = MI_SIZE * mi_size_high[bs]; + const uint16_t *x16 = CONVERT_TO_SHORTPTR(x->plane[0].src.buf); + + sum = 0; + num_pix = 0; + avg = 0; + int row = mi_row << MI_SIZE_LOG2; + int col = mi_col << MI_SIZE_LOG2; + for (r = row; (r < (row + bh)) && (r < pic_h); r++) { + for (c = col; (c < (col + bw)) && (c < pic_w); c++) { + sum += *(x16 + r * x->plane[0].src.stride + c); + num_pix++; + } + } + if (num_pix != 0) { + avg = sum / num_pix; + } + return avg; +} + +#define DEFAULT_E_MIDPOINT 10.0 + +static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) { + MACROBLOCKD *xd = &x->e_mbd; + int stride = x->plane[0].src.stride; + uint8_t *buf = x->plane[0].src.buf; + const int num_8x8_cols = block_size_wide[bs] / 8; + const int num_8x8_rows = block_size_high[bs] / 8; + const int hbd = is_cur_buf_hbd(xd); + + int64_t var = av1_haar_ac_sad_mxn_uint8_input(buf, stride, hbd, num_8x8_rows, + num_8x8_cols); + + return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs]; +} + +static double log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) { + unsigned int haar_sad = haar_ac_energy(x, bs); + return log1p(haar_sad); +} + +int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { + double energy, energy_midpoint; + energy_midpoint = (is_stat_consumption_stage_twopass(cpi)) + ? cpi->twopass_frame.frame_avg_haar_energy + : DEFAULT_E_MIDPOINT; + energy = log_block_wavelet_energy(x, bs) - energy_midpoint; + return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); +} + +int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi, + int block_var_level) { + int rate_level; + const AV1_COMMON *const cm = &cpi->common; + + if (DELTA_Q_PERCEPTUAL_MODULATION == 1) { + ENERGY_IN_BOUNDS(block_var_level); + rate_level = SEGMENT_ID(block_var_level); + } else { + rate_level = block_var_level; + } + const int base_qindex = cm->quant_params.base_qindex; + int qindex_delta = + av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, base_qindex, + deltaq_rate_ratio[rate_level]); + + if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { + qindex_delta = -base_qindex + 1; + } + return base_qindex + qindex_delta; +} diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h new file mode 100644 index 0000000000..aa0535ad72 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_variance.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_ +#define AOM_AV1_ENCODER_AQ_VARIANCE_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_vaq_frame_setup(AV1_COMP *cpi); + +int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); +int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs, + int mi_row, int mi_col); +int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi, + int block_var_level); +int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_ diff --git a/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c new file mode 100644 index 0000000000..91fc1e00a5 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if defined(_MSC_VER) && !defined(__clang__) +#include +#else +#include +#endif + +#include +#include + +#include "config/aom_config.h" + +#define CRC_LOOP(op, crc, type, buf, len) \ + while ((len) >= sizeof(type)) { \ + (crc) = op((crc), *(type *)(buf)); \ + (len) -= sizeof(type); \ + buf += sizeof(type); \ + } + +#define CRC_SINGLE(op, crc, type, buf, len) \ + if ((len) >= sizeof(type)) { \ + (crc) = op((crc), *(type *)(buf)); \ + (len) -= sizeof(type); \ + buf += sizeof(type); \ + } + +/* Return 32-bit CRC for the input buffer. + * Polynomial is 0x1EDC6F41. + */ + +uint32_t av1_get_crc32c_value_arm_crc32(void *crc_calculator, uint8_t *p, + size_t len) { + (void)crc_calculator; + const uint8_t *buf = p; + uint32_t crc = 0xFFFFFFFF; + +#if !AOM_ARCH_AARCH64 + // Align input to 8-byte boundary (only necessary for 32-bit builds.) + while (len && ((uintptr_t)buf & 7)) { + crc = __crc32cb(crc, *buf++); + len--; + } +#endif + + CRC_LOOP(__crc32cd, crc, uint64_t, buf, len) + CRC_SINGLE(__crc32cw, crc, uint32_t, buf, len) + CRC_SINGLE(__crc32ch, crc, uint16_t, buf, len) + CRC_SINGLE(__crc32cb, crc, uint8_t, buf, len) + + return ~crc; +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c new file mode 100644 index 0000000000..26d06b46fe --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // By operating on unsigned integers we can store up to 4 squared diff in a + // 32-bit element before having to widen to 64 bits. + uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + // We can't do the same here as we're operating on signed integers, so we + // can only accumulate 2 squares. + int32x4_t ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); + ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz0); + + int32x4_t ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); + ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_s64x2(ssz_s64); + return (int64_t)horizontal_add_u64x2(err_u64); +} + +int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff, + int block_size) { + uint64x2_t err_u64 = vdupq_n_u64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // By operating on unsigned integers we can store up to 4 squared diff in a + // 32-bit element before having to widen to 64 bits. + uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return (int64_t)horizontal_add_u64x2(err_u64); +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c new file mode 100644 index 0000000000..63aad0b785 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/mem_neon.h" + +int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int64x2_t sqcoeff[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + sqcoeff[0] = aom_sdotq_s16(sqcoeff[0], c0, c0); + sqcoeff[1] = aom_sdotq_s16(sqcoeff[1], c1, c1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = vaddvq_s64(vaddq_s64(sqcoeff[0], sqcoeff[1])); + return vaddvq_s64(vaddq_s64(error[0], error[1])); +} + +int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff, + int block_size) { + if (block_size % 32 == 0) { + int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t c2 = vld1q_s16(coeff + 16); + const int16x8_t c3 = vld1q_s16(coeff + 24); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + const int16x8_t d2 = vld1q_s16(dqcoeff + 16); + const int16x8_t d3 = vld1q_s16(dqcoeff + 24); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + const int16x8_t diff2 = vsubq_s16(c2, d2); + const int16x8_t diff3 = vsubq_s16(c3, d3); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + error[2] = aom_sdotq_s16(error[2], diff2, diff2); + error[3] = aom_sdotq_s16(error[3], diff3, diff3); + + coeff += 32; + dqcoeff += 32; + block_size -= 32; + } while (block_size != 0); + + error[0] = vaddq_s64(error[0], error[1]); + error[2] = vaddq_s64(error[2], error[3]); + error[0] = vaddq_s64(error[0], error[2]); + return vaddvq_s64(error[0]); + } + assert(block_size == 16); + + int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return vaddvq_s64(vaddq_s64(error[0], error[1])); +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c new file mode 100644 index 0000000000..5148ee74a9 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c @@ -0,0 +1,3090 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "shift_neon.h" +#include "txfm_neon.h" + +#define TXFM_COS_BIT_MAX 13 + +// A note on butterfly helper naming: +// +// butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon +// e.g. butterfly_s32_s32_x4_0231_neon +// | | | ^ Weights are applied as indices 0, 2, 3, 1 +// | | | (see more detail below) +// | | ^ (int32)x4 input/output parameters +// | ^ 32-bit accumulators internally +// ^ 32-bit input/output parameters +// +// Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to +// avoid needing separate negation instructions. This is represented in the +// helper naming by referring to the lane index in the loaded tuple that each +// multiply is performed with: +// +// in0 in1 +// /---------- +// out0 | w0 w1 ==> out0 = in0 * w0 + in1 * w1 +// out1 | w2 w3 ==> out1 = in0 * w2 + in1 * w3 +// +// So for indices 0331 from the earlier example, we end up with: +// +// in0 in1 +// /------------------ +// out0 | (lane 0) (lane 2) ==> out0 = in0 * w0 + in1 * -w0 +// out1 | (lane 3) (lane 1) ==> out1 = in0 * (w0-1) + in1 * (1-w0) + +static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon( + const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, + int32x4_t *out0, int32x4_t *out1) { + int32x4_t w0101 = vmovl_s16(w0101_s16); + int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); + o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1); + int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); + o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0); + *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); + *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); +} + +static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon( + const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, + int32x4_t *out0, int32x4_t *out1) { + int32x4_t w0101 = vmovl_s16(w0101_s16); + int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); + o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1); + int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1); + o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0); + *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); + *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); +} + +static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon( + const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, + int32x4_t *out0, int32x4_t *out1) { + int32x4_t w0101 = vmovl_s16(w0101_s16); + int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); + o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0); + int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); + o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1); + *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); + *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); +} + +static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon( + const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, + int32x4_t *out0, int32x4_t *out1) { + int32x4_t w0101 = vmovl_s16(w0101_s16); + int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); + o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0); + int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0); + o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1); + *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); + *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); +} + +#define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \ + out0, out1) \ + do { \ + int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0); \ + u0 = vmlal_lane_s16(u0, in1, wvec, lane1); \ + int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2); \ + v0 = vmlal_lane_s16(v0, in1, wvec, lane3); \ + *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \ + *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \ + } while (0) + +static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon( + const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, + int16x4_t *out0, int16x4_t *out1) { + butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon( + const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, + int16x4_t *out0, int16x4_t *out1) { + butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon( + const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, + int16x4_t *out0, int16x4_t *out1) { + butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon( + const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, + int16x4_t *out0, int16x4_t *out1) { + butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1); +} + +#define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \ + out0, out1) \ + do { \ + int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0); \ + u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1); \ + int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0); \ + u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1); \ + int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2); \ + v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3); \ + int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2); \ + v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3); \ + const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \ + const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX); \ + const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \ + const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX); \ + *out0 = vcombine_s16(c0, c1); \ + *out1 = vcombine_s16(d0, d1); \ + } while (0) + +static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon( + const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, + int16x8_t *out0, int16x8_t *out1) { + butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon( + const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, + int16x8_t *out0, int16x8_t *out1) { + butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon( + const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, + int16x8_t *out0, int16x8_t *out1) { + butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon( + const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, + int16x8_t *out0, int16x8_t *out1) { + butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out, + int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out, + int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8( + int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2, + const int stride, const int out_size) { + for (int i = 0; i < out_size; ++i) { + vst1q_s32(out + stride * i, in1[i]); + vst1q_s32(out + stride * i + 4, in2[i]); + } +} + +static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in, + const int stride, + int16x4_t *const out, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = vld1_s16(in); + in += stride; + } +} + +static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride, + int16x8_t *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = vld1q_s16(in + i * stride); + } +} + +static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + vst1q_s32(out + i * stride, vmovl_s16(in[i])); + } +} + +static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i]))); + vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i]))); + } +} + +// A note on naming: +// round_shift_[sqrt2]_s16_s32_4x1_neon(...) +// | | | ^ 1 => a single vector +// | | | n => an array of vectors +// | | | ^ input/output vector element count +// | | ^ output type +// | ^ input type +// ^ multiplicand and shift identifier + +static AOM_FORCE_INLINE int16x4_t +round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) { + return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits); +} + +static AOM_FORCE_INLINE int16x8_t +round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) { + return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)), + round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a))); +} + +static AOM_FORCE_INLINE int16x4_t +round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) { + return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits); +} + +static AOM_FORCE_INLINE int16x8_t +round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) { + return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)), + round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a))); +} + +static AOM_FORCE_INLINE int32x4_t +round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) { + return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits); +} + +static AOM_FORCE_INLINE int32x4_t +round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) { + return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits); +} + +#define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn) \ + static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \ + for (int i = 0; i < size; ++i) { \ + out[i] = fn(in[i]); \ + } \ + } + +ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t, + int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon) +ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t, + int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon) +ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t, + int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon) +ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t, + int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon) +ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t, + int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon) + +static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i])); + } +} + +static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + vst1q_s32(out + i * stride + 0, + round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i]))); + vst1q_s32(out + i * stride + 4, + round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i]))); + } +} + +static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + int32x4_t u[6], v[6]; + const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit)); + const int16x4_t u01 = vqadd_s16(input[0], input[1]); + + v[5] = vmull_lane_s16(input[2], sinpi, 2); + v[0] = vmull_lane_s16(input[1], sinpi, 1); + v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0); + v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3); + v[2] = vmull_lane_s16(u01, sinpi, 2); + v[3] = vmull_lane_s16(input[0], sinpi, 3); + v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0); + v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1); + + u[0] = vaddq_s32(v[0], v[1]); + u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2); + u[2] = vsubq_s32(v[3], v[4]); + u[3] = vsubq_s32(u[2], u[0]); + u[3] = vmlaq_n_s32(u[3], v[5], 3); + + output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX); + output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX); + output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX); + output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX); +} + +static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + + // stage 1-2 + int16x4_t x2[8]; + butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]); + butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]); + + // stage 3 + int16x4_t x3[8]; + x3[0] = vqadd_s16(input[0], x2[2]); + x3[1] = vqsub_s16(x2[3], input[7]); + x3[2] = vqsub_s16(input[0], x2[2]); + x3[3] = vqadd_s16(input[7], x2[3]); + x3[4] = vqsub_s16(x2[6], input[1]); + x3[5] = vqadd_s16(input[6], x2[7]); + x3[6] = vqadd_s16(input[1], x2[6]); + x3[7] = vqsub_s16(input[6], x2[7]); + + // stage 4 + int16x4_t x4[8]; + butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]); + butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]); + + // stage 5 + int16x4_t x5[8]; + x5[0] = vqadd_s16(x3[0], x4[4]); + x5[1] = vqadd_s16(x3[1], x4[5]); + x5[2] = vqadd_s16(x3[2], x4[6]); + x5[3] = vqsub_s16(x4[7], x3[3]); + x5[4] = vqsub_s16(x3[0], x4[4]); + x5[5] = vqsub_s16(x3[1], x4[5]); + x5[6] = vqsub_s16(x3[2], x4[6]); + x5[7] = vqadd_s16(x3[3], x4[7]); + + // stage 6-7 + butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]); + butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]); + butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]); + butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]); +} + +static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + int32x4_t u_lo[4], u_hi[4]; + const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit)); + const int16x8_t u01 = vqaddq_s16(input[0], input[1]); + + u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1); + u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1); + + u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0); + u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0); + + u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3); + u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3); + + u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2); + u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2); + + u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2); + u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2); + + u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3); + u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3); + + u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0); + u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0); + + u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1); + u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1); + + u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2); + u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2); + + u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2); + u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2); + + u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]); + u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]); + + const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3); + u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2); + u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2); + + output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX), + vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX)); + output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX), + vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX)); + output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX), + vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX)); + output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX), + vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX)); +} + +static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]); + + int16x4_t in12a = vadd_s16(input[1], input[2]); + int16x4_t in12s = vsub_s16(input[1], input[2]); + int16x4_t in03a = vadd_s16(input[0], input[3]); + int16x4_t in03s = vsub_s16(input[0], input[3]); + + int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]); + int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]); + + int32x4_t u[4]; + u[0] = vaddq_s32(u0ad1, u0ad2); + u[1] = vsubq_s32(u0ad2, u0ad1); + u[2] = vmull_lane_s16(in12s, cospi16, 1); + u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0); + u[3] = vmull_lane_s16(in03s, cospi16, 1); + u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0); + + output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX); + output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX); + output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX); + output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX); +} + +// Butterfly pre-processing: +// e.g. n=4: +// out[0] = in[0] + in[3] +// out[1] = in[1] + in[2] +// out[2] = in[1] - in[2] +// out[3] = in[0] - in[3] + +static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input, + int16x4_t *output, + int n) { + for (int i = 0; i < n / 2; ++i) { + output[i] = vqadd_s16(input[i], input[n - i - 1]); + } + for (int i = 0; i < n / 2; ++i) { + output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]); + } +} + +static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input, + int16x8_t *output, + int n) { + for (int i = 0; i < n / 2; ++i) { + output[i] = vqaddq_s16(input[i], input[n - i - 1]); + } + for (int i = 0; i < n / 2; ++i) { + output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]); + } +} + +static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input, + int32x4_t *output, + int n) { + for (int i = 0; i < n / 2; ++i) { + output[i] = vqaddq_s32(input[i], input[n - i - 1]); + } + for (int i = 0; i < n / 2; ++i) { + output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]); + } +} + +// Butterfly post-processing: +// e.g. n=8: +// out[0] = in0[0] + in1[3]; +// out[1] = in0[1] + in1[2]; +// out[2] = in0[1] - in1[2]; +// out[3] = in0[0] - in1[3]; +// out[4] = in0[7] - in1[4]; +// out[5] = in0[6] - in1[5]; +// out[6] = in0[6] + in1[5]; +// out[7] = in0[7] + in1[4]; + +static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0, + const int16x4_t *in1, + int16x4_t *output, + int n) { + for (int i = 0; i < n / 4; ++i) { + output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[(3 * n) / 4 + i] = + vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); + } +} + +static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0, + const int16x8_t *in1, + int16x8_t *output, + int n) { + for (int i = 0; i < n / 4; ++i) { + output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[(3 * n) / 4 + i] = + vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); + } +} + +static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0, + const int32x4_t *in1, + int32x4_t *output, + int n) { + for (int i = 0; i < n / 4; ++i) { + output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[(3 * n) / 4 + i] = + vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); + } +} + +static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + + // stage 1 + int16x8_t x1[4]; + butterfly_dct_pre_s16_x8(input, x1, 4); + + // stage 2 + int16x8_t x2[4]; + butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]); + butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]); + + // stage 3 + output[0] = x2[0]; + output[1] = x2[2]; + output[2] = x2[1]; + output[3] = x2[3]; +} + +static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + + // stage 1 + int16x4_t x1[8]; + butterfly_dct_pre_s16_x4(input, x1, 8); + + // stage 2 + int16x4_t x2[8]; + butterfly_dct_pre_s16_x4(x1, x2, 4); + butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]); + + // stage 3 + int16x4_t x3[8]; + butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]); + butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]); + butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4); + + // stage 4-5 + butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]); + butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]); +} + +static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + + // stage 1 + int16x8_t x1[8]; + butterfly_dct_pre_s16_x8(input, x1, 8); + + // stage 2 + int16x8_t x2[8]; + butterfly_dct_pre_s16_x8(x1, x2, 4); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]); + + // stage 3 + int16x8_t x3[8]; + butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]); + butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]); + butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4); + + // stage 4-5 + butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]); + butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]); +} + +static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + + // stage 1 + int16x4_t x1[16]; + butterfly_dct_pre_s16_x4(input, x1, 16); + + // stage 2 + int16x4_t x2[16]; + butterfly_dct_pre_s16_x4(x1, x2, 8); + butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]); + butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]); + + // stage 3 + int16x4_t x3[16]; + butterfly_dct_pre_s16_x4(x2, x3, 4); + butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]); + butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8); + + // stage 4 + int16x4_t x4[16]; + butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]); + butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4], + &output[12]); + butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4); + butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]); + butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]); + + // stage 5 + int16x4_t x5[16]; + butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]); + butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10], + &output[6]); + butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4); + butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4); + + // stage 6-7 + butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1], + &output[15]); + butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9], + &output[7]); + butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5], + &output[11]); + butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13], + &output[3]); +} + +static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + + // stage 1 + int16x8_t x1[16]; + butterfly_dct_pre_s16_x8(input, x1, 16); + + // stage 2 + int16x8_t x2[16]; + butterfly_dct_pre_s16_x8(x1, x2, 8); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]); + + // stage 3 + int16x8_t x3[16]; + butterfly_dct_pre_s16_x8(x2, x3, 4); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]); + butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8); + + // stage 4 + int16x8_t x4[16]; + butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4], + &output[12]); + butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]); + + // stage 5 + int16x8_t x5[16]; + butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]); + butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10], + &output[6]); + butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4); + butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4); + + // stage 6-7 + butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1], + &output[15]); + butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9], + &output[7]); + butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5], + &output[11]); + butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13], + &output[3]); +} + +static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + + // stage 1 + int16x8_t x1[32]; + butterfly_dct_pre_s16_x8(input, x1, 32); + + // stage 2 + int16x8_t x2[32]; + butterfly_dct_pre_s16_x8(x1, x2, 16); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]); + + // stage 3 + int16x8_t x3[32]; + butterfly_dct_pre_s16_x8(x2, x3, 8); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]); + butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16); + + // stage 4 + int16x8_t x4[32]; + butterfly_dct_pre_s16_x8(x3, x4, 4); + butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]); + butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]); + + // stage 5 + int16x8_t x5[32]; + butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0], + &output[16]); + butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8], + &output[24]); + butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4); + butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]); + butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]); + butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8); + butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8); + + // stage 6 + int16x8_t x6[32]; + butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20], + &output[12]); + butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4); + butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4); + butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]); + butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]); + butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]); + + // stage 7 + int16x8_t x7[32]; + butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2], + &output[30]); + butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18], + &output[14]); + butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10], + &output[22]); + butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26], + &output[6]); + butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4); + butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4); + butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4); + butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4); + + butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1], + &output[31]); + butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17], + &output[15]); + butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9], + &output[23]); + butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25], + &output[7]); + butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5], + &output[27]); + butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21], + &output[11]); + butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13], + &output[19]); + butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29], + &output[3]); +} + +static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]); + const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]); + const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]); + const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]); + const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]); + const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]); + const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]); + const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + const int16x4_t cospi1 = vget_low_s16(cospi1_3); + const int16x4_t cospi3 = vget_high_s16(cospi1_3); + const int16x4_t cospi5 = vget_low_s16(cospi5_7); + const int16x4_t cospi7 = vget_high_s16(cospi5_7); + const int16x4_t cospi9 = vget_low_s16(cospi9_11); + const int16x4_t cospi11 = vget_high_s16(cospi9_11); + const int16x4_t cospi13 = vget_low_s16(cospi13_15); + const int16x4_t cospi15 = vget_high_s16(cospi13_15); + const int16x4_t cospi17 = vget_low_s16(cospi17_19); + const int16x4_t cospi19 = vget_high_s16(cospi17_19); + const int16x4_t cospi21 = vget_low_s16(cospi21_23); + const int16x4_t cospi23 = vget_high_s16(cospi21_23); + const int16x4_t cospi25 = vget_low_s16(cospi25_27); + const int16x4_t cospi27 = vget_high_s16(cospi25_27); + const int16x4_t cospi29 = vget_low_s16(cospi29_31); + const int16x4_t cospi31 = vget_high_s16(cospi29_31); + + // stage 1 + int16x8_t x1[64]; + butterfly_dct_pre_s16_x8(input, x1, 64); + + // stage 2 + int16x8_t x2[64]; + butterfly_dct_pre_s16_x8(x1, x2, 32); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]); + + // stage 3 + int16x8_t x3[64]; + butterfly_dct_pre_s16_x8(x2, x3, 16); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32); + + // stage 4 + int16x8_t x4[64]; + butterfly_dct_pre_s16_x8(x3, x4, 8); + butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]); + butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]); + butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]); + + // stage 5 + int16x8_t x5[64]; + butterfly_dct_pre_s16_x8(x4, x5, 4); + butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]); + butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8); + butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]); + butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]); + butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]); + butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]); + butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16); + butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16); + + // stage 6 + int16x8_t x6[64]; + butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]); + butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]); + butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4); + butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]); + butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]); + butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8); + butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8); + butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]); + butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]); + butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]); + butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]); + butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]); + butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]); + + // stage 7 + int16x8_t x7[64]; + butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]); + butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]); + butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4); + butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4); + butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]); + butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]); + butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]); + butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]); + butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8); + butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8); + butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8); + butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8); + + // stage 8 + int16x8_t x8[64]; + butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]); + butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]); + butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]); + butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]); + butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4); + butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4); + butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4); + butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4); + butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]); + butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]); + butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]); + butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]); + butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]); + butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]); + butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]); + butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]); + + // stage 9 + int16x8_t x9[64]; + butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]); + butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]); + butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]); + butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]); + butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]); + butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]); + butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]); + butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]); + butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4); + butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4); + butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4); + butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4); + butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4); + butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4); + butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4); + butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4); + + // stage 10 + butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1], + &output[63]); + butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33], + &output[31]); + butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17], + &output[47]); + butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49], + &output[15]); + butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9], + &output[55]); + butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41], + &output[23]); + butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25], + &output[39]); + butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57], + &output[7]); + butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5], + &output[59]); + butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37], + &output[27]); + butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21], + &output[43]); + butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53], + &output[11]); + butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13], + &output[51]); + butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45], + &output[19]); + butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29], + &output[35]); + butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61], + &output[3]); + + // stage 11 + output[0] = x6[0]; + output[2] = x9[16]; + output[4] = x8[8]; + output[6] = x9[24]; + output[8] = x7[4]; + output[10] = x9[20]; + output[12] = x8[12]; + output[14] = x9[28]; + output[16] = x6[2]; + output[18] = x9[18]; + output[20] = x8[10]; + output[22] = x9[26]; + output[24] = x7[6]; + output[26] = x9[22]; + output[28] = x8[14]; + output[30] = x9[30]; + output[32] = x6[1]; + output[34] = x9[17]; + output[36] = x8[9]; + output[38] = x9[25]; + output[40] = x7[5]; + output[42] = x9[21]; + output[44] = x8[13]; + output[46] = x9[29]; + output[48] = x6[3]; + output[52] = x8[11]; + output[54] = x9[27]; + output[56] = x7[7]; + output[58] = x9[23]; + output[60] = x8[15]; + output[62] = x9[31]; +} + +static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + + // stage 2 + int16x8_t x2[8]; + butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]); + butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]); + + // stage 3 + int16x8_t x3[8]; + x3[0] = vqaddq_s16(input[0], x2[2]); + x3[1] = vqsubq_s16(x2[3], input[7]); + x3[2] = vqsubq_s16(input[0], x2[2]); + x3[3] = vqaddq_s16(input[7], x2[3]); + x3[4] = vqsubq_s16(x2[6], input[1]); + x3[5] = vqaddq_s16(input[6], x2[7]); + x3[6] = vqaddq_s16(input[1], x2[6]); + x3[7] = vqsubq_s16(input[6], x2[7]); + + // stage 4 + butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); + + // stage 5 + int16x8_t x5[8]; + x5[0] = vqaddq_s16(x3[0], x3[4]); + x5[1] = vqaddq_s16(x3[1], x3[5]); + x5[2] = vqaddq_s16(x3[2], x3[6]); + x5[3] = vqsubq_s16(x3[7], x3[3]); + x5[4] = vqsubq_s16(x3[0], x3[4]); + x5[5] = vqsubq_s16(x3[1], x3[5]); + x5[6] = vqsubq_s16(x3[2], x3[6]); + x5[7] = vqaddq_s16(x3[3], x3[7]); + + // stage 6 + butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]); + butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]); + butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]); + butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]); +} + +static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + + // stage 2 + int16x4_t x2[8]; + butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]); + butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]); + butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]); + butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]); + + // stage 3 + int16x4_t x3[16]; + x3[0] = vqadd_s16(input[0], x2[0]); + x3[1] = vqsub_s16(x2[1], input[15]); + x3[2] = vqsub_s16(input[0], x2[0]); + x3[3] = vqadd_s16(input[15], x2[1]); + x3[4] = vqsub_s16(x2[2], input[3]); + x3[5] = vqadd_s16(input[12], x2[3]); + x3[6] = vqadd_s16(input[3], x2[2]); + x3[7] = vqsub_s16(input[12], x2[3]); + x3[8] = vqsub_s16(x2[4], input[1]); + x3[9] = vqadd_s16(input[14], x2[5]); + x3[10] = vqadd_s16(input[1], x2[4]); + x3[11] = vqsub_s16(input[14], x2[5]); + x3[12] = vqadd_s16(input[2], x2[6]); + x3[13] = vqsub_s16(x2[7], input[13]); + x3[14] = vqsub_s16(input[2], x2[6]); + x3[15] = vqadd_s16(input[13], x2[7]); + + // stage 4 + butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); + butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); + butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]); + butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]); + + // stage 5 + int16x4_t x5[16]; + x5[0] = vqadd_s16(x3[0], x3[4]); + x5[1] = vqadd_s16(x3[1], x3[5]); + x5[2] = vqadd_s16(x3[2], x3[6]); + x5[3] = vqsub_s16(x3[7], x3[3]); + x5[4] = vqsub_s16(x3[0], x3[4]); + x5[5] = vqsub_s16(x3[1], x3[5]); + x5[6] = vqsub_s16(x3[2], x3[6]); + x5[7] = vqadd_s16(x3[3], x3[7]); + x5[8] = vqadd_s16(x3[8], x3[12]); + x5[9] = vqadd_s16(x3[9], x3[13]); + x5[10] = vqsub_s16(x3[14], x3[10]); + x5[11] = vqadd_s16(x3[11], x3[15]); + x5[12] = vqsub_s16(x3[8], x3[12]); + x5[13] = vqsub_s16(x3[9], x3[13]); + x5[14] = vqadd_s16(x3[10], x3[14]); + x5[15] = vqsub_s16(x3[11], x3[15]); + + // stage 6 + butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]); + butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]); + butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]); + butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]); + + // stage 7 + int16x4_t x7[16]; + x7[0] = vqadd_s16(x5[0], x5[8]); + x7[1] = vqadd_s16(x5[1], x5[9]); + x7[2] = vqadd_s16(x5[2], x5[10]); + x7[3] = vqadd_s16(x5[3], x5[11]); + x7[4] = vqadd_s16(x5[4], x5[12]); + x7[5] = vqadd_s16(x5[5], x5[13]); + x7[6] = vqadd_s16(x5[6], x5[14]); + x7[7] = vqsub_s16(x5[15], x5[7]); + x7[8] = vqsub_s16(x5[0], x5[8]); + x7[9] = vqsub_s16(x5[1], x5[9]); + x7[10] = vqsub_s16(x5[2], x5[10]); + x7[11] = vqsub_s16(x5[3], x5[11]); + x7[12] = vqsub_s16(x5[4], x5[12]); + x7[13] = vqsub_s16(x5[5], x5[13]); + x7[14] = vqsub_s16(x5[6], x5[14]); + x7[15] = vqadd_s16(x5[7], x5[15]); + + // stage 8 + butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]); + butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13], + &output[2]); + butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11], + &output[4]); + butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]); + butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]); + butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5], + &output[10]); + butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3], + &output[12]); + butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14], + &output[1]); +} + +static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + + // stage 2 + int16x8_t x2[8]; + butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]); + butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]); + butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]); + butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]); + + // stage 3 + int16x8_t x3[16]; + x3[0] = vqaddq_s16(input[0], x2[0]); + x3[1] = vqsubq_s16(x2[1], input[15]); + x3[2] = vqsubq_s16(input[0], x2[0]); + x3[3] = vqaddq_s16(input[15], x2[1]); + x3[4] = vqsubq_s16(x2[2], input[3]); + x3[5] = vqaddq_s16(input[12], x2[3]); + x3[6] = vqaddq_s16(input[3], x2[2]); + x3[7] = vqsubq_s16(input[12], x2[3]); + x3[8] = vqsubq_s16(x2[4], input[1]); + x3[9] = vqaddq_s16(input[14], x2[5]); + x3[10] = vqaddq_s16(input[1], x2[4]); + x3[11] = vqsubq_s16(input[14], x2[5]); + x3[12] = vqaddq_s16(input[2], x2[6]); + x3[13] = vqsubq_s16(x2[7], input[13]); + x3[14] = vqsubq_s16(input[2], x2[6]); + x3[15] = vqaddq_s16(input[13], x2[7]); + + // stage 4 + butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]); + butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]); + + // stage 5 + int16x8_t x5[16]; + x5[0] = vqaddq_s16(x3[0], x3[4]); + x5[1] = vqaddq_s16(x3[1], x3[5]); + x5[2] = vqaddq_s16(x3[2], x3[6]); + x5[3] = vqsubq_s16(x3[7], x3[3]); + x5[4] = vqsubq_s16(x3[0], x3[4]); + x5[5] = vqsubq_s16(x3[1], x3[5]); + x5[6] = vqsubq_s16(x3[2], x3[6]); + x5[7] = vqaddq_s16(x3[3], x3[7]); + x5[8] = vqaddq_s16(x3[8], x3[12]); + x5[9] = vqaddq_s16(x3[9], x3[13]); + x5[10] = vqsubq_s16(x3[14], x3[10]); + x5[11] = vqaddq_s16(x3[11], x3[15]); + x5[12] = vqsubq_s16(x3[8], x3[12]); + x5[13] = vqsubq_s16(x3[9], x3[13]); + x5[14] = vqaddq_s16(x3[10], x3[14]); + x5[15] = vqsubq_s16(x3[11], x3[15]); + + // stage 6 + butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]); + butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]); + + // stage 7 + int16x8_t x7[16]; + x7[0] = vqaddq_s16(x5[0], x5[8]); + x7[1] = vqaddq_s16(x5[1], x5[9]); + x7[2] = vqaddq_s16(x5[2], x5[10]); + x7[3] = vqaddq_s16(x5[3], x5[11]); + x7[4] = vqaddq_s16(x5[4], x5[12]); + x7[5] = vqaddq_s16(x5[5], x5[13]); + x7[6] = vqaddq_s16(x5[6], x5[14]); + x7[7] = vqsubq_s16(x5[15], x5[7]); + x7[8] = vqsubq_s16(x5[0], x5[8]); + x7[9] = vqsubq_s16(x5[1], x5[9]); + x7[10] = vqsubq_s16(x5[2], x5[10]); + x7[11] = vqsubq_s16(x5[3], x5[11]); + x7[12] = vqsubq_s16(x5[4], x5[12]); + x7[13] = vqsubq_s16(x5[5], x5[13]); + x7[14] = vqsubq_s16(x5[6], x5[14]); + x7[15] = vqaddq_s16(x5[7], x5[15]); + + // stage 8 + butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]); + butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13], + &output[2]); + butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11], + &output[4]); + butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]); + butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]); + butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5], + &output[10]); + butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3], + &output[12]); + butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14], + &output[1]); +} + +static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input, + int16x4_t *const output, + const int cos_bit) { + (void)cos_bit; + round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4); +} + +static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input, + int16x8_t *const output, + const int cos_bit) { + (void)cos_bit; + round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4); +} + +static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + (void)cos_bit; + shift_left_1_s16_x4(input, output, 8); +} + +static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + (void)cos_bit; + shift_left_1_s16_x8(input, output, 8); +} + +static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input, + int16x4_t *output, + int cos_bit) { + (void)cos_bit; + round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16); +} + +static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input, + int16x8_t *output, + int cos_bit) { + (void)cos_bit; + round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16); +} + +static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input, + int16x8_t *output, + int cos_bit) { + (void)cos_bit; + shift_left_2_s16_x8(input, output, 32); +} + +#define TRANSFORM_COL(name, tw, n) \ + static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \ + int stride, int cos_bit) { \ + int16x##tw##_t buf0[n]; \ + load_buffer_s16_x##tw(input, stride, buf0, n); \ + shift_left_2_s16_x##tw(buf0, buf0, n); \ + name##_neon(buf0, output, cos_bit); \ + } + +TRANSFORM_COL(fadst4x4, 4, 4) +TRANSFORM_COL(fadst4x8, 4, 8) +TRANSFORM_COL(fadst4x16, 4, 16) +TRANSFORM_COL(fadst8x4, 8, 4) +TRANSFORM_COL(fadst8x8, 8, 8) +TRANSFORM_COL(fadst8x16, 8, 16) +TRANSFORM_COL(fdct4x4, 4, 4) +TRANSFORM_COL(fdct4x8, 4, 8) +TRANSFORM_COL(fdct4x16, 4, 16) +TRANSFORM_COL(fdct8x4, 8, 4) +TRANSFORM_COL(fdct8x8, 8, 8) +TRANSFORM_COL(fdct8x16, 8, 16) +TRANSFORM_COL(fdct8x32, 8, 32) +TRANSFORM_COL(fidentity4x4, 4, 4) +TRANSFORM_COL(fidentity4x8, 4, 8) +TRANSFORM_COL(fidentity4x16, 4, 16) +TRANSFORM_COL(fidentity8x4, 8, 4) +TRANSFORM_COL(fidentity8x8, 8, 8) +TRANSFORM_COL(fidentity8x16, 8, 16) +TRANSFORM_COL(fidentity8x32, 8, 32) + +#define TRANSFORM_ROW(name, tw, n) \ + static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \ + int stride, int cos_bit) { \ + int16x##tw##_t buf0[n]; \ + name##_neon(input, buf0, cos_bit); \ + store_buffer_s16_x##tw(buf0, output, stride, n); \ + } + +#define TRANSFORM_ROW_RECT(name, tw, n) \ + static void name##_row_rect_neon(const int16x##tw##_t *input, \ + int32_t *output, int stride, int cos_bit) { \ + int16x##tw##_t buf0[n]; \ + name##_neon(input, buf0, cos_bit); \ + store_rect_buffer_s16_x##tw(buf0, output, stride, n); \ + } + +TRANSFORM_ROW(fadst4x4, 4, 4) +TRANSFORM_ROW(fadst4x16, 4, 16) +TRANSFORM_ROW(fadst8x4, 8, 4) +TRANSFORM_ROW(fadst8x8, 8, 8) +TRANSFORM_ROW(fadst8x16, 8, 16) +TRANSFORM_ROW(fdct4x4, 4, 4) +TRANSFORM_ROW(fdct4x16, 4, 16) +TRANSFORM_ROW(fdct8x4, 8, 4) +TRANSFORM_ROW(fdct8x8, 8, 8) +TRANSFORM_ROW(fdct8x16, 8, 16) +TRANSFORM_ROW(fdct8x32, 8, 32) +TRANSFORM_ROW(fidentity4x4, 4, 4) +TRANSFORM_ROW(fidentity4x16, 4, 16) +TRANSFORM_ROW(fidentity8x4, 8, 4) +TRANSFORM_ROW(fidentity8x8, 8, 8) +TRANSFORM_ROW(fidentity8x16, 8, 16) +TRANSFORM_ROW(fidentity8x32, 8, 32) + +TRANSFORM_ROW_RECT(fadst4x8, 4, 8) +TRANSFORM_ROW_RECT(fadst8x4, 8, 4) +TRANSFORM_ROW_RECT(fadst8x8, 8, 8) +TRANSFORM_ROW_RECT(fadst8x16, 8, 16) +TRANSFORM_ROW_RECT(fdct4x8, 4, 8) +TRANSFORM_ROW_RECT(fdct8x4, 8, 4) +TRANSFORM_ROW_RECT(fdct8x8, 8, 8) +TRANSFORM_ROW_RECT(fdct8x16, 8, 16) +TRANSFORM_ROW_RECT(fdct8x32, 8, 32) +TRANSFORM_ROW_RECT(fidentity4x8, 4, 8) +TRANSFORM_ROW_RECT(fidentity8x4, 8, 4) +TRANSFORM_ROW_RECT(fidentity8x8, 8, 8) +TRANSFORM_ROW_RECT(fidentity8x16, 8, 16) +TRANSFORM_ROW_RECT(fidentity8x32, 8, 32) + +typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input, + int16x4_t *output, int cos_bit); +typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input, + int16x8_t *output, int cos_bit); + +typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input, + int16x4_t *output, int stride, + int cos_bit); +typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input, + int16x8_t *output, int stride, + int cos_bit); + +typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input, + int32_t *output, int stride, + int cos_bit); +typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input, + int32_t *output, int stride, + int cos_bit); + +static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = { + fdct4x8_col_neon, // DCT_DCT + fadst4x8_col_neon, // ADST_DCT + fdct4x8_col_neon, // DCT_ADST + fadst4x8_col_neon, // ADST_ADST + fadst4x8_col_neon, // FLIPADST_DCT + fdct4x8_col_neon, // DCT_FLIPADST + fadst4x8_col_neon, // FLIPADST_FLIPADST + fadst4x8_col_neon, // ADST_FLIPADST + fadst4x8_col_neon, // FLIPADST_ADST + fidentity4x8_col_neon, // IDTX + fdct4x8_col_neon, // V_DCT + fidentity4x8_col_neon, // H_DCT + fadst4x8_col_neon, // V_ADST + fidentity4x8_col_neon, // H_ADST + fadst4x8_col_neon, // V_FLIPADST + fidentity4x8_col_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = { + fdct8x4_row_neon, // DCT_DCT + fdct8x4_row_neon, // ADST_DCT + fadst8x4_row_neon, // DCT_ADST + fadst8x4_row_neon, // ADST_ADST + fdct8x4_row_neon, // FLIPADST_DCT + fadst8x4_row_neon, // DCT_FLIPADST + fadst8x4_row_neon, // FLIPADST_FLIPADST + fadst8x4_row_neon, // ADST_FLIPADST + fadst8x4_row_neon, // FLIPADST_ADST + fidentity8x4_row_neon, // IDTX + fidentity8x4_row_neon, // V_DCT + fdct8x4_row_neon, // H_DCT + fidentity8x4_row_neon, // V_ADST + fadst8x4_row_neon, // H_ADST + fidentity8x4_row_neon, // V_FLIPADST + fadst8x4_row_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = { + fdct8x4_row_rect_neon, // DCT_DCT + fdct8x4_row_rect_neon, // ADST_DCT + fadst8x4_row_rect_neon, // DCT_ADST + fadst8x4_row_rect_neon, // ADST_ADST + fdct8x4_row_rect_neon, // FLIPADST_DCT + fadst8x4_row_rect_neon, // DCT_FLIPADST + fadst8x4_row_rect_neon, // FLIPADST_FLIPADST + fadst8x4_row_rect_neon, // ADST_FLIPADST + fadst8x4_row_rect_neon, // FLIPADST_ADST + fidentity8x4_row_rect_neon, // IDTX + fidentity8x4_row_rect_neon, // V_DCT + fdct8x4_row_rect_neon, // H_DCT + fidentity8x4_row_rect_neon, // V_ADST + fadst8x4_row_rect_neon, // H_ADST + fidentity8x4_row_rect_neon, // V_FLIPADST + fadst8x4_row_rect_neon // H_FLIPADST +}; + +static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = { + fdct8x4_col_neon, // DCT_DCT + fadst8x4_col_neon, // ADST_DCT + fdct8x4_col_neon, // DCT_ADST + fadst8x4_col_neon, // ADST_ADST + fadst8x4_col_neon, // FLIPADST_DCT + fdct8x4_col_neon, // DCT_FLIPADST + fadst8x4_col_neon, // FLIPADST_FLIPADST + fadst8x4_col_neon, // ADST_FLIPADST + fadst8x4_col_neon, // FLIPADST_ADST + fidentity8x4_col_neon, // IDTX + fdct8x4_col_neon, // V_DCT + fidentity8x4_col_neon, // H_DCT + fadst8x4_col_neon, // V_ADST + fidentity8x4_col_neon, // H_ADST + fadst8x4_col_neon, // V_FLIPADST + fidentity8x4_col_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = { + fdct4x8_row_rect_neon, // DCT_DCT + fdct4x8_row_rect_neon, // ADST_DCT + fadst4x8_row_rect_neon, // DCT_ADST + fadst4x8_row_rect_neon, // ADST_ADST + fdct4x8_row_rect_neon, // FLIPADST_DCT + fadst4x8_row_rect_neon, // DCT_FLIPADST + fadst4x8_row_rect_neon, // FLIPADST_FLIPADST + fadst4x8_row_rect_neon, // ADST_FLIPADST + fadst4x8_row_rect_neon, // FLIPADST_ADST + fidentity4x8_row_rect_neon, // IDTX + fidentity4x8_row_rect_neon, // V_DCT + fdct4x8_row_rect_neon, // H_DCT + fidentity4x8_row_rect_neon, // V_ADST + fadst4x8_row_rect_neon, // H_ADST + fidentity4x8_row_rect_neon, // V_FLIPADST + fadst4x8_row_rect_neon // H_FLIPADST +}; + +static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = { + fdct8x8_col_neon, // DCT_DCT + fadst8x8_col_neon, // ADST_DCT + fdct8x8_col_neon, // DCT_ADST + fadst8x8_col_neon, // ADST_ADST + fadst8x8_col_neon, // FLIPADST_DCT + fdct8x8_col_neon, // DCT_FLIPADST + fadst8x8_col_neon, // FLIPADST_FLIPADST + fadst8x8_col_neon, // ADST_FLIPADST + fadst8x8_col_neon, // FLIPADST_ADST + fidentity8x8_col_neon, // IDTX + fdct8x8_col_neon, // V_DCT + fidentity8x8_col_neon, // H_DCT + fadst8x8_col_neon, // V_ADST + fidentity8x8_col_neon, // H_ADST + fadst8x8_col_neon, // V_FLIPADST + fidentity8x8_col_neon, // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = { + fdct8x8_row_neon, // DCT_DCT + fdct8x8_row_neon, // ADST_DCT + fadst8x8_row_neon, // DCT_ADST + fadst8x8_row_neon, // ADST_ADST + fdct8x8_row_neon, // FLIPADST_DCT + fadst8x8_row_neon, // DCT_FLIPADST + fadst8x8_row_neon, // FLIPADST_FLIPADST + fadst8x8_row_neon, // ADST_FLIPADST + fadst8x8_row_neon, // FLIPADST_ADST + fidentity8x8_row_neon, // IDTX + fidentity8x8_row_neon, // V_DCT + fdct8x8_row_neon, // H_DCT + fidentity8x8_row_neon, // V_ADST + fadst8x8_row_neon, // H_ADST + fidentity8x8_row_neon, // V_FLIPADST + fadst8x8_row_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = { + fdct8x8_row_rect_neon, // DCT_DCT + fdct8x8_row_rect_neon, // ADST_DCT + fadst8x8_row_rect_neon, // DCT_ADST + fadst8x8_row_rect_neon, // ADST_ADST + fdct8x8_row_rect_neon, // FLIPADST_DCT + fadst8x8_row_rect_neon, // DCT_FLIPADST + fadst8x8_row_rect_neon, // FLIPADST_FLIPADST + fadst8x8_row_rect_neon, // ADST_FLIPADST + fadst8x8_row_rect_neon, // FLIPADST_ADST + fidentity8x8_row_rect_neon, // IDTX + fidentity8x8_row_rect_neon, // V_DCT + fdct8x8_row_rect_neon, // H_DCT + fidentity8x8_row_rect_neon, // V_ADST + fadst8x8_row_rect_neon, // H_ADST + fidentity8x8_row_rect_neon, // V_FLIPADST + fadst8x8_row_rect_neon // H_FLIPADST +}; + +static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = { + fdct4x16_col_neon, // DCT_DCT + fadst4x16_col_neon, // ADST_DCT + fdct4x16_col_neon, // DCT_ADST + fadst4x16_col_neon, // ADST_ADST + fadst4x16_col_neon, // FLIPADST_DCT + fdct4x16_col_neon, // DCT_FLIPADST + fadst4x16_col_neon, // FLIPADST_FLIPADST + fadst4x16_col_neon, // ADST_FLIPADST + fadst4x16_col_neon, // FLIPADST_ADST + fidentity4x16_col_neon, // IDTX + fdct4x16_col_neon, // V_DCT + fidentity4x16_col_neon, // H_DCT + fadst4x16_col_neon, // V_ADST + fidentity4x16_col_neon, // H_ADST + fadst4x16_col_neon, // V_FLIPADST + fidentity4x16_col_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = { + fdct4x16_row_neon, // DCT_DCT + fdct4x16_row_neon, // ADST_DCT + fadst4x16_row_neon, // DCT_ADST + fadst4x16_row_neon, // ADST_ADST + fdct4x16_row_neon, // FLIPADST_DCT + fadst4x16_row_neon, // DCT_FLIPADST + fadst4x16_row_neon, // FLIPADST_FLIPADST + fadst4x16_row_neon, // ADST_FLIPADST + fadst4x16_row_neon, // FLIPADST_ADST + fidentity4x16_row_neon, // IDTX + fidentity4x16_row_neon, // V_DCT + fdct4x16_row_neon, // H_DCT + fidentity4x16_row_neon, // V_ADST + fadst4x16_row_neon, // H_ADST + fidentity4x16_row_neon, // V_FLIPADST + fadst4x16_row_neon // H_FLIPADST +}; + +static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_col_neon, // DCT_DCT + fadst8x16_col_neon, // ADST_DCT + fdct8x16_col_neon, // DCT_ADST + fadst8x16_col_neon, // ADST_ADST + fadst8x16_col_neon, // FLIPADST_DCT + fdct8x16_col_neon, // DCT_FLIPADST + fadst8x16_col_neon, // FLIPADST_FLIPADST + fadst8x16_col_neon, // ADST_FLIPADST + fadst8x16_col_neon, // FLIPADST_ADST + fidentity8x16_col_neon, // IDTX + fdct8x16_col_neon, // V_DCT + fidentity8x16_col_neon, // H_DCT + fadst8x16_col_neon, // V_ADST + fidentity8x16_col_neon, // H_ADST + fadst8x16_col_neon, // V_FLIPADST + fidentity8x16_col_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = { + fdct8x16_row_neon, // DCT_DCT + fdct8x16_row_neon, // ADST_DCT + fadst8x16_row_neon, // DCT_ADST + fadst8x16_row_neon, // ADST_ADST + fdct8x16_row_neon, // FLIPADST_DCT + fadst8x16_row_neon, // DCT_FLIPADST + fadst8x16_row_neon, // FLIPADST_FLIPADST + fadst8x16_row_neon, // ADST_FLIPADST + fadst8x16_row_neon, // FLIPADST_ADST + fidentity8x16_row_neon, // IDTX + fidentity8x16_row_neon, // V_DCT + fdct8x16_row_neon, // H_DCT + fidentity8x16_row_neon, // V_ADST + fadst8x16_row_neon, // H_ADST + fidentity8x16_row_neon, // V_FLIPADST + fadst8x16_row_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = { + fdct8x16_row_rect_neon, // DCT_DCT + fdct8x16_row_rect_neon, // ADST_DCT + fadst8x16_row_rect_neon, // DCT_ADST + fadst8x16_row_rect_neon, // ADST_ADST + fdct8x16_row_rect_neon, // FLIPADST_DCT + fadst8x16_row_rect_neon, // DCT_FLIPADST + fadst8x16_row_rect_neon, // FLIPADST_FLIPADST + fadst8x16_row_rect_neon, // ADST_FLIPADST + fadst8x16_row_rect_neon, // FLIPADST_ADST + fidentity8x16_row_rect_neon, // IDTX + fidentity8x16_row_rect_neon, // V_DCT + fdct8x16_row_rect_neon, // H_DCT + fidentity8x16_row_rect_neon, // V_ADST + fadst8x16_row_rect_neon, // H_ADST + fidentity8x16_row_rect_neon, // V_FLIPADST + fadst8x16_row_rect_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = { + fdct8x32_row_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_row_neon, // IDTX + fidentity8x32_row_neon, // V_DCT + fdct8x32_row_neon, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = { + fdct8x32_row_rect_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_row_rect_neon, // IDTX + fidentity8x32_row_rect_neon, // V_DCT + fdct8x32_row_rect_neon, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = { + fdct8x32_col_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_col_neon, // IDTX + fdct8x32_col_neon, // V_DCT + fidentity8x32_col_neon, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); + + int16x4_t buf0[4], buf1[4]; + switch (tx_type) { + case DCT_DCT: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case ADST_DCT: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case DCT_ADST: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case ADST_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case FLIPADST_DCT: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case DCT_FLIPADST: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case FLIPADST_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case ADST_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case FLIPADST_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case IDTX: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case V_DCT: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_DCT: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case V_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_ADST: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case V_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_FLIPADST: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + } +} + +static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x4_t buf0[8]; + int16x8_t buf1[8]; + const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + col_txfm(input, buf0, stride, 13); + shift_right_1_round_s16_x4(buf0, buf0, 8); + transpose_arrays_s16_4x8(buf0, buf1); + + if (lr_flip) { + int16x8_t buf2[8]; + flip_buf_8_neon(buf1, buf2, 4); + row_txfm(buf2, output, 8, 13); + } else { + row_txfm(buf1, output, 8, 13); + } +} + +static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x4_t buf0[16]; + int16x8_t buf1[16]; + const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + col_txfm(input, buf0, stride, 13); + shift_right_1_round_s16_x4(buf0, buf0, 16); + transpose_arrays_s16_4x8(buf0, buf1); + transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + if (lr_flip) { + int16x8_t buf2[16]; + flip_buf_8_neon(buf1 + 8 * i, buf2, 4); + row_txfm(buf2, output + 8 * i, 16, 12); + } else { + int16x8_t *buf = buf1 + 8 * i; + row_txfm(buf, output + 8 * i, 16, 12); + } + } +} + +static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[8]; + int16x4_t buf1[8]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type]; + const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); + col_txfm(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 4); + transpose_arrays_s16_8x4(buf0, buf1); + + if (lr_flip) { + int16x4_t buf2[8]; + flip_buf_4_neon(buf1, buf2, 8); + row_txfm(buf2, output, 4, 13); + } else { + row_txfm(buf1, output, 4, 13); + } +} + +static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + + int16x8_t buf0[8], buf1[8]; + + switch (tx_type) { + case DCT_DCT: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case ADST_DCT: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case DCT_ADST: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case ADST_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case FLIPADST_DCT: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case DCT_FLIPADST: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case FLIPADST_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case ADST_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case FLIPADST_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case IDTX: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case V_DCT: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_DCT: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case V_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_ADST: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case V_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_FLIPADST: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + } +} + +static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[16], buf1[16]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + col_txfm(input, buf0, stride, 13); + shift_right_2_round_s16_x8(buf0, buf0, 16); + transpose_arrays_s16_8x8(buf0, buf1); + transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 8 * i, buf0, 8); + row_txfm(buf0, output + 8 * i, 16, 13); + } else { + int16x8_t *buf = buf1 + 8 * i; + row_txfm(buf, output + 8 * i, 16, 13); + } + } +} + +static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[32], buf1[32]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); + col_txfm(input, buf0, stride, 12); + shift_right_2_round_s16_x8(buf0, buf0, 32); + transpose_arrays_s16_8x8(buf0, buf1); + transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8); + transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16); + transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24); + + for (int i = 0; i < 4; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 8 * i, buf0, 8); + row_txfm(buf0, output + 8 * i, 32, 12); + } else { + int16x8_t *buf = buf1 + 8 * i; + row_txfm(buf, output + 8 * i, 32, 12); + } + } +} + +static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[16]; + int16x4_t buf1[16]; + int16x4_t buf2[16]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type]; + const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); + for (int i = 0; i < 2; i++) { + col_txfm(input + 8 * i, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 4); + transpose_arrays_s16_8x4(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + flip_buf_4_neon(buf1, buf2, 16); + row_txfm(buf2, output, 4, 13); + } else { + row_txfm(buf1, output, 4, 13); + } +} + +static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[16], buf1[16]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + for (int i = 0; i < 2; i++) { + col_txfm(input + 8 * i, buf0, stride, 13); + shift_right_2_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + flip_buf_8_neon(buf1, buf0, 16); + row_txfm(buf0, output, 8, 13); + } else { + row_txfm(buf1, output, 8, 13); + } +} + +static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[16], buf1[32]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + for (int i = 0; i < 2; i++) { + col_txfm(input + 8 * i, buf0, stride, 13); + shift_right_2_round_s16_x8(buf0, buf0, 16); + transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i); + } + + for (int i = 0; i < 2; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 16 * i, buf0, 16); + row_txfm(buf0, output + 8 * i, 16, 12); + } else { + int16x8_t *buf = buf1 + 16 * i; + row_txfm(buf, output + 8 * i, 16, 12); + } + } +} + +static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[32], buf1[64]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type]; + + if (col_txfm == NULL || row_txfm == NULL) { + av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd); + return; + } + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); + for (int i = 0; i < 2; i++) { + col_txfm(input + 8 * i, buf0, stride, 12); + shift_right_4_round_s16_x8(buf0, buf0, 32); + transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i); + } + + for (int i = 0; i < 4; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 16 * i, buf0, 16); + row_txfm(buf0, output + 8 * i, 32, 13); + } else { + int16x8_t *buf = buf1 + 16 * i; + row_txfm(buf, output + 8 * i, 32, 13); + } + } +} + +static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[32], buf1[32]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm == NULL || row_txfm == NULL) { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + return; + } + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + for (int i = 0; i < 4; i++) { + col_txfm(input + 8 * i, buf0, stride, 13); + shift_right_2_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i); + } + + if (lr_flip) { + flip_buf_8_neon(buf1, buf0, 32); + row_txfm(buf0, output, 8, 12); + } else { + row_txfm(buf1, output, 8, 12); + } +} + +static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[32], buf1[64]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type]; + + if (col_txfm == NULL || row_txfm == NULL) { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + return; + } + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + for (int i = 0; i < 4; i++) { + col_txfm(input + 8 * i, buf0, stride, 13); + shift_right_4_round_s16_x8(buf0, buf0, 16); + transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i); + } + + for (int i = 0; i < 2; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 32 * i, buf0, 32); + row_txfm(buf0, output + 8 * i, 16, 13); + } else { + int16x8_t *buf = buf1 + 32 * i; + row_txfm(buf, output + 8 * i, 16, 13); + } + } +} + +static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[32], buf1[128]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm == NULL || row_txfm == NULL) { + av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd); + return; + } + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); + for (int i = 0; i < 4; i++) { + col_txfm(input + 8 * i, buf0, stride, 12); + shift_right_4_round_s16_x8(buf0, buf0, 32); + transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i); + } + + for (int i = 0; i < 4; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 32 * i, buf0, 32); + row_txfm(buf0, output + 8 * i, 32, 12); + } else { + int16x8_t *buf = buf1 + 32 * i; + row_txfm(buf, output + 8 * i, 32, 12); + } + } +} + +static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + int16x8_t buf0[64], buf1[128]; + const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon; + const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon; + + for (int i = 0; i < 8; i++) { + load_buffer_s16_x8(input + 8 * i, stride, buf0, 16); + shift_left_2_s16_x8(buf0, buf0, 16); + col_txfm(buf0, buf0, 13); + shift_right_4_round_s16_x8(buf0, buf0, 16); + for (int j = 0; j < 2; ++j) { + transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); + } + } + + for (int i = 0; i < 2; i++) { + int16x8_t *buf = buf1 + 64 * i; + row_txfm(buf, buf, 12); + store_buffer_s16_x8(buf, output + 8 * i, 16, 32); + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + int16x8_t buf0[64], buf1[128]; + const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; + const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon; + + for (int i = 0; i < 2; i++) { + load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); + col_txfm(buf0, buf0, 13); + shift_right_2_round_s16_x8(buf0, buf0, 64); + for (int j = 0; j < 8; ++j) { + transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i); + } + } + + for (int i = 0; i < 4; i++) { + int16x8_t *buf = buf1 + 16 * i; + row_txfm(buf, buf, 12); + store_buffer_s16_x8(buf, output + 8 * i, 32, 16); + } +} + +static void fdct32_neon(const int32x4_t *input, int32x4_t *output, + int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + + int32x4_t buf0[32]; + int32x4_t buf1[32]; + + // stage 1 + butterfly_dct_pre_s32_x4(input, buf1, 32); + + // stage 2 + butterfly_dct_pre_s32_x4(buf1, buf0, 16); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27], + &buf0[20]); + butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26], + &buf0[21]); + butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25], + &buf0[22]); + butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24], + &buf0[23]); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + butterfly_dct_pre_s32_x4(buf0, buf1, 8); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13], + &buf1[10]); + butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12], + &buf1[11]); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16); + + // stage 4 + butterfly_dct_pre_s32_x4(buf1, buf0, 4); + buf0[4] = buf1[4]; + butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]); + buf0[7] = buf1[7]; + butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29], + &buf0[18]); + butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28], + &buf0[19]); + butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27], + &buf0[20]); + butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26], + &buf0[21]); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]); + butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]); + butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4); + buf1[8] = buf0[8]; + butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14], + &buf1[9]); + butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13], + &buf1[10]); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8); + butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8); + + // stage 6 + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]); + butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]); + butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4); + butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4); + buf0[16] = buf1[16]; + butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30], + &buf0[17]); + butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29], + &buf0[18]); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26], + &buf0[21]); + butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25], + &buf0[22]); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8], + &buf1[15]); + butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9], + &buf1[14]); + butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10], + &buf1[13]); + butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11], + &buf1[12]); + butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4); + butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4); + butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4); + butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4); + + // stage 8 + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16], + &buf0[31]); + butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17], + &buf0[30]); + butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18], + &buf0[29]); + butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19], + &buf0[28]); + butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20], + &buf0[27]); + butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21], + &buf0[26]); + butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22], + &buf0[25]); + butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23], + &buf0[24]); + + // stage 9 + output[0] = buf0[0]; + output[1] = buf0[16]; + output[2] = buf0[8]; + output[3] = buf0[24]; + output[4] = buf0[4]; + output[5] = buf0[20]; + output[6] = buf0[12]; + output[7] = buf0[28]; + output[8] = buf0[2]; + output[9] = buf0[18]; + output[10] = buf0[10]; + output[11] = buf0[26]; + output[12] = buf0[6]; + output[13] = buf0[22]; + output[14] = buf0[14]; + output[15] = buf0[30]; + output[16] = buf0[1]; + output[17] = buf0[17]; + output[18] = buf0[9]; + output[19] = buf0[25]; + output[20] = buf0[5]; + output[21] = buf0[21]; + output[22] = buf0[13]; + output[23] = buf0[29]; + output[24] = buf0[3]; + output[25] = buf0[19]; + output[26] = buf0[11]; + output[27] = buf0[27]; + output[28] = buf0[7]; + output[29] = buf0[23]; + output[30] = buf0[15]; + output[31] = buf0[31]; +} + +static void fdct64_neon(const int32x4_t *input, int32x4_t *output, + int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]); + const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]); + const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]); + const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]); + const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]); + const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]); + const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]); + const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + const int16x4_t cospi1 = vget_low_s16(cospi1_3); + const int16x4_t cospi3 = vget_high_s16(cospi1_3); + const int16x4_t cospi5 = vget_low_s16(cospi5_7); + const int16x4_t cospi7 = vget_high_s16(cospi5_7); + const int16x4_t cospi9 = vget_low_s16(cospi9_11); + const int16x4_t cospi11 = vget_high_s16(cospi9_11); + const int16x4_t cospi13 = vget_low_s16(cospi13_15); + const int16x4_t cospi15 = vget_high_s16(cospi13_15); + const int16x4_t cospi17 = vget_low_s16(cospi17_19); + const int16x4_t cospi19 = vget_high_s16(cospi17_19); + const int16x4_t cospi21 = vget_low_s16(cospi21_23); + const int16x4_t cospi23 = vget_high_s16(cospi21_23); + const int16x4_t cospi25 = vget_low_s16(cospi25_27); + const int16x4_t cospi27 = vget_high_s16(cospi25_27); + const int16x4_t cospi29 = vget_low_s16(cospi29_31); + const int16x4_t cospi31 = vget_high_s16(cospi29_31); + + // stage 1 + int32x4_t x1[64]; + butterfly_dct_pre_s32_x4(input, x1, 64); + + // stage 2 + int32x4_t x2[64]; + butterfly_dct_pre_s32_x4(x1, x2, 32); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]); + + // stage 3 + int32x4_t x3[64]; + butterfly_dct_pre_s32_x4(x2, x3, 16); + butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]); + butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]); + butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]); + butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]); + butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32); + + // stage 4 + int32x4_t x4[64]; + butterfly_dct_pre_s32_x4(x3, x4, 8); + butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]); + butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]); + butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16); + butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]); + butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]); + butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]); + butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]); + butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]); + butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]); + butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]); + butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]); + + // stage 5 + int32x4_t x5[64]; + butterfly_dct_pre_s32_x4(x4, x5, 4); + butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]); + butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8); + butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]); + butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]); + butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]); + butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]); + butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16); + butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16); + + // stage 6 + int32x4_t x6[64]; + butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]); + butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]); + butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4); + butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]); + butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]); + butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8); + butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8); + butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]); + butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]); + butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]); + butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]); + butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]); + butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]); + butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]); + butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]); + + // stage 7 + int32x4_t x7[64]; + butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]); + butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]); + butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4); + butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4); + butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]); + butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]); + butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]); + butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]); + butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8); + butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8); + butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8); + butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8); + + // stage 8 + int32x4_t x8[64]; + butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]); + butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]); + butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]); + butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]); + butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4); + butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4); + butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4); + butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4); + butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]); + butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]); + butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]); + butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]); + butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]); + butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]); + butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]); + butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]); + + // stage 9 + int32x4_t x9[64]; + butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]); + butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]); + butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]); + butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]); + butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]); + butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]); + butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]); + butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]); + butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4); + butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4); + butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4); + butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4); + butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4); + butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4); + butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4); + butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4); + + // stage 10 + int32x4_t x10[64]; + butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]); + butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]); + butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]); + butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]); + butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]); + butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]); + butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]); + butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]); + butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]); + butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]); + butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]); + butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]); + butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]); + butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]); + butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]); + butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]); + + // stage 11, only store into the low 32 output indices. + output[0] = x6[0]; + output[1] = x10[32]; + output[2] = x9[16]; + output[3] = x10[48]; + output[4] = x8[8]; + output[5] = x10[40]; + output[6] = x9[24]; + output[7] = x10[56]; + output[8] = x7[4]; + output[9] = x10[36]; + output[10] = x9[20]; + output[11] = x10[52]; + output[12] = x8[12]; + output[13] = x10[44]; + output[14] = x9[28]; + output[15] = x10[60]; + output[16] = x6[2]; + output[17] = x10[34]; + output[18] = x9[18]; + output[19] = x10[50]; + output[20] = x8[10]; + output[21] = x10[42]; + output[22] = x9[26]; + output[23] = x10[58]; + output[24] = x7[6]; + output[25] = x10[38]; + output[26] = x9[22]; + output[27] = x10[54]; + output[28] = x8[14]; + output[29] = x10[46]; + output[30] = x9[30]; + output[31] = x10[62]; +} + +static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + int16x8_t buf0[64], buf1[512]; + const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; + + for (int i = 0; i < 8; i++) { + load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); + col_txfm(buf0, buf0, 13); + shift_right_2_round_s16_x8(buf0, buf0, 64); + for (int j = 0; j < 4; ++j) { + transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); + } + } + for (int i = 0; i < 4; i++) { + int32x4_t bufA[64]; + int32x4_t bufB[64]; + int16x8_t *buf = buf1 + 64 * i; + for (int j = 0; j < 64; ++j) { + bufA[j] = vmovl_s16(vget_low_s16(buf[j])); + bufB[j] = vmovl_s16(vget_high_s16(buf[j])); + } + fdct64_neon(bufA, bufA, 10); + fdct64_neon(bufB, bufB, 10); + shift_right_2_round_s32_x4(bufA, bufA, 32); + shift_right_2_round_s32_x4(bufB, bufB, 32); + store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[64], buf1[256]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; + + for (int i = 0; i < 8; i++) { + col_txfm(input + 8 * i, buf0, stride, 12); + shift_right_4_round_s16_x8(buf0, buf0, 32); + for (int j = 0; j < 4; ++j) { + transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < 4; i++) { + int32x4_t bufA[64]; + int32x4_t bufB[64]; + int16x8_t *buf = buf1 + 64 * i; + for (int j = 0; j < 64; ++j) { + bufA[j] = vmovl_s16(vget_low_s16(buf[j])); + bufB[j] = vmovl_s16(vget_high_s16(buf[j])); + } + fdct64_neon(bufA, bufA, 11); + fdct64_neon(bufB, bufB, 11); + shift_right_2_round_s32_x4(bufA, bufA, 32); + shift_right_2_round_s32_x4(bufB, bufB, 32); + round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); + round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32); + store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + int16x8_t buf0[64], buf1[256]; + const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; + + for (int i = 0; i < 4; i++) { + load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); + col_txfm(buf0, buf0, 13); + shift_right_2_round_s16_x8(buf0, buf0, 64); + for (int j = 0; j < 4; ++j) { + transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i); + } + } + + for (int i = 0; i < 4; i++) { + int32x4_t bufA[32]; + int32x4_t bufB[32]; + int16x8_t *buf = buf1 + 32 * i; + for (int j = 0; j < 32; ++j) { + bufA[j] = vmovl_s16(vget_low_s16(buf[j])); + bufB[j] = vmovl_s16(vget_high_s16(buf[j])); + } + fdct32_neon(bufA, bufA, 11); + fdct32_neon(bufB, bufB, 11); + shift_right_2_round_s32_x4(bufA, bufA, 32); + shift_right_2_round_s32_x4(bufB, bufB, 32); + round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); + round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32); + store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = { + lowbd_fwd_txfm2d_4x4_neon, // 4x4 transform + lowbd_fwd_txfm2d_8x8_neon, // 8x8 transform + lowbd_fwd_txfm2d_16x16_neon, // 16x16 transform + lowbd_fwd_txfm2d_32x32_neon, // 32x32 transform + lowbd_fwd_txfm2d_64x64_neon, // 64x64 transform + lowbd_fwd_txfm2d_4x8_neon, // 4x8 transform + lowbd_fwd_txfm2d_8x4_neon, // 8x4 transform + lowbd_fwd_txfm2d_8x16_neon, // 8x16 transform + lowbd_fwd_txfm2d_16x8_neon, // 16x8 transform + lowbd_fwd_txfm2d_16x32_neon, // 16x32 transform + lowbd_fwd_txfm2d_32x16_neon, // 32x16 transform + lowbd_fwd_txfm2d_32x64_neon, // 32x64 transform + lowbd_fwd_txfm2d_64x32_neon, // 64x32 transform + lowbd_fwd_txfm2d_4x16_neon, // 4x16 transform + lowbd_fwd_txfm2d_16x4_neon, // 16x4 transform + lowbd_fwd_txfm2d_8x32_neon, // 8x32 transform + lowbd_fwd_txfm2d_32x8_neon, // 32x8 transform + lowbd_fwd_txfm2d_16x64_neon, // 16x64 transform + lowbd_fwd_txfm2d_64x16_neon, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size]; + if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c new file mode 100644 index 0000000000..11d3def16b --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "aom_dsp/arm/mem_neon.h" + +#include "av1/common/quant_common.h" +#include "av1/encoder/av1_quantize.h" + +static INLINE uint16x4_t quantize_4(const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, + int32x4_t v_round_s32, int log_scale) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_log_scale = vdupq_n_s32(log_scale); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) + const int32x4_t v_abs_coeff_scaled = + vshlq_s32(v_abs_coeff, vdupq_n_s32(1 + log_scale)); + const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32); + // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 + const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32), + vreinterpretq_s32_u32(v_mask)); + // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale)); + const int32x4_t v_abs_qcoeff = + vqdmulhq_s32(vshlq_s32(v_tmp, v_log_scale), v_quant_s32); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + // vshlq_s32 will shift right if shift value is negative. + const int32x4_t v_abs_dqcoeff = + vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale)); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Used to find eob. + const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0)); + return vmovn_u32(nz_qcoeff_mask); +} + +static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan, + int16x8_t v_eobmax, + uint16x8_t v_mask) { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); + const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); + return vmaxq_s16(v_eobmax, v_nz_iscan); +} + +static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { +#if AOM_ARCH_AARCH64 + return (uint16_t)vmaxvq_s16(v_eobmax); +#else + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + return (uint16_t)vget_lane_s16(v_eobmax_final, 0); +#endif +} + +void av1_highbd_quantize_fp_neon( + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale) { + (void)scan; + (void)zbin_ptr; + (void)quant_shift_ptr; + + const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_zero = vdup_n_s16(0); + const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero); + const int16x4_t v_round_no_scale = vld1_s16(round_ptr); + const int16x4_t v_round_log_scale = + vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); + const int16x4_t v_round = + vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale); + int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); + int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); + int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + + // DC and first 3 AC + v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, + v_dequant_s32, v_round_s32, log_scale); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + + // 4 more AC + v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32, log_scale); + + // Find the max lane eob for the first 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + count -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, + v_dequant_s32, v_round_s32, log_scale); + v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32, log_scale); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + count -= 8; + } while (count); + + *eob_ptr = get_max_eob(v_eobmax); +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c new file mode 100644 index 0000000000..d13cc65ae0 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "aom_dsp/arm/sum_neon.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static int32x4_t k_means_multiply_add_neon(const int16x8_t a) { + const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a)); + const int32x4_t h = vmull_s16(vget_high_s16(a), vget_high_s16(a)); +#if AOM_ARCH_AARCH64 + return vpaddq_s32(l, h); +#else + const int32x2_t dl = vpadd_s32(vget_low_s32(l), vget_high_s32(l)); + const int32x2_t dh = vpadd_s32(vget_low_s32(h), vget_high_s32(h)); + return vcombine_s32(dl, dh); +#endif +} + +void av1_calc_indices_dim1_neon(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + int64x2_t sum = vdupq_n_s64(0); + int16x8_t cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + cents[j] = vdupq_n_s16(centroids[j]); + } + + for (int i = 0; i < n; i += 8) { + const int16x8_t in = vld1q_s16(data); + uint16x8_t ind = vdupq_n_u16(0); + // Compute the distance to the first centroid. + int16x8_t dist_min = vabdq_s16(in, cents[0]); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + const int16x8_t dist = vabdq_s16(in, cents[j]); + // Compare to the minimal one. + const uint16x8_t cmp = vcgtq_s16(dist_min, dist); + dist_min = vminq_s16(dist_min, dist); + const uint16x8_t ind1 = vdupq_n_u16(j); + ind = vbslq_u16(cmp, ind1, ind); + } + if (total_dist) { + // Square, convert to 32 bit and add together. + const int32x4_t l = + vmull_s16(vget_low_s16(dist_min), vget_low_s16(dist_min)); + const int32x4_t sum32_tmp = + vmlal_s16(l, vget_high_s16(dist_min), vget_high_s16(dist_min)); + // Pairwise sum, convert to 64 bit and add to sum. + sum = vpadalq_s32(sum, sum32_tmp); + } + vst1_u8(indices, vmovn_u16(ind)); + indices += 8; + data += 8; + } + if (total_dist) { + *total_dist = horizontal_add_s64x2(sum); + } +} + +void av1_calc_indices_dim2_neon(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + int64x2_t sum = vdupq_n_s64(0); + uint32x4_t ind[2]; + int16x8_t cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1]; + const int16_t cxcy[8] = { cx, cy, cx, cy, cx, cy, cx, cy }; + cents[j] = vld1q_s16(cxcy); + } + + for (int i = 0; i < n; i += 8) { + for (int l = 0; l < 2; ++l) { + const int16x8_t in = vld1q_s16(data); + ind[l] = vdupq_n_u32(0); + // Compute the distance to the first centroid. + int16x8_t d1 = vsubq_s16(in, cents[0]); + int32x4_t dist_min = k_means_multiply_add_neon(d1); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + d1 = vsubq_s16(in, cents[j]); + const int32x4_t dist = k_means_multiply_add_neon(d1); + // Compare to the minimal one. + const uint32x4_t cmp = vcgtq_s32(dist_min, dist); + dist_min = vminq_s32(dist_min, dist); + const uint32x4_t ind1 = vdupq_n_u32(j); + ind[l] = vbslq_u32(cmp, ind1, ind[l]); + } + if (total_dist) { + // Pairwise sum, convert to 64 bit and add to sum. + sum = vpadalq_s32(sum, dist_min); + } + data += 8; + } + // Cast to 8 bit and store. + vst1_u8(indices, + vmovn_u16(vcombine_u16(vmovn_u32(ind[0]), vmovn_u32(ind[1])))); + indices += 8; + } + if (total_dist) { + *total_dist = horizontal_add_s64x2(sum); + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c new file mode 100644 index 0000000000..18cd0ce4c0 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/reconinter.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/av1_temporal_denoiser.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { +#if AOM_ARCH_AARCH64 + return vaddlvq_s8(v_sum_diff_total); +#else + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); + const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); + const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210), + vget_low_s64(fedcba98_76543210)); + const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0); + return sum_diff; +#endif +} + +// Denoise a 16x1 vector. +static INLINE int8x16_t denoiser_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold, + const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment, + const uint8x16_t v_delta_level_1_and_2, + const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) { + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = + vandq_u8(v_level2_mask, v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = + vandq_u8(v_level3_mask, v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = + vaddq_u8(v_level1_adjustment, v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = + vaddq_u8(v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = + vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +static INLINE int8x16_t denoiser_adjust_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t k_delta, int8x16_t v_sum_diff_total) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +// Denoise 8x8 and 8x16 blocks. +static int av1_denoiser_8xN_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude, + int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_height = block_size_high[bs] >> 1; + + int8x16_t v_sum_diff_total = vdupq_n_s8(0); + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + v_sum_diff_total = denoiser_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], + v_level1_threshold, v_level2_threshold, v_level3_threshold, + v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + v_sum_diff_total = denoiser_adjust_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = + vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = + vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + return FILTER_BLOCK; +} + +// Denoise 16x16, to 128x128 blocks. +static int av1_denoiser_NxM_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude) { + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_width = block_size_wide[bs]; + const int b_height = block_size_high[bs]; + const int b_width_shift4 = b_width >> 4; + + int8x16_t v_sum_diff_total[8][8]; + int r, c, sum_diff = 0; + + for (r = 0; r < 8; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r] = vdupq_n_s8(0); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon( + sig, mc_running_avg_y, running_avg_y, v_level1_threshold, + v_level2_threshold, v_level3_threshold, v_level1_adjustment, + v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vdupq_n_u8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = + denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y, + k_delta, v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int av1_denoiser_filter_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return av1_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return av1_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } + return COPY_BLOCK; +} diff --git a/third_party/aom/av1/encoder/arm/neon/cnn_neon.c b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c new file mode 100644 index 0000000000..8e686260d0 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c @@ -0,0 +1,1144 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/av1_common_int.h" +#include "av1/encoder/cnn.h" +#include "av1/encoder/partition_cnn_weights.h" + +// The CNN weights used in av1_cnn_convolve_no_maxpool_padding_valid are +// declared (av1_intra_mode_cnn_partition_cnn_layer_[01234]_kernel) in +// partition_cnn_weights.h. However, to enable linear memory access, rearrange +// the weight tables here. +static const float weights_layer_1[] = { + 0.228403f, 0.031690f, -0.251710f, -0.046230f, 0.413294f, -0.236732f, + -0.038291f, 0.210766f, 0.427196f, -0.384319f, -0.439463f, 0.366015f, + 0.112263f, -0.144168f, -0.075017f, 0.119629f, 0.325200f, -0.678246f, + -0.370826f, -0.341362f, -0.503392f, 0.400884f, 0.465214f, -0.360847f, + 0.187100f, -0.190757f, -0.131906f, 0.121492f, -0.303556f, -0.007658f, + 0.380077f, -0.066394f, -0.016043f, -1.490730f, -0.120682f, 0.132062f, + 0.086185f, -0.042766f, -0.087069f, 0.029426f, 0.309583f, -0.029985f, + -0.297429f, -0.018139f, -0.688828f, 0.756607f, 0.706410f, -0.696826f, + -0.087793f, -0.023304f, -0.012332f, -0.018043f, -0.410268f, 0.352143f, + 0.391284f, -0.363178f, -0.295034f, 0.160246f, -0.149446f, 0.260145f, + -0.252249f, 0.190826f, 0.251206f, -0.270796f, -0.979219f, 0.884880f, + 0.962057f, -0.847601f, -0.011053f, 0.118765f, -0.028428f, -0.020138f, + 0.400274f, -0.382845f, -0.462766f, 0.390654f, 0.361223f, -0.320068f, + -0.372084f, 0.313196f, 0.241933f, -0.416614f, -0.008722f, -0.255078f, + 0.078730f, -0.381935f, -0.204577f, 0.159768f, 0.071853f, -0.126294f, + -0.036186f, -0.007900f, 0.380071f, -0.298882f, 0.387941f, -0.267350f, + -0.586802f, 0.477785f, -0.000013f, 0.197296f, -0.079154f, -0.005811f, + -0.044300f, -0.021192f, -0.020879f, -0.005265f, 0.082277f, -0.139132f, + -0.239237f, 0.440234f, -0.542342f, 0.378360f, -0.070974f, 0.272702f, + -0.278939f, -0.044948f, -0.134197f, -0.007172f, -0.353628f, -0.128091f, + 0.357458f, -0.037614f, -0.144983f, 0.220623f, -0.003394f, -0.070166f, + 0.200370f, -0.166037f, 0.224448f, -0.012990f, -0.098853f, 0.008613f, + -0.017669f, 0.070641f, 0.174530f, -0.119822f, -0.065096f, 0.118487f, + -0.024764f, -0.050466f, 0.066631f, -0.075896f, -0.062363f, 0.212604f, + -0.377322f, 0.306306f, -0.399733f, 0.238624f, 0.233571f, -0.344080f, + 0.462491f, -0.565210f, -0.035074f, -0.010459f, 0.084382f, 0.052294f, + 0.065714f, 0.013716f, 0.135036f, 0.000588f, 0.181079f, -0.566344f, + 0.395561f, -0.398509f, 0.450017f, -1.462710f, 1.138280f, -0.447774f, + 0.247936f, -0.417067f, 0.165997f, -0.458632f, -0.018527f, 0.308461f, + 0.541266f, 0.162257f, 0.601786f, -1.275840f, -0.373404f, -0.589747f, + 0.026539f, -0.219327f, 0.142972f, -0.018496f, 0.075204f, -0.775190f, + 0.237307f, -0.348252f, 0.117792f, -0.094332f, 0.363101f, -0.065025f, + 0.816662f, 0.590110f, 0.752202f, -0.308599f, 0.258337f, -0.842085f, + 0.695788f, -0.205615f, 0.093930f, -0.392536f, 0.463093f, -0.432456f, + 0.041660f, -0.827264f, 0.309128f, -0.354658f, 0.451957f, -1.406640f, + 0.773192f, -0.892943f, 0.134856f, -0.467808f, 0.306003f, -0.226560f, + 0.086865f, -0.104102f, 0.148098f, -0.082658f, 0.316655f, -1.028310f, + 0.741566f, -0.345326f, 0.052379f, -0.275613f, 0.191765f, -0.162391f, + 0.000976f, 0.093061f, 0.068649f, 0.033582f, 0.239727f, -0.647769f, + 0.218493f, -0.397120f, 0.268229f, -0.303424f, 0.185393f, -0.314189f, + 0.101728f, -0.163083f, -0.084989f, 0.136783f, -0.264346f, 0.465914f, + 0.220395f, -0.252968f, -0.326661f, 0.271483f, 0.374717f, -0.311570f, + -0.082119f, 0.020870f, 0.091975f, -0.030582f, -0.487148f, 0.198912f, + 0.024554f, -0.749363f, -0.102267f, 0.097787f, 0.141459f, -0.110706f, + 0.079467f, -0.082570f, -0.347567f, 0.341043f, -0.137871f, 0.112319f, + 0.064733f, -0.082869f, 0.269999f, -0.408184f, -0.183443f, 0.180608f, + 0.223345f, -0.357376f, -0.244593f, 0.355348f, -0.072701f, -0.034311f, + 0.096544f, 0.016407f, 0.417550f, -0.367772f, -0.484535f, 0.405977f, + 0.314243f, -0.099622f, -0.192218f, -0.012780f, 0.434551f, -0.399047f, + -0.531499f, 0.484513f, -0.691352f, 0.872823f, 1.207720f, -1.377490f, + 0.006872f, -0.041453f, 0.007845f, 0.007463f, 0.467299f, -0.476372f, + -0.452606f, 0.452357f, 0.447332f, -0.365632f, -0.332435f, 0.300284f, + -0.290504f, 0.255410f, 0.310921f, -0.293717f, -0.616299f, 0.594207f, + 0.461347f, -0.449439f, 0.278455f, 0.285085f, -1.201340f, -0.016463f, + 0.549095f, 0.610375f, -4.608530f, -1.727390f, 0.150404f, -0.012846f, + -0.481148f, -0.182257f, 0.918796f, 0.213872f, 1.050410f, 0.681526f, + -0.458777f, -0.710395f, -2.347200f, -0.277197f, 0.213294f, 0.337551f, + -0.177710f, -0.152136f, 0.167666f, 0.308403f, -1.248500f, -0.565367f, + 0.122054f, 0.087874f, -0.476556f, -0.083548f, -0.358734f, -0.073131f, + -0.146320f, -2.241960f, 0.697639f, 0.545581f, -1.889700f, -0.267725f, + 0.433045f, 0.298224f, -0.338508f, 0.250226f, 0.405675f, 0.447201f, + -1.184690f, -0.473447f, 0.307403f, 0.711236f, -3.191560f, -1.663980f, + 0.165201f, 0.101360f, -0.624451f, -0.173269f, 0.089795f, 0.227478f, + -0.136664f, 0.007907f, 0.131079f, 0.605374f, -2.991620f, -1.723790f, + 0.082428f, 0.006781f, -0.348732f, -0.019271f, -0.032040f, -0.067078f, + -0.437166f, -0.144472f, 0.069844f, 0.194625f, -0.162284f, -0.374656f, + 0.056472f, -0.236524f, -0.114241f, -0.029161f, -0.222078f, -0.053435f, + -0.313938f, -0.555472f, 1.037550f, 0.689968f, 0.575694f, 0.065826f, + -0.659979f, -0.881351f, -0.626417f, -0.953975f, -0.576106f, -0.258708f, + 0.263004f, -0.229847f, 0.463835f, 1.390960f, -2.614480f, -1.272910f, + 0.065780f, -0.058603f, 0.015612f, 0.104703f, 0.198028f, 0.262792f, + 0.253616f, -0.079126f, -0.587381f, -0.739021f, -0.822676f, -0.795512f, + 0.193644f, 0.234643f, -0.034407f, 0.421478f, -0.572610f, -0.290714f, + -0.257803f, -0.644835f, -0.536938f, -0.375899f, -0.651077f, -0.522576f, + 0.562564f, 0.834616f, 0.513893f, 0.649689f, 0.356530f, 0.400716f, + 0.300606f, 0.290505f, 0.584608f, 0.671574f, 0.564584f, 0.419870f, + 0.062061f, 0.018263f, 0.009831f, 0.084103f, -0.128281f, -0.018818f, + -0.187244f, 0.067210f, 0.437147f, 0.442029f, 0.444939f, 0.226661f, + 0.541609f, 0.444280f, 0.302795f, 0.633026f, -0.180374f, 0.265197f, + 0.210404f, -0.118916f, -0.294013f, -0.692627f, -0.402347f, -0.356287f, + 0.387578f, 0.385496f, 0.789542f, 0.690396f, -0.203542f, -0.688546f, + 0.045319f, -0.448747f, -0.157148f, 0.152581f, 0.022360f, 0.058358f, + 0.593007f, 1.131860f, 0.289006f, 1.015560f, 0.144942f, -0.411577f, + 0.264794f, -0.085791f, 0.156996f, 0.200340f, 0.169264f, 0.267615f, + -0.361015f, -0.601842f, -0.442217f, -0.781086f, 0.112938f, 0.385305f, + 0.482454f, 0.470268f, 1.193390f, 0.589642f, 0.127638f, -0.640946f, + 0.540310f, 0.741498f, 0.686937f, 0.435879f, 0.534523f, 0.693119f, + 0.817577f, 0.783109f, 0.021681f, -0.004973f, 0.201236f, -0.086311f, + 0.028628f, 0.227871f, 0.462751f, 0.126832f, -0.389997f, -0.553965f, + -0.343953f, -0.448517f, 0.053129f, -0.115083f, 0.018138f, -0.067131f, + -0.293468f, -0.220700f, 0.074348f, -0.273153f, 0.263637f, 0.122049f, + 0.153025f, 0.076292f, 0.142320f, 0.286734f, 0.100542f, 0.308660f, + -0.759591f, -0.750938f, -0.788799f, -0.853076f, -0.588019f, -0.990063f, + -0.692327f, -0.722904f, 0.084736f, 0.151068f, 0.159606f, 0.147715f, + 1.610180f, 1.950330f, 1.765670f, 2.265110f, 0.008262f, 0.185584f, + 0.039337f, 0.164721f, 0.479446f, 0.314083f, 0.043969f, 0.291320f, + 0.003400f, -0.551190f, 0.060158f, -0.147591f, 0.089117f, 0.042994f, + 0.042802f, 0.127392f, -0.066172f, 0.078370f, 0.051408f, 0.014004f, + 0.086726f, 0.133334f, -0.046733f, 0.155100f, -0.118223f, -0.100778f, + -0.225245f, -0.460397f, 0.892644f, 1.003770f, 0.405155f, 0.517477f, + 0.184585f, 0.279090f, -0.036477f, 0.198703f, 0.027139f, -0.055728f, + -0.022396f, -0.147319f, 2.275540f, 2.014990f, 2.296800f, 2.081730f, + -0.088713f, 0.105729f, -0.027871f, -0.095047f, 0.012429f, 0.014244f, + -0.014755f, -0.003017f, 1.332700f, 1.300040f, 1.464250f, 1.305030f, + 0.032568f, 0.118042f, 0.079632f, -0.089405f, 0.163905f, 0.146608f, + 0.026502f, 0.065307f, -0.056909f, -0.065052f, 0.069851f, -0.082958f, + 0.023419f, -0.026293f, 0.037616f, -0.048096f, -0.073701f, -0.208295f, + -0.782095f, 0.000523f, 0.374131f, 0.420946f, 0.466151f, 0.349651f, + -0.679275f, -0.745827f, -0.379918f, -0.900107f, 0.044070f, -0.347536f, + -1.224390f, 0.740113f, -0.779966f, 0.510920f, -0.968597f, -0.095630f, + 0.120805f, 0.676803f, -0.164827f, 0.172996f, -0.106720f, 0.197527f, + 0.337561f, 0.571094f, -0.279090f, -0.396697f, -0.253083f, -0.690170f, + -0.363291f, 0.516921f, 0.489391f, -0.920628f, 0.497572f, 0.483864f, + -0.125696f, -0.338123f, -0.041517f, -0.534630f, -0.388465f, -0.784554f, + 0.215227f, 0.055088f, 0.179638f, 0.086997f, 0.569313f, 0.572926f, + 0.137182f, -0.045485f, 0.118087f, 0.210383f, 0.212664f, 0.482443f, + 0.151921f, 0.307947f, -0.084656f, -0.386206f, 0.542277f, -0.207005f, + 0.073792f, -1.013240f, 0.303581f, 0.270527f, 0.265985f, 0.332702f, + 0.848609f, 0.686757f, 0.767212f, 0.316901f, -0.502460f, -0.567092f, + -0.484799f, -0.173350f, -0.426863f, 0.222375f, -0.200267f, -0.523758f, + 0.265180f, -0.175648f, -0.229754f, 0.148740f, 0.402515f, 0.028243f, + -0.366109f, 0.157232f, -0.131564f, 0.055136f, 0.211046f, -0.115542f, + 0.322379f, -0.137768f, -0.247832f, 0.070394f, 0.058530f, -0.295023f, + -0.196022f, -0.109097f, 0.261285f, -0.273585f, -0.240632f, 0.258326f, + -0.077364f, 0.071405f, -0.014766f, -0.008751f, -0.203622f, 0.177818f, + 0.116726f, -0.116735f, -0.723616f, -0.700154f, 0.145082f, -0.184949f, + -0.287076f, 0.150405f, 0.258075f, -0.157764f, -0.120909f, 0.105459f, + 0.113288f, -0.092963f, 0.328183f, -0.300115f, -0.361289f, 0.319792f, + -0.048875f, 0.135673f, 0.132539f, -0.162481f, 0.002109f, 0.065048f, + -0.135969f, 0.061558f, 1.510670f, -0.884925f, -0.827022f, 0.190311f, + -0.060088f, -0.033362f, 0.013354f, 0.002847f, 0.353479f, -0.462538f, + -0.319638f, 0.424484f, 0.199540f, -0.073843f, -0.140621f, 0.072133f, + -0.098662f, 0.070613f, 0.031150f, -0.021869f, -0.511253f, 0.503412f, + 0.565963f, -0.576146f, -1.081700f, 0.047670f, 0.266687f, 0.524804f, + -2.361150f, 0.147823f, 0.594717f, 0.956842f, -1.048220f, 0.127083f, + 0.079581f, 0.065419f, 0.176783f, 0.653953f, 0.260967f, 0.537892f, + -1.207580f, 0.245983f, -0.727067f, 0.071755f, -0.343025f, -0.173435f, + 0.215289f, 0.268578f, -1.158560f, 0.039263f, -0.132888f, 0.217132f, + -0.622195f, -0.071256f, 0.317333f, 0.157614f, -1.588250f, 0.316432f, + -0.736720f, -0.041698f, -1.959280f, 0.083451f, 0.570584f, 0.327620f, + -1.262200f, -0.026738f, 0.231198f, 0.326861f, -1.644200f, -0.143833f, + -0.079495f, 0.493026f, -2.488090f, -0.034046f, 0.165884f, 1.074260f, + -1.076980f, 0.248198f, -0.017987f, 0.421900f, -0.105860f, 0.076710f, + 0.002072f, 0.070264f, -1.734750f, 0.227145f, 0.209220f, 0.851459f, + -0.142369f, 0.066502f, 0.027816f, 0.044321f, -0.186591f, -0.100340f, + 0.115580f, 0.192252f, -0.892114f, 0.209531f, -0.308243f, 0.367968f, + -0.721770f, 0.220224f, -0.062744f, 0.133754f, 0.040416f, 0.190428f, + -0.035428f, 0.162974f, 0.116427f, 0.669393f, 0.278891f, 0.856676f, + 1.060390f, 0.936983f, 0.863355f, 0.990560f, -0.147111f, -0.217883f, + 0.355794f, -0.186530f, -0.275614f, -0.095719f, 0.167346f, 0.359078f, + -0.079223f, -0.581596f, -0.213134f, -0.431123f, -0.516443f, -0.388628f, + -0.643821f, -0.202345f, 0.426230f, 0.516923f, 0.548131f, 0.555973f, + 0.022286f, 0.361170f, 0.980065f, 0.648400f, -0.056813f, -0.100310f, + -0.439481f, -0.166454f, 0.412449f, 0.509400f, 0.316208f, 0.470293f, + -0.827838f, -1.078380f, -1.047040f, -1.074560f, 0.274555f, -0.316736f, + 0.128818f, 0.228566f, -0.520967f, -0.731674f, -0.687887f, -0.536388f, + -0.031187f, 0.041404f, 0.047821f, 0.064397f, 0.054230f, 0.105059f, + -0.178671f, 0.176847f, -0.394797f, -0.260255f, -0.333734f, -0.162345f, + -0.444650f, -0.928438f, -0.705840f, -0.833162f, 0.306737f, 0.429699f, + 0.417298f, 0.478469f, 0.420903f, 0.676871f, 0.429677f, 0.616921f, + -0.805199f, -0.643391f, -0.304100f, 0.797599f, -0.172157f, 0.429085f, + -0.750676f, 0.149227f, -0.207898f, -0.022534f, -0.341448f, -0.247976f, + 0.095325f, -0.561120f, 0.599694f, -0.025236f, 0.292346f, -0.312001f, + 0.517478f, 0.301457f, -0.106415f, 0.226263f, -0.184163f, -0.114419f, + -0.322702f, 0.172541f, 0.445573f, 0.157213f, 0.670704f, 0.102174f, + -0.234667f, -0.293311f, 0.769852f, 0.038028f, -0.036741f, -0.228060f, + -0.253335f, 0.424054f, -0.597980f, 0.221007f, -0.114741f, -0.411557f, + -0.592201f, 0.442684f, 0.115491f, -0.106896f, -0.028110f, 0.354751f, + -0.248375f, 0.242570f, -0.155856f, 0.280528f, -0.198742f, 0.588725f, + 0.371065f, 0.078197f, 0.114706f, -0.448021f, 0.065255f, 0.133741f, + -0.227522f, -0.047339f, -0.052849f, 0.309480f, 0.597185f, 0.209182f, + 0.226108f, -0.601036f, -0.431672f, -0.172601f, -0.000174f, 0.194292f, + -0.133937f, 0.130676f, 0.059372f, 0.091381f, 0.098751f, -0.150996f, + 0.170514f, -0.085494f, 0.336576f, 0.484004f, 0.033862f, 0.277473f, + -0.231482f, -0.328385f, -0.332739f, -0.626957f, 0.510167f, 0.575861f, + 0.421494f, 0.482540f, -0.636377f, -0.864661f, -0.694180f, -0.420014f, + -0.132781f, 0.017599f, 0.003538f, 0.486934f, 0.133878f, -0.094622f, + 0.016132f, 0.010117f, 0.156680f, -0.022201f, -0.014621f, 0.228445f, + 0.190826f, 0.171580f, 0.579923f, 0.245428f, 0.322713f, 0.480101f, + 0.406320f, 0.412229f, 0.002334f, -0.022349f, 0.074571f, -0.043828f, + 0.290453f, 0.451749f, 0.530376f, 0.271879f, 0.095144f, 0.169450f, + 0.049482f, 0.114605f, -0.635634f, -0.700768f, -0.558538f, -0.537625f, + 0.190255f, -0.308237f, -0.053703f, 0.212489f, 0.056520f, -0.040019f, + 0.089822f, -0.014155f, -0.376004f, -0.448752f, -0.526717f, -0.571440f, + 0.116482f, 0.162321f, 0.147895f, 0.280527f, 0.159037f, -0.095958f, + 0.007931f, -0.086630f, 0.285625f, 0.514914f, 0.208908f, 0.519251f, + 0.309368f, 0.379777f, 0.350565f, 0.487487f, -0.541494f, -0.421836f, + -0.390001f, -0.500696f, -0.905736f, -0.150439f, -0.942304f, -0.566771f, + 0.484233f, 0.767417f, 0.410477f, 0.670196f, 0.070210f, 0.488836f, + 0.372805f, 0.197631f, 0.337892f, 0.524423f, 0.777219f, -0.260955f, + -0.112981f, -0.060088f, -0.200250f, -0.195671f, 0.007584f, 0.252096f, + 0.235511f, 0.366612f, -0.304979f, -0.211068f, -0.420683f, -0.085370f, + 0.085762f, -0.097549f, -0.802509f, -0.468079f, -0.192787f, -0.069670f, + -0.235162f, -0.077772f, -0.441671f, -0.348479f, -0.431434f, -0.108256f, + -0.133779f, 0.017032f, 0.001964f, -0.120647f, -0.187663f, -0.194985f, + -0.231742f, -0.175288f, -0.162639f, 0.245110f, 0.049951f, 0.104229f, + -0.159634f, -0.076545f, -0.022496f, -0.036532f, -0.147028f, -0.034215f, + 0.028213f, -0.059669f, -0.078259f, 0.062993f, -0.124066f, -0.137362f, + -0.129977f, -0.010532f, -0.049090f, -0.189401f, 0.495471f, 0.615778f, + 0.451437f, 0.803526f, 0.523532f, 0.841339f, 0.699528f, 0.745129f, + 0.246264f, -0.198290f, -0.283620f, 0.189917f, -0.018306f, -0.419097f, + 0.280363f, -0.098085f, 0.138972f, -0.140867f, -0.117025f, 0.098585f, + 0.130979f, 0.268133f, -0.161731f, -0.176629f, -0.357677f, -0.126379f, + 0.553128f, -0.126821f, -0.001511f, -0.010081f, -0.031162f, 0.079203f, + -0.157731f, 0.072865f, 0.535830f, -0.529989f, -0.570075f, 0.295795f, + 0.595613f, -0.449278f, -0.669756f, 0.941452f, 0.356897f, -0.723720f, + -0.115203f, -0.134479f, 0.133048f, 0.109860f, -0.024250f, -0.049732f, + 0.020098f, 0.048356f, -0.048293f, 0.108754f, 0.062548f, -0.238315f, + 0.182700f, 0.312011f, -0.244377f, -0.118012f, 0.012276f, 0.006089f, + 0.098068f, -0.079280f, -0.423987f, -0.411931f, -0.027425f, 0.870280f, + 0.022825f, -0.024481f, -0.036320f, -0.111189f, 0.364539f, -0.244896f, + -0.373060f, 0.266345f, -0.141778f, 0.277549f, 0.059834f, -0.178242f, + -0.686222f, 0.594535f, 0.354546f, -0.272516f, 1.060730f, -1.059810f, + -0.948126f, 0.993267f, 0.116597f, -0.227574f, -0.436144f, -0.333309f, + -0.575746f, -0.828102f, 0.284561f, 0.351668f, -0.080164f, -0.762518f, + -0.511108f, -0.212855f, 0.293892f, -0.548664f, 0.072057f, 0.006748f, + 1.485110f, 0.124687f, 0.727211f, 1.557560f, -0.064383f, -0.022242f, + 0.002921f, -0.151505f, 0.270926f, 0.173632f, -0.640644f, 0.422410f, + -0.240699f, -0.361980f, -0.279864f, -0.055165f, -1.084140f, 0.231705f, + 0.366172f, -0.347698f, -0.097565f, -0.747227f, -0.243033f, 0.941545f, + -0.207460f, -0.353913f, 0.104303f, -0.403151f, 0.203177f, 0.335893f, + -0.229033f, 0.029096f, -0.409634f, -0.179599f, -0.442397f, 0.649114f, + 0.460774f, 0.170906f, -0.043857f, 0.402066f, -0.226896f, -0.199624f, + 0.016650f, 0.207894f, 0.056954f, 0.220329f, 0.374060f, 0.130361f, + -0.303960f, -0.078863f, 0.195410f, 0.729438f, 0.246818f, 0.287730f, + 0.484876f, 0.111488f, -0.168647f, -0.087878f, -0.070089f, -0.341329f, + -0.330280f, 0.259943f, -0.364205f, 0.256555f, -0.756804f, -0.086915f, + 0.777351f, 0.006136f, 0.110348f, 0.248743f, 0.209326f, -0.362741f, + -0.184416f, 0.422446f, 0.565193f, 0.310072f, -0.011212f, -0.765226f, + 0.039466f, 0.301288f, 0.172907f, -1.539450f, 0.606202f, 0.477469f, + 0.045894f, -0.222180f, -0.013192f, -0.064077f, -0.241551f, 0.192914f, + 0.028004f, -0.540538f, 0.437440f, 0.179087f, -0.753204f, -0.001374f, + 1.185930f, -0.151182f, 1.238580f, -1.389900f, 0.277954f, 0.422208f, + 0.041553f, -0.542284f, 0.139019f, -0.148580f, -0.130705f, 0.361830f, + 0.322953f, -0.092371f, 0.120180f, -0.355299f, -0.028057f, 0.128114f, + 0.250947f, -0.349926f, -0.684633f, 0.246175f, 0.186731f, -0.676313f, + 0.060535f, 0.333371f, -0.021172f, -0.421266f, -0.079650f, 0.031359f, + -0.303658f, -0.298286f, 0.119016f, 0.655585f, 0.200175f, -0.887182f, + -0.197539f, -0.318883f, -0.130250f, 0.522487f, -0.092616f, 0.405930f, + -0.281678f, 0.089728f, 0.081814f, -0.781745f, 0.348878f, 0.082274f, + -0.914136f, 1.098810f, 0.855321f, -1.078170f, -0.268018f, 0.246440f, + 0.238347f, -0.027228f, 0.074111f, -0.061197f, -0.063582f, 0.089462f, + -0.040347f, 0.117082f, 0.122772f, -0.162816f, -0.148668f, -0.342856f, + -0.495604f, -1.453630f, -0.045273f, -0.030463f, 0.043766f, 0.047978f, + 0.016910f, -0.009700f, 0.006288f, -0.042556f, 0.632896f, -0.845744f, + -0.516844f, 0.709439f, 0.486166f, -1.203050f, -0.978381f, 0.631876f, + 0.000705f, 0.123858f, -0.001187f, -0.172312f, -0.422668f, 0.241838f, + 0.437400f, -0.268186f, -0.513259f, 0.450209f, 0.542629f, -0.453810f, + -0.207119f, 0.072598f, 0.085066f, -0.018986f, -0.149512f, 0.149521f, + 0.182105f, -0.227200f, -0.363240f, 0.172670f, -0.502932f, 0.689256f, + 0.093760f, -0.090207f, -0.066803f, 0.056759f, -0.002243f, -0.050662f, + -0.059324f, 0.152943f, -0.701150f, 0.712540f, 0.660349f, -0.654970f, + 0.351772f, -0.303383f, -0.311177f, 0.247653f, 0.013035f, 0.034648f, + -0.137832f, 0.041197f, 0.410265f, 0.345129f, 0.653338f, 0.047050f, + 0.140399f, 0.018613f, -0.012431f, -0.113632f, -0.029928f, 0.051564f, + -0.031349f, 0.151944f, -0.160340f, 0.326798f, -0.458067f, 0.636235f, + 0.243184f, 0.514072f, 2.414450f, 1.421980f, -0.001474f, -0.141389f, + -0.104817f, -0.141882f, -0.026395f, 0.053014f, 0.143885f, -0.207774f, + -0.563846f, -0.242514f, -0.436574f, -0.456796f, -0.520646f, 0.282550f, + -0.684924f, 0.061105f, -0.315884f, -0.392624f, 0.009805f, -0.256597f, + -0.146732f, 0.331039f, 0.362342f, 0.270851f, 0.067679f, -0.071331f, + -0.222423f, 0.081286f, -0.208192f, -0.193816f, -0.008201f, -0.309340f, + 0.167556f, 0.106071f, 0.172254f, -0.163790f, -0.142205f, -0.043182f, + 0.096145f, 0.145037f, -0.066015f, -0.073194f, 0.132237f, -0.088522f, + -0.044292f, -0.487128f, 0.033389f, -0.573548f, 0.185449f, 0.273593f, + 0.147503f, 0.457049f, -0.021539f, 0.090786f, 0.009147f, 0.000899f, + 0.018088f, 0.115791f, -0.079165f, 0.139388f, +}; + +static const float weights_layer_2[] = { + 0.153048f, 0.112901f, 0.136781f, 0.154580f, 0.091610f, 0.045165f, + 0.088490f, 0.116991f, -0.463766f, -0.596567f, -0.567008f, -0.630565f, + 0.141874f, 0.095726f, 0.175427f, 0.145027f, -0.969824f, -1.018190f, + -1.073300f, -1.041130f, -0.070545f, -0.123600f, -0.114967f, -0.169453f, + -0.267458f, -0.147730f, -0.161419f, -0.164894f, -0.117508f, -0.204389f, + -0.122695f, -0.163107f, -0.003903f, -0.030470f, -0.037433f, -0.059568f, + 0.138243f, 0.091019f, 0.160372f, 0.141650f, -0.544565f, -0.620004f, + -0.504503f, -0.429979f, -0.099491f, -0.096384f, -0.155265f, -0.188536f, + 0.084923f, 0.038345f, 0.066706f, 0.122083f, 0.267087f, 0.184419f, + 0.261478f, 0.255746f, -0.245894f, -0.114980f, -0.193880f, -0.227785f, + 0.087536f, 0.095712f, 0.106105f, 0.099353f, -0.059473f, -0.173247f, + -0.202386f, -0.076010f, 0.125928f, 0.100793f, 0.119638f, 0.129623f, + 0.136593f, 0.102984f, 0.156550f, 0.140558f, 0.122524f, 0.051596f, + 0.084164f, 0.123630f, 0.072542f, 0.096063f, 0.083236f, 0.087630f, + 0.025900f, 0.023738f, 0.036385f, 0.053077f, -0.029501f, 0.010544f, + -0.010026f, -0.051268f, 0.086302f, 0.109909f, 0.101385f, 0.127513f, + -0.031869f, 0.005340f, -0.056267f, -0.032955f, 0.032748f, 0.023162f, + 0.092118f, -0.001780f, -0.123612f, -0.183433f, -0.202377f, -0.317516f, + 0.129052f, 0.208112f, 0.145582f, 0.175502f, 0.018476f, 0.036349f, + 0.072417f, 0.061194f, 0.086985f, 0.117086f, 0.072465f, 0.129068f, + 0.020182f, 0.052114f, 0.017878f, 0.010478f, -0.001381f, -0.034644f, + 0.025135f, -0.037748f, 0.004973f, 0.024778f, 0.041816f, 0.032111f, + 0.080268f, 0.124998f, 0.105719f, 0.177047f, -0.072114f, -0.011864f, + -0.076846f, -0.089840f, 0.069993f, 0.089362f, 0.088035f, 0.120621f, + 0.065916f, 0.100946f, -0.006784f, -0.007751f, 0.122039f, 0.126482f, + 0.078629f, 0.140299f, 0.074034f, 0.092464f, 0.089798f, 0.108968f, + 0.075729f, 0.057128f, 0.013570f, 0.021195f, 0.068901f, 0.054022f, + 0.029781f, 0.031404f, -0.209998f, -0.208731f, -0.198310f, -0.212454f, + -0.579168f, -0.490190f, -0.607567f, -0.520541f, 0.083863f, 0.056612f, + 0.030366f, 0.061790f, -0.004874f, -0.057203f, -0.060429f, -0.049145f, + 0.080086f, 0.138602f, 0.223796f, 0.133279f, -0.495954f, -0.612093f, + -0.545393f, -0.562310f, 0.070672f, 0.037702f, 0.139013f, 0.080192f, + -0.111387f, -0.048165f, 0.074359f, -0.042125f, 0.113633f, 0.106579f, + 0.042633f, 0.102734f, -0.068220f, 0.128423f, -0.181821f, -0.013260f, + -0.108563f, -0.138667f, -0.109304f, -0.131909f, -0.168667f, -0.126870f, + -0.132533f, -0.167096f, -0.184741f, -0.140890f, -0.125361f, -0.150632f, + 0.309013f, 0.364376f, 0.361102f, 0.271566f, 0.116552f, 0.091160f, + 0.096846f, 0.095954f, 0.046972f, 0.080489f, 0.028766f, -0.012223f, + 0.071379f, 0.041535f, -0.000668f, 0.033698f, -0.013493f, -0.027535f, + -0.025804f, -0.012267f, -0.097465f, -0.099232f, -0.208863f, -0.225201f, + -0.475608f, 0.077358f, -0.002872f, 0.163890f, -0.420298f, 0.072114f, + 0.121601f, -0.016727f, 0.573853f, -0.080196f, 0.193053f, 0.053012f, + -0.454179f, 0.058563f, 0.067265f, 0.141154f, 0.412541f, 0.086933f, + 0.030407f, -0.030413f, 0.478757f, -0.097731f, 0.277072f, -0.086393f, + 0.552604f, -0.334201f, 0.091765f, -0.270262f, -1.395060f, 0.271837f, + -0.005335f, 0.240499f, 0.175442f, -0.326329f, -0.019353f, -0.270338f, + -0.459273f, 0.096183f, 0.153046f, 0.135818f, 0.759028f, -0.177673f, + -0.099966f, 0.103363f, 0.697289f, -0.234184f, -0.048706f, -0.116099f, + -0.282575f, 0.025655f, -0.184759f, 0.040658f, -0.558267f, 0.214087f, + -0.095620f, 0.200522f, 0.278996f, 0.031959f, 0.122936f, -0.209196f, + -0.308217f, 0.092917f, 0.113269f, 0.136274f, -0.037046f, 0.017263f, + -0.194183f, 0.089133f, -0.161244f, 0.042799f, 0.030557f, 0.153545f, + -0.355048f, 0.070928f, -0.152852f, 0.102875f, -0.193649f, 0.007916f, + -0.062952f, 0.050602f, 0.073671f, 0.143045f, -5.978970f, -7.013850f, + 0.058713f, 0.076116f, 0.026445f, -0.056599f, -0.005966f, 0.032234f, + 0.006753f, -0.024528f, 0.120308f, 0.179939f, -6.624630f, -7.638680f, + 0.026359f, 0.020758f, 0.194274f, 0.051489f, -0.008491f, -0.028248f, + -0.061328f, -0.134423f, -0.103951f, -0.110877f, 0.042263f, 0.127016f, + 0.012473f, -0.008595f, 0.031357f, 0.087476f, -0.084022f, -0.015590f, + -0.313546f, 0.120072f, 0.123880f, 0.162148f, -6.596560f, -7.358830f, + 0.004797f, -0.003415f, 0.048455f, 0.026737f, -0.103702f, 0.034416f, + -0.003475f, -0.236827f, 0.005378f, 0.048413f, 0.054612f, -0.079359f, + 0.043707f, 0.001085f, 0.023380f, 0.007785f, 0.025938f, -0.052856f, + -0.033421f, 0.022643f, 0.034161f, 0.127681f, -5.019490f, -5.233580f, + -0.128630f, 0.087741f, -0.239834f, -0.377876f, 0.128082f, 0.142730f, + -0.086819f, -0.350927f, 0.089849f, 0.155776f, -6.155120f, -5.721720f, + 0.056110f, 0.008761f, 0.045579f, 0.016762f, -0.134076f, -0.101551f, + -0.096058f, -0.117146f, 0.003527f, -0.056942f, -0.005578f, 0.071287f, + 0.023776f, -0.028003f, -0.075390f, -0.191160f, -0.089672f, -0.104372f, + -0.104750f, -0.080813f, -0.249824f, -0.124479f, -0.243593f, -0.244284f, + -0.554911f, -0.549095f, -0.564693f, -0.475107f, -0.121771f, -0.143441f, + -0.171170f, -0.120920f, 0.109831f, 0.079708f, 0.327295f, 0.308907f, + -0.178785f, -0.428316f, -0.418882f, -0.366750f, -0.139296f, -0.129645f, + -0.081237f, -0.101533f, -0.006256f, -0.146756f, -0.322110f, -0.338865f, + -0.306085f, -0.319592f, -0.454803f, -0.363560f, -0.018557f, 0.006605f, + -0.131198f, -0.077708f, 0.138160f, 0.119611f, 0.271098f, 0.232168f, + 0.027812f, 0.035390f, -0.202503f, -0.091172f, -0.142020f, -0.159929f, + -0.106404f, -0.107433f, -0.381743f, -0.353222f, -0.484159f, -0.469926f, + -0.234659f, -0.315674f, -0.178327f, -0.213485f, -0.096207f, -0.190944f, + -0.118917f, -0.161288f, 0.015996f, 0.060737f, 0.051390f, 0.060876f, + 0.229289f, 0.282418f, 0.250945f, 0.197273f, 0.045131f, -0.008305f, + 0.072024f, 0.044547f, -0.050010f, 0.055504f, 0.001343f, -0.014445f, + 0.254909f, 0.309091f, 0.228249f, 0.274843f, 0.089778f, -0.046581f, + 0.072714f, 0.126814f, -0.048931f, -0.045743f, -0.151333f, -0.004490f, + 0.179966f, 0.058150f, -0.178622f, -0.088159f, -0.074416f, -0.005821f, + -0.011799f, -0.002225f, -0.069361f, -0.098937f, -0.081575f, -0.034796f, + 0.253792f, 0.301039f, 0.219163f, 0.256027f, 0.058007f, -0.041431f, + 0.040674f, 0.009019f, -0.099670f, -0.099077f, -0.039437f, 0.017946f, + 0.060717f, 0.045796f, 0.109664f, 0.032138f, -0.071094f, 0.023697f, + 0.011335f, -0.030465f, 0.068677f, 0.039345f, -0.045078f, 0.084037f, + 0.135517f, 0.190417f, 0.175578f, 0.155286f, -0.044505f, 0.010826f, + 0.006717f, -0.134715f, 0.068022f, 0.110095f, 0.079966f, 0.034481f, + 0.185804f, 0.188273f, 0.227283f, 0.135935f, 0.033447f, 0.031571f, + -0.014766f, -0.024565f, 0.021792f, 0.017675f, -0.001333f, -0.040069f, + -0.049384f, -0.045256f, -0.014013f, -0.000107f, -0.096928f, -0.111495f, + -0.051225f, -0.060449f, 0.071446f, 0.017294f, -0.004822f, 0.006932f, + 0.020884f, 0.089425f, 0.061097f, -0.038708f, -0.184029f, -0.089541f, + -0.158035f, -0.214607f, -0.377947f, -0.318586f, -0.336977f, -0.323908f, + 0.181612f, 0.140018f, 0.233524f, 0.193366f, -0.254507f, -0.271902f, + -0.197144f, -0.119539f, 0.042162f, 0.000320f, 0.014708f, -0.014228f, + -0.081119f, -0.089326f, 0.001763f, 0.081009f, -0.142618f, -0.160650f, + -0.214597f, -0.202143f, -0.053495f, -0.012819f, -0.071468f, -0.010883f, + 0.072570f, 0.071507f, 0.091045f, 0.083155f, -0.271237f, -0.289211f, + -0.272345f, -0.299411f, 0.031697f, -0.029795f, -0.030045f, -0.013604f, + -0.106843f, -0.045212f, -0.122459f, -0.096936f, 0.059793f, 0.006157f, + 0.028092f, 0.040589f, -0.014560f, -0.008975f, -0.051404f, -0.014309f, + -0.016883f, 0.018332f, 0.040114f, 0.050348f, 0.044921f, -0.002445f, + -0.112396f, 0.014395f, 0.115160f, 0.145350f, -0.166814f, -0.121449f, + 0.155573f, -0.099446f, -0.161661f, 0.187251f, 0.004711f, 0.024318f, + -0.060871f, -0.028311f, -0.098274f, 0.322030f, -0.069242f, -0.153173f, + -0.227428f, -0.293965f, 0.228491f, 0.111413f, -1.354720f, -0.344235f, + 0.866715f, 0.872344f, 0.078789f, -0.384865f, 0.162388f, 0.109018f, + -0.191549f, -0.002638f, 0.305053f, 0.087337f, 0.066506f, -0.055810f, + -0.010984f, -0.056160f, -0.114617f, -0.058478f, 0.022059f, -0.124368f, + -0.130989f, 0.369432f, -0.248898f, -0.003955f, -0.021578f, 0.115991f, + -0.114163f, -0.065232f, 0.339857f, -0.225997f, 0.006282f, -0.125395f, + 0.235082f, -0.347785f, 0.662321f, -0.529182f, 0.153297f, -0.001326f, + -0.026725f, -0.024677f, -0.088065f, -0.116127f, 0.080896f, 0.212542f, + 0.208421f, 0.032047f, -0.211395f, 0.074997f, 0.096659f, 0.096423f, + -0.078643f, 0.106556f, -0.123860f, 0.075609f, 0.066008f, -0.097275f, + -1.000020f, -0.780154f, -0.856922f, -0.964007f, 0.083135f, -0.018922f, + -0.266214f, -0.151480f, 0.051538f, 0.017802f, 0.066774f, -0.021341f, + -0.869494f, -0.935252f, -0.895836f, -0.853871f, -0.160490f, 0.085850f, + -0.029670f, -0.056675f, 0.159989f, 0.166872f, 0.129970f, 0.194377f, + 0.153294f, 0.199593f, 0.037692f, 0.103391f, 0.029335f, -0.085324f, + -0.079326f, -0.077216f, 0.501561f, 0.366168f, 0.330196f, 0.296432f, + -0.977282f, -0.844295f, -1.014870f, -1.098990f, -0.099858f, -0.129552f, + 0.090051f, -0.013378f, 0.081330f, 0.194911f, 0.286501f, 0.177363f, + -0.148250f, -0.111700f, -0.243081f, -0.102918f, 0.161069f, -0.012655f, + -0.071722f, -0.020329f, -0.077828f, -0.041716f, 0.109247f, 0.062229f, + -0.759722f, -0.742756f, -0.563713f, -0.631187f, 0.005911f, 0.268154f, + -0.263769f, 0.087149f, -0.163623f, -0.359600f, -0.464577f, -0.369352f, + -0.515784f, -0.475822f, -0.523485f, -0.649813f, -0.112419f, -0.029285f, + 0.021061f, -0.041515f, 0.149133f, -0.254428f, 0.115776f, -0.061892f, + 0.103675f, -0.283363f, 0.005005f, 0.022034f, -0.178454f, 0.035836f, + -0.113702f, -0.217823f, 0.209407f, -0.296257f, 0.187976f, -0.157370f, + -0.127190f, 0.251780f, 0.055633f, 0.294111f, -0.067773f, 0.467190f, + -0.192625f, -0.071084f, -0.445284f, 0.511090f, -0.319728f, 0.267971f, + 0.494929f, -0.586727f, 0.454543f, -0.520675f, -0.085900f, 0.325989f, + -0.131006f, -0.069501f, 0.199927f, -0.218919f, 0.170055f, -0.106538f, + 0.133312f, 0.127629f, -0.561625f, 0.595666f, -0.090927f, 0.363348f, + -0.249246f, 0.063068f, -0.016458f, -0.291045f, -0.040509f, 0.017866f, + 0.304871f, -0.459214f, 0.214390f, -0.238740f, -0.456541f, 0.545848f, + -0.218026f, 0.202475f, 0.128490f, -0.036417f, 0.173885f, -0.049385f, + 0.235514f, -0.132587f, -0.015066f, 0.164638f, 0.196873f, -0.125330f, + 0.216912f, -0.109398f, 0.121602f, -0.209374f, 0.164400f, -0.123049f, + 0.195520f, -0.212932f, -0.015180f, -0.005784f, 0.049726f, -5.822150f, + 0.124536f, 0.040689f, -0.018560f, -3.155020f, 0.014690f, 0.076202f, + -0.154008f, 1.070630f, -0.071606f, 0.051026f, 0.138285f, -5.836340f, + 0.162173f, 0.085890f, -0.186166f, 0.093221f, 0.019240f, -0.017053f, + -0.090144f, 0.236254f, -0.125344f, 0.056235f, -0.089813f, -0.252281f, + -0.127406f, -0.155088f, 0.009972f, -0.066449f, 0.044222f, 0.025943f, + -0.164921f, 0.165463f, -0.001132f, -0.038386f, 0.115194f, -5.757100f, + 0.163386f, 0.061226f, 0.024626f, 0.132750f, 0.107279f, -0.001622f, + -0.107860f, -0.356009f, -0.138935f, -0.145173f, -0.061198f, -0.646138f, + 0.034279f, 0.078187f, 0.108138f, -0.490444f, 0.074719f, 0.034984f, + -0.109303f, 0.741785f, -0.066939f, 0.015558f, 0.114229f, -4.001080f, + 0.130772f, 0.044675f, -0.165162f, -0.274810f, -0.042987f, -0.048579f, + 0.156603f, -1.288370f, 0.076198f, 0.035065f, 0.032043f, -5.002520f, + 0.086900f, -0.010886f, 0.030850f, -0.782259f, 0.056211f, -0.097759f, + 0.118988f, 0.106638f, 0.091419f, 0.079920f, 0.062325f, 0.097116f, + 0.126035f, 0.122530f, -0.278299f, -0.083314f, -0.300563f, -0.197946f, + 0.081664f, 0.089925f, 0.074754f, 0.074628f, 0.102338f, 0.088845f, + 0.105841f, 0.102381f, 0.003087f, 0.061599f, 0.098326f, 0.040119f, + -0.005298f, -0.028834f, 0.059938f, -0.013668f, -0.585882f, -0.631436f, + -0.742673f, -0.736666f, 0.025071f, 0.066851f, 0.075046f, 0.091360f, + 0.099045f, 0.098261f, 0.106413f, 0.099487f, -0.016742f, -0.097334f, + -0.086152f, -0.212444f, -0.028043f, -0.007362f, 0.003914f, -0.055864f, + 0.034756f, 0.081361f, 0.080183f, 0.061319f, 0.193396f, 0.173716f, + 0.207765f, 0.231701f, -0.074565f, -0.073257f, -0.086470f, -0.083114f, + 0.081489f, 0.078477f, 0.033452f, 0.058835f, -0.069665f, -0.031691f, + -0.111255f, -0.167754f, 0.184179f, 0.174673f, 0.160288f, 0.190893f, + 0.110930f, 0.103495f, 0.098408f, 0.102918f, 0.053764f, 0.089994f, + 0.140308f, 0.124867f, 0.074176f, 0.117460f, -0.160775f, -0.144132f, + -0.099373f, -0.035913f, 0.081237f, 0.062247f, -0.166421f, 0.062125f, + 0.276479f, 0.060955f, 0.066627f, 0.455347f, 0.219953f, 0.109912f, + 0.273931f, 0.233153f, 0.102236f, 0.447606f, -0.352243f, 0.499236f, + -0.931206f, 0.248595f, 0.254047f, 0.061542f, 0.268804f, 0.309517f, + -0.084414f, -0.245828f, -0.144882f, -0.296579f, -0.091628f, -0.142202f, + -0.541764f, -0.407470f, 0.053481f, 0.238955f, 0.150188f, -0.060598f, + 0.196118f, -0.215617f, -0.086238f, -0.263420f, 0.206877f, 0.241788f, + -0.122544f, -0.448790f, 0.286917f, 0.112063f, -0.268408f, -0.041770f, + 0.089161f, 0.355811f, -0.078245f, -0.148490f, -0.407301f, -1.296870f, + -0.633421f, 0.124253f, 0.275402f, 0.223048f, 0.077016f, 0.160766f, + 0.115374f, 0.061053f, -0.231872f, -0.515052f, -0.278331f, -0.235912f, + -0.416372f, -0.284106f, -0.055942f, 0.110698f, -0.428288f, -0.298137f, + -0.018101f, 0.102677f, -0.019639f, 0.013479f, 0.038549f, 0.048682f, + 0.128684f, 0.116416f, 0.044852f, 0.008133f, 0.061597f, 0.083582f, + 0.014953f, 0.063716f, -0.155318f, -0.061732f, 0.084855f, 0.129505f, + 0.068249f, 0.193775f, -0.088631f, -0.446398f, -0.075710f, -0.061327f, + 0.278715f, 0.540366f, 0.618715f, 0.538374f, -0.037843f, 0.062370f, + -0.033184f, 0.119901f, -0.008641f, -0.064789f, 0.087498f, 0.043486f, + 0.247085f, 0.419992f, 0.299935f, 0.234276f, 0.089283f, 0.070357f, + 0.068888f, 0.134311f, 0.109823f, 0.072431f, 0.081676f, 0.091366f, + -1.707980f, -2.213110f, -2.149930f, -1.556870f, 0.226598f, 0.191675f, + 0.192207f, 0.159566f, -0.070194f, -0.136070f, -0.015172f, -0.204272f, + -0.162191f, -0.043313f, -0.158007f, -0.227210f, 0.040398f, 0.043014f, + 0.039439f, -0.035439f, 0.245558f, 0.439691f, 0.219659f, 0.138210f, + -0.048129f, 0.004954f, -0.102860f, -0.185376f, 0.035548f, 0.006821f, + 0.079199f, 0.032901f, 0.039218f, 0.068113f, 0.023075f, -0.037582f, + 0.225181f, 0.164562f, 0.106718f, 0.032684f, 0.013402f, 0.018797f, + 0.076606f, 0.046512f, -0.070024f, 0.099921f, -0.051231f, 0.074167f, + 0.173313f, 0.220212f, 0.142665f, 0.069809f, -0.195130f, -0.007912f, + -0.006764f, -0.063687f, 0.306374f, 0.402035f, 0.273759f, 0.449469f, + 0.114597f, 0.210745f, 0.355326f, 0.271307f, -0.109943f, -0.171912f, + -0.070726f, -0.128932f, 0.138770f, 0.164971f, 0.308516f, 0.332536f, + 0.081537f, 0.096939f, 0.054136f, 0.052226f, 0.109489f, 0.010223f, + 0.168072f, -0.106279f, 0.525568f, 0.704816f, 0.588942f, 0.473398f, + 0.149497f, 0.120835f, 0.080049f, 0.151340f, -0.182038f, -0.191091f, + -0.196505f, -0.198309f, -0.801819f, -1.441620f, -1.107780f, -1.025650f, + 0.035750f, 0.018049f, -0.029033f, -0.067255f, 0.192049f, 0.009664f, + -0.043741f, 0.051557f, 0.082815f, 0.069547f, -0.073379f, 0.010584f, + 0.192128f, 0.208586f, 0.141904f, 0.100763f, 0.046183f, 0.044776f, + -0.033611f, -0.005812f, 0.012966f, 0.030301f, 0.100665f, 0.103641f, + -0.294776f, -0.361573f, -0.420156f, -0.388743f, 0.239287f, 0.191975f, + 0.089644f, 0.117591f, 0.069563f, 0.021480f, 0.100287f, 0.174159f, + -0.013571f, 0.090960f, 0.010232f, -0.034760f, -0.077205f, 0.060632f, + -0.145527f, -0.391110f, -0.143052f, -0.236448f, -0.103902f, -0.188463f, + 0.071311f, -0.080171f, 0.021987f, 0.041767f, -0.419487f, -0.515479f, + -0.205470f, -0.732132f, 0.150901f, 0.107202f, 0.156307f, 0.143672f, + 0.474682f, 0.178137f, 0.150063f, 0.414515f, 0.559891f, 0.697019f, + 0.541231f, 0.505310f, -0.478101f, -0.444267f, -0.586539f, -0.445996f, + -0.451873f, -0.530085f, -0.447980f, -0.364955f, 0.372435f, 0.318894f, + 0.351211f, 0.193961f, 0.212295f, 0.212842f, 0.220003f, 0.243743f, + -0.388628f, -0.789620f, -0.536618f, -0.430691f, 0.247004f, 0.266489f, + 0.261033f, 0.263692f, 0.050089f, 0.048958f, 0.065207f, 0.120180f, + -0.526230f, -0.481969f, -0.422411f, -0.272292f, 0.155593f, 0.229614f, + 0.139579f, 0.171805f, -0.251924f, -0.302067f, -0.126157f, -0.346650f, + -1.195450f, -1.281100f, -0.987911f, -1.478440f, 0.285667f, 0.284802f, + 0.301887f, 0.259556f, -0.194127f, -0.090440f, -0.257959f, -0.259572f, + -0.012273f, -0.049993f, -0.099431f, 0.012506f, 0.081526f, 0.166279f, + 0.042594f, 0.185121f, 0.148830f, 0.073161f, 0.201728f, 0.125747f, + -0.295065f, -0.187585f, -0.333066f, -0.312291f, 0.253458f, 0.321585f, + 0.178844f, 0.219944f, -0.763475f, -0.943374f, -0.816825f, -0.709901f, + -0.166132f, 0.129186f, 0.015405f, -0.065623f, -0.246006f, -0.340385f, + -0.118155f, -0.384905f, -0.233883f, -0.400666f, -0.228597f, -0.228428f, + -0.559083f, -0.377784f, -0.541458f, -0.542870f, 0.067400f, 0.122987f, + 0.180901f, 0.186004f, -0.482910f, -0.424823f, -0.477831f, -0.394719f, + 0.091558f, 0.049248f, 0.049370f, 0.160429f, 0.133641f, 0.096625f, + 0.104429f, 0.100782f, -0.238252f, -0.221459f, -0.196974f, -0.250393f, + -3.071750f, -2.418450f, -0.861410f, -1.051580f, 0.071263f, 0.118014f, + -0.028430f, -0.072073f, -0.074463f, 0.034168f, 0.044089f, -0.091109f, + -3.153840f, -2.945850f, -1.977360f, -1.498850f, -0.083429f, 0.131835f, + -0.063865f, -0.065785f, -0.069346f, -0.015520f, -0.119551f, 0.044881f, + -0.105280f, 0.127516f, 0.005255f, -0.142777f, 0.061055f, -0.117250f, + 0.020454f, 0.157879f, -0.213812f, -0.151783f, 0.028583f, 0.137759f, + -3.248250f, -3.005940f, -1.510540f, -1.475390f, 0.081874f, -0.171465f, + -0.135690f, -0.001989f, -0.227574f, -0.132799f, -0.359742f, -0.137197f, + 0.066324f, 0.039194f, -0.050857f, 0.095166f, 0.044475f, 0.011221f, + 0.054904f, 0.061414f, -0.039189f, 0.123751f, -0.017171f, -0.008494f, + -2.598220f, -2.832670f, -1.622030f, -1.201990f, 0.154313f, -0.021436f, + 0.042190f, 0.143947f, -0.090623f, 0.086853f, 0.143137f, 0.099821f, + -1.732820f, -1.429730f, -0.775125f, -0.648036f, 0.082176f, 0.079448f, + -0.040575f, 0.024511f, -0.064105f, -0.117122f, -0.190323f, -0.182589f, + -0.076430f, -0.095615f, -0.112513f, -0.101581f, 0.143037f, 0.148180f, + 0.430958f, 0.359225f, 0.001403f, -0.080541f, -0.295001f, -0.156706f, + 0.426623f, 0.475597f, 0.455210f, 0.454352f, 0.074365f, 0.099440f, + 0.066348f, -0.007078f, 0.008335f, -0.097116f, -0.133687f, -0.110535f, + 0.204145f, 0.281478f, 0.078886f, 0.112857f, -0.103620f, -0.068247f, + 0.191147f, 0.227593f, -0.011816f, -0.058755f, -0.149477f, -0.101828f, + 0.079878f, 0.304949f, 0.557555f, 0.305288f, -0.150955f, -0.118610f, + 0.052073f, 0.064707f, -0.121728f, -0.151132f, -0.193987f, -0.175046f, + 0.043655f, 0.105270f, -0.120715f, -0.040976f, 0.047776f, -0.004443f, + 0.149606f, 0.111240f, -0.047502f, -0.064146f, -0.151858f, -0.151872f, + -0.160207f, -0.113846f, -0.081585f, -0.006708f, -0.203760f, -0.068597f, + -0.179979f, -0.127779f, -0.062460f, -0.064513f, -0.121479f, -0.111122f, + -0.212384f, -0.229157f, -0.283428f, -0.184891f, +}; + +static const float weights_layer_3[] = { + -0.039388f, 0.033048f, -0.113003f, -0.011642f, 0.170478f, 0.145713f, + 0.040189f, -0.280129f, -0.049050f, -0.043788f, -0.157425f, 0.323829f, + -0.250725f, -0.166349f, 0.101650f, -0.049690f, 0.205606f, 0.281131f, + 0.623204f, 0.993452f, -0.015115f, -0.138995f, 0.009473f, 0.157673f, + -0.024687f, -0.067214f, 0.125566f, -0.317619f, 0.057002f, 0.031202f, + -0.018167f, 0.068542f, 0.011609f, -0.020233f, -0.000428f, -0.035956f, + -0.843274f, -0.800587f, -0.214917f, -0.221250f, 0.031255f, -0.077330f, + -0.074902f, -0.063979f, -0.055562f, 0.679495f, 0.146609f, 1.315330f, + -0.118399f, -0.034539f, -0.050377f, 0.172867f, -0.204607f, -0.034930f, + 0.176014f, 0.089747f, -0.003889f, 0.044980f, 0.002386f, -0.141723f, + -0.035828f, -0.204701f, 0.099813f, 0.123580f, 0.209851f, -0.110989f, + -0.043655f, -0.461118f, -0.139664f, 0.026855f, -0.081714f, 0.207623f, + 0.089942f, 0.253082f, 0.680568f, 0.811360f, -0.090528f, -0.116818f, + -0.432361f, -0.075588f, -0.269924f, -0.276810f, -0.289192f, -0.282570f, + 0.245566f, 0.267216f, 0.238622f, 0.286528f, -0.157605f, -0.200401f, + -0.138924f, -0.185006f, 0.215203f, 0.203316f, 0.209532f, 0.293135f, + 0.928046f, 0.733323f, -0.094120f, 0.036918f, -0.126643f, -0.083371f, + -0.147530f, -0.153195f, 0.097097f, 0.101852f, 0.109160f, 0.105129f, + -0.051869f, -0.064359f, -0.073469f, -0.059591f, 0.102431f, 0.109444f, + 0.113614f, 0.105617f, 0.383311f, 0.325783f, 0.393234f, 0.382508f, + 0.194720f, 0.189672f, 0.217477f, 0.177786f, 0.326461f, 0.114789f, + 0.317061f, 0.048291f, -0.061143f, -0.134641f, -0.067895f, -0.108446f, + 0.082592f, 0.029918f, -0.006580f, 0.015533f, -0.053583f, -0.055540f, + -0.063395f, -0.023157f, -0.064955f, -0.073981f, -0.115452f, -0.086626f, + -0.036616f, 0.008454f, 0.012029f, -0.008039f, -0.207395f, -0.216419f, + -0.205363f, -0.249099f, 0.343308f, 0.413215f, -0.009918f, -0.109978f, + -0.059711f, -0.045089f, -0.029130f, -0.038483f, -0.070323f, -0.099409f, + -0.008849f, -0.063527f, 0.175963f, 0.185335f, 0.149151f, 0.199997f, + -0.027516f, -0.039812f, -0.027760f, -0.047910f, -0.007337f, 0.071065f, + 0.086225f, 0.125539f, 0.151390f, 0.215488f, 0.203450f, 0.045380f, + 0.095761f, 0.107809f, 0.103918f, 0.122383f, 0.116287f, 0.135455f, + 0.115446f, 0.155673f, -0.044648f, -0.027455f, -0.015473f, -0.026657f, + 0.089852f, 0.077459f, 0.077631f, 0.082507f, -0.102761f, -0.054669f, + -0.132223f, -0.024768f, 0.111573f, 0.060467f, 0.107883f, 0.056621f, + 0.219357f, -0.161153f, 0.074379f, -0.118743f, -0.169931f, -0.153995f, + -0.220003f, -0.200186f, 0.032318f, -0.060687f, -0.087550f, -0.038022f, + 0.026633f, -0.005534f, 0.029532f, 0.027081f, 0.011926f, 0.058412f, + 0.010631f, 0.003068f, -0.014911f, 0.063070f, 0.065271f, 0.089550f, + 0.012885f, 0.005320f, -0.037494f, -0.019849f, -0.009624f, -0.059090f, + -0.021222f, -0.088033f, -0.055261f, -0.055113f, -0.047598f, -0.055478f, + -0.023648f, -0.046827f, -0.036572f, -0.057655f, 0.104194f, 0.179800f, + 0.175751f, 0.192851f, -0.016950f, -0.073650f, -0.028592f, -0.088219f, + 0.011130f, 0.061825f, 0.025643f, 0.034183f, 0.095548f, 0.001457f, + -0.132869f, 0.032981f, -0.140178f, -0.105343f, -0.161799f, -0.161983f, + 0.177746f, 0.132903f, 0.135627f, 0.152489f, -0.012532f, -0.068747f, + -0.085849f, -0.095434f, 0.087037f, 0.139497f, 0.111899f, 0.100189f, + -0.024649f, -0.092003f, 0.020783f, -0.115807f, 0.092039f, 0.093943f, + 0.109466f, 0.049639f, -0.133727f, 0.128430f, -0.050546f, 0.190632f, + 0.123733f, 0.082305f, 0.114878f, 0.122572f, 0.201618f, 0.137588f, + 0.065582f, 0.125161f, -0.095179f, -0.120719f, -0.127126f, -0.101961f, + -0.118120f, -0.104833f, -0.179632f, -0.131764f, -0.138096f, -0.147861f, + -0.131512f, -0.153905f, -0.201816f, -0.206641f, -0.196707f, -0.160013f, + -0.212605f, -0.093998f, -0.186258f, -0.076137f, -0.065340f, -0.006969f, + -0.071383f, -0.075005f, +}; + +static const float weights_layer_4[] = { + -0.016102f, -0.022836f, 0.624049f, 0.273485f, 0.222800f, -0.290175f, + -0.518415f, 0.413484f, -0.264495f, 0.498083f, -0.450145f, -0.106419f, + 0.095103f, -0.187451f, 0.145933f, -0.371542f, -0.088871f, 0.184017f, + -0.429625f, -0.110882f, 0.292781f, 0.289588f, 0.185127f, 0.326017f, + -0.432009f, -0.342663f, -0.312206f, 0.004004f, -1.114290f, 0.028497f, + -0.264944f, -0.419611f, 0.046336f, 0.138232f, -0.869528f, 0.425557f, + -0.954838f, -0.186830f, -0.464622f, -0.757107f, -0.432686f, -0.125978f, + -0.402633f, -0.172266f, -0.041749f, -0.822238f, -0.118486f, 0.238617f, + -0.198037f, 0.146347f, 0.405257f, 0.513303f, -0.078876f, -0.300385f, + -0.010293f, -0.183962f, 0.155738f, 0.186797f, -0.086814f, 0.000179f, + 0.123467f, 0.362523f, 0.068805f, 0.371834f, 0.038122f, -0.117867f, + -0.120445f, -0.422322f, -0.131402f, 0.285449f, 0.038957f, 0.008844f, + -0.020197f, 0.187723f, 0.190433f, 0.146532f, -0.091068f, -0.270865f, + -0.194231f, -0.226777f, 0.013548f, 0.248351f, 0.537685f, 0.056316f, + -0.171540f, -0.003865f, 0.406439f, 0.126507f, 0.192780f, 0.149335f, + -0.149602f, 0.255202f, -0.015426f, 0.032335f, -1.791330f, -0.894602f, + -0.196641f, -0.282846f, -0.391100f, -0.040969f, 0.049934f, 0.056348f, + -0.041426f, -0.075159f, -0.658335f, -0.827270f, -0.175029f, -0.427235f, + 0.311201f, 0.560413f, 0.363408f, 0.374580f, -0.433531f, -0.180580f, + 0.142142f, 0.194768f, -0.054118f, -0.376541f, -0.366185f, -0.308782f, + -0.273143f, -0.074097f, 0.009000f, -0.182198f, -0.015616f, -0.003882f, + -0.174340f, -0.354866f, 0.527972f, 0.348355f, 0.091381f, -0.419828f, + -0.530529f, 0.159899f, -0.511867f, -0.104237f, -0.286079f, -0.659039f, + -0.266596f, -0.256557f, -0.600437f, -0.446333f, -0.229629f, 0.024931f, + -0.143716f, -0.415754f, -0.003760f, -0.107195f, -0.666165f, -0.697312f, + -0.650255f, -0.703877f, 0.243402f, 0.426710f, 0.217210f, 0.260255f, + 0.027416f, 0.163147f, 0.132188f, 0.142374f, 0.558627f, 0.065717f, + 0.382781f, -1.192240f, 0.195492f, 0.028439f, 0.278252f, -0.491806f, + 0.497701f, -0.448835f, -0.245079f, -0.014336f, -0.174907f, -0.409633f, + 0.207548f, 0.433813f, 0.459889f, 0.431728f, 0.605050f, 0.485520f, + 0.218548f, 0.437307f, 0.027023f, -0.204251f, 0.012100f, 0.150677f, + -1.097980f, 0.086866f, -1.293130f, -0.372575f, -0.876264f, -0.021818f, + 0.322864f, -0.231043f, -0.271608f, 0.132782f, -0.314895f, 0.396800f, + 0.262788f, -0.317212f, -0.666308f, 0.830742f, 0.319409f, -0.564373f, + -0.178656f, 0.306993f, 0.265634f, -0.332480f, -0.491514f, -0.186745f, + -0.063044f, -0.009321f, 0.074944f, -0.372082f, -0.029479f, 0.081548f, + 0.028172f, -0.233148f, -0.337938f, -0.087695f, 0.596556f, 0.559530f, + 0.139332f, 0.107223f, -0.190915f, 0.137401f, -0.150625f, -0.225484f, + -0.191344f, -0.232535f, 0.126510f, 0.296323f, -0.547901f, -0.653080f, + 0.358514f, 0.726289f, -0.421725f, -0.243620f, 0.236206f, 0.390823f, + -0.076560f, -0.282329f, -0.012460f, -0.428484f, 0.349469f, 0.394629f, + 0.421537f, 0.219632f, -0.117550f, -0.087894f, 0.077155f, 0.016000f, + -0.289137f, -0.092937f, -0.014518f, -0.027111f, 0.210329f, -0.159678f, + 0.013288f, -0.039268f, 0.008112f, 0.003152f, 0.030084f, -0.039859f, + 0.322028f, -0.407797f, 0.447087f, -0.381562f, 0.529297f, -0.520298f, + 0.562865f, -0.616878f, 0.689389f, 0.754262f, 0.138475f, 0.750697f, + -0.760157f, -0.383740f, 0.074219f, 0.556257f, 0.087827f, -0.511826f, + -0.305507f, -0.638214f, 0.114833f, -0.444022f, 0.526612f, -0.604984f, + -0.100415f, 0.037824f, -0.106264f, 0.337615f, 0.070743f, 0.031129f, + 0.281954f, 0.176144f, -0.032833f, -0.073902f, -0.285492f, -0.803803f, + -0.015589f, 0.186077f, -0.033351f, 0.517269f, -1.878800f, -1.685210f, + -0.416581f, 0.158476f, -0.071929f, -0.624353f, -0.122069f, -0.075065f, + 0.311816f, 0.506305f, 0.383896f, 0.259450f, -0.308232f, -0.094221f, + -0.421885f, -0.293573f, +}; + +static const float weights_layer_5[] = { + 0.131894f, 0.078431f, 0.323121f, -0.230680f, -0.684740f, 0.020895f, + 0.364983f, 0.121656f, 0.132448f, -0.731198f, 0.071148f, 0.739642f, + 0.318437f, -0.033021f, -1.037080f, 0.135335f, 0.383582f, 0.287332f, + 0.054042f, -0.825482f, 0.418533f, 0.305606f, 0.041549f, 0.432422f, + -0.826878f, -0.593536f, 0.105657f, 0.125357f, 0.408567f, -0.293338f, + 0.233905f, -0.039609f, 0.547727f, -0.435806f, 0.036160f, 0.220275f, + -0.020337f, -0.619403f, -0.455858f, 0.681455f, 0.543846f, -0.495084f, + 0.251496f, -0.085686f, 0.091395f, -0.476696f, 0.453628f, -0.109663f, + 0.383493f, -0.456563f, -0.212935f, 0.020567f, -0.719564f, -0.377813f, + -0.737511f, 0.765965f, 0.624309f, -0.063679f, -0.055681f, -0.475969f, + -0.069902f, 0.725690f, 0.641094f, 0.439922f, -0.111544f, -0.309061f, + 0.280091f, 0.381416f, 0.481168f, 0.483543f, -0.901267f, -0.499230f, + 0.043449f, -0.372395f, 0.021216f, -0.002200f, -0.524089f, -0.071485f, + -0.273974f, -0.462654f, 0.042369f, -0.138679f, -0.330060f, 0.021886f, + -0.306075f, -0.011130f, -0.260224f, -0.288435f, -0.104039f, -0.183563f, + 0.118990f, -0.531160f, 0.339632f, -0.028374f, 0.159084f, -0.008824f, + -0.791388f, 0.245242f, 0.356510f, 0.469867f, -0.396949f, -0.476146f, + -0.168472f, 1.068400f, 0.474629f, -0.117554f, -0.142453f, -0.306604f, + 0.348525f, -0.111929f, -0.435384f, 0.019952f, -0.260185f, 0.373376f, + 0.109729f, -0.639168f, 0.033392f, -0.082573f, -0.196018f, 0.301637f, + -0.124210f, -0.202515f, -1.221920f, -0.253690f, -0.144864f, 0.287753f, + -0.161206f, -0.213246f, 0.373968f, 0.141397f, -0.248237f, 0.283090f, + -0.008977f, -0.172960f, -0.234146f, -0.720014f, -0.322451f, 0.181083f, + 0.310659f, -0.422646f, -0.719994f, -0.354339f, 0.352739f, 0.230923f, + 0.427013f, -0.660316f, 0.232140f, 0.685896f, 0.660208f, 0.225748f, + -0.918750f, -0.650790f, -0.674525f, -0.450305f, -0.152529f, 0.498480f, + 0.895092f, 0.688242f, 0.669057f, 0.612669f, 0.593484f, 0.318204f, + -0.169294f, 0.388789f, -0.529777f, -0.219706f, -0.044916f, 0.161697f, + -0.145288f, 0.196153f, -0.022212f, -0.434209f, -0.208115f, -0.117745f, + -0.279029f, -0.009506f, 0.137474f, 0.330148f, 0.439258f, 0.345879f, + -0.845131f, -0.215713f, 0.094463f, 0.638604f, 0.882254f, -0.964082f, + -0.383920f, 0.292645f, 0.266341f, 0.747473f, -0.645631f, -0.538896f, + -0.319764f, 0.521880f, 0.460091f, -0.470898f, -0.778283f, -0.061622f, + -0.142433f, 0.210520f, 0.804197f, 0.285840f, -0.138414f, -0.381846f, + -0.499991f, 0.223648f, 0.439025f, 0.321508f, -0.099560f, -0.622893f, + 0.750925f, 0.740994f, 0.140405f, 0.074631f, -0.270223f, -0.829049f, + -0.753355f, -0.258015f, 0.006285f, -0.730573f, -1.107390f, -0.538015f, + -1.005520f, -0.724115f, -0.440183f, -0.395239f, 0.508768f, 0.204620f, + -0.267331f, 0.001740f, -0.838709f, 0.659333f, 0.043739f, -0.024099f, + 0.262431f, 0.252433f, -0.265215f, 0.057289f, -0.428192f, -0.114350f, + -0.011475f, 0.463995f, 0.668833f, -0.604556f, -0.122780f, -0.441645f, + 0.145769f, 0.310450f, -1.003500f, 0.936069f, 0.516604f, -0.643386f, + -0.518571f, 0.306130f, 0.337387f, 0.583400f, -0.366025f, -0.560035f, + -0.262332f, 0.465242f, 0.964332f, -0.545410f, -0.637428f, -0.202695f, + 0.378931f, 0.834604f, 0.000970f, -0.553303f, -0.562879f, 0.221665f, + 0.395160f, 0.446281f, -0.184394f, -0.591780f, 0.170595f, 1.164390f, + 0.227068f, -0.150910f, -0.393690f, -0.131151f, 0.309956f, -0.413518f, + -0.768334f, -0.548975f, 0.245384f, -0.256904f, -0.514790f, -0.102616f, + -0.347625f, 0.420456f, 0.037804f, -0.283200f, -0.578815f, 0.319282f, + 0.674622f, -0.011791f, -0.339329f, 0.466705f, 0.563444f, 0.409660f, + 0.445784f, -0.899507f, -0.605116f, 0.622438f, 0.427385f, -0.062509f, + 0.666570f, 0.057105f, 0.357894f, -0.811016f, -0.421715f, -0.458397f, + 0.288955f, 0.005857f, 0.236331f, 0.107957f, 0.587276f, -0.375800f, + 0.323799f, -0.623363f, 0.254122f, -0.198478f, -0.098436f, -0.282531f, + 0.452453f, -0.163349f, -0.413382f, -0.448732f, -0.528770f, -0.457449f, + -0.619619f, -0.265919f, -0.042760f, 0.438730f, 0.501798f, -0.403851f, + 0.519564f, 0.817314f, 0.366203f, 0.492610f, 0.546929f, 0.853094f, + 0.289000f, 0.453941f, -0.076152f, 0.007226f, -0.183717f, -0.506252f, + -0.599989f, -0.576006f, 0.746488f, 0.631466f, -0.475599f, -0.334991f, + -0.879614f, 0.918957f, 0.473471f, -0.043781f, -0.688234f, -0.925875f, + -0.188081f, 0.050918f, 0.116855f, 0.221413f, -0.066680f, -0.674395f, + -0.481985f, 0.247368f, 0.271129f, 0.637979f, -1.006970f, -0.855441f, + 0.144874f, 0.507424f, 1.506960f, -0.338910f, 0.398203f, 0.738000f, + 0.263193f, -0.425908f, 0.358271f, -1.072900f, -0.816209f, -0.425519f, + 0.264373f, 0.694014f, 0.036333f, 0.635532f, 0.518856f, 0.047585f, + -0.854817f, -0.138202f, 0.006811f, -0.052020f, -0.468498f, 0.489080f, + -0.105778f, 0.357038f, -0.782875f, 0.649049f, -0.562652f, -0.544392f, + -0.328526f, -0.402121f, -0.263172f, -0.668459f, -0.526702f, -0.395829f, + 0.190986f, 0.307766f, -1.001830f, -0.293051f, 0.283334f, 0.572450f, + 0.906095f, -1.144300f, 0.180989f, 0.421092f, 0.684571f, 0.527276f, + -0.122287f, 0.575067f, 0.675221f, 0.755029f, 0.094957f, 0.481403f, + 0.825155f, 0.755035f, 0.641420f, 0.034497f, 0.518783f, 0.283800f, + 0.293733f, -0.074778f, -0.268720f, 0.798921f, 0.317714f, -0.236391f, + -0.375071f, -0.414600f, 0.223413f, -0.349044f, -0.191033f, -0.391779f, + -0.596894f, -0.378608f, -0.185920f, -0.822171f, -0.754962f, -0.167706f, + 0.755378f, 0.671847f, 0.969414f, 0.793048f, 1.078610f, -0.418963f, + 0.367648f, 0.217645f, 0.294232f, 0.113027f, 0.060312f, -0.327488f, + -0.305035f, -0.243600f, -0.020588f, -0.326324f, -0.417534f, -0.425868f, + -0.404614f, -0.346750f, -0.339145f, -0.348094f, -0.527290f, -0.617825f, + -0.258342f, -0.200753f, -0.249779f, -0.321039f, -0.023117f, -0.004167f, + -0.206788f, -0.612420f, -0.646428f, -0.548969f, -0.158875f, 0.213814f, + -0.084040f, -0.217365f, -0.511895f, -0.653285f, 0.440971f, 0.455591f, + -0.123900f, 0.134097f, -0.251241f, 0.682463f, 0.740614f, 0.991212f, + 0.565984f, 0.592690f, +}; + +static INLINE float32x4_t add_f32x4_x4(const float32x4_t a[4]) { + float32x4_t sum01 = vaddq_f32(a[0], a[1]); + float32x4_t sum23 = vaddq_f32(a[2], a[3]); + return vaddq_f32(sum01, sum23); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 2 && filter_width == 2); + assert(skip_width == 2 && skip_height == 2); + assert(in_width >= 16); + const int in_size = in_height * in_width; + + do { + const float32x4_t bias_v = vdupq_n_f32(bias[0]); + const float *weight_ptr0 = weights; + const float *in_ptr0 = *input; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + const float *weight_ptr1 = weight_ptr0; + const float *in_ptr2 = in_ptr1; + int k = 0; + float32x4_t sum0[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + float32x4_t sum1[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + + do { + const float32x4_t weights0 = vld1q_f32(weight_ptr1); + const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4); + const float32x2_t weights0_lo = vget_low_f32(weights0); + const float32x2_t weights0_hi = vget_high_f32(weights0); + const float32x2_t weights1_lo = vget_low_f32(weights1); + const float32x2_t weights1_hi = vget_high_f32(weights1); + + const float32x4x2_t in0_lo_0 = vld2q_f32(in_ptr2); + const float32x4x2_t in0_hi_0 = vld2q_f32(in_ptr2 + in_stride); + const float32x4x2_t in1_lo_0 = vld2q_f32(in_ptr2 + in_size); + const float32x4x2_t in1_hi_0 = + vld2q_f32(in_ptr2 + in_size + in_stride); + + sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[0], weights0_lo, 0); + sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[1], weights0_lo, 1); + + sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[0], weights0_hi, 0); + sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[1], weights0_hi, 1); + + sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[0], weights1_lo, 0); + sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[1], weights1_lo, 1); + + sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[0], weights1_hi, 0); + sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[1], weights1_hi, 1); + + const float32x4x2_t in0_lo_1 = vld2q_f32(in_ptr2 + 8); + const float32x4x2_t in0_hi_1 = vld2q_f32(in_ptr2 + in_stride + 8); + const float32x4x2_t in1_lo_1 = vld2q_f32(in_ptr2 + in_size + 8); + const float32x4x2_t in1_hi_1 = + vld2q_f32(in_ptr2 + in_size + in_stride + 8); + + sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[0], weights0_lo, 0); + sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[1], weights0_lo, 1); + + sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[0], weights0_hi, 0); + sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[1], weights0_hi, 1); + + sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[0], weights1_lo, 0); + sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[1], weights1_lo, 1); + + sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[0], weights1_hi, 0); + sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[1], weights1_hi, 1); + + weight_ptr1 += 8; + in_ptr2 += 2 * in_size; + k += 2; + } while (k < in_channels); + + vst1q_f32(out_ptr1, add_f32x4_x4(sum0)); + vst1q_f32(out_ptr1 + 4, add_f32x4_x4(sum1)); + + out_ptr1 += 8; + in_ptr1 += 8 * skip_width; + w += 8 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++bias; + ++output; + weights += in_channels * filter_height * filter_width; + } while (++start_idx < out_channels); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 2 && filter_width == 2); + assert(skip_width == 2 && skip_height == 2); + assert(in_width == 8); + const int in_size = in_height * in_width; + do { + const float32x4_t bias_v = vdupq_n_f32(*bias); + const float *weight_ptr0 = weights; + const float *in_ptr0 = *input; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + const float *weight_ptr1 = weight_ptr0; + const float *in_ptr2 = in_ptr1; + int k = 0; + float32x4_t sum[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + + do { + const float32x4_t weights0 = vld1q_f32(weight_ptr1); + const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4); + const float32x2_t weights0_lo = vget_low_f32(weights0); + const float32x2_t weights0_hi = vget_high_f32(weights0); + const float32x2_t weights1_lo = vget_low_f32(weights1); + const float32x2_t weights1_hi = vget_high_f32(weights1); + + const float32x4x2_t in0_lo = vld2q_f32(in_ptr2); + const float32x4x2_t in0_hi = vld2q_f32(in_ptr2 + in_stride); + const float32x4x2_t in1_lo = vld2q_f32(in_ptr2 + in_size); + const float32x4x2_t in1_hi = vld2q_f32(in_ptr2 + in_size + in_stride); + + sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[0], weights0_lo, 0); + sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[1], weights0_lo, 1); + + sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[0], weights0_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[1], weights0_hi, 1); + + sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[0], weights1_lo, 0); + sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[1], weights1_lo, 1); + + sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[0], weights1_hi, 0); + sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[1], weights1_hi, 1); + + weight_ptr1 += 8; + in_ptr2 += 2 * in_size; + k += 2; + } while (k < in_channels); + + vst1q_f32(out_ptr1, add_f32x4_x4(sum)); + + out_ptr1 += 4; + in_ptr1 += 4 * skip_width; + w += 4 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++bias; + ++output; + weights += in_channels * filter_height * filter_width; + } while (++start_idx < out_channels); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 5 && filter_width == 5); + assert(skip_width == 4 && skip_height == 4); + assert(in_width >= 16); + assert(in_channels == 1); + (void)in_channels; + + do { + const float32x4_t bias_v = vdupq_n_f32(*bias); + const float *in_ptr0 = *input; + const float *weights_ptr0 = weights; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + float32x4_t sum[2] = { bias_v, vdupq_n_f32(0) }; + + const float32x4_t weight_0_3 = vld1q_f32(weights_ptr0); + const float32x4_t weight_4_7 = vld1q_f32(weights_ptr0 + 4); + const float32x4_t weight_8_11 = vld1q_f32(weights_ptr0 + 8); + const float32x4_t weight_12_15 = vld1q_f32(weights_ptr0 + 12); + const float32x4_t weight_16_19 = vld1q_f32(weights_ptr0 + 16); + const float32x4_t weight_20_23 = vld1q_f32(weights_ptr0 + 20); + + const float32x2_t weight_0_3_lo = vget_low_f32(weight_0_3); + const float32x2_t weight_0_3_hi = vget_high_f32(weight_0_3); + const float32x2_t weight_4_7_lo = vget_low_f32(weight_4_7); + const float32x2_t weight_4_7_hi = vget_high_f32(weight_4_7); + const float32x2_t weight_8_11_lo = vget_low_f32(weight_8_11); + const float32x2_t weight_8_11_hi = vget_high_f32(weight_8_11); + const float32x2_t weight_12_15_lo = vget_low_f32(weight_12_15); + const float32x2_t weight_12_15_hi = vget_high_f32(weight_12_15); + const float32x2_t weight_16_19_lo = vget_low_f32(weight_16_19); + const float32x2_t weight_16_19_hi = vget_high_f32(weight_16_19); + const float32x2_t weight_20_23_lo = vget_low_f32(weight_20_23); + const float32x2_t weight_20_23_hi = vget_high_f32(weight_20_23); + + const float32x4x4_t in0 = vld4q_f32(in_ptr1 + 0 * in_stride); + const float32x4x4_t in1 = vld4q_f32(in_ptr1 + 1 * in_stride); + const float32x4x4_t in2 = vld4q_f32(in_ptr1 + 2 * in_stride); + const float32x4x4_t in3 = vld4q_f32(in_ptr1 + 3 * in_stride); + const float32x4x4_t in4 = vld4q_f32(in_ptr1 + 4 * in_stride); + + const float32x4_t in0_4 = vextq_f32( + in0.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 0 * in_stride)), 1); + const float32x4_t in1_4 = vextq_f32( + in1.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 1 * in_stride)), 1); + const float32x4_t in2_4 = vextq_f32( + in2.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 2 * in_stride)), 1); + const float32x4_t in3_4 = vextq_f32( + in3.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 3 * in_stride)), 1); + const float32x4_t in4_4 = vextq_f32( + in4.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 4 * in_stride)), 1); + + // Kernel row 0. + sum[0] = vmlaq_lane_f32(sum[0], in0.val[0], weight_0_3_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0.val[1], weight_0_3_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in0.val[2], weight_0_3_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0.val[3], weight_0_3_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in0_4, weight_4_7_lo, 0); + + // Kernel row 1. + sum[1] = vmlaq_lane_f32(sum[1], in1.val[0], weight_4_7_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in1.val[1], weight_4_7_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in1.val[2], weight_4_7_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in1.val[3], weight_8_11_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in1_4, weight_8_11_lo, 1); + + // Kernel row 2. + sum[0] = vmlaq_lane_f32(sum[0], in2.val[0], weight_8_11_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in2.val[1], weight_8_11_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in2.val[2], weight_12_15_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in2.val[3], weight_12_15_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in2_4, weight_12_15_hi, 0); + + // Kernel row 3. + sum[1] = vmlaq_lane_f32(sum[1], in3.val[0], weight_12_15_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in3.val[1], weight_16_19_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in3.val[2], weight_16_19_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in3.val[3], weight_16_19_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in3_4, weight_16_19_hi, 1); + + // Kernel row 4. + sum[0] = vmlaq_lane_f32(sum[0], in4.val[0], weight_20_23_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in4.val[1], weight_20_23_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in4.val[2], weight_20_23_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in4.val[3], weight_20_23_hi, 1); + sum[0] = vmlaq_f32(sum[0], vdupq_n_f32(*(weights_ptr0 + 24)), in4_4); + + vst1q_f32(out_ptr1, vaddq_f32(sum[0], sum[1])); + + out_ptr1 += 4; + in_ptr1 += 4 * skip_width; + w += 4 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++output; + ++bias; + weights += 25; + } while (++start_idx < out_channels); +} + +// Neon variant of av1_cnn_convolve_no_maxpool_padding_valid_c(). +// As per the current encoder, av1_cnn_convolve function gets called for +// block size equal to 64x64. av1_cnn_convolve() uses layer config values +// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few +// details related to each layer's config parameters. +// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht +// 0 64x64 16x16 5 5 4 4 +// 1 16x16 8x8 2 2 2 2 +// 2 8x8 4x4 2 2 2 2 +// 3 4x4 2x2 2 2 2 2 +// 4 2x2 1x1 2 2 2 2 +// Here, +// filter_wd = filter_width and filter_ht = filter_height, +// skip_wd = skip_width and skip_ht = skip_height. +void av1_cnn_convolve_no_maxpool_padding_valid_neon( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, + int start_idx, int cstep, int channel_step) { + assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) || + !layer_config->maxpool); + assert(layer_config->filter_height > 1 || layer_config->filter_width > 1); + assert(layer_config->pad == PADDING_VALID); + assert(channel_step == 1); + assert(cstep == layer_config->in_channels * layer_config->out_channels); + + if (layer_config->filter_width == 5 && layer_config->filter_height == 5 && + layer_config->skip_width == 4 && layer_config->skip_height == 4) { + av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights_layer_5); + } else if (layer_config->filter_width == 2 && + layer_config->filter_height == 2 && + layer_config->skip_width == 2 && layer_config->skip_height == 2) { + const float *weights = weights_layer_1; + if (layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[2].output_num) { + weights = weights_layer_2; + } else if ((layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[3] + .output_num)) { + weights = weights_layer_3; + } else if ((layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[4] + .output_num)) { + weights = weights_layer_4; + } + if (in_width >= 16) { + av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights); + } else if (in_width == 8) { + av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights); + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, start_idx, cstep, channel_step); + } + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c new file mode 100644 index 0000000000..582863a27c --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c @@ -0,0 +1,646 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/encodetxb.h" + +void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = height + TX_PAD_HOR; + memset(levels - TX_PAD_TOP * stride, 0, + sizeof(*levels) * TX_PAD_TOP * stride); + memset(levels + stride * width, 0, + sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); + + const int32x4_t zeros = vdupq_n_s32(0); + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (height == 4) { + do { + const int32x4_t coeffA = vld1q_s32(cf); + const int32x4_t coeffB = vld1q_s32(cf + height); + const int16x8_t coeffAB = + vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB)); + const int16x8_t absAB = vqabsq_s16(coeffAB); + const int8x8_t absABs = vqmovn_s16(absAB); +#if AOM_ARCH_AARCH64 + const int8x16_t absAB8 = + vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros))); + const uint8x16_t lsAB = + vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros)); +#else + const int32x2x2_t absAB8 = + vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros)); + const uint8x16_t lsAB = + vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1])); +#endif + vst1q_u8(ls, lsAB); + ls += (stride << 1); + cf += (height << 1); + i += 2; + } while (i < width); + } else if (height == 8) { + do { + const int16x8_t coeffAB = load_tran_low_to_s16q(cf); + const int16x8_t absAB = vqabsq_s16(coeffAB); + const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8( + vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros)))); + vst1q_u8(ls, absAB8); + ls += stride; + cf += height; + i += 1; + } while (i < width); + } else { + do { + int j = 0; + do { + const int16x8_t coeffAB = load_tran_low_to_s16q(cf); + const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8); + const int16x8_t absAB = vqabsq_s16(coeffAB); + const int16x8_t absCD = vqabsq_s16(coeffCD); + const uint8x16_t absABCD = vreinterpretq_u8_s8( + vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD))); + vst1q_u8((ls + j), absABCD); + j += 16; + cf += 16; + } while (j < height); + *(int32_t *)(ls + height) = 0; + ls += stride; + i += 1; + } while (i < width); + } +} + +// get_4_nz_map_contexts_2d coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = { + { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 }, + { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21 } +}; + +// get_4_nz_map_contexts_hor coefficients: +/* clang-format off */ +#define SIG_COEF_CONTEXTS_2D_X4_051010 \ + (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \ + ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24)) +/* clang-format on */ + +// get_4_nz_map_contexts_ver coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_4_po_hor[16]) = { + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 +}; + +// get_8_coeff_contexts_2d coefficients: +// if (width == 8) +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = { + { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 } +}; +// if (width < 8) +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = { + { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21 }, + { 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21 } +}; + +// if (width > 8) +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = { + { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 } +}; + +// get_4_nz_map_contexts_ver coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_ver[16]) = { + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 +}; + +// get_16n_coeff_contexts_2d coefficients: +// real_width == real_height +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = { + { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } +}; + +// real_width < real_height +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = { + { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } +}; + +// real_width > real_height +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = { + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } +}; + +// get_16n_coeff_contexts_hor coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = { + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 +}; + +// end of coefficients declaration area + +static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src, + const int byte_stride) { +#if AOM_ARCH_AARCH64 + uint32x4_t v_data = vld1q_u32((uint32_t *)src); + v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1); + v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2); + v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3); + + return vreinterpretq_u8_u32(v_data); +#else + return load_unaligned_u8q(src, byte_stride); +#endif +} + +static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src, + const int byte_stride) { +#if AOM_ARCH_AARCH64 + uint64x2_t v_data = vld1q_u64((uint64_t *)src); + v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1); + + return vreinterpretq_u8_u64(v_data); +#else + uint8x8_t v_data_low = vld1_u8(src); + uint8x8_t v_data_high = vld1_u8(src + byte_stride); + + return vcombine_u8(v_data_low, v_data_high); +#endif +} + +static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src, + const int byte_stride) { + (void)byte_stride; + return vld1q_u8(src); +} + +static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride, + const ptrdiff_t *const offsets, + uint8x16_t *const level) { + level[0] = load_8bit_4x4_to_1_reg(&src[1], stride); + level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride); + level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride); + level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride); + level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride); +} + +static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride, + const ptrdiff_t *const offsets, + uint8x16_t *const level) { + level[0] = load_8bit_8x2_to_1_reg(&src[1], stride); + level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride); + level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride); + level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride); + level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride); +} + +static INLINE void load_levels_16x1x5(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + uint8x16_t *const level) { + level[0] = load_8bit_16x1_to_1_reg(&src[1], stride); + level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride); + level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride); + level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride); + level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride); +} + +static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) { + const uint8x16_t const_3 = vdupq_n_u8(3); + const uint8x16_t const_4 = vdupq_n_u8(4); + uint8x16_t count; + + count = vminq_u8(level[0], const_3); + level[1] = vminq_u8(level[1], const_3); + level[2] = vminq_u8(level[2], const_3); + level[3] = vminq_u8(level[3], const_3); + level[4] = vminq_u8(level[4], const_3); + count = vaddq_u8(count, level[1]); + count = vaddq_u8(count, level[2]); + count = vaddq_u8(count, level[3]); + count = vaddq_u8(count, level[4]); + + count = vrshrq_n_u8(count, 1); + count = vminq_u8(count, const_4); + return count; +} + +static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *const coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const uint8x16_t pos_to_offset_large = vdupq_n_u8(21); + + uint8x16_t pos_to_offset = + (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]); + + uint8x16_t count; + uint8x16_t level[5]; + uint8_t *cc = coeff_contexts; + + assert(!(width % 4)); + + int col = width; + do { + load_levels_4x4x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(cc, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + cc += 16; + col -= 4; + } while (col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + + const uint8x16_t pos_to_offset = + vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010)); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 4)); + + int col = width; + do { + load_levels_4x4x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + levels += 4 * stride; + coeff_contexts += 16; + col -= 4; + } while (col); +} + +static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 4)); + + int col = width; + do { + load_levels_4x4x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + coeff_contexts += 16; + col -= 4; + } while (col); +} + +static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + uint8_t *cc = coeff_contexts; + uint8x16_t count; + uint8x16_t level[5]; + uint8x16_t pos_to_offset[3]; + + assert(!(width % 2)); + + if (width == 8) { + pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]); + pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]); + } else if (width < 8) { + pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]); + pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]); + } else { + pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]); + pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]); + } + pos_to_offset[2] = vdupq_n_u8(21); + + int col = width; + do { + load_levels_8x2x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset[0]); + vst1q_u8(cc, count); + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += 2 * stride; + cc += 16; + col -= 2; + } while (col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + + const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 2)); + + int col = width; + do { + load_levels_8x2x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + levels += 2 * stride; + coeff_contexts += 16; + col -= 2; + } while (col); +} + +static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0), + vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5)); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 2)); + + int col = width; + do { + load_levels_8x2x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 2 * stride; + coeff_contexts += 16; + col -= 2; + } while (col); +} + +static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, + const int real_width, + const int real_height, + const int width, const int height, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + uint8_t *cc = coeff_contexts; + int col = width; + uint8x16_t pos_to_offset[5]; + uint8x16_t pos_to_offset_large[3]; + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(height % 16)); + + pos_to_offset_large[2] = vdupq_n_u8(21); + if (real_width == real_height) { + pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]); + pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]); + pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]); + pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]); + pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = + pos_to_offset_large[2]; + } else if (real_width < real_height) { + pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]); + pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]); + pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = + vld1q_u8(c_16_po_2d_g[2]); + pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; + } else { // real_width > real_height + pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]); + pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]); + pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]); + pos_to_offset[4] = pos_to_offset_large[2]; + pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(16); + } + + do { + int h = height; + + do { + load_levels_16x1x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset[0]); + vst1q_u8(cc, count); + levels += 16; + cc += 16; + h -= 16; + pos_to_offset[0] = pos_to_offset_large[0]; + } while (h); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + pos_to_offset[2] = pos_to_offset[3]; + pos_to_offset[3] = pos_to_offset[4]; + pos_to_offset_large[0] = pos_to_offset_large[1]; + pos_to_offset_large[1] = pos_to_offset_large[2]; + levels += TX_PAD_HOR; + } while (--col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + + const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(height % 16)); + + int col = width; + do { + uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver); + + int h = height; + do { + load_levels_16x1x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 16; + coeff_contexts += 16; + h -= 16; + } while (h); + + levels += TX_PAD_HOR; + } while (--col); +} + +static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + + uint8x16_t pos_to_offset[3]; + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(height % 16)); + + pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0); + pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5); + pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + int col = width; + do { + int h = height; + do { + load_levels_16x1x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset[0]); + vst1q_u8(coeff_contexts, count); + levels += 16; + coeff_contexts += 16; + h -= 16; + } while (h); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += TX_PAD_HOR; + } while (--col); +} + +// Note: levels[] must be in the range [0, 127], inclusive. +void av1_get_nz_map_contexts_neon(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, + const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int last_idx = eob - 1; + if (!last_idx) { + coeff_contexts[0] = 0; + return; + } + + uint8_t *const coefficients = (uint8_t *const)coeff_contexts; + + const int real_width = tx_size_wide[tx_size]; + const int real_height = tx_size_high[tx_size]; + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const int stride = height + TX_PAD_HOR; + ptrdiff_t offsets[3]; + + /* coeff_contexts must be 16 byte aligned. */ + assert(!((intptr_t)coeff_contexts & 0xf)); + + if (tx_class == TX_CLASS_2D) { + offsets[0] = 0 * stride + 2; + offsets[1] = 1 * stride + 1; + offsets[2] = 2 * stride + 0; + + if (height == 4) { + get_4_nz_map_contexts_2d(levels, width, offsets, coefficients); + } else if (height == 8) { + get_8_coeff_contexts_2d(levels, width, offsets, coefficients); + } else { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coefficients); + } + } else if (tx_class == TX_CLASS_HORIZ) { + offsets[0] = 2 * stride; + offsets[1] = 3 * stride; + offsets[2] = 4 * stride; + if (height == 4) { + get_4_nz_map_contexts_hor(levels, width, offsets, coefficients); + } else if (height == 8) { + get_8_coeff_contexts_hor(levels, width, offsets, coefficients); + } else { + get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients); + } + } else { // TX_CLASS_VERT + offsets[0] = 2; + offsets[1] = 3; + offsets[2] = 4; + if (height == 4) { + get_4_nz_map_contexts_ver(levels, width, offsets, coefficients); + } else if (height == 8) { + get_8_coeff_contexts_ver(levels, width, offsets, coefficients); + } else { + get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients); + } + } + + const int bhl = get_txb_bhl(tx_size); + const int pos = scan[last_idx]; + if (last_idx <= (width << bhl) / 8) + coeff_contexts[pos] = 1; + else if (last_idx <= (width << bhl) / 4) + coeff_contexts[pos] = 2; + else + coeff_contexts[pos] = 3; +} diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c new file mode 100644 index 0000000000..aa64a38902 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c @@ -0,0 +1,2619 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "shift_neon.h" +#include "txfm_neon.h" + +static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in, + int32x4_t *out) { + // This is not quite the same as the other transposes defined in + // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is + // unused by the following row transform. + for (int j = 0; j < 8; ++j) { + for (int i = 0; i < 16; ++i) { + transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i); + } + } +} + +// A note on butterfly helper naming: +// +// butterfly_[weight_indices]_neon +// e.g. butterfly_0312_neon +// ^ Weights are applied as indices 0, 3, 2, 1 +// (see more detail below) +// +// Weight indices are treated as an index into the 4-tuple of the weight +// itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1). +// This is then represented in the helper naming by referring to the lane index +// in the loaded tuple that each multiply is performed with: +// +// in0 in1 +// /------------ +// out0 | w[0] w[1] ==> out0 = in0 * w[0] + in1 * w[1] +// out1 | w[2] w[3] ==> out1 = in0 * w[2] + in1 * w[3] +// +// So for indices 0321 from the earlier example, we end up with: +// +// in0 in1 +// /------------------ +// out0 | (lane 0) (lane 3) ==> out0 = in0 * w0 + in1 * (w0-1) +// out1 | (lane 2) (lane 1) ==> out1 = in0 * -w0 + in1 * (1-w0) + +#define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit) \ + do { \ + int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } }; \ + int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \ + x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2); \ + *out = vrshlq_s32(x, v_bit); \ + } while (false) + +static AOM_FORCE_INLINE void butterfly_0112_neon( + const int32_t *cospi, const int widx0, const int32x4_t n0, + const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, + const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * widx0); + butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void butterfly_2312_neon( + const int32_t *cospi, const int widx0, const int32x4_t n0, + const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, + const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * widx0); + butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void butterfly_0332_neon( + const int32_t *cospi, const int widx0, const int32x4_t n0, + const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, + const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * widx0); + butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void butterfly_0130_neon( + const int32_t *cospi, const int widx0, const int32x4_t n0, + const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, + const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * widx0); + butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon( + const int32_t *cospi, const int32x4_t n0, const int32x4_t n1, + int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * 32); + butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon( + const int32_t *cospi, const int32x4_t n0, const int32x4_t n1, + int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * 32); + butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input, + int32x4_t *output, + const int size) { + const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2); + int i = 0; + do { + const int32x4_t r1 = vmulq_s32(input[i], sqrt2); + output[i] = vrshrq_n_s32(r1, NewSqrt2Bits); + } while (++i < size); +} + +static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon( + const int32x4_t *input, int32x4_t *output, const int size) { + const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2); + int i = 0; + do { + const int32x4_t r0 = vrshrq_n_s32(input[i], 2); + const int32x4_t r1 = vmulq_s32(r0, sqrt2); + output[i] = vrshrq_n_s32(r1, NewSqrt2Bits); + } while (++i < size); +} + +#define LOAD_BUFFER_4XH(h) \ + static AOM_FORCE_INLINE void load_buffer_4x##h( \ + const int16_t *input, int32x4_t *in, int stride, int fliplr) { \ + if (fliplr) { \ + for (int i = 0; i < (h); ++i) { \ + int16x4_t a = vld1_s16(input + i * stride); \ + a = vrev64_s16(a); \ + in[i] = vshll_n_s16(a, 2); \ + } \ + } else { \ + for (int i = 0; i < (h); ++i) { \ + int16x4_t a = vld1_s16(input + i * stride); \ + in[i] = vshll_n_s16(a, 2); \ + } \ + } \ + } + +// AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to +// avoid the expression even though the compiler can prove that the code path +// is never taken if `shift == 0`. +#define shift_left_long_s16(a, shift) \ + ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift))) + +#define LOAD_BUFFER_WXH(w, h, shift) \ + static AOM_FORCE_INLINE void load_buffer_##w##x##h( \ + const int16_t *input, int32x4_t *in, int stride, int fliplr) { \ + assert(w >= 8); \ + if (fliplr) { \ + for (int i = 0; i < (h); ++i) { \ + for (int j = 0; j < (w) / 8; ++j) { \ + int16x8_t a = vld1q_s16(input + i * stride + j * 8); \ + a = vrev64q_s16(a); \ + int j2 = (w) / 8 - j - 1; \ + in[i + (h) * (2 * j2 + 0)] = \ + shift_left_long_s16(vget_high_s16(a), (shift)); \ + in[i + (h) * (2 * j2 + 1)] = \ + shift_left_long_s16(vget_low_s16(a), (shift)); \ + } \ + } \ + } else { \ + for (int i = 0; i < (h); ++i) { \ + for (int j = 0; j < (w) / 8; ++j) { \ + int16x8_t a = vld1q_s16(input + i * stride + j * 8); \ + in[i + (h) * (2 * j + 0)] = \ + shift_left_long_s16(vget_low_s16(a), (shift)); \ + in[i + (h) * (2 * j + 1)] = \ + shift_left_long_s16(vget_high_s16(a), (shift)); \ + } \ + } \ + } \ + } + +LOAD_BUFFER_4XH(4) +LOAD_BUFFER_4XH(8) +LOAD_BUFFER_4XH(16) +LOAD_BUFFER_4XH(32) +LOAD_BUFFER_WXH(8, 8, 2) +LOAD_BUFFER_WXH(16, 16, 2) +LOAD_BUFFER_WXH(32, 64, 0) +LOAD_BUFFER_WXH(64, 32, 2) +LOAD_BUFFER_WXH(64, 64, 0) + +#if !CONFIG_REALTIME_ONLY +LOAD_BUFFER_WXH(16, 64, 0) +LOAD_BUFFER_WXH(64, 16, 2) +#endif // !CONFIG_REALTIME_ONLY + +#define STORE_BUFFER_WXH(w, h) \ + static AOM_FORCE_INLINE void store_buffer_##w##x##h( \ + const int32x4_t *in, int32_t *out, int stride) { \ + for (int i = 0; i < (w); ++i) { \ + for (int j = 0; j < (h) / 4; ++j) { \ + vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \ + } \ + } \ + } + +STORE_BUFFER_WXH(4, 4) +STORE_BUFFER_WXH(8, 4) +STORE_BUFFER_WXH(8, 8) +STORE_BUFFER_WXH(16, 4) +STORE_BUFFER_WXH(16, 16) +STORE_BUFFER_WXH(32, 4) +STORE_BUFFER_WXH(32, 32) +STORE_BUFFER_WXH(64, 32) + +#if !CONFIG_REALTIME_ONLY +STORE_BUFFER_WXH(16, 32) +STORE_BUFFER_WXH(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in, + int32x4_t *out, int bit) { + const int32_t *const cospi = cospi_arr_s32(bit); + const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]); + const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]); + + const int32x4_t a0 = vaddq_s32(in[0], in[3]); + const int32x4_t a1 = vsubq_s32(in[0], in[3]); + const int32x4_t a2 = vaddq_s32(in[1], in[2]); + const int32x4_t a3 = vsubq_s32(in[1], in[2]); + + const int32x4_t b0 = vmulq_s32(a0, cospi32); + const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1); + const int32x4_t b2 = vmulq_s32(a2, cospi32); + const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1); + + const int32x4_t c0 = vaddq_s32(b0, b2); + const int32x4_t c1 = vsubq_s32(b0, b2); + const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0); + const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0); + + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t d0 = vrshlq_s32(c0, v_bit); + const int32x4_t d1 = vrshlq_s32(c1, v_bit); + const int32x4_t d2 = vrshlq_s32(c2, v_bit); + const int32x4_t d3 = vrshlq_s32(c3, v_bit); + + out[0] = d0; + out[1] = d2; + out[2] = d1; + out[3] = d3; +} + +static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in, + int32x4_t *out, int bit) { + const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1); + + const int32x4_t a0 = vaddq_s32(in[0], in[1]); + const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0); + const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1); + const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0); + + const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1); + const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0); + const int32x4_t b2 = vsubq_s32(a0, in[3]); + + const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1); + const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1); + const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0); + + const int32x4_t d0 = vaddq_s32(c0, a3); + const int32x4_t d1 = vsubq_s32(c1, a3); + const int32x4_t d2 = vsubq_s32(c1, c0); + + const int32x4_t e0 = vaddq_s32(d2, a3); + + const int32x4_t v_bit = vdupq_n_s32(-bit); + out[0] = vrshlq_s32(d0, v_bit); + out[1] = vrshlq_s32(c2, v_bit); + out[2] = vrshlq_s32(d1, v_bit); + out[3] = vrshlq_s32(e0, v_bit); +} + +static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in, + int32x4_t *out, + int bit) { + (void)bit; + int32x4_t fact = vdupq_n_s32(NewSqrt2); + + for (int i = 0; i < 4; i++) { + const int32x4_t a_low = vmulq_s32(in[i], fact); + out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits); + } +} + +void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff, + int input_stride, TX_TYPE tx_type, int bd) { + (void)bd; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4); + + // Workspace for column/row-wise transforms. + int32x4_t buf[4]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case ADST_DCT: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case DCT_ADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case ADST_ADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case FLIPADST_DCT: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, buf, input_stride, 1); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, buf, input_stride, 1); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, buf, input_stride, 1); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case IDTX: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case V_DCT: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case H_DCT: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case V_ADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case H_ADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case V_FLIPADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case H_FLIPADST: + load_buffer_4x4(input, buf, input_stride, 1); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + default: assert(0); + } +} + +// Butterfly pre-processing: +// e.g. n=4: +// out[0] = in[0] + in[3] +// out[1] = in[1] + in[2] +// out[2] = in[1] - in[2] +// out[3] = in[0] - in[3] + +static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input, + int32x4_t *output, int n) { + for (int i = 0; i < n / 2; ++i) { + output[i] = vaddq_s32(input[i], input[n - i - 1]); + } + for (int i = 0; i < n / 2; ++i) { + output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]); + } +} + +// Butterfly post-processing: +// e.g. n=8: +// out[0] = in0[0] + in1[3]; +// out[1] = in0[1] + in1[2]; +// out[2] = in0[1] - in1[2]; +// out[3] = in0[0] - in1[3]; +// out[4] = in0[7] - in1[4]; +// out[5] = in0[6] - in1[5]; +// out[6] = in0[6] + in1[5]; +// out[7] = in0[7] + in1[4]; + +static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0, + const int32x4_t *in1, + int32x4_t *output, int n) { + for (int i = 0; i < n / 4; ++i) { + output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[(3 * n) / 4 + i] = + vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); + } +} + +static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in, + int32x4_t *out, int bit) { + const int32_t *const cospi = cospi_arr_s32(bit); + const int32x4_t v_bit = vdupq_n_s32(-bit); + + // stage 1 + int32x4_t a[8]; + butterfly_dct_pre(in, a, 8); + + // stage 2 + int32x4_t b[8]; + butterfly_dct_pre(a, b, 4); + butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit); + + // stage 3 + int32x4_t c[8]; + butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit); + butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit); + butterfly_dct_post(a + 4, b + 4, c + 4, 4); + + // stage 4-5 + butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit); + butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit); + + out[0] = c[0]; + out[2] = c[2]; + out[4] = c[1]; + out[6] = c[3]; +} + +static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in, + int32x4_t *out, int bit) { + const int32_t *const cospi = cospi_arr_s32(bit); + const int32x4_t v_bit = vdupq_n_s32(-bit); + + int32x4_t u0, u1, u2, u3, u4, u5, u6, u7; + int32x4_t v0, v1, v2, v3, v4, v5, v6, v7; + + // stage 0-1 + u0 = in[0]; + u1 = in[7]; + u2 = in[3]; + u3 = in[4]; + u4 = in[1]; + u5 = in[6]; + u6 = in[2]; + u7 = in[5]; + + // stage 2 + v0 = u0; + v1 = u1; + butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit); + v4 = u4; + v5 = u5; + butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit); + + // stage 3 + u0 = vaddq_s32(v0, v2); + u1 = vsubq_s32(v3, v1); + u2 = vsubq_s32(v0, v2); + u3 = vaddq_s32(v1, v3); + u4 = vsubq_s32(v6, v4); + u5 = vaddq_s32(v5, v7); + u6 = vaddq_s32(v4, v6); + u7 = vsubq_s32(v5, v7); + + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit); + butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit); + + // stage 5 + u0 = vaddq_s32(v0, v4); + u1 = vaddq_s32(v1, v5); + u2 = vaddq_s32(v2, v6); + u3 = vsubq_s32(v7, v3); + u4 = vsubq_s32(v0, v4); + u5 = vsubq_s32(v1, v5); + u6 = vsubq_s32(v2, v6); + u7 = vaddq_s32(v3, v7); + + // stage 6 + butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit); + butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit); + butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit); + butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit); + + // stage 7 + out[0] = v1; + out[1] = v6; + out[2] = v3; + out[3] = v4; + out[4] = v5; + out[5] = v2; + out[6] = v7; + out[7] = v0; +} + +static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in, + int32x4_t *out, + int bit) { + (void)bit; + out[0] = vshlq_n_s32(in[0], 1); + out[1] = vshlq_n_s32(in[1], 1); + out[2] = vshlq_n_s32(in[2], 1); + out[3] = vshlq_n_s32(in[3], 1); + out[4] = vshlq_n_s32(in[4], 1); + out[5] = vshlq_n_s32(in[5], 1); + out[6] = vshlq_n_s32(in[6], 1); + out[7] = vshlq_n_s32(in[7], 1); +} + +static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in, + int32x4_t *out, int bit, + int howmany) { + const int stride = 8; + int i = 0; + do { + highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in, + int32x4_t *out, int bit, + int howmany) { + const int stride = 8; + int i = 0; + do { + highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in, + int32x4_t *out, int bit, + int howmany) { + (void)bit; + const int stride = 8; + int i = 0; + do { + highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + + // Workspaces for column/row-wise transforms. + int32x4_t buf0[16], buf1[16]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case ADST_DCT: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case DCT_ADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case ADST_ADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case FLIPADST_DCT: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, buf0, stride, 1); + highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, buf0, stride, 1); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, buf0, stride, 1); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case IDTX: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case V_DCT: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case H_DCT: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case V_ADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case H_ADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case V_FLIPADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case H_FLIPADST: + load_buffer_8x8(input, buf0, stride, 1); + highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + default: assert(0); + } +} + +static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out, + int bit) { + const int32_t *const cospi = cospi_arr_s32(bit); + const int32x4_t v_bit = vdupq_n_s32(-bit); + + int32x4_t u[16], v[16]; + + // stage 1 + butterfly_dct_pre(in, u, 16); + + // stage 2 + butterfly_dct_pre(u, v, 8); + v[8] = u[8]; + v[9] = u[9]; + butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit); + butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit); + v[14] = u[14]; + v[15] = u[15]; + + // stage 3 + butterfly_dct_pre(v, u, 4); + u[4] = v[4]; + butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit); + u[7] = v[7]; + butterfly_dct_post(v + 8, v + 8, u + 8, 8); + + // stage 4 + butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit); + butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit); + butterfly_dct_post(u + 4, u + 4, v + 4, 4); + v[8] = u[8]; + butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit); + butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit); + v[11] = u[11]; + v[12] = u[12]; + v[15] = u[15]; + + // stage 5 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit); + butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit); + butterfly_dct_post(v + 8, v + 8, u + 8, 4); + butterfly_dct_post(v + 12, v + 12, u + 12, 4); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit); + butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit); + butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit); + butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit); + + out[0] = v[0]; + out[1] = v[8]; + out[2] = v[4]; + out[3] = v[12]; + out[4] = v[2]; + out[5] = v[10]; + out[6] = v[6]; + out[7] = v[14]; + out[8] = v[1]; + out[9] = v[9]; + out[10] = v[5]; + out[11] = v[13]; + out[12] = v[3]; + out[13] = v[11]; + out[14] = v[7]; + out[15] = v[15]; +} + +static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out, + int bit) { + const int32_t *const cospi = cospi_arr_s32(bit); + const int32x4_t v_bit = vdupq_n_s32(-bit); + + int32x4_t u[16], v[16]; + + // stage 0-1 + u[0] = in[0]; + u[1] = in[15]; + u[2] = in[7]; + u[3] = in[8]; + u[4] = in[3]; + u[5] = in[12]; + u[6] = in[4]; + u[7] = in[11]; + u[8] = in[1]; + u[9] = in[14]; + u[10] = in[6]; + u[11] = in[9]; + u[12] = in[2]; + u[13] = in[13]; + u[14] = in[5]; + u[15] = in[10]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit); + v[4] = u[4]; + v[5] = u[5]; + butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit); + v[8] = u[8]; + v[9] = u[9]; + butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit); + v[12] = u[12]; + v[13] = u[13]; + butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit); + + // stage 3 + u[0] = vaddq_s32(v[0], v[2]); + u[1] = vsubq_s32(v[3], v[1]); + u[2] = vsubq_s32(v[0], v[2]); + u[3] = vaddq_s32(v[1], v[3]); + u[4] = vsubq_s32(v[6], v[4]); + u[5] = vaddq_s32(v[5], v[7]); + u[6] = vaddq_s32(v[4], v[6]); + u[7] = vsubq_s32(v[5], v[7]); + u[8] = vsubq_s32(v[10], v[8]); + u[9] = vaddq_s32(v[9], v[11]); + u[10] = vaddq_s32(v[8], v[10]); + u[11] = vsubq_s32(v[9], v[11]); + u[12] = vaddq_s32(v[12], v[14]); + u[13] = vsubq_s32(v[15], v[13]); + u[14] = vsubq_s32(v[12], v[14]); + u[15] = vaddq_s32(v[13], v[15]); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit); + butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit); + butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit); + + // stage 5 + u[0] = vaddq_s32(v[0], v[4]); + u[1] = vaddq_s32(v[1], v[5]); + u[2] = vaddq_s32(v[2], v[6]); + u[3] = vsubq_s32(v[7], v[3]); + u[4] = vsubq_s32(v[0], v[4]); + u[5] = vsubq_s32(v[1], v[5]); + u[6] = vsubq_s32(v[2], v[6]); + u[7] = vaddq_s32(v[3], v[7]); + u[8] = vaddq_s32(v[8], v[12]); + u[9] = vaddq_s32(v[9], v[13]); + u[10] = vsubq_s32(v[14], v[10]); + u[11] = vaddq_s32(v[11], v[15]); + u[12] = vsubq_s32(v[8], v[12]); + u[13] = vsubq_s32(v[9], v[13]); + u[14] = vaddq_s32(v[10], v[14]); + u[15] = vsubq_s32(v[11], v[15]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit); + butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit); + butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit); + butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit); + + // stage 7 + u[0] = vaddq_s32(v[0], v[8]); + u[1] = vaddq_s32(v[1], v[9]); + u[2] = vaddq_s32(v[2], v[10]); + u[3] = vaddq_s32(v[3], v[11]); + u[4] = vaddq_s32(v[4], v[12]); + u[5] = vaddq_s32(v[5], v[13]); + u[6] = vaddq_s32(v[6], v[14]); + u[7] = vsubq_s32(v[15], v[7]); + u[8] = vsubq_s32(v[0], v[8]); + u[9] = vsubq_s32(v[1], v[9]); + u[10] = vsubq_s32(v[2], v[10]); + u[11] = vsubq_s32(v[3], v[11]); + u[12] = vsubq_s32(v[4], v[12]); + u[13] = vsubq_s32(v[5], v[13]); + u[14] = vsubq_s32(v[6], v[14]); + u[15] = vaddq_s32(v[7], v[15]); + + // stage 8 + butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit); + butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit); + butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit); + butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit); + butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit); + butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit); + butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit); + butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit); + + // stage 9 + out[0] = v[1]; + out[1] = v[14]; + out[2] = v[3]; + out[3] = v[12]; + out[4] = v[5]; + out[5] = v[10]; + out[6] = v[7]; + out[7] = v[8]; + out[8] = v[9]; + out[9] = v[6]; + out[10] = v[11]; + out[11] = v[4]; + out[12] = v[13]; + out[13] = v[2]; + out[14] = v[15]; + out[15] = v[0]; +} + +static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out, + int bit) { + (void)bit; + const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2); + const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1)); + + for (int i = 0; i < 16; i++) { + int32x4_t a = vmulq_s32(in[i], fact); + a = vaddq_s32(a, offset); + out[i] = vshrq_n_s32(a, NewSqrt2Bits); + } +} + +static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, + const int howmany) { + const int stride = 16; + int i = 0; + do { + highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, + int howmany) { + const int stride = 16; + int i = 0; + do { + highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out, + int bit, int howmany) { + const int stride = 16; + int i = 0; + do { + highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + + // Workspaces for column/row-wise transforms. + int32x4_t buf0[64], buf1[64]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case ADST_DCT: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case DCT_ADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case ADST_ADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case FLIPADST_DCT: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case DCT_FLIPADST: + load_buffer_16x16(input, buf0, stride, 1); + highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case FLIPADST_FLIPADST: + load_buffer_16x16(input, buf0, stride, 1); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case ADST_FLIPADST: + load_buffer_16x16(input, buf0, stride, 1); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case FLIPADST_ADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case IDTX: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case V_DCT: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case H_DCT: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case V_ADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case H_ADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case V_FLIPADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case H_FLIPADST: + load_buffer_16x16(input, buf0, stride, 1); + highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + default: assert(0); + } +} + +typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out, + int stride, int bit, int lr_flip); +typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in, + int32x4_t *out, int stride, + int bit, int lr_flip, + int howmany, int hm_stride); + +typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out, + int bit, int stride); +typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in, + int32_t *out, int bit, + int howmany, int hm_stride, + int stride); + +// Construct component kernels that include the load_buffer and store_buffer +// stages to avoid the need to spill loaded data to the stack between these and +// the txfm kernel calls. +// The TRANSFORM_*_ONE cases are only ever called in situations where the +// howmany parameter would be one, so no need for the loop at all in these +// cases. + +#define TRANSFORM_COL_ONE(name, n) \ + static void highbd_##name##_col_neon(const int16_t *input, \ + int32x4_t *output, int stride, \ + int cos_bit, int lr_flip) { \ + int32x4_t buf0[n]; \ + load_buffer_4x##n(input, buf0, stride, lr_flip); \ + highbd_##name##_x4_neon(buf0, output, cos_bit); \ + } + +#define TRANSFORM_COL_MANY(name, n) \ + static void highbd_##name##_col_many_neon( \ + const int16_t *input, int32x4_t *output, int stride, int cos_bit, \ + int lr_flip, int howmany, int hm_stride) { \ + int i = 0; \ + do { \ + int32x4_t buf0[n]; \ + load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip); \ + highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit); \ + } while (++i < howmany); \ + } + +#define TRANSFORM_ROW_ONE(name, n) \ + static void highbd_##name##_row_neon( \ + const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \ + int32x4_t buf0[n]; \ + highbd_##name##_x4_neon(input, buf0, cos_bit); \ + store_buffer_##n##x4(buf0, output, stride); \ + } + +#define TRANSFORM_ROW_RECT_ONE(name, n) \ + static void highbd_##name##_row_rect_neon( \ + const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \ + int32x4_t buf0[n]; \ + highbd_##name##_x4_neon(input, buf0, cos_bit); \ + round_rect_array_s32_neon(buf0, buf0, (n)); \ + store_buffer_##n##x4(buf0, output, stride); \ + } + +#define TRANSFORM_ROW_MANY(name, n) \ + static void highbd_##name##_row_many_neon( \ + const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \ + int hm_stride, int stride) { \ + int i = 0; \ + do { \ + int32x4_t buf0[n]; \ + highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \ + store_buffer_##n##x4(buf0, output + 4 * i, stride); \ + } while (++i < howmany); \ + } + +#define TRANSFORM_ROW_RECT_MANY(name, n) \ + static void highbd_##name##_row_rect_many_neon( \ + const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \ + int hm_stride, int stride) { \ + int i = 0; \ + do { \ + int32x4_t buf0[n]; \ + highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \ + round_rect_array_s32_neon(buf0, buf0, (n)); \ + store_buffer_##n##x4(buf0, output + 4 * i, stride); \ + } while (++i < howmany); \ + } + +TRANSFORM_COL_ONE(fdct8, 8) +TRANSFORM_COL_ONE(fadst8, 8) +TRANSFORM_COL_ONE(fidentity8, 8) + +TRANSFORM_COL_MANY(fdct4, 4) +TRANSFORM_COL_MANY(fdct8, 8) +TRANSFORM_COL_MANY(fdct16, 16) +TRANSFORM_COL_MANY(fadst4, 4) +TRANSFORM_COL_MANY(fadst8, 8) +TRANSFORM_COL_MANY(fadst16, 16) +TRANSFORM_COL_MANY(fidentity4, 4) +TRANSFORM_COL_MANY(fidentity8, 8) +TRANSFORM_COL_MANY(fidentity16, 16) + +TRANSFORM_ROW_ONE(fdct16, 16) +TRANSFORM_ROW_ONE(fadst16, 16) +TRANSFORM_ROW_ONE(fidentity16, 16) + +TRANSFORM_ROW_RECT_ONE(fdct8, 8) +TRANSFORM_ROW_RECT_ONE(fadst8, 8) +TRANSFORM_ROW_RECT_ONE(fidentity8, 8) + +#if !CONFIG_REALTIME_ONLY +TRANSFORM_ROW_MANY(fdct4, 4) +TRANSFORM_ROW_MANY(fdct8, 8) +TRANSFORM_ROW_MANY(fadst4, 4) +TRANSFORM_ROW_MANY(fadst8, 8) +TRANSFORM_ROW_MANY(fidentity4, 4) +TRANSFORM_ROW_MANY(fidentity8, 8) +#endif + +TRANSFORM_ROW_RECT_MANY(fdct4, 4) +TRANSFORM_ROW_RECT_MANY(fdct8, 8) +TRANSFORM_ROW_RECT_MANY(fdct16, 16) +TRANSFORM_ROW_RECT_MANY(fadst4, 4) +TRANSFORM_ROW_RECT_MANY(fadst8, 8) +TRANSFORM_ROW_RECT_MANY(fadst16, 16) +TRANSFORM_ROW_RECT_MANY(fidentity4, 4) +TRANSFORM_ROW_RECT_MANY(fidentity8, 8) +TRANSFORM_ROW_RECT_MANY(fidentity16, 16) + +static const fwd_transform_1d_col_many_neon + col_highbd_txfm8_xn_arr[TX_TYPES] = { + highbd_fdct8_col_many_neon, // DCT_DCT + highbd_fadst8_col_many_neon, // ADST_DCT + highbd_fdct8_col_many_neon, // DCT_ADST + highbd_fadst8_col_many_neon, // ADST_ADST + highbd_fadst8_col_many_neon, // FLIPADST_DCT + highbd_fdct8_col_many_neon, // DCT_FLIPADST + highbd_fadst8_col_many_neon, // FLIPADST_FLIPADST + highbd_fadst8_col_many_neon, // ADST_FLIPADST + highbd_fadst8_col_many_neon, // FLIPADST_ADST + highbd_fidentity8_col_many_neon, // IDTX + highbd_fdct8_col_many_neon, // V_DCT + highbd_fidentity8_col_many_neon, // H_DCT + highbd_fadst8_col_many_neon, // V_ADST + highbd_fidentity8_col_many_neon, // H_ADST + highbd_fadst8_col_many_neon, // V_FLIPADST + highbd_fidentity8_col_many_neon // H_FLIPADST + }; + +static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = { + highbd_fdct8_col_neon, // DCT_DCT + highbd_fadst8_col_neon, // ADST_DCT + highbd_fdct8_col_neon, // DCT_ADST + highbd_fadst8_col_neon, // ADST_ADST + highbd_fadst8_col_neon, // FLIPADST_DCT + highbd_fdct8_col_neon, // DCT_FLIPADST + highbd_fadst8_col_neon, // FLIPADST_FLIPADST + highbd_fadst8_col_neon, // ADST_FLIPADST + highbd_fadst8_col_neon, // FLIPADST_ADST + highbd_fidentity8_col_neon, // IDTX + highbd_fdct8_col_neon, // V_DCT + highbd_fidentity8_col_neon, // H_DCT + highbd_fadst8_col_neon, // V_ADST + highbd_fidentity8_col_neon, // H_ADST + highbd_fadst8_col_neon, // V_FLIPADST + highbd_fidentity8_col_neon // H_FLIPADST +}; + +static const fwd_transform_1d_col_many_neon + col_highbd_txfm16_xn_arr[TX_TYPES] = { + highbd_fdct16_col_many_neon, // DCT_DCT + highbd_fadst16_col_many_neon, // ADST_DCT + highbd_fdct16_col_many_neon, // DCT_ADST + highbd_fadst16_col_many_neon, // ADST_ADST + highbd_fadst16_col_many_neon, // FLIPADST_DCT + highbd_fdct16_col_many_neon, // DCT_FLIPADST + highbd_fadst16_col_many_neon, // FLIPADST_FLIPADST + highbd_fadst16_col_many_neon, // ADST_FLIPADST + highbd_fadst16_col_many_neon, // FLIPADST_ADST + highbd_fidentity16_col_many_neon, // IDTX + highbd_fdct16_col_many_neon, // V_DCT + highbd_fidentity16_col_many_neon, // H_DCT + highbd_fadst16_col_many_neon, // V_ADST + highbd_fidentity16_col_many_neon, // H_ADST + highbd_fadst16_col_many_neon, // V_FLIPADST + highbd_fidentity16_col_many_neon // H_FLIPADST + }; + +static const fwd_transform_1d_col_many_neon + col_highbd_txfm4_xn_arr[TX_TYPES] = { + highbd_fdct4_col_many_neon, // DCT_DCT + highbd_fadst4_col_many_neon, // ADST_DCT + highbd_fdct4_col_many_neon, // DCT_ADST + highbd_fadst4_col_many_neon, // ADST_ADST + highbd_fadst4_col_many_neon, // FLIPADST_DCT + highbd_fdct4_col_many_neon, // DCT_FLIPADST + highbd_fadst4_col_many_neon, // FLIPADST_FLIPADST + highbd_fadst4_col_many_neon, // ADST_FLIPADST + highbd_fadst4_col_many_neon, // FLIPADST_ADST + highbd_fidentity4_col_many_neon, // IDTX + highbd_fdct4_col_many_neon, // V_DCT + highbd_fidentity4_col_many_neon, // H_DCT + highbd_fadst4_col_many_neon, // V_ADST + highbd_fidentity4_col_many_neon, // H_ADST + highbd_fadst4_col_many_neon, // V_FLIPADST + highbd_fidentity4_col_many_neon // H_FLIPADST + }; + +static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = { + highbd_fdct16_row_neon, // DCT_DCT + highbd_fdct16_row_neon, // ADST_DCT + highbd_fadst16_row_neon, // DCT_ADST + highbd_fadst16_row_neon, // ADST_ADST + highbd_fdct16_row_neon, // FLIPADST_DCT + highbd_fadst16_row_neon, // DCT_FLIPADST + highbd_fadst16_row_neon, // FLIPADST_FLIPADST + highbd_fadst16_row_neon, // ADST_FLIPADST + highbd_fadst16_row_neon, // FLIPADST_ADST + highbd_fidentity16_row_neon, // IDTX + highbd_fidentity16_row_neon, // V_DCT + highbd_fdct16_row_neon, // H_DCT + highbd_fidentity16_row_neon, // V_ADST + highbd_fadst16_row_neon, // H_ADST + highbd_fidentity16_row_neon, // V_FLIPADST + highbd_fadst16_row_neon // H_FLIPADST +}; + +static const fwd_transform_1d_row_many_neon + row_rect_highbd_txfm16_xn_arr[TX_TYPES] = { + highbd_fdct16_row_rect_many_neon, // DCT_DCT + highbd_fdct16_row_rect_many_neon, // ADST_DCT + highbd_fadst16_row_rect_many_neon, // DCT_ADST + highbd_fadst16_row_rect_many_neon, // ADST_ADST + highbd_fdct16_row_rect_many_neon, // FLIPADST_DCT + highbd_fadst16_row_rect_many_neon, // DCT_FLIPADST + highbd_fadst16_row_rect_many_neon, // FLIPADST_FLIPADST + highbd_fadst16_row_rect_many_neon, // ADST_FLIPADST + highbd_fadst16_row_rect_many_neon, // FLIPADST_ADST + highbd_fidentity16_row_rect_many_neon, // IDTX + highbd_fidentity16_row_rect_many_neon, // V_DCT + highbd_fdct16_row_rect_many_neon, // H_DCT + highbd_fidentity16_row_rect_many_neon, // V_ADST + highbd_fadst16_row_rect_many_neon, // H_ADST + highbd_fidentity16_row_rect_many_neon, // V_FLIPADST + highbd_fadst16_row_rect_many_neon // H_FLIPADST + }; + +#if !CONFIG_REALTIME_ONLY +static const fwd_transform_1d_row_many_neon + row_highbd_txfm8_xn_arr[TX_TYPES] = { + highbd_fdct8_row_many_neon, // DCT_DCT + highbd_fdct8_row_many_neon, // ADST_DCT + highbd_fadst8_row_many_neon, // DCT_ADST + highbd_fadst8_row_many_neon, // ADST_ADST + highbd_fdct8_row_many_neon, // FLIPADST_DCT + highbd_fadst8_row_many_neon, // DCT_FLIPADST + highbd_fadst8_row_many_neon, // FLIPADST_FLIPADST + highbd_fadst8_row_many_neon, // ADST_FLIPADST + highbd_fadst8_row_many_neon, // FLIPADST_ADST + highbd_fidentity8_row_many_neon, // IDTX + highbd_fidentity8_row_many_neon, // V_DCT + highbd_fdct8_row_many_neon, // H_DCT + highbd_fidentity8_row_many_neon, // V_ADST + highbd_fadst8_row_many_neon, // H_ADST + highbd_fidentity8_row_many_neon, // V_FLIPADST + highbd_fadst8_row_many_neon // H_FLIPADST + }; +#endif + +static const fwd_transform_1d_row_many_neon + row_rect_highbd_txfm8_xn_arr[TX_TYPES] = { + highbd_fdct8_row_rect_many_neon, // DCT_DCT + highbd_fdct8_row_rect_many_neon, // ADST_DCT + highbd_fadst8_row_rect_many_neon, // DCT_ADST + highbd_fadst8_row_rect_many_neon, // ADST_ADST + highbd_fdct8_row_rect_many_neon, // FLIPADST_DCT + highbd_fadst8_row_rect_many_neon, // DCT_FLIPADST + highbd_fadst8_row_rect_many_neon, // FLIPADST_FLIPADST + highbd_fadst8_row_rect_many_neon, // ADST_FLIPADST + highbd_fadst8_row_rect_many_neon, // FLIPADST_ADST + highbd_fidentity8_row_rect_many_neon, // IDTX + highbd_fidentity8_row_rect_many_neon, // V_DCT + highbd_fdct8_row_rect_many_neon, // H_DCT + highbd_fidentity8_row_rect_many_neon, // V_ADST + highbd_fadst8_row_rect_many_neon, // H_ADST + highbd_fidentity8_row_rect_many_neon, // V_FLIPADST + highbd_fadst8_row_rect_many_neon // H_FLIPADST + }; + +static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = { + highbd_fdct8_row_rect_neon, // DCT_DCT + highbd_fdct8_row_rect_neon, // ADST_DCT + highbd_fadst8_row_rect_neon, // DCT_ADST + highbd_fadst8_row_rect_neon, // ADST_ADST + highbd_fdct8_row_rect_neon, // FLIPADST_DCT + highbd_fadst8_row_rect_neon, // DCT_FLIPADST + highbd_fadst8_row_rect_neon, // FLIPADST_FLIPADST + highbd_fadst8_row_rect_neon, // ADST_FLIPADST + highbd_fadst8_row_rect_neon, // FLIPADST_ADST + highbd_fidentity8_row_rect_neon, // IDTX + highbd_fidentity8_row_rect_neon, // V_DCT + highbd_fdct8_row_rect_neon, // H_DCT + highbd_fidentity8_row_rect_neon, // V_ADST + highbd_fadst8_row_rect_neon, // H_ADST + highbd_fidentity8_row_rect_neon, // V_FLIPADST + highbd_fadst8_row_rect_neon // H_FLIPADST +}; + +#if !CONFIG_REALTIME_ONLY +static const fwd_transform_1d_row_many_neon + row_highbd_txfm4_xn_arr[TX_TYPES] = { + highbd_fdct4_row_many_neon, // DCT_DCT + highbd_fdct4_row_many_neon, // ADST_DCT + highbd_fadst4_row_many_neon, // DCT_ADST + highbd_fadst4_row_many_neon, // ADST_ADST + highbd_fdct4_row_many_neon, // FLIPADST_DCT + highbd_fadst4_row_many_neon, // DCT_FLIPADST + highbd_fadst4_row_many_neon, // FLIPADST_FLIPADST + highbd_fadst4_row_many_neon, // ADST_FLIPADST + highbd_fadst4_row_many_neon, // FLIPADST_ADST + highbd_fidentity4_row_many_neon, // IDTX + highbd_fidentity4_row_many_neon, // V_DCT + highbd_fdct4_row_many_neon, // H_DCT + highbd_fidentity4_row_many_neon, // V_ADST + highbd_fadst4_row_many_neon, // H_ADST + highbd_fidentity4_row_many_neon, // V_FLIPADST + highbd_fadst4_row_many_neon // H_FLIPADST + }; +#endif + +static const fwd_transform_1d_row_many_neon + row_rect_highbd_txfm4_xn_arr[TX_TYPES] = { + highbd_fdct4_row_rect_many_neon, // DCT_DCT + highbd_fdct4_row_rect_many_neon, // ADST_DCT + highbd_fadst4_row_rect_many_neon, // DCT_ADST + highbd_fadst4_row_rect_many_neon, // ADST_ADST + highbd_fdct4_row_rect_many_neon, // FLIPADST_DCT + highbd_fadst4_row_rect_many_neon, // DCT_FLIPADST + highbd_fadst4_row_rect_many_neon, // FLIPADST_FLIPADST + highbd_fadst4_row_rect_many_neon, // ADST_FLIPADST + highbd_fadst4_row_rect_many_neon, // FLIPADST_ADST + highbd_fidentity4_row_rect_many_neon, // IDTX + highbd_fidentity4_row_rect_many_neon, // V_DCT + highbd_fdct4_row_rect_many_neon, // H_DCT + highbd_fidentity4_row_rect_many_neon, // V_ADST + highbd_fadst4_row_rect_many_neon, // H_ADST + highbd_fidentity4_row_rect_many_neon, // V_FLIPADST + highbd_fadst4_row_rect_many_neon // H_FLIPADST + }; + +static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output, + int cos_bit) { + const int32_t *const cospi = cospi_arr_s32(cos_bit); + const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit); + + // Workspaces for intermediate transform steps. + int32x4_t buf0[32]; + int32x4_t buf1[32]; + + // stage 1 + butterfly_dct_pre(input, buf1, 32); + + // stage 2 + butterfly_dct_pre(buf1, buf0, 16); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20], + v_cos_bit); + butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21], + v_cos_bit); + butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22], + v_cos_bit); + butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23], + v_cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + butterfly_dct_pre(buf0, buf1, 8); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10], + v_cos_bit); + butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11], + v_cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16); + + // stage 4 + butterfly_dct_pre(buf1, buf0, 4); + buf0[4] = buf1[4]; + butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5], + v_cos_bit); + buf0[7] = buf1[7]; + butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18], + v_cos_bit); + butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19], + v_cos_bit); + butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27], + v_cos_bit); + butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26], + v_cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1], + v_cos_bit); + butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3], + v_cos_bit); + butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4); + buf1[8] = buf0[8]; + butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9], + v_cos_bit); + butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13], + v_cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8); + butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8); + + // stage 6 + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + + butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7], + v_cos_bit); + butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17], + v_cos_bit); + butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29], + v_cos_bit); + butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4); + butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4); + buf0[16] = buf1[16]; + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + + butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6], + v_cos_bit); + butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21], + v_cos_bit); + butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22], + v_cos_bit); + + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15], + v_cos_bit); + butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14], + v_cos_bit); + butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13], + v_cos_bit); + butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12], + v_cos_bit); + butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4); + butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4); + butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4); + butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4); + + // stage 8 + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31], + v_cos_bit); + butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30], + v_cos_bit); + butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29], + v_cos_bit); + butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28], + v_cos_bit); + butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27], + v_cos_bit); + butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26], + v_cos_bit); + butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25], + v_cos_bit); + butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24], + v_cos_bit); + + // stage 9 + output[0] = buf0[0]; + output[1] = buf0[16]; + output[2] = buf0[8]; + output[3] = buf0[24]; + output[4] = buf0[4]; + output[5] = buf0[20]; + output[6] = buf0[12]; + output[7] = buf0[28]; + output[8] = buf0[2]; + output[9] = buf0[18]; + output[10] = buf0[10]; + output[11] = buf0[26]; + output[12] = buf0[6]; + output[13] = buf0[22]; + output[14] = buf0[14]; + output[15] = buf0[30]; + output[16] = buf0[1]; + output[17] = buf0[17]; + output[18] = buf0[9]; + output[19] = buf0[25]; + output[20] = buf0[5]; + output[21] = buf0[21]; + output[22] = buf0[13]; + output[23] = buf0[29]; + output[24] = buf0[3]; + output[25] = buf0[19]; + output[26] = buf0[11]; + output[27] = buf0[27]; + output[28] = buf0[7]; + output[29] = buf0[23]; + output[30] = buf0[15]; + output[31] = buf0[31]; +} + +static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output, + int8_t cos_bit) { + const int32_t *const cospi = cospi_arr_s32(cos_bit); + const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit); + + // stage 1 + int32x4_t x1[64]; + butterfly_dct_pre(input, x1, 64); + + // stage 2 + int32x4_t x2[64]; + butterfly_dct_pre(x1, x2, 32); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + int32x4_t x3[64]; + butterfly_dct_pre(x2, x3, 16); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit); + butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit); + butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit); + butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32); + + // stage 4 + int32x4_t x4[64]; + butterfly_dct_pre(x3, x4, 8); + x4[8] = x3[8]; + x4[9] = x3[9]; + butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit); + butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit); + butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit); + butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit); + butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit); + butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit); + butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit); + butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit); + butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + int32x4_t x5[64]; + butterfly_dct_pre(x4, x5, 4); + x5[4] = x4[4]; + butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit); + x5[7] = x4[7]; + butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8); + x5[16] = x4[16]; + x5[17] = x4[17]; + butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit); + butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit); + butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit); + butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16); + butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16); + + // stage 6 + int32x4_t x6[64]; + butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit); + butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit); + butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4); + x6[8] = x5[8]; + butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit); + butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8); + butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8); + x6[32] = x5[32]; + x6[33] = x5[33]; + butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit); + butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit); + butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit); + butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit); + butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit); + butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit); + butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + int32x4_t x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit); + butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit); + butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4); + butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4); + x7[16] = x6[16]; + butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit); + butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit); + butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8); + butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8); + butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8); + butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8); + + // stage 8 + int32x4_t x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + + butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit); + butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit); + butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit); + butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit); + butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4); + butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4); + butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4); + butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4); + x8[32] = x7[32]; + butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit); + butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit); + butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit); + butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit); + butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + int32x4_t x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit); + butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit); + butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit); + butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit); + butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit); + butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit); + butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit); + butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit); + butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4); + butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4); + butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4); + butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4); + butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4); + butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4); + butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4); + butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4); + + // stage 10 + int32x4_t x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit); + butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit); + butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit); + butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit); + butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit); + butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit); + butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit); + butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit); + butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit); + butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit); + butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit); + butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit); + butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit); + butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit); + butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit); + butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit); + + // stage 11 + output[0] = x10[0]; + output[1] = x10[32]; + output[2] = x10[16]; + output[3] = x10[48]; + output[4] = x10[8]; + output[5] = x10[40]; + output[6] = x10[24]; + output[7] = x10[56]; + output[8] = x10[4]; + output[9] = x10[36]; + output[10] = x10[20]; + output[11] = x10[52]; + output[12] = x10[12]; + output[13] = x10[44]; + output[14] = x10[28]; + output[15] = x10[60]; + output[16] = x10[2]; + output[17] = x10[34]; + output[18] = x10[18]; + output[19] = x10[50]; + output[20] = x10[10]; + output[21] = x10[42]; + output[22] = x10[26]; + output[23] = x10[58]; + output[24] = x10[6]; + output[25] = x10[38]; + output[26] = x10[22]; + output[27] = x10[54]; + output[28] = x10[14]; + output[29] = x10[46]; + output[30] = x10[30]; + output[31] = x10[62]; + output[32] = x10[1]; + output[33] = x10[33]; + output[34] = x10[17]; + output[35] = x10[49]; + output[36] = x10[9]; + output[37] = x10[41]; + output[38] = x10[25]; + output[39] = x10[57]; + output[40] = x10[5]; + output[41] = x10[37]; + output[42] = x10[21]; + output[43] = x10[53]; + output[44] = x10[13]; + output[45] = x10[45]; + output[46] = x10[29]; + output[47] = x10[61]; + output[48] = x10[3]; + output[49] = x10[35]; + output[50] = x10[19]; + output[51] = x10[51]; + output[52] = x10[11]; + output[53] = x10[43]; + output[54] = x10[27]; + output[55] = x10[59]; + output[56] = x10[7]; + output[57] = x10[39]; + output[58] = x10[23]; + output[59] = x10[55]; + output[60] = x10[15]; + output[61] = x10[47]; + output[62] = x10[31]; + output[63] = x10[63]; +} + +static void highbd_fidentity32_x4_neon(const int32x4_t *input, + int32x4_t *output, int cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; i++) { + output[i] = vshlq_n_s32(input[i], 2); + } +} + +TRANSFORM_COL_MANY(fdct32, 32) +TRANSFORM_COL_MANY(fidentity32, 32) + +static const fwd_transform_1d_col_many_neon + col_highbd_txfm32_x4_arr[TX_TYPES] = { + highbd_fdct32_col_many_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + highbd_fidentity32_col_many_neon, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST + }; + +TRANSFORM_ROW_MANY(fdct32, 32) +TRANSFORM_ROW_MANY(fidentity32, 32) + +static const fwd_transform_1d_row_many_neon + row_highbd_txfm32_x4_arr[TX_TYPES] = { + highbd_fdct32_row_many_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + highbd_fidentity32_row_many_neon, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST + }; + +TRANSFORM_ROW_RECT_MANY(fdct32, 32) +TRANSFORM_ROW_RECT_MANY(fidentity32, 32) + +static const fwd_transform_1d_row_many_neon + row_rect_highbd_txfm32_x4_arr[TX_TYPES] = { + highbd_fdct32_row_rect_many_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + highbd_fidentity32_row_rect_many_neon, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST + }; + +void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm8_xn_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_rect_highbd_txfm16_xn_arr[tx_type]; + int bit = av1_fwd_cos_bit_col[2][1]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + + // Column-wise transform. + int32x4_t buf0[32]; + if (lr_flip) { + col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4, + /*hm_stride=*/-8); + } else { + col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4, + /*hm_stride=*/8); + } + shift_right_2_round_s32_x4(buf0, buf0, 32); + + int32x4_t buf1[32]; + transpose_arrays_s32_16x8(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8); +} + +void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm16_xn_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_rect_highbd_txfm8_xn_arr[tx_type]; + int bit = av1_fwd_cos_bit_col[1][2]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + + // Column-wise transform. + int32x4_t buf0[32]; + if (lr_flip) { + col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2, + /*hm_stride=*/-16); + } else { + col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2, + /*hm_stride=*/16); + } + shift_right_2_round_s32_x4(buf0, buf0, 32); + + int32x4_t buf1[32]; + transpose_arrays_s32_8x16(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16); +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + int bitcol = av1_fwd_cos_bit_col[0][2]; + int bitrow = av1_fwd_cos_bit_row[0][2]; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm16_xn_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_highbd_txfm4_xn_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + + // Column-wise transform. + int32x4_t buf0[16]; + if (lr_flip) { + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1, + /*hm_stride=*/0); + } else { + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1, + /*hm_stride=*/0); + } + shift_right_1_round_s32_x4(buf0, buf0, 16); + + int32x4_t buf1[16]; + transpose_arrays_s32_4x16(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16); +} +#endif + +void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + int bitcol = av1_fwd_cos_bit_col[2][0]; + int bitrow = av1_fwd_cos_bit_row[2][0]; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm4_xn_arr[tx_type]; + const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); + + // Column-wise transform. + int32x4_t buf0[16]; + if (lr_flip) { + col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4, + /*hm_stride=*/-4); + } else { + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4, + /*hm_stride=*/4); + } + + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_4x16(buf0, buf0); + + // Row-wise transform. + row_txfm(buf0, coeff, bitrow, /*stride=*/4); +} + +void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm32_x4_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_rect_highbd_txfm16_xn_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[2][3]; + int bitrow = av1_fwd_cos_bit_row[2][3]; + + // Column-wise transform. + int32x4_t buf0[128]; + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4, + /*hm_stride=*/32); + shift_right_4_round_s32_x4(buf0, buf0, 128); + + int32x4_t buf1[128]; + transpose_arrays_s32_16x32(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32); +} + +void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + int bitcol = av1_fwd_cos_bit_col[3][4]; + int bitrow = av1_fwd_cos_bit_row[3][4]; + + // Column-wise transform. + int32x4_t buf0[512]; + load_buffer_32x64(input, buf0, stride, 0); + for (int i = 0; i < 8; i++) { + highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol); + } + shift_right_2_round_s32_x4(buf0, buf0, 512); + + int32x4_t buf1[512]; + transpose_arrays_s32_32x64(buf0, buf1); + + // Row-wise transform. + for (int i = 0; i < 16; i++) { + highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow); + } + round_shift2_rect_array_s32_neon(buf1, buf1, 512); + store_buffer_32x32(buf1, coeff, /*stride=*/32); +} + +void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + int bitcol = av1_fwd_cos_bit_col[4][3]; + int bitrow = av1_fwd_cos_bit_row[4][3]; + + // Column-wise transform. + int32x4_t buf0[512]; + load_buffer_64x32(input, buf0, stride, 0); + for (int i = 0; i < 16; i++) { + highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol); + } + shift_right_4_round_s32_x4(buf0, buf0, 512); + + int32x4_t buf1[512]; + transpose_arrays_s32_64x32(buf0, buf1); + + // Row-wise transform. + for (int i = 0; i < 8; i++) { + highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow); + } + round_shift2_rect_array_s32_neon(buf1, buf1, 512); + store_buffer_64x32(buf1, coeff, /*stride=*/32); +} + +void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm16_xn_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_rect_highbd_txfm32_x4_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[3][2]; + int bitrow = av1_fwd_cos_bit_row[3][2]; + + // Column-wise transform. + int32x4_t buf0[128]; + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8, + /*hm_stride=*/16); + shift_right_4_round_s32_x4(buf0, buf0, 128); + + int32x4_t buf1[128]; + transpose_arrays_s32_32x16(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16); +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm32_x4_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_highbd_txfm8_xn_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[1][3]; + int bitrow = av1_fwd_cos_bit_row[1][3]; + + // Column-wise transform. + int32x4_t buf0[64]; + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2, + /*hm_stride=*/32); + shift_right_2_round_s32_x4(buf0, buf0, 64); + + int32x4_t buf1[64]; + transpose_arrays_s32_8x32(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32); +} + +void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm8_xn_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_highbd_txfm32_x4_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[3][1]; + int bitrow = av1_fwd_cos_bit_row[3][1]; + + // Column-wise transform. + int32x4_t buf0[64]; + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8, + /*hm_stride=*/8); + shift_right_2_round_s32_x4(buf0, buf0, 64); + + int32x4_t buf1[64]; + transpose_arrays_s32_32x8(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8); +} +#endif + +void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + int bitcol = av1_fwd_cos_bit_col[0][1]; + int bitrow = av1_fwd_cos_bit_row[0][1]; + const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_rect_highbd_txfm4_xn_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + + // Column-wise transform. + int32x4_t buf0[8]; + col_txfm(input, buf0, stride, bitcol, lr_flip); + shift_right_1_round_s32_x4(buf0, buf0, 8); + + int32x4_t buf1[8]; + transpose_arrays_s32_4x8(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8); +} + +void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const int bitcol = av1_fwd_cos_bit_col[1][0]; + const int bitrow = av1_fwd_cos_bit_row[1][0]; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm4_xn_arr[tx_type]; + const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); + + // Column-wise transform. + int32x4_t buf0[8]; + if (lr_flip) { + col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2, + /*hm_stride=*/-4); + } else { + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2, + /*hm_stride=*/4); + } + + shift_right_1_round_s32_x4(buf0, buf0, 8); + + int32x4_t buf1[8]; + transpose_arrays_s32_8x4(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*stride=*/4); +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const int bitcol = av1_fwd_cos_bit_col[2][4]; + const int bitrow = av1_fwd_cos_bit_row[2][4]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 64); + + // Column-wise transform. + int32x4_t buf0[256]; + load_buffer_16x64(input, buf0, stride, lr_flip); + for (int i = 0; i < 4; i++) { + highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol); + } + shift_right_2_round_s32_x4(buf0, buf0, 256); + + int32x4_t buf1[256]; + transpose_arrays_s32_16x64(buf0, buf1); + + // Row-wise transform. + highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8); + store_buffer_16x32(buf1, coeff, /*stride=*/32); +} + +void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const int bitcol = av1_fwd_cos_bit_col[4][2]; + const int bitrow = av1_fwd_cos_bit_row[4][2]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + + // Column-wise transform. + int32x4_t buf0[256]; + load_buffer_64x16(input, buf0, stride, lr_flip); + highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16); + shift_right_4_round_s32_x4(buf0, buf0, 256); + + int32x4_t buf1[256]; + transpose_arrays_s32_64x16(buf0, buf1); + + // Row-wise transform. + for (int i = 0; i < 4; i++) { + highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow); + } + store_buffer_64x16(buf1, coeff, /*stride=*/16); + memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff)); +} +#endif + +void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm32_x4_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_highbd_txfm32_x4_arr[tx_type]; + + // Column-wise transform. + int32x4_t buf0[256]; + col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8, + /*hm_stride=*/32); + shift_right_4_round_s32_x4(buf0, buf0, 256); + + int32x4_t buf1[256]; + transpose_arrays_s32_32x32(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32, + /*stride=*/32); +} + +void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + + // Column-wise transform. + int32x4_t buf0[1024]; + load_buffer_64x64(input, buf0, stride, 0); + for (int col = 0; col < 16; col++) { + highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13); + } + shift_right_2_round_s32_x4(buf0, buf0, 1024); + + int32x4_t buf1[1024]; + transpose_arrays_s32_64x64(buf0, buf1); + + // Row-wise transform. + for (int col = 0; col < 8; col++) { + highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10); + } + shift_right_2_round_s32_x4(buf1, buf1, 512); + store_buffer_64x32(buf1, output, /*stride=*/32); +} diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c new file mode 100644 index 0000000000..47b5f5cfb7 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c @@ -0,0 +1,1207 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/encoder/arm/neon/pickrst_neon.h" +#include "av1/encoder/pickrst.h" + +static INLINE void highbd_calc_proj_params_r0_r1_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t h01_lo = vdupq_n_s64(0); + int64x2_t h01_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint16_t *src_ptr = src; + const uint16_t *dat_ptr = dat; + int32_t *flt0_ptr = flt0; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint16x8_t s = vld1q_u16(src_ptr); + uint16x8_t d = vld1q_u16(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int32x4_t u_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); + int32x4_t u_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); + int32x4_t s_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); + int32x4_t s_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); + s_lo = vsubq_s32(s_lo, u_lo); + s_hi = vsubq_s32(s_hi, u_hi); + + f0_lo = vsubq_s32(f0_lo, u_lo); + f0_hi = vsubq_s32(f0_hi, u_hi); + f1_lo = vsubq_s32(f1_lo, u_lo); + f1_hi = vsubq_s32(f1_hi, u_hi); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); + h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); + h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); + h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src += src_stride; + dat += dat_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + H[1][0] = H[0][1]; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +static INLINE void highbd_calc_proj_params_r0_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + + do { + const uint16_t *src_ptr = src; + const uint16_t *dat_ptr = dat; + int32_t *flt0_ptr = flt0; + int w = width; + + do { + uint16x8_t s = vld1q_u16(src_ptr); + uint16x8_t d = vld1q_u16(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + + int32x4_t u_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); + int32x4_t u_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); + int32x4_t s_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); + int32x4_t s_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); + s_lo = vsubq_s32(s_lo, u_lo); + s_hi = vsubq_s32(s_hi, u_hi); + + f0_lo = vsubq_s32(f0_lo, u_lo); + f0_hi = vsubq_s32(f0_hi, u_hi); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + w -= 8; + } while (w != 0); + + src += src_stride; + dat += dat_stride; + flt0 += flt0_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; +} + +static INLINE void highbd_calc_proj_params_r1_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint16_t *src_ptr = src; + const uint16_t *dat_ptr = dat; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint16x8_t s = vld1q_u16(src_ptr); + uint16x8_t d = vld1q_u16(dat_ptr); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int32x4_t u_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); + int32x4_t u_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); + int32x4_t s_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); + int32x4_t s_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); + s_lo = vsubq_s32(s_lo, u_lo); + s_hi = vsubq_s32(s_hi, u_hi); + + f1_lo = vsubq_s32(f1_lo, u_lo); + f1_hi = vsubq_s32(f1_hi, u_hi); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src += src_stride; + dat += dat_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +// The function calls 3 subfunctions for the following cases : +// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements +// of C and H need to be computed. +// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +static INLINE int16x8_t tbl2q(int16x8_t a, int16x8_t b, uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b) } }; + return vreinterpretq_s16_u8(vqtbl2q_u8(table, idx)); +#else + uint8x8x4_t table = { { vreinterpret_u8_s16(vget_low_s16(a)), + vreinterpret_u8_s16(vget_high_s16(a)), + vreinterpret_u8_s16(vget_low_s16(b)), + vreinterpret_u8_s16(vget_high_s16(b)) } }; + return vreinterpretq_s16_u8(vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)), + vtbl4_u8(table, vget_high_u8(idx)))); +#endif +} + +static INLINE int16x8_t tbl3q(int16x8_t a, int16x8_t b, int16x8_t c, + uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x3_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b), + vreinterpretq_u8_s16(c) } }; + return vreinterpretq_s16_u8(vqtbl3q_u8(table, idx)); +#else + // This is a specific implementation working only for compute stats with + // wiener_win == 5. + uint8x8x3_t table_lo = { { vreinterpret_u8_s16(vget_low_s16(a)), + vreinterpret_u8_s16(vget_high_s16(a)), + vreinterpret_u8_s16(vget_low_s16(b)) } }; + uint8x8x3_t table_hi = { { vreinterpret_u8_s16(vget_low_s16(b)), + vreinterpret_u8_s16(vget_high_s16(b)), + vreinterpret_u8_s16(vget_low_s16(c)) } }; + return vreinterpretq_s16_u8(vcombine_u8( + vtbl3_u8(table_lo, vget_low_u8(idx)), + vtbl3_u8(table_hi, vsub_u8(vget_high_u8(idx), vdup_n_u8(16))))); +#endif +} + +static INLINE int64_t div_shift_s64(int64_t x, int power) { + return (x < 0 ? x + (1ll << power) - 1 : x) >> power; +} + +// The M matrix is accumulated in a bitdepth-dependent number of steps to +// speed up the computation. This function computes the final M from the +// accumulated (src_s64) and the residual parts (src_s32). It also transposes +// the result as the output needs to be column-major. +static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int shift) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift); + } + } +} + +// The resulting H is a column-major matrix accumulated from the transposed +// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single +// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This +// function transforms back to the originally expected format (double +// transpose). The H matrix is accumulated in a bitdepth-dependent number of +// steps to speed up the computation. This function computes the final H from +// the accumulated (src_s64) and the residual parts (src_s32). The computed H is +// only an upper triangle matrix, this function also fills the lower triangle of +// the resulting matrix. +static INLINE void update_H(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int stride, int shift) { + // For a simplified theoretical 3x3 case where `wiener_win` is 3 and + // `wiener_win2` is 9, the M matrix is 3x3: + // 0, 3, 6 + // 1, 4, 7 + // 2, 5, 8 + // + // This is viewed as a vector to compute H (9x9) by vector outer product: + // 0, 3, 6, 1, 4, 7, 2, 5, 8 + // + // Double transpose and upper triangle remapping for 3x3 -> 9x9 case: + // 0, 3, 6, 1, 4, 7, 2, 5, 8, + // 3, 30, 33, 12, 31, 34, 21, 32, 35, + // 6, 33, 60, 15, 42, 61, 24, 51, 62, + // 1, 12, 15, 10, 13, 16, 11, 14, 17, + // 4, 31, 42, 13, 40, 43, 22, 41, 44, + // 7, 34, 61, 16, 43, 70, 25, 52, 71, + // 2, 21, 24, 11, 22, 25, 20, 23, 26, + // 5, 32, 51, 14, 41, 52, 23, 50, 53, + // 8, 35, 62, 17, 44, 71, 26, 53, 80, + const int wiener_win2 = wiener_win * wiener_win; + + // Loop through the indices according to the remapping above, along the + // columns: + // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ..., + // wiener_win - 1, wiener_win - 1 + wiener_win, ... + // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int i = 0; i < wiener_win; ++i) { + for (int j = i; j < wiener_win2; j += wiener_win) { + // These two inner loops are the same as the two outer loops, but running + // along rows instead of columns. For the 3x3 case `l` will be: + // 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int k = 0; k < wiener_win; ++k) { + for (int l = k; l < wiener_win2; l += wiener_win) { + // The nominal double transpose indexing would be: + // int idx = stride * j + l; + // However we need the upper-right triangle, it is easy with some + // min/max operations. + int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l); + + // Resulting matrix is filled by combining the 64-bit and the residual + // 32-bit matrices together with scaling. + *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift); + } + } + } + } +} + +// Load 7x7 matrix into 7 128-bit vectors from consecutive rows, the last load +// address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_s16_8x7(int16x8_t dst[7], const int16_t *src, + ptrdiff_t stride) { + dst[0] = vld1q_s16(src); + src += stride; + dst[1] = vld1q_s16(src); + src += stride; + dst[2] = vld1q_s16(src); + src += stride; + dst[3] = vld1q_s16(src); + src += stride; + dst[4] = vld1q_s16(src); + src += stride; + dst[5] = vld1q_s16(src); + src += stride; + dst[6] = vld1q_s16(src - 1); +} + +static INLINE void highbd_compute_stats_win7_neon( + const uint16_t *dgd, const uint16_t *src, int avg, int width, int height, + int dgd_stride, int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7_highbd[192]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, + 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 10, 11, 12, 13, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, + 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats7_highbd + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats7_highbd + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats7_highbd + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats7_highbd + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats7_highbd + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats7_highbd + 80); + const uint8x16_t lut6 = vld1q_u8(shuffle_stats7_highbd + 96); + const uint8x16_t lut7 = vld1q_u8(shuffle_stats7_highbd + 112); + const uint8x16_t lut8 = vld1q_u8(shuffle_stats7_highbd + 128); + const uint8x16_t lut9 = vld1q_u8(shuffle_stats7_highbd + 144); + const uint8x16_t lut10 = vld1q_u8(shuffle_stats7_highbd + 160); + const uint8x16_t lut11 = vld1q_u8(shuffle_stats7_highbd + 176); + + // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results + // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can + // be as high as 32768/2048/128 for the compute stats. + const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1; + int acc_cnt = acc_cnt_max; + const int src_next = src_stride - width; + const int dgd_next = dgd_stride - width; + const int16x8_t avg_s16 = vdupq_n_s16(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + int16x8_t dgd_rows[7]; + load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6; + dgd += 2; + + dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16); + dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16); + dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16); + dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16); + dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16); + dgd_rows[5] = vsubq_s16(dgd_rows[5], avg_s16); + dgd_rows[6] = vsubq_s16(dgd_rows[6], avg_s16); + + // Re-arrange the combined 8x7 matrix to have the 2 whole 7x7 matrices (1 + // for each of the 2 pixels) separated into distinct int16x8_t[6] arrays. + // These arrays contain 48 elements of the 49 (7x7). Compute `dgd - avg` + // for both buffers. Each DGD_AVG buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + int16x8_t dgd_avg1[6]; + + dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut6); + dgd_avg0[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + dgd_avg1[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut7); + dgd_avg0[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + dgd_avg1[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut8); + dgd_avg0[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut3); + dgd_avg1[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut9); + dgd_avg0[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut4); + dgd_avg1[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut10); + dgd_avg0[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut5); + dgd_avg1[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut11); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]); + + // The remaining last (49th) elements of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + DGD_AVG1[48] = dgd_ptr[7] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 7 * 7. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3], + dgd_avg1[3]); + update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4], + dgd_avg1[4]); + update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5], + dgd_avg1[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 49 * 49. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += + DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48]; + + // Accumulate into 64-bit after a bit depth dependent number of iterations + // to prevent overflow. + if (--acc_cnt == 0) { + acc_cnt = acc_cnt_max; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4); + + // Last element of the row is computed separately. + lh[48] += lh32[48]; + lh32[48] = 0; + + lh += WIENER_WIN2_ALIGN2; + lh32 += WIENER_WIN2_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + int16x8_t dgd_rows[7]; + load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6; + ++dgd; + + // Re-arrange the combined 8x7 matrix to have a whole 7x7 matrix tightly + // packed into a int16x8_t[6] array. This array contains 48 elements of + // the 49 (7x7). Compute `dgd - avg` for the whole buffer. The DGD_AVG + // buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + + dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16); + dgd_avg0[1] = vsubq_s16(tbl2q(dgd_rows[1], dgd_rows[2], lut1), avg_s16); + dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[2], dgd_rows[3], lut2), avg_s16); + dgd_avg0[3] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut3), avg_s16); + dgd_avg0[4] = vsubq_s16(tbl2q(dgd_rows[4], dgd_rows[5], lut4), avg_s16); + dgd_avg0[5] = vsubq_s16(tbl2q(dgd_rows[5], dgd_rows[6], lut5), avg_s16); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + + // The remaining last (49th) element of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]); + update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]); + update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[48] += DGD_AVG0[48] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 49 * 49. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48); + + // The last element of the triangle of H_s32 matrix can be computed as + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + int bit_depth_shift = bit_depth - AOM_BITS_8; + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, bit_depth_shift); + + update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, bit_depth_shift); +} + +// Load 5x5 matrix into 5 128-bit vectors from consecutive rows, the last load +// address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_s16_6x5(int16x8_t dst[5], const int16_t *src, + ptrdiff_t stride) { + dst[0] = vld1q_s16(src); + src += stride; + dst[1] = vld1q_s16(src); + src += stride; + dst[2] = vld1q_s16(src); + src += stride; + dst[3] = vld1q_s16(src); + src += stride; + dst[4] = vld1q_s16(src - 3); +} + +static void highbd_compute_stats_win5_neon(const uint16_t *dgd, + const uint16_t *src, int avg, + int width, int height, + int dgd_stride, int src_stride, + int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, + H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, + H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x3 matrix with consecutive elements from 5x5 + // matrix. + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5_highbd[96]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21, + 6, 7, 8, 9, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 32, 33, + 2, 3, 4, 5, 6, 7, 8, 9, 22, 23, 24, 25, 26, 27, 28, 29, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 34, 35, + 4, 5, 6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, + }; + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats5_highbd + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats5_highbd + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats5_highbd + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats5_highbd + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats5_highbd + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats5_highbd + 80); + + // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results + // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can + // be as high as 32768/2048/128 for the compute stats. + const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1; + int acc_cnt = acc_cnt_max; + const int src_next = src_stride - width; + const int dgd_next = dgd_stride - width; + const int16x8_t avg_s16 = vdupq_n_s16(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + int16x8_t dgd_rows[5]; + load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4; + dgd += 2; + + dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16); + dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16); + dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16); + dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16); + dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16); + + // Re-arrange the combined 6x5 matrix to have the 2 whole 5x5 matrices (1 + // for each of the 2 pixels) separated into distinct int16x8_t[3] arrays. + // These arrays contain 24 elements of the 25 (5x5). Compute `dgd - avg` + // for both buffers. Each DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + int16x8_t dgd_avg1[3]; + + dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut3); + dgd_avg0[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1); + dgd_avg1[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut4); + dgd_avg0[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut2); + dgd_avg1[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut5); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + + // The remaining last (25th) elements of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 5 * 5. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 25 * 25. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24]; + + // Accumulate into 64-bit after a bit depth dependent number of iterations + // to prevent overflow. + if (--acc_cnt == 0) { + acc_cnt = acc_cnt_max; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4); + + // Last element of the row is computed separately. + lh[24] += lh32[24]; + lh32[24] = 0; + + lh += WIENER_WIN2_REDUCED_ALIGN2; + lh32 += WIENER_WIN2_REDUCED_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + int16x8_t dgd_rows[5]; + load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4; + ++dgd; + + // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5 + // matrix tightly packed into a int16x8_t[3] array. This array contains + // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + + dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16); + dgd_avg0[1] = vsubq_s16( + tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1), avg_s16); + dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut2), avg_s16); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + + // The remaining last (25th) element of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[24] += DGD_AVG0[24] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 25 * 25. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + int bit_depth_shift = bit_depth - AOM_BITS_8; + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, bit_depth_shift); + + update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2, + bit_depth_shift); +} + +static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride, + int width, int height) { + assert(width > 0); + assert(height > 0); + + uint64x2_t sum_u64 = vdupq_n_u64(0); + uint64_t sum = 0; + + int h = height; + do { + uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int w = width; + const uint16_t *row = src; + while (w >= 32) { + uint16x8_t s0 = vld1q_u16(row + 0); + uint16x8_t s1 = vld1q_u16(row + 8); + uint16x8_t s2 = vld1q_u16(row + 16); + uint16x8_t s3 = vld1q_u16(row + 24); + + s0 = vaddq_u16(s0, s1); + s2 = vaddq_u16(s2, s3); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + sum_u32[1] = vpadalq_u16(sum_u32[1], s2); + + row += 32; + w -= 32; + } + + if (w >= 16) { + uint16x8_t s0 = vld1q_u16(row + 0); + uint16x8_t s1 = vld1q_u16(row + 8); + + s0 = vaddq_u16(s0, s1); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + + row += 16; + w -= 16; + } + + if (w >= 8) { + uint16x8_t s0 = vld1q_u16(row); + sum_u32[1] = vpadalq_u16(sum_u32[1], s0); + + row += 8; + w -= 8; + } + + if (w >= 4) { + uint16x8_t s0 = vcombine_u16(vld1_u16(row), vdup_n_u16(0)); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + + row += 4; + w -= 4; + } + + while (w-- > 0) { + sum += *row++; + } + + sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0], sum_u32[1])); + + src += src_stride; + } while (--h != 0); + + return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width)); +} + +void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED); + + const int wiener_halfwin = wiener_win >> 1; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const int height = v_end - v_start; + const int width = h_end - h_start; + + const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride; + const uint16_t *src_start = src + h_start + v_start * src_stride; + + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; + + uint16_t avg = highbd_find_average_neon(dgd_start, dgd_stride, width, height); + + if (wiener_win == WIENER_WIN) { + highbd_compute_stats_win7_neon(dgd_win, src_start, avg, width, height, + dgd_stride, src_stride, M, H, bit_depth); + } else { + highbd_compute_stats_win5_neon(dgd_win, src_start, avg, width, height, + dgd_stride, src_stride, M, H, bit_depth); + } +} + +int64_t av1_highbd_pixel_proj_error_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int64_t sse = 0; + int64x2_t sse_s64 = vdupq_n_s64(0); + + if (params->r[0] > 0 && params->r[1] > 0) { + int32x2_t xq_v = vld1_s32(xq); + int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4); + + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + + do { + const uint16x8_t d = vld1q_u16(&dat[j]); + const uint16x8_t s = vld1q_u16(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt0[j]); + int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); + int32x4_t flt1_0 = vld1q_s32(&flt1[j]); + int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); + + int32x4_t d_s32_lo = vreinterpretq_s32_u32( + vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); + int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16( + vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); + + int32x4_t v0 = vsubq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), + d_s32_lo); + int32x4_t v1 = vsubq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), + d_s32_hi); + + v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0); + v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0); + v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); + v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), + vreinterpretq_s16_u16(vsubq_u16(d, s))); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + v += xq[0] * (flt0[k]) + xq[1] * (flt1[k]); + v -= (xq[1] + xq[0]) * (int32_t)(dat[k] << 4); + int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += ((int64_t)e * e); + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + } else if (params->r[0] > 0 || params->r[1] > 0) { + int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + int32x4_t xq_v = vdupq_n_s32(xq_active); + + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + do { + const uint16x8_t d0 = vld1q_u16(&dat[j]); + const uint16x8_t s0 = vld1q_u16(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt[j]); + int32x4_t flt0_1 = vld1q_s32(&flt[j + 4]); + + uint16x8_t d_u16 = vshlq_n_u16(d0, 4); + int32x4_t sub0 = vreinterpretq_s32_u32( + vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16))); + int32x4_t sub1 = vreinterpretq_s32_u32( + vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16))); + + int32x4_t v0 = vmlaq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub0, + xq_v); + int32x4_t v1 = vmlaq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub1, + xq_v); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), + vreinterpretq_s16_u16(vsubq_u16(d0, s0))); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + v += xq_active * (int32_t)((uint32_t)flt[j] - (uint16_t)(dat[k] << 4)); + const int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += ((int64_t)e * e); + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + flt += flt_stride; + src += src_stride; + } while (--height != 0); + } else { + do { + int j = 0; + + do { + const uint16x8_t d = vld1q_u16(&dat[j]); + const uint16x8_t s = vld1q_u16(&src[j]); + + uint16x8_t diff = vabdq_u16(d, s); + uint16x4_t diff_lo = vget_low_u16(diff); + uint16x4_t diff_hi = vget_high_u16(diff); + + uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo); + uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi); + + sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo)); + sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi)); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t e = dat[k] - src[k]; + sse += e * e; + } + + dat += dat_stride; + src += src_stride; + } while (--height != 0); + } + + sse += horizontal_add_s64x2(sse_s64); + return sse; +} diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c new file mode 100644 index 0000000000..4bf7ae6ce4 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "aom_dsp/arm/sum_neon.h" + +int64_t av1_highbd_block_error_neon(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + const int shift = 2 * (bd - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int32x4_t c = vld1q_s32(coeff); + const int32x4_t d = vld1q_s32(dqcoeff); + + const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d)); + + err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff)); + err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff)); + + ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c)); + ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c)); + + coeff += 4; + dqcoeff += 4; + block_size -= 4; + } while (block_size != 0); + + *ssz = (horizontal_add_s64x2(ssz_s64) + rounding) >> shift; + return ((int64_t)horizontal_add_u64x2(err_u64) + rounding) >> shift; +} diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c new file mode 100644 index 0000000000..88e176f56c --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c @@ -0,0 +1,562 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void get_squared_error( + const uint16_t *frame1, const uint32_t stride1, const uint16_t *frame2, + const uint32_t stride2, const uint32_t block_width, + const uint32_t block_height, uint32_t *frame_sse, + const unsigned int dst_stride) { + uint32_t *dst = frame_sse; + + uint32_t i = 0; + do { + uint32_t j = 0; + do { + uint16x8_t s = vld1q_u16(frame1 + i * stride1 + j); + uint16x8_t r = vld1q_u16(frame2 + i * stride2 + j); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint32x4_t sse_lo = + vmull_u16(vget_low_u16(abs_diff), vget_low_u16(abs_diff)); + uint32x4_t sse_hi = + vmull_u16(vget_high_u16(abs_diff), vget_high_u16(abs_diff)); + + vst1q_u32(dst + j, sse_lo); + vst1q_u32(dst + j + 4, sse_hi); + + j += 8; + } while (j < block_width); + + dst += dst_stride; + i++; + } while (i < block_height); +} + +static uint32_t sum_kernel5x5_mask_single(const uint32x4_t vsrc[5][2], + const uint32x4_t mask_single) { + uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask_single); + vsums = vmlaq_u32(vsums, vsrc[1][0], mask_single); + vsums = vmlaq_u32(vsums, vsrc[2][0], mask_single); + vsums = vmlaq_u32(vsums, vsrc[3][0], mask_single); + vsums = vmlaq_u32(vsums, vsrc[4][0], mask_single); + return horizontal_add_u32x4(vsums); +} + +static uint32x4_t sum_kernel5x5_mask_double(const uint32x4_t vsrc[5][2], + const uint32x4_t mask1, + const uint32x4_t mask2) { + uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask1); + vsums = vmlaq_u32(vsums, vsrc[1][0], mask1); + vsums = vmlaq_u32(vsums, vsrc[2][0], mask1); + vsums = vmlaq_u32(vsums, vsrc[3][0], mask1); + vsums = vmlaq_u32(vsums, vsrc[4][0], mask1); + vsums = vmlaq_u32(vsums, vsrc[0][1], mask2); + vsums = vmlaq_u32(vsums, vsrc[1][1], mask2); + vsums = vmlaq_u32(vsums, vsrc[2][1], mask2); + vsums = vmlaq_u32(vsums, vsrc[3][1], mask2); + vsums = vmlaq_u32(vsums, vsrc[4][1], mask2); + return vsums; +} + +static void highbd_apply_temporal_filter( + const uint16_t *frame, const unsigned int stride, + const uint32_t block_width, const uint32_t block_height, + const int *subblock_mses, unsigned int *accumulator, uint16_t *count, + const uint32_t *frame_sse, const uint32_t frame_sse_stride, + const uint32_t *luma_sse_sum, const double inv_num_ref_pixels, + const double decay_factor, const double inv_factor, + const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl, + int bd) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_neon[BH][BW] = { 0 }; + const int half_window = TF_WINDOW_LENGTH >> 1; + + uint32x4_t vsrc[5][2] = { 0 }; + const uint32x4_t k0000 = vdupq_n_u32(0); + const uint32x4_t k1111 = vdupq_n_u32(1); + const uint32_t k3110_u32[4] = { 0, 1, 1, 3 }; + const uint32_t k2111_u32[4] = { 1, 1, 1, 2 }; + const uint32_t k1112_u32[4] = { 2, 1, 1, 1 }; + const uint32_t k0113_u32[4] = { 3, 1, 1, 0 }; + const uint32x4_t k3110 = vld1q_u32(k3110_u32); + const uint32x4_t k2111 = vld1q_u32(k2111_u32); + const uint32x4_t k1112 = vld1q_u32(k1112_u32); + const uint32x4_t k0113 = vld1q_u32(k0113_u32); + + uint32x4_t vmask1[4], vmask2[4]; + vmask1[0] = k1111; + vmask2[0] = vextq_u32(k1111, k0000, 3); + vmask1[1] = vextq_u32(k0000, k1111, 3); + vmask2[1] = vextq_u32(k1111, k0000, 2); + vmask1[2] = vextq_u32(k0000, k1111, 2); + vmask2[2] = vextq_u32(k1111, k0000, 1); + vmask1[3] = vextq_u32(k0000, k1111, 1); + vmask2[3] = k1111; + + uint32_t row = 0; + do { + uint32_t col = 0; + const uint32_t *src = frame_sse + row * frame_sse_stride; + if (row == 0) { + vsrc[2][0] = vld1q_u32(src); + vsrc[3][0] = vld1q_u32(src + frame_sse_stride); + vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride); + + // First 2 rows of the 5x5 matrix are padded from the 1st. + vsrc[0][0] = vsrc[2][0]; + vsrc[1][0] = vsrc[2][0]; + } else if (row == 1) { + vsrc[1][0] = vld1q_u32(src - frame_sse_stride); + vsrc[2][0] = vld1q_u32(src); + vsrc[3][0] = vld1q_u32(src + frame_sse_stride); + vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride); + + // First row of the 5x5 matrix are padded from the 1st. + vsrc[0][0] = vsrc[1][0]; + } else if (row == block_height - 2) { + vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][0] = vld1q_u32(src - frame_sse_stride); + vsrc[2][0] = vld1q_u32(src); + vsrc[3][0] = vld1q_u32(src + frame_sse_stride); + + // Last row of the 5x5 matrix are padded from the one before. + vsrc[4][0] = vsrc[3][0]; + } else if (row == block_height - 1) { + vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][0] = vld1q_u32(src - frame_sse_stride); + vsrc[2][0] = vld1q_u32(src); + + // Last 2 rows of the 5x5 matrix are padded from the 3rd. + vsrc[3][0] = vsrc[2][0]; + vsrc[4][0] = vsrc[2][0]; + } else { + vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][0] = vld1q_u32(src - frame_sse_stride); + vsrc[2][0] = vld1q_u32(src); + vsrc[3][0] = vld1q_u32(src + frame_sse_stride); + vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride); + } + + acc_5x5_neon[row][0] = sum_kernel5x5_mask_single(vsrc, k0113); + acc_5x5_neon[row][1] = sum_kernel5x5_mask_single(vsrc, k1112); + + col += 4; + src += 4; + // Traverse 4 columns at a time + do { + if (row == 0) { + vsrc[2][1] = vld1q_u32(src); + vsrc[3][1] = vld1q_u32(src + frame_sse_stride); + vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride); + + // First 2 rows of the 5x5 matrix are padded from the 1st. + vsrc[0][1] = vsrc[2][1]; + vsrc[1][1] = vsrc[2][1]; + } else if (row == 1) { + vsrc[1][1] = vld1q_u32(src - frame_sse_stride); + vsrc[2][1] = vld1q_u32(src); + vsrc[3][1] = vld1q_u32(src + frame_sse_stride); + vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride); + + // First row of the 5x5 matrix are padded from the 1st. + vsrc[0][1] = vsrc[1][1]; + } else if (row == block_height - 2) { + vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][1] = vld1q_u32(src - frame_sse_stride); + vsrc[2][1] = vld1q_u32(src); + vsrc[3][1] = vld1q_u32(src + frame_sse_stride); + + // Last row of the 5x5 matrix are padded from the one before. + vsrc[4][1] = vsrc[3][1]; + } else if (row == block_height - 1) { + vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][1] = vld1q_u32(src - frame_sse_stride); + vsrc[2][1] = vld1q_u32(src); + + // Last 2 rows of the 5x5 matrix are padded from the 3rd. + vsrc[3][1] = vsrc[2][1]; + vsrc[4][1] = vsrc[2][1]; + } else { + vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][1] = vld1q_u32(src - frame_sse_stride); + vsrc[2][1] = vld1q_u32(src); + vsrc[3][1] = vld1q_u32(src + frame_sse_stride); + vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride); + } + + uint32x4_t sums[4]; + sums[0] = sum_kernel5x5_mask_double(vsrc, vmask1[0], vmask2[0]); + sums[1] = sum_kernel5x5_mask_double(vsrc, vmask1[1], vmask2[1]); + sums[2] = sum_kernel5x5_mask_double(vsrc, vmask1[2], vmask2[2]); + sums[3] = sum_kernel5x5_mask_double(vsrc, vmask1[3], vmask2[3]); + vst1q_u32(&acc_5x5_neon[row][col - half_window], + horizontal_add_4d_u32x4(sums)); + + vsrc[0][0] = vsrc[0][1]; + vsrc[1][0] = vsrc[1][1]; + vsrc[2][0] = vsrc[2][1]; + vsrc[3][0] = vsrc[3][1]; + vsrc[4][0] = vsrc[4][1]; + + src += 4; + col += 4; + } while (col <= block_width - 4); + + acc_5x5_neon[row][col - half_window] = + sum_kernel5x5_mask_single(vsrc, k2111); + acc_5x5_neon[row][col - half_window + 1] = + sum_kernel5x5_mask_single(vsrc, k3110); + + row++; + } while (row < block_height); + + // Perform filtering. + if (tf_wgt_calc_lvl == 0) { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + // Scale down the difference for high bit depth input. + const uint32_t diff_sse = + (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } else { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + // Scale down the difference for high bit depth input. + const uint32_t diff_sse = + (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } +} + +void av1_highbd_apply_temporal_filter_neon( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred8, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + assert(is_high_bitdepth); + + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + // Frame information. + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint32_t frame_sse[BW * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = + frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; + const uint32_t frame_sse_stride = plane_w; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint16_t *ref = + CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + const int ww = frame_sse_stride + << ss_x_shift; // Width of Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * ww + xx]; + } + } + } + } + } + get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w, + plane_h, frame_sse, frame_sse_stride); + + highbd_apply_temporal_filter( + pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses, + accum + plane_offset, count + plane_offset, frame_sse, frame_sse_stride, + luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl, mbd->bd); + + plane_offset += plane_h * plane_w; + } +} + +double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src, + int height, int width, + int stride, + int bitdepth, + int edge_thresh) { + uint16x8_t thresh = vdupq_n_u16(edge_thresh); + uint64x2_t acc = vdupq_n_u64(0); + // Count is in theory positive as it counts the number of times we're under + // the threshold, but it will be counted negatively in order to make best use + // of the vclt instruction, which sets every bit of a lane to 1 when the + // condition is true. + int32x4_t count = vdupq_n_s32(0); + int final_count = 0; + uint64_t final_acc = 0; + const uint16_t *src_start = src + stride + 1; + int h = 1; + + do { + int w = 1; + const uint16_t *src_ptr = src_start; + + while (w <= (width - 1) - 8) { + uint16x8_t mat[3][3]; + mat[0][0] = vld1q_u16(src_ptr - stride - 1); + mat[0][1] = vld1q_u16(src_ptr - stride); + mat[0][2] = vld1q_u16(src_ptr - stride + 1); + mat[1][0] = vld1q_u16(src_ptr - 1); + mat[1][1] = vld1q_u16(src_ptr); + mat[1][2] = vld1q_u16(src_ptr + 1); + mat[2][0] = vld1q_u16(src_ptr + stride - 1); + mat[2][1] = vld1q_u16(src_ptr + stride); + mat[2][2] = vld1q_u16(src_ptr + stride + 1); + + // Compute Sobel gradients. + uint16x8_t gxa = vaddq_u16(mat[0][0], mat[2][0]); + uint16x8_t gxb = vaddq_u16(mat[0][2], mat[2][2]); + gxa = vaddq_u16(gxa, vaddq_u16(mat[1][0], mat[1][0])); + gxb = vaddq_u16(gxb, vaddq_u16(mat[1][2], mat[1][2])); + + uint16x8_t gya = vaddq_u16(mat[0][0], mat[0][2]); + uint16x8_t gyb = vaddq_u16(mat[2][0], mat[2][2]); + gya = vaddq_u16(gya, vaddq_u16(mat[0][1], mat[0][1])); + gyb = vaddq_u16(gyb, vaddq_u16(mat[2][1], mat[2][1])); + + uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb); + ga = vrshlq_u16(ga, vdupq_n_s16(8 - bitdepth)); + + // Check which vector elements are under the threshold. The Laplacian is + // then unconditionnally computed and we accumulate zeros if we're not + // under the threshold. This is much faster than using an if statement. + uint16x8_t thresh_u16 = vcltq_u16(ga, thresh); + + uint16x8_t center = vshlq_n_u16(mat[1][1], 2); + + uint16x8_t adj0 = vaddq_u16(mat[0][1], mat[2][1]); + uint16x8_t adj1 = vaddq_u16(mat[1][0], mat[1][2]); + uint16x8_t adj = vaddq_u16(adj0, adj1); + adj = vaddq_u16(adj, adj); + + uint16x8_t diag0 = vaddq_u16(mat[0][0], mat[0][2]); + uint16x8_t diag1 = vaddq_u16(mat[2][0], mat[2][2]); + uint16x8_t diag = vaddq_u16(diag0, diag1); + + uint16x8_t v = vabdq_u16(vaddq_u16(center, diag), adj); + v = vandq_u16(vrshlq_u16(v, vdupq_n_s16(8 - bitdepth)), thresh_u16); + uint32x4_t v_u32 = vpaddlq_u16(v); + + acc = vpadalq_u32(acc, v_u32); + // Add -1 for each lane where the gradient is under the threshold. + count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16)); + + w += 8; + src_ptr += 8; + } + + if (w <= (width - 1) - 4) { + uint16x4_t mat[3][3]; + mat[0][0] = vld1_u16(src_ptr - stride - 1); + mat[0][1] = vld1_u16(src_ptr - stride); + mat[0][2] = vld1_u16(src_ptr - stride + 1); + mat[1][0] = vld1_u16(src_ptr - 1); + mat[1][1] = vld1_u16(src_ptr); + mat[1][2] = vld1_u16(src_ptr + 1); + mat[2][0] = vld1_u16(src_ptr + stride - 1); + mat[2][1] = vld1_u16(src_ptr + stride); + mat[2][2] = vld1_u16(src_ptr + stride + 1); + + // Compute Sobel gradients. + uint16x4_t gxa = vadd_u16(mat[0][0], mat[2][0]); + uint16x4_t gxb = vadd_u16(mat[0][2], mat[2][2]); + gxa = vadd_u16(gxa, vadd_u16(mat[1][0], mat[1][0])); + gxb = vadd_u16(gxb, vadd_u16(mat[1][2], mat[1][2])); + + uint16x4_t gya = vadd_u16(mat[0][0], mat[0][2]); + uint16x4_t gyb = vadd_u16(mat[2][0], mat[2][2]); + gya = vadd_u16(gya, vadd_u16(mat[0][1], mat[0][1])); + gyb = vadd_u16(gyb, vadd_u16(mat[2][1], mat[2][1])); + + uint16x4_t ga = vaba_u16(vabd_u16(gxa, gxb), gya, gyb); + ga = vrshl_u16(ga, vdup_n_s16(8 - bitdepth)); + + // Check which vector elements are under the threshold. The Laplacian is + // then unconditionnally computed and we accumulate zeros if we're not + // under the threshold. This is much faster than using an if statement. + uint16x4_t thresh_u16 = vclt_u16(ga, vget_low_u16(thresh)); + + uint16x4_t center = vshl_n_u16(mat[1][1], 2); + + uint16x4_t adj0 = vadd_u16(mat[0][1], mat[2][1]); + uint16x4_t adj1 = vadd_u16(mat[1][0], mat[1][2]); + uint16x4_t adj = vadd_u16(adj0, adj1); + adj = vadd_u16(adj, adj); + + uint16x4_t diag0 = vadd_u16(mat[0][0], mat[0][2]); + uint16x4_t diag1 = vadd_u16(mat[2][0], mat[2][2]); + uint16x4_t diag = vadd_u16(diag0, diag1); + + uint16x4_t v = vabd_u16(vadd_u16(center, diag), adj); + v = vand_u16(v, thresh_u16); + uint32x4_t v_u32 = vmovl_u16(vrshl_u16(v, vdup_n_s16(8 - bitdepth))); + + acc = vpadalq_u32(acc, v_u32); + // Add -1 for each lane where the gradient is under the threshold. + count = vaddw_s16(count, vreinterpret_s16_u16(thresh_u16)); + + w += 4; + src_ptr += 4; + } + + while (w < width - 1) { + int mat[3][3]; + mat[0][0] = *(src_ptr - stride - 1); + mat[0][1] = *(src_ptr - stride); + mat[0][2] = *(src_ptr - stride + 1); + mat[1][0] = *(src_ptr - 1); + mat[1][1] = *(src_ptr); + mat[1][2] = *(src_ptr + 1); + mat[2][0] = *(src_ptr + stride - 1); + mat[2][1] = *(src_ptr + stride); + mat[2][2] = *(src_ptr + stride + 1); + + // Compute Sobel gradients. + const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + + 2 * (mat[1][0] - mat[1][2]); + const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + + 2 * (mat[0][1] - mat[2][1]); + const int ga = ROUND_POWER_OF_TWO(abs(gx) + abs(gy), bitdepth - 8); + + // Accumulate Laplacian. + const int is_under = ga < edge_thresh; + const int v = 4 * mat[1][1] - + 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); + final_acc += ROUND_POWER_OF_TWO(abs(v), bitdepth - 8) * is_under; + final_count += is_under; + + src_ptr++; + w++; + } + src_start += stride; + } while (++h < height - 1); + + // We counted negatively, so subtract to get the final value. + final_count -= horizontal_add_s32x4(count); + final_acc += horizontal_add_u64x2(acc); + return (final_count < 16) + ? -1.0 + : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2; +} diff --git a/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c new file mode 100644 index 0000000000..6cf835a243 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/txfm_common.h" + +static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) { + int32x4x2_t b0 = + vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1])); + int16x4x2_t c0 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])), + vreinterpret_s16_s32(vget_high_s32(b0.val[0]))); + int16x4x2_t c1 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])), + vreinterpret_s16_s32(vget_high_s32(b0.val[1]))); + out[0] = c0.val[0]; + out[1] = c0.val[1]; + out[2] = c1.val[0]; + out[3] = c1.val[1]; +} + +void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride) { + // Load the 4x4 source in transposed form. + int16x4_t a1, b1, c1, d1, e; + a1 = vld1_s16(&input[0]); + b1 = vld1_s16(&input[1 * stride]); + c1 = vld1_s16(&input[2 * stride]); + d1 = vld1_s16(&input[3 * stride]); + + // WHT. + + // Row transforms. + a1 = vadd_s16(a1, b1); + d1 = vsub_s16(d1, c1); + e = vhsub_s16(a1, d1); + b1 = vsub_s16(e, b1); + c1 = vsub_s16(e, c1); + a1 = vsub_s16(a1, c1); + d1 = vadd_s16(d1, b1); + + int16x8_t x[2]; + x[0] = vcombine_s16(a1, c1); + x[1] = vcombine_s16(d1, b1); + + int16x4_t s[4]; + transpose4x4(x, s); + + a1 = s[0]; + b1 = s[1]; + c1 = s[2]; + d1 = s[3]; + + // Row transforms. + a1 = vadd_s16(a1, b1); + d1 = vsub_s16(d1, c1); + e = vhsub_s16(a1, d1); + b1 = vsub_s16(e, b1); + c1 = vsub_s16(e, c1); + a1 = vsub_s16(a1, c1); + d1 = vadd_s16(d1, b1); + + vst1q_s32(&output[0], vshll_n_s16(a1, UNIT_QUANT_SHIFT)); + vst1q_s32(&output[4], vshll_n_s16(c1, UNIT_QUANT_SHIFT)); + vst1q_s32(&output[8], vshll_n_s16(d1, UNIT_QUANT_SHIFT)); + vst1q_s32(&output[12], vshll_n_s16(b1, UNIT_QUANT_SHIFT)); +} diff --git a/third_party/aom/av1/encoder/arm/neon/ml_neon.c b/third_party/aom/av1/encoder/arm/neon/ml_neon.c new file mode 100644 index 0000000000..be6ddfd763 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/ml_neon.c @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/encoder/ml.h" + +static void nn_activate8(float32x4_t *out_h, float32x4_t *out_l, + const float32x4_t *zero) { + *out_h = vmaxq_f32(*out_h, *zero); + *out_l = vmaxq_f32(*out_l, *zero); +} + +static void nn_activate4(float32x4_t *x, const float32x4_t *zero) { + *x = vmaxq_f32(*x, *zero); +} + +#define CLAMP_0(x) (x = x > 0 ? x : 0) + +static void nn_propagate_8to1(int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes, bool output_layer) { + const float32x4_t zero = vdupq_n_f32(0); + float32x4_t vadd = zero; + float total = *layer_bias; + + for (int in = 0; in < num_inputs; in += 8) { + const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]); + const float32x4_t inputs_l = vld1q_f32(&inputs[in]); + + const float32x4_t weights_h = vld1q_f32(&weights[in + 4]); + const float32x4_t weights_l = vld1q_f32(&weights[in]); + + vadd = vmlaq_f32(vadd, inputs_h, weights_h); + vadd = vmlaq_f32(vadd, inputs_l, weights_l); + } +#if AOM_ARCH_AARCH64 + total += vaddvq_f32(vadd); +#else + float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd)); + vadd_lo = vpadd_f32(vadd_lo, vadd_lo); + total += vget_lane_f32(vadd_lo, 0); +#endif + + if (!output_layer) CLAMP_0(total); + *output_nodes = total; +} + +static void nn_propagate_xto1(int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes) { + float32x4_t vadd = vdupq_n_f32(0); + + float total = *layer_bias; + int j = num_inputs; + int in = 0; + while (j > 7) { + const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]); + const float32x4_t inputs_l = vld1q_f32(&inputs[in]); + + const float32x4_t weights_h = vld1q_f32(&weights[in + 4]); + const float32x4_t weights_l = vld1q_f32(&weights[in]); + + vadd = vmlaq_f32(vadd, inputs_h, weights_h); + vadd = vmlaq_f32(vadd, inputs_l, weights_l); + in += 8; + j -= 8; + } + +#if AOM_ARCH_AARCH64 + total += vaddvq_f32(vadd); + +#else + float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd)); + vadd_lo = vpadd_f32(vadd_lo, vadd_lo); + total += vget_lane_f32(vadd_lo, 0); +#endif + for (; in < num_inputs; in++) total += weights[in] * inputs[in]; + + *output_nodes = CLAMP_0(total); +} + +static void nn_propagate_xsto1(int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes) { + float total = *layer_bias; +#if AOM_ARCH_AARCH64 + const float32x4_t v_inputs = vld1q_f32(inputs); + const float32x4_t v_weights = vld1q_f32(weights); + const float32x4_t vadd = vmulq_f32(v_inputs, v_weights); + total += vaddvq_f32(vadd); + int in = 4; +#else + int in = 0; +#endif + for (; in < num_inputs; in++) total += weights[in] * inputs[in]; + + *output_nodes = CLAMP_0(total); +} + +static void nn_propagate_4to1(int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes, bool output_layer) { + const float32x4_t zero = vdupq_n_f32(0); + float32x4_t vadd = zero; + float total = *layer_bias; + + for (int in = 0; in < num_inputs; in += 4) { + const float32x4_t v_inputs = vld1q_f32(&inputs[in]); + const float32x4_t v_weights = vld1q_f32(&weights[in]); + vadd = vmlaq_f32(vadd, v_inputs, v_weights); + } + +#if AOM_ARCH_AARCH64 + total += vaddvq_f32(vadd); +#else + float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd)); + vadd_lo = vpadd_f32(vadd_lo, vadd_lo); + total += vget_lane_f32(vadd_lo, 0); +#endif + + if (!output_layer) CLAMP_0(total); + *output_nodes = total; +} + +static void nn_propagate_4to4(int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes, bool output_layer) { + float32x4_t outputs = vld1q_f32(layer_bias); + const float32x4_t zero = vdupq_n_f32(0); + + float32x4_t mul0[2] = { zero, zero }; + float32x4_t mul1[2] = { zero, zero }; + for (int in = 0; in < num_inputs; in += 4) { + const float32x4_t v_input = vld1q_f32(&inputs[in]); + + for (int i = 0; i < 2; i++) { + const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]); + mul0[i] = vmlaq_f32(mul0[i], weight0, v_input); + const float32x4_t weight1 = + vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]); + mul1[i] = vmlaq_f32(mul1[i], weight1, v_input); + } + } + for (int i = 0; i < 2; i++) +#if AOM_ARCH_AARCH64 + mul0[i] = vpaddq_f32(mul0[i], mul1[i]); + const float32x4_t hh = vpaddq_f32(mul0[0], mul0[1]); +#else + mul0[i] = + vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])), + vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i]))); + const float32x4_t hh = + vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])), + vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1]))); +#endif + + outputs = vaddq_f32(outputs, hh); + if (!output_layer) nn_activate4(&outputs, &zero); + vst1q_f32(output_nodes, outputs); +} + +static void nn_propagate_4to8(const int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes, bool output_layer) { + float32x4_t out_h = vld1q_f32(&layer_bias[4]); + float32x4_t out_l = vld1q_f32(layer_bias); + const float32x4_t zero = vdupq_n_f32(0); + float32x4_t mul0[4] = { zero, zero, zero, zero }; + float32x4_t mul1[4] = { zero, zero, zero, zero }; + + for (int in = 0; in < num_inputs; in += 4) { + const float32x4_t v_input = vld1q_f32(&inputs[in]); + for (int i = 0; i < 4; i++) { + const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]); + const float32x4_t weight1 = + vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]); + mul0[i] = vmlaq_f32(mul0[i], v_input, weight0); + mul1[i] = vmlaq_f32(mul1[i], v_input, weight1); + } + } + for (int i = 0; i < 4; i++) +#if AOM_ARCH_AARCH64 + mul0[i] = vpaddq_f32(mul0[i], mul1[i]); + const float32x4_t hh0 = vpaddq_f32(mul0[0], mul0[1]); + const float32x4_t hh1 = vpaddq_f32(mul0[2], mul0[3]); +#else + mul0[i] = + vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])), + vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i]))); + const float32x4_t hh0 = + vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])), + vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1]))); + const float32x4_t hh1 = + vcombine_f32(vpadd_f32(vget_low_f32(mul0[2]), vget_high_f32(mul0[2])), + vpadd_f32(vget_low_f32(mul0[3]), vget_high_f32(mul0[3]))); +#endif + + out_h = vaddq_f32(out_h, hh1); + out_l = vaddq_f32(out_l, hh0); + + if (!output_layer) nn_activate8(&out_h, &out_l, &zero); + vst1q_f32(&output_nodes[4], out_h); + vst1q_f32(output_nodes, out_l); +} + +static void nn_propagate_8to4(const int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes, bool output_layer) { + float32x4_t outputs = vld1q_f32(layer_bias); + const float32x4_t zero = vdupq_n_f32(0); + float32x4_t add[4] = { zero, zero, zero, zero }; + for (int in = 0; in < num_inputs; in += 8) { + const float32x4_t inputs_l = vld1q_f32(&inputs[in]); + const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]); + + for (int i = 0; i < 4; i++) { + const float32x4_t weight_l = vld1q_f32(&weights[in + i * num_inputs]); + const float32x4_t weight_h = vld1q_f32(&weights[in + i * num_inputs + 4]); + add[i] = vmlaq_f32(add[i], inputs_l, weight_l); + add[i] = vmlaq_f32(add[i], inputs_h, weight_h); + } + } +#if AOM_ARCH_AARCH64 + const float32x4_t hadd_h = vpaddq_f32(add[2], add[3]); + const float32x4_t hadd_l = vpaddq_f32(add[0], add[1]); + const float32x4_t haddhadd = vpaddq_f32(hadd_l, hadd_h); +#else + const float32x4_t hadd_h = + vcombine_f32(vpadd_f32(vget_low_f32(add[2]), vget_high_f32(add[2])), + vpadd_f32(vget_low_f32(add[3]), vget_high_f32(add[3]))); + const float32x4_t hadd_l = + vcombine_f32(vpadd_f32(vget_low_f32(add[0]), vget_high_f32(add[0])), + vpadd_f32(vget_low_f32(add[1]), vget_high_f32(add[1]))); + const float32x4_t haddhadd = + vcombine_f32(vpadd_f32(vget_low_f32(hadd_l), vget_high_f32(hadd_l)), + vpadd_f32(vget_low_f32(hadd_h), vget_high_f32(hadd_h))); +#endif + + outputs = vaddq_f32(outputs, haddhadd); + if (!output_layer) nn_activate4(&outputs, &zero); + vst1q_f32(output_nodes, outputs); +} + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_neon(const float *input_nodes, + const NN_CONFIG *const nn_config, int reduce_prec, + float *const output) { + float buf[2][NN_MAX_NODES_PER_LAYER]; + int buf_index = 0; + int num_inputs = nn_config->num_inputs; + // Hidden layers, except the final iteration is the output layer. + for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + bool output_layer = (layer == nn_config->num_hidden_layers); + float *const output_nodes = output_layer ? output : buf[buf_index]; + const int num_outputs = output_layer ? nn_config->num_outputs + : nn_config->num_hidden_nodes[layer]; + + if (num_inputs % 4 == 0 && num_outputs % 8 == 0) { + for (int out = 0; out < num_outputs; out += 8) { + nn_propagate_4to8(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out], output_layer); + } + } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + nn_propagate_8to4(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out], output_layer); + } + } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + nn_propagate_4to4(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out], output_layer); + } + } else if (num_inputs % 8 == 0) { + for (int out = 0; out < num_outputs; out++) { + nn_propagate_8to1(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out], output_layer); + } + } else if (num_inputs % 4 == 0) { + for (int out = 0; out < num_outputs; out++) { + nn_propagate_4to1(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out], output_layer); + } + } else if (num_inputs > 8) { + for (int out = 0; out < num_outputs; out++) { + nn_propagate_xto1(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out]); + } + } else if (num_inputs >= 4) { + for (int out = 0; out < num_outputs; out++) { + nn_propagate_xsto1(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out]); + } + } else { + for (int node = 0; node < num_outputs; ++node) { + float val = layer_bias[node]; + for (int i = 0; i < num_inputs; ++i) + val += layer_weights[node * num_inputs + i] * input_nodes[i]; + // ReLU as activation function. + val = val > 0.0f ? val : 0.0f; // Could use AOMMAX(). + output_nodes[node] = val; + } + } + input_nodes = output_nodes; + num_inputs = num_outputs; + buf_index = 1 - buf_index; + } + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); +} diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c new file mode 100644 index 0000000000..2e4761f9a4 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c @@ -0,0 +1,1217 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/restoration.h" +#include "av1/encoder/arm/neon/pickrst_neon.h" +#include "av1/encoder/pickrst.h" + +int64_t av1_lowbd_pixel_proj_error_neon( + const uint8_t *src, int width, int height, int src_stride, + const uint8_t *dat, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int64_t sse = 0; + int64x2_t sse_s64 = vdupq_n_s64(0); + + if (params->r[0] > 0 && params->r[1] > 0) { + int32x2_t xq_v = vld1_s32(xq); + int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), SGRPROJ_RST_BITS); + + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + + do { + const uint8x8_t d = vld1_u8(&dat[j]); + const uint8x8_t s = vld1_u8(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt0[j]); + int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); + int32x4_t flt1_0 = vld1q_s32(&flt1[j]); + int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); + + int32x4_t offset = + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)); + int32x4_t v0 = vmlaq_lane_s32(offset, flt0_0, xq_v, 0); + int32x4_t v1 = vmlaq_lane_s32(offset, flt0_1, xq_v, 0); + + v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); + v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); + + int16x8_t d_s16 = vreinterpretq_s16_u16(vmovl_u8(d)); + v0 = vmlsl_lane_s16(v0, vget_low_s16(d_s16), + vreinterpret_s16_s32(xq_sum_v), 0); + v1 = vmlsl_lane_s16(v1, vget_high_s16(d_s16), + vreinterpret_s16_s32(xq_sum_v), 0); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s)); + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t u = (dat[k] << SGRPROJ_RST_BITS); + int32_t v = (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)) + + xq[0] * flt0[k] + xq[1] * flt1[k] - u * (xq[0] + xq[1]); + int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += e * e; + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + } else if (params->r[0] > 0 || params->r[1] > 0) { + int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + int32x2_t xq_v = vdup_n_s32(xq_active); + + do { + int32x4_t sse_s32 = vdupq_n_s32(0); + int j = 0; + + do { + const uint8x8_t d = vld1_u8(&dat[j]); + const uint8x8_t s = vld1_u8(&src[j]); + int32x4_t flt_0 = vld1q_s32(&flt[j]); + int32x4_t flt_1 = vld1q_s32(&flt[j + 4]); + int16x8_t d_s16 = + vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + + int32x4_t sub_0 = vsubw_s16(flt_0, vget_low_s16(d_s16)); + int32x4_t sub_1 = vsubw_s16(flt_1, vget_high_s16(d_s16)); + + int32x4_t offset = + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)); + int32x4_t v0 = vmlaq_lane_s32(offset, sub_0, xq_v, 0); + int32x4_t v1 = vmlaq_lane_s32(offset, sub_1, xq_v, 0); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s)); + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t u = dat[k] << SGRPROJ_RST_BITS; + int32_t v = xq_active * (flt[k] - u); + int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) + + dat[k] - src[k]; + sse += e * e; + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + src += src_stride; + flt += flt_stride; + } while (--height != 0); + } else { + uint32x4_t sse_s32 = vdupq_n_u32(0); + + do { + int j = 0; + + do { + const uint8x16_t d = vld1q_u8(&dat[j]); + const uint8x16_t s = vld1q_u8(&src[j]); + + uint8x16_t diff = vabdq_u8(d, s); + uint8x8_t diff_lo = vget_low_u8(diff); + uint8x8_t diff_hi = vget_high_u8(diff); + + sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_lo, diff_lo)); + sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_hi, diff_hi)); + + j += 16; + } while (j <= width - 16); + + for (int k = j; k < width; ++k) { + int32_t e = dat[k] - src[k]; + sse += e * e; + } + + dat += dat_stride; + src += src_stride; + } while (--height != 0); + + sse_s64 = vreinterpretq_s64_u64(vpaddlq_u32(sse_s32)); + } + + sse += horizontal_add_s64x2(sse_s64); + return sse; +} + +// We can accumulate up to 65536 8-bit multiplication results in 32-bit. We are +// processing 2 pixels at a time, so the accumulator max can be as high as 32768 +// for the compute stats. +#define STAT_ACCUMULATOR_MAX 32768 + +static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { a, b } }; + return vqtbl2_u8(table, idx); +#else + uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), + vget_high_u8(b) } }; + return vtbl4_u8(table, idx); +#endif +} + +static INLINE uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { a, b } }; + return vqtbl2q_u8(table, idx); +#else + uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), + vget_high_u8(b) } }; + return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)), + vtbl4_u8(table, vget_high_u8(idx))); +#endif +} + +// The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the +// computation. This function computes the final M from the accumulated +// (src_s64) and the residual parts (src_s32). It also transposes the result as +// the output needs to be column-major. +static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int scale) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale; + } + } +} + +// The resulting H is a column-major matrix accumulated from the transposed +// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single +// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This +// function transforms back to the originally expected format (double +// transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to +// speed-up the computation. This function computes the final H from the +// accumulated (src_s64) and the residual parts (src_s32). The computed H is +// only an upper triangle matrix, this function also fills the lower triangle of +// the resulting matrix. +static void update_H(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, int stride, + int scale) { + // For a simplified theoretical 3x3 case where `wiener_win` is 3 and + // `wiener_win2` is 9, the M matrix is 3x3: + // 0, 3, 6 + // 1, 4, 7 + // 2, 5, 8 + // + // This is viewed as a vector to compute H (9x9) by vector outer product: + // 0, 3, 6, 1, 4, 7, 2, 5, 8 + // + // Double transpose and upper triangle remapping for 3x3 -> 9x9 case: + // 0, 3, 6, 1, 4, 7, 2, 5, 8, + // 3, 30, 33, 12, 31, 34, 21, 32, 35, + // 6, 33, 60, 15, 42, 61, 24, 51, 62, + // 1, 12, 15, 10, 13, 16, 11, 14, 17, + // 4, 31, 42, 13, 40, 43, 22, 41, 44, + // 7, 34, 61, 16, 43, 70, 25, 52, 71, + // 2, 21, 24, 11, 22, 25, 20, 23, 26, + // 5, 32, 51, 14, 41, 52, 23, 50, 53, + // 8, 35, 62, 17, 44, 71, 26, 53, 80, + const int wiener_win2 = wiener_win * wiener_win; + + // Loop through the indices according to the remapping above, along the + // columns: + // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ..., + // wiener_win - 1, wiener_win - 1 + wiener_win, ... + // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int i = 0; i < wiener_win; ++i) { + for (int j = i; j < wiener_win2; j += wiener_win) { + // These two inner loops are the same as the two outer loops, but running + // along rows instead of columns. For the 3x3 case `l` will be: + // 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int k = 0; k < wiener_win; ++k) { + for (int l = k; l < wiener_win2; l += wiener_win) { + // The nominal double transpose indexing would be: + // int idx = stride * j + l; + // However we need the upper-triangle indices, it is easy with some + // min/max operations. + int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l); + + // Resulting matrix is filled by combining the 64-bit and the residual + // 32-bit matrices together with scaling. + *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale; + } + } + } + } +} + +// Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the +// last load address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src, + ptrdiff_t stride) { + dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0)); +} + +static INLINE void compute_stats_win7_neon(const uint8_t *dgd, + const uint8_t *src, int width, + int height, int dgd_stride, + int src_stride, int avg, int64_t *M, + int64_t *H, int downsample_factor) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7[96]) = { + 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, + 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, + 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, + 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, + 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, + 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats7 + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats7 + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats7 + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats7 + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats7 + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats7 + 80); + + int acc_cnt = STAT_ACCUMULATOR_MAX; + const int src_next = downsample_factor * src_stride - width; + const int dgd_next = downsample_factor * dgd_stride - width; + const uint8x8_t avg_u8 = vdup_n_u8(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + uint8x16_t dgd_rows[4]; + load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 6; + dgd += 2; + + // Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7 + // matrices (1 for each of the 2 pixels) separated into distinct + // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7). + // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49 + // consecutive elements. + int16x8_t dgd_avg0[6]; + int16x8_t dgd_avg1[6]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg1[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf3), avg_u8)); + dgd_avg1[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf3), avg_u8)); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + uint8x16_t dgd_shuf4 = tbl2q(dgd_rows[1], dgd_rows[2], lut4); + + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg0[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + dgd_avg1[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf4), avg_u8)); + dgd_avg1[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf4), avg_u8)); + + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]); + + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + uint8x16_t dgd_shuf5 = tbl2q(dgd_rows[2], dgd_rows[3], lut5); + + dgd_avg0[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg0[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + dgd_avg1[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf5), avg_u8)); + dgd_avg1[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf5), avg_u8)); + + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]); + vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]); + + // The remaining last (49th) elements of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + DGD_AVG1[48] = dgd_ptr[7] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 7 * 7. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3], + dgd_avg1[3]); + update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4], + dgd_avg1[4]); + update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5], + dgd_avg1[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 49 * 49. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += + DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48]; + + // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent + // overflow. + if (--acc_cnt == 0) { + acc_cnt = STAT_ACCUMULATOR_MAX; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4); + + // Last element of the row is computed separately. + lh[48] += lh32[48]; + lh32[48] = 0; + + lh += WIENER_WIN2_ALIGN2; + lh32 += WIENER_WIN2_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + uint8x16_t dgd_rows[4]; + load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 6; + ++dgd; + + // Re-arrange (and widen) the combined 8x7 matrix to have a whole 7x7 + // matrix tightly packed into a int16x8_t[6] array. This array contains + // 48 elements of the 49 (7x7). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg0[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + dgd_avg0[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg0[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + + // The remaining last (49th) element of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]); + update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]); + update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[48] += DGD_AVG0[48] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 49 * 49. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48); + + // The last element of the triangle of H_s32 matrix can be computed as + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, downsample_factor); + + update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, downsample_factor); +} + +// Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the +// last load address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src, + ptrdiff_t stride) { + dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0)); +} + +static INLINE void compute_stats_win5_neon(const uint8_t *dgd, + const uint8_t *src, int width, + int height, int dgd_stride, + int src_stride, int avg, int64_t *M, + int64_t *H, int downsample_factor) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, + H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, + H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x3 matrix with consecutive elements from two 5x5 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5[48]) = { + 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, + 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 25, + 9, 10, 11, 12, 19, 20, 21, 22, 10, 11, 12, 13, 20, 21, 22, 23, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats5 + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats5 + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats5 + 32); + + int acc_cnt = STAT_ACCUMULATOR_MAX; + const int src_next = downsample_factor * src_stride - width; + const int dgd_next = downsample_factor * dgd_stride - width; + const uint8x8_t avg_u8 = vdup_n_u8(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + uint8x16_t dgd_rows[3]; + load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 4; + dgd += 2; + + // Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5 + // matrices (1 for each of the 2 pixels) separated into distinct + // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5). + // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25 + // consecutive elements. + int16x8_t dgd_avg0[3]; + int16x8_t dgd_avg1[3]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1); + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg1[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg1[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + dgd_avg1[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + + vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 0, dgd_avg1[0]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + + // The remaining last (25th) elements of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 5 * 5. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 25 * 25. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24]; + + // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent + // overflow. + if (--acc_cnt == 0) { + acc_cnt = STAT_ACCUMULATOR_MAX; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4); + + // Last element of the row is computed separately. + lh[24] += lh32[24]; + lh32[24] = 0; + + lh += WIENER_WIN2_REDUCED_ALIGN2; + lh32 += WIENER_WIN2_REDUCED_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + uint8x16_t dgd_rows[3]; + load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 4; + ++dgd; + + // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5 + // matrix tightly packed into a int16x8_t[3] array. This array contains + // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2)); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(dgd_shuf1, avg_u8)); + + vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + + // The remaining last (25th) element of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[24] += DGD_AVG0[24] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 25 * 25. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, downsample_factor); + + update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2, + downsample_factor); +} + +static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride, + int width, int height) { + uint64_t sum = 0; + + if (width >= 16) { + int h = 0; + // We can accumulate up to 257 8-bit values in a 16-bit value, given + // that each 16-bit vector has 8 elements, that means we can process up to + // int(257*8/width) rows before we need to widen to 32-bit vector + // elements. + int h_overflow = 257 * 8 / width; + int h_limit = height > h_overflow ? h_overflow : height; + uint32x4_t avg_u32 = vdupq_n_u32(0); + do { + uint16x8_t avg_u16 = vdupq_n_u16(0); + do { + int j = width; + const uint8_t *src_ptr = src; + do { + uint8x16_t s = vld1q_u8(src_ptr); + avg_u16 = vpadalq_u8(avg_u16, s); + j -= 16; + src_ptr += 16; + } while (j >= 16); + if (j >= 8) { + uint8x8_t s = vld1_u8(src_ptr); + avg_u16 = vaddw_u8(avg_u16, s); + j -= 8; + src_ptr += 8; + } + // Scalar tail case. + while (j > 0) { + sum += src[width - j]; + j--; + } + src += src_stride; + } while (++h < h_limit); + avg_u32 = vpadalq_u16(avg_u32, avg_u16); + + h_limit += h_overflow; + h_limit = height > h_overflow ? h_overflow : height; + } while (h < height); + return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) / + (width * height)); + } + if (width >= 8) { + int h = 0; + // We can accumulate up to 257 8-bit values in a 16-bit value, given + // that each 16-bit vector has 4 elements, that means we can process up to + // int(257*4/width) rows before we need to widen to 32-bit vector + // elements. + int h_overflow = 257 * 4 / width; + int h_limit = height > h_overflow ? h_overflow : height; + uint32x2_t avg_u32 = vdup_n_u32(0); + do { + uint16x4_t avg_u16 = vdup_n_u16(0); + do { + int j = width; + const uint8_t *src_ptr = src; + uint8x8_t s = vld1_u8(src_ptr); + avg_u16 = vpadal_u8(avg_u16, s); + j -= 8; + src_ptr += 8; + // Scalar tail case. + while (j > 0) { + sum += src[width - j]; + j--; + } + src += src_stride; + } while (++h < h_limit); + avg_u32 = vpadal_u16(avg_u32, avg_u16); + + h_limit += h_overflow; + h_limit = height > h_overflow ? h_overflow : height; + } while (h < height); + return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) / + (width * height)); + } + int i = height; + do { + int j = 0; + do { + sum += src[j]; + } while (++j < width); + src += src_stride; + } while (--i != 0); + return (uint8_t)(sum / (width * height)); +} + +void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); + assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4); + (void)dgd_avg; + (void)src_avg; + + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = wiener_win >> 1; + const int width = h_end - h_start; + const int height = v_end - v_start; + + const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride; + const uint8_t *src_start = src + h_start + v_start * src_stride; + + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; + + uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height); + + // Since the height is not necessarily a multiple of the downsample factor, + // the last line of src will be scaled according to how many rows remain. + int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + + int downsampled_height = height / downsample_factor; + int downsample_remainder = height % downsample_factor; + + memset(M, 0, wiener_win2 * sizeof(*M)); + memset(H, 0, wiener_win2 * wiener_win2 * sizeof(*H)); + + // Calculate the M and H matrices for the normal and downsampled cases. + if (downsampled_height > 0) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_neon(dgd_win, src_start, width, downsampled_height, + dgd_stride, src_stride, avg, M, H, + downsample_factor); + } else { + compute_stats_win5_neon(dgd_win, src_start, width, downsampled_height, + dgd_stride, src_stride, avg, M, H, + downsample_factor); + } + } + + // Accumulate the remaining last rows in the downsampled case. + if (downsample_remainder > 0) { + int remainder_offset = height - downsample_remainder; + if (wiener_win == WIENER_WIN) { + compute_stats_win7_neon(dgd_win + remainder_offset * dgd_stride, + src_start + remainder_offset * src_stride, width, + 1, dgd_stride, src_stride, avg, M, H, + downsample_remainder); + } else { + compute_stats_win5_neon(dgd_win + remainder_offset * dgd_stride, + src_start + remainder_offset * src_stride, width, + 1, dgd_stride, src_stride, avg, M, H, + downsample_remainder); + } + } +} + +static INLINE void calc_proj_params_r0_r1_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t h01_lo = vdupq_n_s64(0); + int64x2_t h01_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint8_t *src_ptr = src8; + const uint8_t *dat_ptr = dat8; + int32_t *flt0_ptr = flt0; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t d = vld1_u8(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); + + int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); + int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); + f0_lo = vsubw_s16(f0_lo, vget_low_s16(u)); + f0_hi = vsubw_s16(f0_hi, vget_high_s16(u)); + f1_lo = vsubw_s16(f1_lo, vget_low_s16(u)); + f1_hi = vsubw_s16(f1_hi, vget_high_s16(u)); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); + h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); + h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); + h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src8 += src_stride; + dat8 += dat_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + H[1][0] = H[0][1]; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + + do { + const uint8_t *src_ptr = src8; + const uint8_t *dat_ptr = dat8; + int32_t *flt0_ptr = flt0; + int w = width; + + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t d = vld1_u8(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + + int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); + + int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); + int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); + f0_lo = vsubw_s16(f0_lo, vget_low_s16(u)); + f0_hi = vsubw_s16(f0_hi, vget_high_s16(u)); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + w -= 8; + } while (w != 0); + + src8 += src_stride; + dat8 += dat_stride; + flt0 += flt0_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; +} + +static INLINE void calc_proj_params_r1_neon(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint8_t *src_ptr = src8; + const uint8_t *dat_ptr = dat8; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t d = vld1_u8(dat_ptr); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); + + int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); + int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); + f1_lo = vsubw_s16(f1_lo, vget_low_s16(u)); + f1_hi = vsubw_s16(f1_hi, vget_high_s16(u)); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src8 += src_stride; + dat8 += dat_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +// The function calls 3 subfunctions for the following cases : +// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements +// of C and H need to be computed. +// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], + int64_t C[2], const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride, + flt1, flt1_stride, H, C); + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h new file mode 100644 index 0000000000..7b72dca34d --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_ +#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_ + +#include + +#include "av1/common/restoration.h" + +// Aligned sizes for Wiener filters. +#define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2) +#define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3) +#define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED)) +#define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2) +#define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3) + +// Compute 8 values of M (cross correlation) for a single source pixel and +// accumulate. +static INLINE void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg, + int16x8_t dgd_avg) { + int32x4_t lo = vld1q_s32(M_s32 + 0); + int32x4_t hi = vld1q_s32(M_s32 + 4); + + lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg); + + vst1q_s32(M_s32 + 0, lo); + vst1q_s32(M_s32 + 4, hi); +} + +// Compute 8 values of M (cross correlation) for two source pixels and +// accumulate. +static INLINE void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0, + int16x4_t src_avg1, int16x8_t dgd_avg0, + int16x8_t dgd_avg1) { + int32x4_t lo = vld1q_s32(M_s32 + 0); + int32x4_t hi = vld1q_s32(M_s32 + 4); + + lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0); + lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1); + + vst1q_s32(M_s32 + 0, lo); + vst1q_s32(M_s32 + 4, hi); +} + +static INLINE void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg, + int width, int height) { + for (int i = 0; i < height; i += 4) { + int16x4_t di = vld1_s16(dgd_avg + i); + + for (int j = i; j < width; j += 4) { + int16x4_t dj = vld1_s16(dgd_avg + j); + int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j); + + h0 = vmlal_lane_s16(h0, dj, di, 0); + h1 = vmlal_lane_s16(h1, dj, di, 1); + h2 = vmlal_lane_s16(h2, dj, di, 2); + h3 = vmlal_lane_s16(h3, dj, di, 3); + + vst1q_s32(H_s32 + 0 * width + j, h0); + vst1q_s32(H_s32 + 1 * width + j, h1); + vst1q_s32(H_s32 + 2 * width + j, h2); + vst1q_s32(H_s32 + 3 * width + j, h3); + } + H_s32 += 4 * width; + } +} + +static INLINE void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0, + const int16_t *dgd_avg1) { + for (int i = 0; i < 24; i += 4) { + int16x4_t di0 = vld1_s16(dgd_avg0 + i); + int16x4_t di1 = vld1_s16(dgd_avg1 + i); + + for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) { + int16x4_t dj0 = vld1_s16(dgd_avg0 + j); + int16x4_t dj1 = vld1_s16(dgd_avg1 + j); + int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j); + + h0 = vmlal_lane_s16(h0, dj0, di0, 0); + h0 = vmlal_lane_s16(h0, dj1, di1, 0); + h1 = vmlal_lane_s16(h1, dj0, di0, 1); + h1 = vmlal_lane_s16(h1, dj1, di1, 1); + h2 = vmlal_lane_s16(h2, dj0, di0, 2); + h2 = vmlal_lane_s16(h2, dj1, di1, 2); + h3 = vmlal_lane_s16(h3, dj0, di0, 3); + h3 = vmlal_lane_s16(h3, dj1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3); + } + H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2; + } +} + +static INLINE void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0, + const int16_t *dgd_avg1) { + for (int i = 0; i < 48; i += 4) { + int16x4_t di0 = vld1_s16(dgd_avg0 + i); + int16x4_t di1 = vld1_s16(dgd_avg1 + i); + + int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i); + + h0 = vmlal_lane_s16(h0, di0, di0, 0); + h0 = vmlal_lane_s16(h0, di1, di1, 0); + h1 = vmlal_lane_s16(h1, di0, di0, 1); + h1 = vmlal_lane_s16(h1, di1, di1, 1); + h2 = vmlal_lane_s16(h2, di0, di0, 2); + h2 = vmlal_lane_s16(h2, di1, di1, 2); + h3 = vmlal_lane_s16(h3, di0, di0, 3); + h3 = vmlal_lane_s16(h3, di1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3); + + for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) { + int16x4_t dj0 = vld1_s16(dgd_avg0 + j); + int16x4_t dj1 = vld1_s16(dgd_avg1 + j); + h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j); + h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j); + h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j); + h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j); + + h0 = vmlal_lane_s16(h0, dj0, di0, 0); + h0 = vmlal_lane_s16(h0, dj1, di1, 0); + h1 = vmlal_lane_s16(h1, dj0, di0, 1); + h1 = vmlal_lane_s16(h1, dj1, di1, 1); + h2 = vmlal_lane_s16(h2, dj0, di0, 2); + h2 = vmlal_lane_s16(h2, dj1, di1, 2); + h3 = vmlal_lane_s16(h3, dj0, di0, 3); + h3 = vmlal_lane_s16(h3, dj1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3); + } + H_s32 += 4 * WIENER_WIN2_ALIGN2; + } +} + +// Widen 32-bit src data and accumulate into 64-bit dst. Clear src data. +static INLINE void accumulate_and_clear(int64_t *dst, int32_t *src, + int length) { + do { + int32x4_t s32 = vld1q_s32(src); + vst1q_s32(src, vdupq_n_s32(0)); + src += 4; + + int64x2_t d_lo = vld1q_s64(dst + 0); + int64x2_t d_hi = vld1q_s64(dst + 2); + + d_lo = vaddw_s32(d_lo, vget_low_s32(s32)); + d_hi = vaddw_s32(d_hi, vget_high_s32(s32)); + + vst1q_s64(dst + 0, d_lo); + vst1q_s64(dst + 2, d_hi); + + dst += 4; + length -= 4; + } while (length > 0); +} + +#endif // AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_ diff --git a/third_party/aom/av1/encoder/arm/neon/quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c new file mode 100644 index 0000000000..c3b57ce206 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c @@ -0,0 +1,928 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_mem/aom_mem.h" + +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rd.h" + +static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { +#if AOM_ARCH_AARCH64 + return (uint16_t)vmaxvq_s16(v_eobmax); +#else + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + return (uint16_t)vget_lane_s16(v_eobmax_final, 0); +#endif +} + +static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan, + int16x8_t v_eobmax, + uint16x8_t v_mask) { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); + const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); + return vmaxq_s16(v_eobmax, v_nz_iscan); +} + +static INLINE uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + int16x8_t v_quant, int16x8_t v_dequant, + int16x8_t v_round, int16x8_t v_zero) { + const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); + const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1); + const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff); + store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff); + return v_nz_mask; +} + +void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + const int16x8_t v_zero = vdupq_n_s16(0); + int16x8_t v_quant = vld1q_s16(quant_ptr); + int16x8_t v_dequant = vld1q_s16(dequant_ptr); + int16x8_t v_round = vld1q_s16(round_ptr); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + uint16x8_t v_nz_mask; + // process dc and the first seven ac coeffs + v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero); + v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask); + // overwrite the dc constants with ac constants + v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); + v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); + v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); + + count -= 8; + // now process the rest of the ac coeffs + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero); + v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); + count -= 8; + } while (count > 0); + *eob_ptr = get_max_eob(v_eobmax_76543210); +} + +static INLINE uint16x8_t quantize_lp_8(const int16_t *coeff_ptr, + int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, int16x8_t v_quant, + int16x8_t v_dequant, int16x8_t v_round, + int16x8_t v_zero) { + const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); + const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1); + const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + vst1q_s16(qcoeff_ptr, v_qcoeff); + vst1q_s16(dqcoeff_ptr, v_dqcoeff); + return v_nz_mask; +} + +void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + const int16x8_t v_zero = vdupq_n_s16(0); + int16x8_t v_quant = vld1q_s16(quant_ptr); + int16x8_t v_dequant = vld1q_s16(dequant_ptr); + int16x8_t v_round = vld1q_s16(round_ptr); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + uint16x8_t v_nz_mask; + intptr_t count = n_coeffs; + + // process dc and the first seven ac coeffs + v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero); + v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); + // overwrite the dc constants with ac constants + v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); + v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); + v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); + + count -= 8; + // now process the rest of the ac coeffs + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero); + v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); + count -= 8; + } while (count != 0); + *eob_ptr = get_max_eob(v_eobmax_76543210); +} + +static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant, + int16x8_t v_round, int16x8_t v_zero, int log_scale) { + const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1); + const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale)); + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); + const uint16x8_t v_mask = + vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1)); + // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 + const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round), + vreinterpretq_s16_u16(v_mask)); + const int16x8_t v_tmp2 = + vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant); + const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff = + vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign); + // Multiplying by dequant here will use all 16 bits. Cast to unsigned before + // shifting right. (vshlq_s16 will shift right if shift value is negative) + const uint16x8_t v_abs_dqcoeff = + vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), + vdupq_n_s16(-log_scale)); + const int16x8_t v_dqcoeff = + vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign), + v_coeff_sign); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); + return v_nz_mask; +} + +static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant, + int16x8_t v_round, int16x8_t v_zero) { + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); + const uint16x8_t v_mask = + vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1), + vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2)); + // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 + const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round), + vreinterpretq_s16_u16(v_mask)); + // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); + const int16x8_t v_tmp2 = + vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1), + vreinterpretq_s16_u16(vshrq_n_u16( + vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14))); + const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff = + vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign); + // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale; + const int16x8_t v_abs_dqcoeff = + vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13), + vreinterpretq_s16_u16(vshrq_n_u16( + vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2))); + const int16x8_t v_dqcoeff = + vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); + return v_nz_mask; +} + +static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan, + int log_scale) { + const int16x8_t v_zero = vdupq_n_s16(0); + int16x8_t v_quant = vld1q_s16(quant_ptr); + int16x8_t v_dequant = vld1q_s16(dequant_ptr); + const int16x8_t v_round_no_scale = vld1q_s16(round_ptr); + int16x8_t v_round = + vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + intptr_t non_zero_count = n_coeffs; + + assert(n_coeffs > 16); + // Pre-scan pass + const int16x8_t v_dequant_scaled = + vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale))); + const int16x8_t v_zbin_s16 = + vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1); + intptr_t i = n_coeffs; + do { + const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8); + const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16); + const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a); + const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b); + const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16); + const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16); + // If the coefficient is in the base ZBIN range, then discard. + if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) { + non_zero_count -= 16; + } else { + break; + } + i -= 16; + } while (i > 0); + + const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count; + memset(qcoeff_ptr + non_zero_count, 0, + remaining_zcoeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr + non_zero_count, 0, + remaining_zcoeffs * sizeof(*dqcoeff_ptr)); + + // process dc and the first seven ac coeffs + uint16x8_t v_nz_mask; + if (log_scale == 2) { + v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant, v_dequant, v_round, v_zero); + } else { + v_nz_mask = + quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero, log_scale); + } + v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); + // overwrite the dc constants with ac constants + v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); + v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); + v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); + + for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + if (log_scale == 2) { + v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant, v_dequant, v_round, v_zero); + } else { + v_nz_mask = + quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero, log_scale); + } + v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); + } + *eob_ptr = get_max_eob(v_eobmax_76543210); +} + +void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, + iscan, 1); +} + +void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, + iscan, 2); +} + +void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + (void)quant_shift_ptr; + (void)scan; + + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + const int16x8_t zero = vdupq_n_s16(0); + int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); + + int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]); + int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); + int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); + int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); + + int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + int16x8_t v_abs = vabsq_s16(v_coeff); + + vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); + + uint16x8_t vcond = vcgeq_s16(v_abs, vzbins); + uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + vround = vsetq_lane_s16(round_ptr[0], vround, 0); + vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); + + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); + + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); + store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); + int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); + + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); + store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); + + vround = vsetq_lane_s16(round_ptr[1], vround, 0); + vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[0]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); + + for (int i = 8; i < n_coeffs; i += 8) { + v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); + v_coeff_sign = vshrq_n_s16(v_coeff, 15); + v_abs = vabsq_s16(v_coeff); + vcond = vcgeq_s16(v_abs, vzbins); + + nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + + vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); + store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); + int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); + store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[i]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + } + *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; +} + +#define QM_MULL_SHIFT(x0, x1) \ + vreinterpretq_s16_u16(vorrq_u16( \ + vreinterpretq_u16_s16(vshlq_n_s16( \ + vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \ + vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS))) + +static void aom_quantize_b_helper_16x16_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + (void)scan; + + uint16x8_t vwt, viwt; + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + const int16x8_t zero = vdupq_n_s16(0); + int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); + + int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]); + int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); + int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); + int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); + + int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + int16x8_t v_abs = vabsq_s16(v_coeff); + vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); + uint16x8_t vcond; + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + vround = vsetq_lane_s16(round_ptr[0], vround, 0); + vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); + + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + + vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); + store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); + store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); + + vround = vsetq_lane_s16(round_ptr[1], vround, 0); + vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[0]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); + + for (int i = 8; i < n_coeffs; i += 8) { + v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); + v_coeff_sign = vshrq_n_s16(v_coeff, 15); + v_abs = vabsq_s16(v_coeff); + + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + + vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); + store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); + store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[i]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + } + *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; +} + +static void aom_quantize_b_helper_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + (void)scan; + + uint16x8_t vwt, viwt; + const int log_scale = 1; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + const int16x8_t zero = vdupq_n_s16(0); + int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); + const int16x8_t v_log_scale = v_eobmax_76543210; + + int16x8_t vzbins = vdupq_n_s16(zbins[1]), + vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale)); + int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); + int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); + int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); + + int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + int16x8_t v_abs = vabsq_s16(v_coeff); + vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); + uint16x8_t vcond; + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + vround = + vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0); + vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); + + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + + vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); + store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( + vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); + store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); + + vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); + vround = + vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0); + vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[0]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); + + for (int i = 8; i < n_coeffs; i += 8) { + v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); + v_coeff_sign = vshrq_n_s16(v_coeff, 15); + v_abs = vabsq_s16(v_coeff); + + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift); + + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); + store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( + vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); + store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[i]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + } + *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; +} + +static void aom_quantize_b_helper_64x64_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + (void)scan; + + uint16x8_t vwt, viwt; + const int log_scale = 2; + const int16x8_t v_log_scale = + vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE)); + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + const int16x8_t zero = vdupq_n_s16(0); + int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); + int16x8_t v_ones = vnegq_s16(v_eobmax_76543210); + + int16x8_t vzbins = vdupq_n_s16(zbins[1]), + vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale)); + int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); + int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); + int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); + + int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + int16x8_t v_abs = vabsq_s16(v_coeff); + vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); + uint16x8_t vcond; + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + vround = + vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0); + vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + + int16x8_t ones = + vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones); + vtmp2 = + vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); + store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( + vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); + v_deq_abs = + vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); + store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); + + vround = + vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0); + vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[0]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); + + for (int i = 8; i < n_coeffs; i += 8) { + v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); + v_coeff_sign = vshrq_n_s16(v_coeff, 15); + v_abs = vabsq_s16(v_coeff); + + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + + int16x8_t ones = + vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones); + vtmp2 = + vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); + store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( + vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); + v_deq_abs = + vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); + store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[i]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + } + *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; +} + +void aom_quantize_b_helper_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + switch (log_scale) { // log_scale for AV1 encoder can be only 0, 1, 2 + case 0: + aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, qm_ptr, iqm_ptr); + break; + case 1: + aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, qm_ptr, iqm_ptr); + break; + case 2: + aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, qm_ptr, iqm_ptr); + break; + } +} + +void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 1); +} + +void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 2); +} diff --git a/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c b/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c new file mode 100644 index 0000000000..7d3bd4c606 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c @@ -0,0 +1,459 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include + +#include "av1/encoder/rdopt.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +// Process horizontal and vertical correlations in a 4x4 block of pixels. +// We actually use the 4x4 pixels to calculate correlations corresponding to +// the top-left 3x3 pixels, so this function must be called with 1x1 overlap, +// moving the window along/down by 3 pixels at a time. +INLINE static void horver_correlation_4x4(const int16_t *diff, int stride, + int32x4_t *xy_sum_32, + int32x4_t *xz_sum_32, + int32x4_t *x_sum_32, + int32x4_t *x2_sum_32) { + // Pixels in this 4x4 [ a b c d ] + // are referred to as: [ e f g h ] + // [ i j k l ] + // [ m n o p ] + + const int16x4_t pixelsa_2_lo = vld1_s16(diff + (0 * stride)); + const int16x4_t pixelsa_2_sli = + vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_2_lo), 16)); + const int16x4_t pixelsb_2_lo = vld1_s16(diff + (1 * stride)); + const int16x4_t pixelsb_2_sli = + vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_2_lo), 16)); + const int16x4_t pixelsa_1_lo = vld1_s16(diff + (2 * stride)); + const int16x4_t pixelsa_1_sli = + vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_1_lo), 16)); + const int16x4_t pixelsb_1_lo = vld1_s16(diff + (3 * stride)); + const int16x4_t pixelsb_1_sli = + vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_1_lo), 16)); + + const int16x8_t slli_a = vcombine_s16(pixelsa_1_sli, pixelsa_2_sli); + + *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_1_lo, pixelsa_1_sli); + *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_2_lo, pixelsa_2_sli); + *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsb_2_lo, pixelsb_2_sli); + + *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_1_sli); + *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_2_sli, pixelsb_2_sli); + *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_2_sli); + + // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k + // (sum up every element in slli_a and swap_b) + *x_sum_32 = vpadalq_s16(*x_sum_32, slli_a); + *x_sum_32 = vaddw_s16(*x_sum_32, pixelsb_2_sli); + + // Also sum their squares + *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_1_sli, pixelsa_1_sli); + *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_2_sli, pixelsa_2_sli); + *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsb_2_sli, pixelsb_2_sli); +} + +void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - right neighbour pixel + // z - below neighbour pixel + // w - down-right neighbour pixel + int64_t xy_sum = 0, xz_sum = 0; + int64_t x_sum = 0, x2_sum = 0; + int32x4_t zero = vdupq_n_s32(0); + int64x2_t v_x_sum = vreinterpretq_s64_s32(zero); + int64x2_t v_xy_sum = vreinterpretq_s64_s32(zero); + int64x2_t v_xz_sum = vreinterpretq_s64_s32(zero); + int64x2_t v_x2_sum = vreinterpretq_s64_s32(zero); + // Process horizontal and vertical correlations through the body in 4x4 + // blocks. This excludes the final row and column and possibly one extra + // column depending how 3 divides into width and height + + for (int i = 0; i <= height - 4; i += 3) { + int32x4_t xy_sum_32 = zero; + int32x4_t xz_sum_32 = zero; + int32x4_t x_sum_32 = zero; + int32x4_t x2_sum_32 = zero; + for (int j = 0; j <= width - 4; j += 3) { + horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, + &xz_sum_32, &x_sum_32, &x2_sum_32); + } + v_xy_sum = vpadalq_s32(v_xy_sum, xy_sum_32); + v_xz_sum = vpadalq_s32(v_xz_sum, xz_sum_32); + v_x_sum = vpadalq_s32(v_x_sum, x_sum_32); + v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32); + } +#if AOM_ARCH_AARCH64 + xy_sum = vaddvq_s64(v_xy_sum); + xz_sum = vaddvq_s64(v_xz_sum); + x2_sum = vaddvq_s64(v_x2_sum); + x_sum = vaddvq_s64(v_x_sum); +#else + xy_sum = vget_lane_s64( + vadd_s64(vget_low_s64(v_xy_sum), vget_high_s64(v_xy_sum)), 0); + xz_sum = vget_lane_s64( + vadd_s64(vget_low_s64(v_xz_sum), vget_high_s64(v_xz_sum)), 0); + x2_sum = vget_lane_s64( + vadd_s64(vget_low_s64(v_x2_sum), vget_high_s64(v_x2_sum)), 0); + x_sum = + vget_lane_s64(vadd_s64(vget_low_s64(v_x_sum), vget_high_s64(v_x_sum)), 0); +#endif + // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols + int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; + + // Do we have 2 rows remaining or just the one? Note that width and height + // are powers of 2, so each modulo 3 must be 1 or 2. + if (height % 3 == 1) { // Just horiz corrs on the final row + const int16_t x0 = diff[(height - 1) * stride]; + x_sum += x0; + x_finalrow += x0; + x2_sum += x0 * x0; + x2_finalrow += x0 * x0; + if (width >= 8) { + int32x4_t v_y_sum = zero; + int32x4_t v_y2_sum = zero; + int32x4_t v_xy_sum_a = zero; + int k = width - 1; + int j = 0; + while ((k - 8) > 0) { + const int16x8_t v_x = vld1q_s16(&diff[(height - 1) * stride + j]); + const int16x8_t v_y = vld1q_s16(&diff[(height - 1) * stride + j + 1]); + const int16x4_t v_x_lo = vget_low_s16(v_x); + const int16x4_t v_x_hi = vget_high_s16(v_x); + const int16x4_t v_y_lo = vget_low_s16(v_y); + const int16x4_t v_y_hi = vget_high_s16(v_y); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); + v_y_sum = vpadalq_s16(v_y_sum, v_y); + k -= 8; + j += 8; + } + + const int16x8_t v_l = vld1q_s16(&diff[(height - 1) * stride] + j); + const int16x8_t v_x = + vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7), + vreinterpretq_s16_s32(zero), 1); + const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1); + const int16x4_t v_x_lo = vget_low_s16(v_x); + const int16x4_t v_x_hi = vget_high_s16(v_x); + const int16x4_t v_y_lo = vget_low_s16(v_y); + const int16x4_t v_y_hi = vget_high_s16(v_y); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); + const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y); + const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a); +#if AOM_ARCH_AARCH64 + const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum); + xy_sum += vaddvq_s64(v_xy_sum2); + const int32_t y = vaddvq_s32(v_y_sum_a); + const int64_t y2 = vaddvq_s64(v_y2_sum_a); +#else + xy_sum += vget_lane_s64( + vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0); + const int64x2_t v_y_a = vpaddlq_s32(v_y_sum_a); + const int64_t y = + vget_lane_s64(vadd_s64(vget_low_s64(v_y_a), vget_high_s64(v_y_a)), 0); + const int64x2_t v_y2_sum_b = vpaddlq_s32(v_y2_sum); + int64_t y2 = vget_lane_s64( + vadd_s64(vget_low_s64(v_y2_sum_b), vget_high_s64(v_y2_sum_b)), 0); +#endif + x_sum += y; + x2_sum += y2; + x_finalrow += y; + x2_finalrow += y2; + } else { + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 1) * stride + j]; + const int16_t y = diff[(height - 1) * stride + j + 1]; + xy_sum += x * y; + x_sum += y; + x2_sum += y * y; + x_finalrow += y; + x2_finalrow += y * y; + } + } + } else { // Two rows remaining to do + const int16_t x0 = diff[(height - 2) * stride]; + const int16_t z0 = diff[(height - 1) * stride]; + x_sum += x0 + z0; + x2_sum += x0 * x0 + z0 * z0; + x_finalrow += z0; + x2_finalrow += z0 * z0; + if (width >= 8) { + int32x4_t v_y2_sum = zero; + int32x4_t v_w2_sum = zero; + int32x4_t v_xy_sum_a = zero; + int32x4_t v_xz_sum_a = zero; + int32x4_t v_x_sum_a = zero; + int32x4_t v_w_sum = zero; + int k = width - 1; + int j = 0; + while ((k - 8) > 0) { + const int16x8_t v_x = vld1q_s16(&diff[(height - 2) * stride + j]); + const int16x8_t v_y = vld1q_s16(&diff[(height - 2) * stride + j + 1]); + const int16x8_t v_z = vld1q_s16(&diff[(height - 1) * stride + j]); + const int16x8_t v_w = vld1q_s16(&diff[(height - 1) * stride + j + 1]); + + const int16x4_t v_x_lo = vget_low_s16(v_x); + const int16x4_t v_y_lo = vget_low_s16(v_y); + const int16x4_t v_z_lo = vget_low_s16(v_z); + const int16x4_t v_w_lo = vget_low_s16(v_w); + const int16x4_t v_x_hi = vget_high_s16(v_x); + const int16x4_t v_y_hi = vget_high_s16(v_y); + const int16x4_t v_z_hi = vget_high_s16(v_z); + const int16x4_t v_w_hi = vget_high_s16(v_w); + + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi); + + v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo); + v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi); + + v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo); + v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); + + v_w_sum = vpadalq_s16(v_w_sum, v_w); + v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y); + v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w); + + k -= 8; + j += 8; + } + const int16x8_t v_l = vld1q_s16(&diff[(height - 2) * stride] + j); + const int16x8_t v_x = + vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7), + vreinterpretq_s16_s32(zero), 1); + const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1); + const int16x8_t v_l_2 = vld1q_s16(&diff[(height - 1) * stride] + j); + const int16x8_t v_z = + vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l_2, 7), + vreinterpretq_s16_s32(zero), 1); + const int16x8_t v_w = vextq_s16(v_l_2, vreinterpretq_s16_s32(zero), 1); + + const int16x4_t v_x_lo = vget_low_s16(v_x); + const int16x4_t v_y_lo = vget_low_s16(v_y); + const int16x4_t v_z_lo = vget_low_s16(v_z); + const int16x4_t v_w_lo = vget_low_s16(v_w); + const int16x4_t v_x_hi = vget_high_s16(v_x); + const int16x4_t v_y_hi = vget_high_s16(v_y); + const int16x4_t v_z_hi = vget_high_s16(v_z); + const int16x4_t v_w_hi = vget_high_s16(v_w); + + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi); + + v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo); + v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi); + + v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo); + v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); + + v_w_sum = vpadalq_s16(v_w_sum, v_w); + v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y); + v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w); + +#if AOM_ARCH_AARCH64 + xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a)); + xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a)); + x_sum += vaddvq_s32(v_x_sum_a); + x_finalrow += vaddvq_s32(v_w_sum); + int64_t y2 = vaddvq_s64(vpaddlq_s32(v_y2_sum)); + int64_t w2 = vaddvq_s64(vpaddlq_s32(v_w2_sum)); +#else + const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a); + xy_sum += vget_lane_s64( + vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0); + const int64x2_t v_xz_sum2 = vpaddlq_s32(v_xz_sum_a); + xz_sum += vget_lane_s64( + vadd_s64(vget_low_s64(v_xz_sum2), vget_high_s64(v_xz_sum2)), 0); + const int64x2_t v_x_sum2 = vpaddlq_s32(v_x_sum_a); + x_sum += vget_lane_s64( + vadd_s64(vget_low_s64(v_x_sum2), vget_high_s64(v_x_sum2)), 0); + const int64x2_t v_w_sum_a = vpaddlq_s32(v_w_sum); + x_finalrow += vget_lane_s64( + vadd_s64(vget_low_s64(v_w_sum_a), vget_high_s64(v_w_sum_a)), 0); + const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum); + int64_t y2 = vget_lane_s64( + vadd_s64(vget_low_s64(v_y2_sum_a), vget_high_s64(v_y2_sum_a)), 0); + const int64x2_t v_w2_sum_a = vpaddlq_s32(v_w2_sum); + int64_t w2 = vget_lane_s64( + vadd_s64(vget_low_s64(v_w2_sum_a), vget_high_s64(v_w2_sum_a)), 0); +#endif + x2_sum += y2 + w2; + x2_finalrow += w2; + } else { + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 2) * stride + j]; + const int16_t y = diff[(height - 2) * stride + j + 1]; + const int16_t z = diff[(height - 1) * stride + j]; + const int16_t w = diff[(height - 1) * stride + j + 1]; + + // Horizontal and vertical correlations for the penultimate row: + xy_sum += x * y; + xz_sum += x * z; + + // Now just horizontal correlations for the final row: + xy_sum += z * w; + + x_sum += y + w; + x2_sum += y * y + w * w; + x_finalrow += w; + x2_finalrow += w * w; + } + } + } + + // Do we have 2 columns remaining or just the one? + if (width % 3 == 1) { // Just vert corrs on the final col + const int16_t x0 = diff[width - 1]; + x_sum += x0; + x_finalcol += x0; + x2_sum += x0 * x0; + x2_finalcol += x0 * x0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 1]; + xz_sum += x * z; + x_finalcol += z; + x2_finalcol += z * z; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z; + x2_sum += z * z; + } + } + } else { // Two cols remaining + const int16_t x0 = diff[width - 2]; + const int16_t y0 = diff[width - 1]; + x_sum += x0 + y0; + x2_sum += x0 * x0 + y0 * y0; + x_finalcol += y0; + x2_finalcol += y0 * y0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 2]; + const int16_t y = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 2]; + const int16_t w = diff[(i + 1) * stride + width - 1]; + + // Horizontal and vertical correlations for the penultimate col: + // Skip these on the last iteration of this loop if we also had two + // rows remaining, otherwise the final horizontal and vertical correlation + // get erroneously processed twice + if (i < height - 2 || height % 3 == 1) { + xy_sum += x * y; + xz_sum += x * z; + } + + x_finalcol += w; + x2_finalcol += w * w; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z + w; + x2_sum += z * z + w * w; + } + + // Now just vertical correlations for the final column: + xz_sum += y * w; + } + } + + // Calculate the simple sums and squared-sums + int64_t x_firstrow = 0, x_firstcol = 0; + int64_t x2_firstrow = 0, x2_firstcol = 0; + + if (width >= 8) { + int32x4_t v_x_firstrow = zero; + int32x4_t v_x2_firstrow = zero; + for (int j = 0; j < width; j += 8) { + const int16x8_t v_diff = vld1q_s16(diff + j); + const int16x4_t v_diff_lo = vget_low_s16(v_diff); + const int16x4_t v_diff_hi = vget_high_s16(v_diff); + v_x_firstrow = vpadalq_s16(v_x_firstrow, v_diff); + v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo); + v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi); + } +#if AOM_ARCH_AARCH64 + x_firstrow += vaddvq_s32(v_x_firstrow); + x2_firstrow += vaddvq_s32(v_x2_firstrow); +#else + const int64x2_t v_x_firstrow_64 = vpaddlq_s32(v_x_firstrow); + x_firstrow += vget_lane_s64( + vadd_s64(vget_low_s64(v_x_firstrow_64), vget_high_s64(v_x_firstrow_64)), + 0); + const int64x2_t v_x2_firstrow_64 = vpaddlq_s32(v_x2_firstrow); + x2_firstrow += vget_lane_s64(vadd_s64(vget_low_s64(v_x2_firstrow_64), + vget_high_s64(v_x2_firstrow_64)), + 0); +#endif + } else { + for (int j = 0; j < width; ++j) { + x_firstrow += diff[j]; + x2_firstrow += diff[j] * diff[j]; + } + } + for (int i = 0; i < height; ++i) { + x_firstcol += diff[i * stride]; + x2_firstcol += diff[i * stride] * diff[i * stride]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c b/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c new file mode 100644 index 0000000000..3d17723224 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/arm/mem_neon.h" + +#include "av1/encoder/reconinter_enc.h" + +void aom_upsampled_pred_neon(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref, int ref_stride, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter_params = av1_get_filter(subpel_search); + + if (!subpel_x_q3 && !subpel_y_q3) { + if (width > 8) { + assert(width % 16 == 0); + int i = height; + do { + int j = 0; + do { + uint8x16_t r = vld1q_u8(ref + j); + vst1q_u8(comp_pred + j, r); + j += 16; + } while (j < width); + ref += ref_stride; + comp_pred += width; + } while (--i != 0); + } else if (width == 8) { + int i = height; + do { + uint8x8_t r = vld1_u8(ref); + vst1_u8(comp_pred, r); + ref += ref_stride; + comp_pred += width; + } while (--i != 0); + } else { + assert(width == 4); + int i = height / 2; + do { + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + vst1_u8(comp_pred, r); + ref += 2 * ref_stride; + comp_pred += 2 * width; + } while (--i != 0); + } + } else if (!subpel_y_q3) { + const int16_t *const filter_x = + av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1); + aom_convolve8_horiz(ref, ref_stride, comp_pred, width, filter_x, 16, NULL, + -1, width, height); + } else if (!subpel_x_q3) { + const int16_t *const filter_y = + av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1); + aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, filter_y, + 16, width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + im_block[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + + const int16_t *const filter_x = + av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1); + const int16_t *const filter_y = + av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1); + + const int im_stride = MAX_SB_SIZE; + const int im_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + SUBPEL_TAPS; + + const int ref_vert_offset = ref_stride * ((SUBPEL_TAPS >> 1) - 1); + const int im_vert_offset = im_stride * ((filter_params->taps >> 1) - 1); + + assert(im_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz(ref - ref_vert_offset, ref_stride, im_block, + MAX_SB_SIZE, filter_x, 16, NULL, -1, width, im_height); + aom_convolve8_vert(im_block + im_vert_offset, MAX_SB_SIZE, comp_pred, width, + NULL, -1, filter_y, 16, width, height); + } +} + +void aom_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd, + const AV1_COMMON *const cm, int mi_row, + int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + + aom_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, width); +} + +void aom_dist_wtd_comp_avg_upsampled_pred_neon( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { + aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + + aom_dist_wtd_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, + width, jcp_param); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd, + const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred8, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred8, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + + if (!subpel_x_q3 && !subpel_y_q3) { + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + if (width > 4) { + assert(width % 8 == 0); + int i = height; + do { + int j = 0; + do { + uint16x8_t r = vld1q_u16(ref + j); + vst1q_u16(comp_pred + j, r); + j += 8; + } while (j < width); + ref += ref_stride; + comp_pred += width; + } while (--i != 0); + } else if (width == 4) { + int i = height; + do { + uint16x4_t r = vld1_u16(ref); + vst1_u16(comp_pred, r); + ref += ref_stride; + comp_pred += width; + } while (--i != 0); + } else { + assert(width == 2); + int i = height / 2; + do { + uint16x4_t r = load_u16_2x2(ref, ref_stride); + store_u16x2_strided_x2(comp_pred, width, r); + ref += 2 * ref_stride; + comp_pred += 2 * width; + } while (--i != 0); + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz_neon(ref8, ref_stride, comp_pred8, width, kernel, + 16, NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert_neon(ref8, ref_stride, comp_pred8, width, NULL, + -1, kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz_neon( + ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride, + CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert_neon( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); + } +} + +void aom_highbd_comp_avg_upsampled_pred_neon( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, int subpel_search) { + aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, + ref_stride, bd, subpel_search); + + aom_highbd_comp_avg_pred_neon(comp_pred8, pred8, width, height, comp_pred8, + width); +} + +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, + int subpel_search) { + aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, + ref_stride, bd, subpel_search); + + aom_highbd_dist_wtd_comp_avg_pred_neon(comp_pred8, pred8, width, height, + comp_pred8, width, jcp_param); +} + +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/arm/neon/shift_neon.h b/third_party/aom/av1/encoder/arm/neon/shift_neon.h new file mode 100644 index 0000000000..d73aef2f25 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/shift_neon.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_ +#define AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_ + +#include + +#include "aom/aom_integer.h" // For AOM_INLINE. + +#define SHIFT_LOOP_HELPER(name, type, intrinsic, arg) \ + static AOM_INLINE void name(const type *in, type *out, int size) { \ + int i = 0; \ + do { \ + out[i] = intrinsic(in[i], arg); \ + } while (++i < size); \ + } + +SHIFT_LOOP_HELPER(shift_left_2_s16_x4, int16x4_t, vshl_n_s16, 2) +SHIFT_LOOP_HELPER(shift_left_2_s16_x8, int16x8_t, vshlq_n_s16, 2) +SHIFT_LOOP_HELPER(shift_left_2_s32_x4, int32x4_t, vshlq_n_s32, 2) +SHIFT_LOOP_HELPER(shift_right_2_round_s16_x8, int16x8_t, vrshrq_n_s16, 2) +SHIFT_LOOP_HELPER(shift_right_2_round_s32_x4, int32x4_t, vrshrq_n_s32, 2) +SHIFT_LOOP_HELPER(shift_right_4_round_s16_x8, int16x8_t, vrshrq_n_s16, 4) +SHIFT_LOOP_HELPER(shift_right_4_round_s32_x4, int32x4_t, vrshrq_n_s32, 4) + +// Addition instructions have slightly better performance compared to shift +// instructions on some micro-architectures, so use these for shifts by one. + +SHIFT_LOOP_HELPER(shift_left_1_s16_x4, int16x4_t, vadd_s16, in[i]) +SHIFT_LOOP_HELPER(shift_left_1_s16_x8, int16x8_t, vaddq_s16, in[i]) +SHIFT_LOOP_HELPER(shift_right_1_round_s16_x4, int16x4_t, vrhadd_s16, + vdup_n_s16(0)) +SHIFT_LOOP_HELPER(shift_right_1_round_s16_x8, int16x8_t, vrhaddq_s16, + vdupq_n_s16(0)) +SHIFT_LOOP_HELPER(shift_right_1_round_s32_x4, int32x4_t, vrhaddq_s32, + vdupq_n_s32(0)) + +#undef SHIFT_LOOP_HELPER + +#endif // AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_ diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c new file mode 100644 index 0000000000..986f143864 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c @@ -0,0 +1,548 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +// For the squared error buffer, add padding for 4 samples. +#define SSE_STRIDE (BW + 4) + +// When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits. +DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, + 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, + 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, + 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF +}; + +static INLINE void get_squared_error( + const uint8_t *frame1, const uint32_t stride1, const uint8_t *frame2, + const uint32_t stride2, const uint32_t block_width, + const uint32_t block_height, uint16_t *frame_sse, + const unsigned int dst_stride) { + uint16_t *dst = frame_sse; + + uint32_t i = 0; + do { + uint32_t j = 0; + do { + uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j); + uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j); + + uint8x16_t abs_diff = vabdq_u8(s, r); + uint16x8_t sse_lo = + vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); + uint16x8_t sse_hi = + vmull_u8(vget_high_u8(abs_diff), vget_high_u8(abs_diff)); + + vst1q_u16(dst + j + 2, sse_lo); + vst1q_u16(dst + j + 10, sse_hi); + + j += 16; + } while (j < block_width); + + dst += dst_stride; + } while (++i < block_height); +} + +static INLINE uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col, + const uint32_t block_width) { + uint16x8_t s = vld1q_u16(src); + + if (col == 0) { + const uint16_t lane2 = vgetq_lane_u16(s, 2); + s = vsetq_lane_u16(lane2, s, 0); + s = vsetq_lane_u16(lane2, s, 1); + } else if (col >= block_width - 4) { + const uint16_t lane5 = vgetq_lane_u16(s, 5); + s = vsetq_lane_u16(lane5, s, 6); + s = vsetq_lane_u16(lane5, s, 7); + } + return s; +} + +static void apply_temporal_filter( + const uint8_t *frame, const unsigned int stride, const uint32_t block_width, + const uint32_t block_height, const int *subblock_mses, + unsigned int *accumulator, uint16_t *count, const uint16_t *frame_sse, + const uint32_t *luma_sse_sum, const double inv_num_ref_pixels, + const double decay_factor, const double inv_factor, + const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_neon[BH][BW]; + const uint16x8x4_t vmask = vld1q_u16_x4(kSlidingWindowMask); + + // Traverse 4 columns at a time - first and last two columns need padding. + for (uint32_t col = 0; col < block_width; col += 4) { + uint16x8_t vsrc[5]; + const uint16_t *src = frame_sse + col; + + // Load and pad (for first and last two columns) 3 rows from the top. + for (int i = 2; i < 5; i++) { + vsrc[i] = load_and_pad(src, col, block_width); + src += SSE_STRIDE; + } + + // Pad the top 2 rows. + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (unsigned int row = 0; row < block_height; row++) { + for (int i = 0; i < 4; i++) { + uint32x4_t vsum = vdupq_n_u32(0); + for (int j = 0; j < 5; j++) { + vsum = vpadalq_u16(vsum, vandq_u16(vsrc[j], vmask.val[i])); + } + acc_5x5_neon[row][col + i] = horizontal_add_u32x4(vsum); + } + + // Push all rows in the sliding window up one. + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + if (row <= block_height - 4) { + // Load next row into the bottom of the sliding window. + vsrc[4] = load_and_pad(src, col, block_width); + src += SSE_STRIDE; + } else { + // Pad the bottom 2 rows. + vsrc[4] = vsrc[3]; + } + } + } + + // Perform filtering. + if (tf_wgt_calc_lvl == 0) { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } else { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } +} + +void av1_apply_temporal_filter_neon( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!"); + assert(!is_high_bitdepth && "Only support low bit-depth with Neon!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + // Frame information. + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint16_t frame_sse[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = + frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; + } + } + } + } + } + + get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w, + plane_h, frame_sse, SSE_STRIDE); + + apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h, + subblock_mses, accum + plane_offset, + count + plane_offset, frame_sse, luma_sse_sum, + inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + + plane_offset += plane_h * plane_w; + } +} + +double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height, + int width, int stride, + int edge_thresh) { + uint16x8_t thresh = vdupq_n_u16(edge_thresh); + uint32x4_t acc = vdupq_n_u32(0); + // Count is in theory positive as it counts the number of times we're under + // the threshold, but it will be counted negatively in order to make best use + // of the vclt instruction, which sets every bit of a lane to 1 when the + // condition is true. + int32x4_t count = vdupq_n_s32(0); + int final_count = 0; + int64_t final_acc = 0; + const uint8_t *src_start = src + stride + 1; + int h = 1; + + do { + int w = 1; + const uint8_t *src_ptr = src_start; + + while (w <= (width - 1) - 16) { + uint8x16_t mat[3][3]; + mat[0][0] = vld1q_u8(src_ptr - stride - 1); + mat[0][1] = vld1q_u8(src_ptr - stride); + mat[0][2] = vld1q_u8(src_ptr - stride + 1); + mat[1][0] = vld1q_u8(src_ptr - 1); + mat[1][1] = vld1q_u8(src_ptr); + mat[1][2] = vld1q_u8(src_ptr + 1); + mat[2][0] = vld1q_u8(src_ptr + stride - 1); + mat[2][1] = vld1q_u8(src_ptr + stride); + mat[2][2] = vld1q_u8(src_ptr + stride + 1); + + // Compute Sobel gradients. + uint16x8_t gxa_lo = + vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[2][0])); + uint16x8_t gxa_hi = + vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[2][0])); + uint16x8_t gxb_lo = + vaddl_u8(vget_low_u8(mat[0][2]), vget_low_u8(mat[2][2])); + uint16x8_t gxb_hi = + vaddl_u8(vget_high_u8(mat[0][2]), vget_high_u8(mat[2][2])); + gxa_lo = vaddq_u16( + gxa_lo, vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][0]))); + gxa_hi = vaddq_u16( + gxa_hi, vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][0]))); + gxb_lo = vaddq_u16( + gxb_lo, vaddl_u8(vget_low_u8(mat[1][2]), vget_low_u8(mat[1][2]))); + gxb_hi = vaddq_u16( + gxb_hi, vaddl_u8(vget_high_u8(mat[1][2]), vget_high_u8(mat[1][2]))); + + uint16x8_t gya_lo = + vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2])); + uint16x8_t gya_hi = + vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2])); + uint16x8_t gyb_lo = + vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2])); + uint16x8_t gyb_hi = + vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2])); + gya_lo = vaddq_u16( + gya_lo, vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[0][1]))); + gya_hi = vaddq_u16( + gya_hi, vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[0][1]))); + gyb_lo = vaddq_u16( + gyb_lo, vaddl_u8(vget_low_u8(mat[2][1]), vget_low_u8(mat[2][1]))); + gyb_hi = vaddq_u16( + gyb_hi, vaddl_u8(vget_high_u8(mat[2][1]), vget_high_u8(mat[2][1]))); + + uint16x8_t ga_lo = vabaq_u16(vabdq_u16(gxa_lo, gxb_lo), gya_lo, gyb_lo); + uint16x8_t ga_hi = vabaq_u16(vabdq_u16(gxa_hi, gxb_hi), gya_hi, gyb_hi); + + // Check which vector elements are under the threshold. The Laplacian is + // then unconditionally computed and we accumulate zeros if we're not + // under the threshold. This is much faster than using an if statement. + uint16x8_t thresh_u16_lo = vcltq_u16(ga_lo, thresh); + uint16x8_t thresh_u16_hi = vcltq_u16(ga_hi, thresh); + + uint16x8_t center_lo = vshll_n_u8(vget_low_u8(mat[1][1]), 2); + uint16x8_t center_hi = vshll_n_u8(vget_high_u8(mat[1][1]), 2); + + uint16x8_t adj0_lo = + vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[2][1])); + uint16x8_t adj0_hi = + vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[2][1])); + uint16x8_t adj1_lo = + vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][2])); + uint16x8_t adj1_hi = + vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][2])); + uint16x8_t adj_lo = vaddq_u16(adj0_lo, adj1_lo); + adj_lo = vaddq_u16(adj_lo, adj_lo); + uint16x8_t adj_hi = vaddq_u16(adj0_hi, adj1_hi); + adj_hi = vaddq_u16(adj_hi, adj_hi); + + uint16x8_t diag0_lo = + vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2])); + uint16x8_t diag0_hi = + vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2])); + uint16x8_t diag1_lo = + vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2])); + uint16x8_t diag1_hi = + vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2])); + uint16x8_t diag_lo = vaddq_u16(diag0_lo, diag1_lo); + uint16x8_t diag_hi = vaddq_u16(diag0_hi, diag1_hi); + + uint16x8_t v_lo = vaddq_u16(center_lo, diag_lo); + v_lo = vabdq_u16(v_lo, adj_lo); + uint16x8_t v_hi = vaddq_u16(center_hi, diag_hi); + v_hi = vabdq_u16(v_hi, adj_hi); + + acc = vpadalq_u16(acc, vandq_u16(v_lo, thresh_u16_lo)); + acc = vpadalq_u16(acc, vandq_u16(v_hi, thresh_u16_hi)); + + // Add -1 for each lane where the gradient is under the threshold. + count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_lo)); + count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_hi)); + + w += 16; + src_ptr += 16; + } + + if (w <= (width - 1) - 8) { + uint8x8_t mat[3][3]; + mat[0][0] = vld1_u8(src_ptr - stride - 1); + mat[0][1] = vld1_u8(src_ptr - stride); + mat[0][2] = vld1_u8(src_ptr - stride + 1); + mat[1][0] = vld1_u8(src_ptr - 1); + mat[1][1] = vld1_u8(src_ptr); + mat[1][2] = vld1_u8(src_ptr + 1); + mat[2][0] = vld1_u8(src_ptr + stride - 1); + mat[2][1] = vld1_u8(src_ptr + stride); + mat[2][2] = vld1_u8(src_ptr + stride + 1); + + // Compute Sobel gradients. + uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]); + uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]); + gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0])); + gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2])); + + uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]); + uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]); + gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1])); + gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1])); + + uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb); + + // Check which vector elements are under the threshold. The Laplacian is + // then unconditionally computed and we accumulate zeros if we're not + // under the threshold. This is much faster than using an if statement. + uint16x8_t thresh_u16 = vcltq_u16(ga, thresh); + + uint16x8_t center = vshll_n_u8(mat[1][1], 2); + + uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]); + uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]); + uint16x8_t adj = vaddq_u16(adj0, adj1); + adj = vaddq_u16(adj, adj); + + uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]); + uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]); + uint16x8_t diag = vaddq_u16(diag0, diag1); + + uint16x8_t v = vaddq_u16(center, diag); + v = vabdq_u16(v, adj); + + acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16)); + // Add -1 for each lane where the gradient is under the threshold. + count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16)); + + w += 8; + src_ptr += 8; + } + + if (w <= (width - 1) - 4) { + uint16x8_t mask = vcombine_u16(vdup_n_u16(65535), vdup_n_u16(0)); + uint8x8_t mat[3][3]; + mat[0][0] = load_u8_4x1(src_ptr - stride - 1); + mat[0][1] = load_u8_4x1(src_ptr - stride); + mat[0][2] = load_u8_4x1(src_ptr - stride + 1); + mat[1][0] = load_u8_4x1(src_ptr - 1); + mat[1][1] = load_u8_4x1(src_ptr); + mat[1][2] = load_u8_4x1(src_ptr + 1); + mat[2][0] = load_u8_4x1(src_ptr + stride - 1); + mat[2][1] = load_u8_4x1(src_ptr + stride); + mat[2][2] = load_u8_4x1(src_ptr + stride + 1); + + // Compute Sobel gradients. + uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]); + uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]); + gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0])); + gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2])); + + uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]); + uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]); + gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1])); + gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1])); + + uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb); + + // Check which vector elements are under the threshold. The Laplacian is + // then unconditionally computed and we accumulate zeros if we're not + // under the threshold. This is much faster than using an if statement. + uint16x8_t thresh_u16 = vandq_u16(vcltq_u16(ga, thresh), mask); + + uint16x8_t center = vshll_n_u8(mat[1][1], 2); + + uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]); + uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]); + uint16x8_t adj = vaddq_u16(adj0, adj1); + adj = vaddq_u16(adj, adj); + + uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]); + uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]); + uint16x8_t diag = vaddq_u16(diag0, diag1); + + uint16x8_t v = vaddq_u16(center, diag); + v = vabdq_u16(v, adj); + + acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16)); + // Add -1 for each lane where the gradient is under the threshold. + count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16)); + + w += 4; + src_ptr += 4; + } + + while (w < width - 1) { + int mat[3][3]; + mat[0][0] = *(src_ptr - stride - 1); + mat[0][1] = *(src_ptr - stride); + mat[0][2] = *(src_ptr - stride + 1); + mat[1][0] = *(src_ptr - 1); + mat[1][1] = *(src_ptr); + mat[1][2] = *(src_ptr + 1); + mat[2][0] = *(src_ptr + stride - 1); + mat[2][1] = *(src_ptr + stride); + mat[2][2] = *(src_ptr + stride + 1); + + // Compute Sobel gradients. + const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + + 2 * (mat[1][0] - mat[1][2]); + const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + + 2 * (mat[0][1] - mat[2][1]); + const int ga = abs(gx) + abs(gy); + + // Accumulate Laplacian. + const int is_under = ga < edge_thresh; + const int v = 4 * mat[1][1] - + 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); + final_acc += abs(v) * is_under; + final_count += is_under; + + src_ptr++; + w++; + } + src_start += stride; + } while (++h < height - 1); + + // We counted negatively, so subtract to get the final value. + final_count -= horizontal_add_s32x4(count); + final_acc += horizontal_long_add_u32x4(acc); + return (final_count < 16) + ? -1.0 + : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2; +} diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c new file mode 100644 index 0000000000..5a52e701a2 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +// For the squared error buffer, add padding for 4 samples. +#define SSE_STRIDE (BW + 4) + +// clang-format off + +DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, + 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, + 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, + 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +// clang-format on + +static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1, + const uint8_t *frame2, const uint32_t stride2, + const uint32_t block_width, + const uint32_t block_height, + uint8_t *frame_abs_diff, + const unsigned int dst_stride) { + uint8_t *dst = frame_abs_diff; + + uint32_t i = 0; + do { + uint32_t j = 0; + do { + uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j); + uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j); + uint8x16_t abs_diff = vabdq_u8(s, r); + vst1q_u8(dst + j + 2, abs_diff); + j += 16; + } while (j < block_width); + + dst += dst_stride; + } while (++i < block_height); +} + +static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col, + const uint32_t block_width) { + uint8x8_t s = vld1_u8(src); + + if (col == 0) { + const uint8_t lane2 = vget_lane_u8(s, 2); + s = vset_lane_u8(lane2, s, 0); + s = vset_lane_u8(lane2, s, 1); + } else if (col >= block_width - 4) { + const uint8_t lane5 = vget_lane_u8(s, 5); + s = vset_lane_u8(lane5, s, 6); + s = vset_lane_u8(lane5, s, 7); + } + return vcombine_u8(s, s); +} + +static void apply_temporal_filter( + const uint8_t *frame, const unsigned int stride, const uint32_t block_width, + const uint32_t block_height, const int *subblock_mses, + unsigned int *accumulator, uint16_t *count, const uint8_t *frame_abs_diff, + const uint32_t *luma_sse_sum, const double inv_num_ref_pixels, + const double decay_factor, const double inv_factor, + const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_neon[BH][BW]; + const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask); + + // Traverse 4 columns at a time - first and last two columns need padding. + for (uint32_t col = 0; col < block_width; col += 4) { + uint8x16_t vsrc[5][2]; + const uint8_t *src = frame_abs_diff + col; + + // Load, pad (for first and last two columns) and mask 3 rows from the top. + for (int i = 2; i < 5; i++) { + const uint8x16_t s = load_and_pad(src, col, block_width); + vsrc[i][0] = vandq_u8(s, vmask.val[0]); + vsrc[i][1] = vandq_u8(s, vmask.val[1]); + src += SSE_STRIDE; + } + + // Pad the top 2 rows. + vsrc[0][0] = vsrc[2][0]; + vsrc[0][1] = vsrc[2][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + + for (unsigned int row = 0; row < block_height; row++) { + uint32x4_t sum_01 = vdupq_n_u32(0); + uint32x4_t sum_23 = vdupq_n_u32(0); + + sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]); + sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]); + sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]); + sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]); + sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]); + + sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]); + sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]); + sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]); + sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]); + sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]); + + vst1q_u32(&acc_5x5_neon[row][col], vpaddq_u32(sum_01, sum_23)); + + // Push all rows in the sliding window up one. + for (int i = 0; i < 4; i++) { + vsrc[i][0] = vsrc[i + 1][0]; + vsrc[i][1] = vsrc[i + 1][1]; + } + + if (row <= block_height - 4) { + // Load next row into the bottom of the sliding window. + uint8x16_t s = load_and_pad(src, col, block_width); + vsrc[4][0] = vandq_u8(s, vmask.val[0]); + vsrc[4][1] = vandq_u8(s, vmask.val[1]); + src += SSE_STRIDE; + } else { + // Pad the bottom 2 rows. + vsrc[4][0] = vsrc[3][0]; + vsrc[4][1] = vsrc[3][1]; + } + } + } + + // Perform filtering. + if (tf_wgt_calc_lvl == 0) { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } else { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } +} + +void av1_apply_temporal_filter_neon_dotprod( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!"); + assert(!is_high_bitdepth && "Only support low bit-depth with Neon!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + // Frame information. + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = + frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += + (frame_abs_diff[yy * SSE_STRIDE + xx + 2] * + frame_abs_diff[yy * SSE_STRIDE + xx + 2]); + } + } + } + } + } + + get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w, + plane_h, frame_abs_diff, SSE_STRIDE); + + apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h, + subblock_mses, accum + plane_offset, + count + plane_offset, frame_abs_diff, luma_sse_sum, + inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + + plane_offset += plane_h * plane_w; + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/txfm_neon.h b/third_party/aom/av1/encoder/arm/neon/txfm_neon.h new file mode 100644 index 0000000000..635364f46a --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/txfm_neon.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_ +#define AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_ + +#include "aom/aom_integer.h" // For AOM_INLINE. + +static AOM_INLINE void ud_adjust_input_and_stride(int ud_flip, + const int16_t **input, + int *stride, int out_size) { + if (ud_flip) { + *input = *input + (out_size - 1) * *stride; + *stride = -*stride; + } +} + +#endif // AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_ diff --git a/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c b/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c new file mode 100644 index 0000000000..1b35269b33 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c for details of the parameters and + * computation. + */ +uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + assert(N % 64 == 0); + + uint64x2_t v_csse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + int i = 0; + do { + int32x4_t sum[4]; + int32x4_t sse[2]; + int16x4_t sum_s16[4]; + + const int16x8_t r1_l = vld1q_s16(r1 + i); + const int16x8_t r1_h = vld1q_s16(r1 + i + 8); + const int16x8_t d_l = vld1q_s16(d + i); + const int16x8_t d_h = vld1q_s16(d + i + 8); + // The following three lines are a bit inelegant compared to using a pair + // of vmovl_u8()... but it forces the compiler to emit a ZIP1, ZIP2 pair - + // which can be executed in parallel with the subsequent SSHL instructions. + // (SSHL can only be executed on half of the Neon pipes in modern Arm + // cores, whereas ZIP1/2 can be executed on all of them.) + const uint8x16x2_t m_u16 = vzipq_u8(vld1q_u8(m + i), vdupq_n_u8(0)); + const int16x8_t m_l = vreinterpretq_s16_u8(m_u16.val[0]); + const int16x8_t m_h = vreinterpretq_s16_u8(m_u16.val[1]); + + sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS); + sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS); + sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS); + sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS); + + sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l)); + sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l)); + sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h)); + sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h)); + + sum_s16[0] = vqmovn_s32(sum[0]); + sum_s16[1] = vqmovn_s32(sum[1]); + sum_s16[2] = vqmovn_s32(sum[2]); + sum_s16[3] = vqmovn_s32(sum[3]); + + sse[0] = vmull_s16(sum_s16[0], sum_s16[0]); + sse[1] = vmull_s16(sum_s16[2], sum_s16[2]); + sse[0] = vmlal_s16(sse[0], sum_s16[1], sum_s16[1]); + sse[1] = vmlal_s16(sse[1], sum_s16[3], sum_s16[3]); + + v_csse[0] = vpadalq_u32(v_csse[0], vreinterpretq_u32_s32(sse[0])); + v_csse[1] = vpadalq_u32(v_csse[1], vreinterpretq_u32_s32(sse[1])); + + i += 16; + } while (i < N); + + uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1])); + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int32x4_t acc[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + + do { + int16x8_t ds_l = vld1q_s16(ds); + int16x8_t ds_h = vld1q_s16(ds + 8); + + int8x16_t m_s8 = vreinterpretq_s8_u8(vld1q_u8(m)); + int16x8_t m_l = vmovl_s8(vget_low_s8(m_s8)); + int16x8_t m_h = vmovl_s8(vget_high_s8(m_s8)); + + acc[0] = vmlal_s16(acc[0], vget_low_s16(ds_l), vget_low_s16(m_l)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(ds_l), vget_high_s16(m_l)); + acc[2] = vmlal_s16(acc[2], vget_low_s16(ds_h), vget_low_s16(m_h)); + acc[3] = vmlal_s16(acc[3], vget_high_s16(ds_h), vget_high_s16(m_h)); + + ds += 16; + m += 16; + N -= 16; + } while (N != 0); + + int64x2_t sum = vpaddlq_s32(acc[0]); + sum = vpadalq_s32(sum, acc[1]); + sum = vpadalq_s32(sum, acc[2]); + sum = vpadalq_s32(sum, acc[3]); + + return (horizontal_add_s64x2(sum) > limit); +} + +void av1_wedge_compute_delta_squares_neon(int16_t *d_ptr, const int16_t *a_ptr, + const int16_t *b_ptr, int N) { + do { + int16x8_t a = vld1q_s16(a_ptr); + int16x8_t b = vld1q_s16(b_ptr); + + int32x4_t sq_lo = vmull_s16(vget_low_s16(a), vget_low_s16(a)); + int32x4_t sq_hi = vmull_s16(vget_high_s16(a), vget_high_s16(a)); + + sq_lo = vmlsl_s16(sq_lo, vget_low_s16(b), vget_low_s16(b)); + sq_hi = vmlsl_s16(sq_hi, vget_high_s16(b), vget_high_s16(b)); + + int16x8_t res = vcombine_s16(vqmovn_s32(sq_lo), vqmovn_s32(sq_hi)); + + vst1q_s16(d_ptr, res); + + d_ptr += 8; + a_ptr += 8; + b_ptr += 8; + N -= 8; + } while (N != 0); +} diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c new file mode 100644 index 0000000000..6601c19ab3 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c @@ -0,0 +1,1885 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "av1/encoder/av1_fwd_txfm1d.h" +#include "av1/common/av1_txfm.h" + +void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 4; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[4]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[3]; + bf1[1] = input[1] + input[2]; + bf1[2] = -input[2] + input[1]; + bf1[3] = -input[3] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[2]; + bf1[2] = bf0[1]; + bf1[3] = bf0[3]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 8; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[7]; + bf1[1] = input[1] + input[6]; + bf1[2] = input[2] + input[5]; + bf1[3] = input[3] + input[4]; + bf1[4] = -input[4] + input[3]; + bf1[5] = -input[5] + input[2]; + bf1[6] = -input[6] + input[1]; + bf1[7] = -input[7] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[4]; + bf1[2] = bf0[2]; + bf1[3] = bf0[6]; + bf1[4] = bf0[1]; + bf1[5] = bf0[5]; + bf1[6] = bf0[3]; + bf1[7] = bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 16; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[15]; + bf1[1] = input[1] + input[14]; + bf1[2] = input[2] + input[13]; + bf1[3] = input[3] + input[12]; + bf1[4] = input[4] + input[11]; + bf1[5] = input[5] + input[10]; + bf1[6] = input[6] + input[9]; + bf1[7] = input[7] + input[8]; + bf1[8] = -input[8] + input[7]; + bf1[9] = -input[9] + input[6]; + bf1[10] = -input[10] + input[5]; + bf1[11] = -input[11] + input[4]; + bf1[12] = -input[12] + input[3]; + bf1[13] = -input[13] + input[2]; + bf1[14] = -input[14] + input[1]; + bf1[15] = -input[15] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[8]; + bf1[2] = bf0[4]; + bf1[3] = bf0[12]; + bf1[4] = bf0[2]; + bf1[5] = bf0[10]; + bf1[6] = bf0[6]; + bf1[7] = bf0[14]; + bf1[8] = bf0[1]; + bf1[9] = bf0[9]; + bf1[10] = bf0[5]; + bf1[11] = bf0[13]; + bf1[12] = bf0[3]; + bf1[13] = bf0[11]; + bf1[14] = bf0[7]; + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 32; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[32]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[31]; + bf1[1] = input[1] + input[30]; + bf1[2] = input[2] + input[29]; + bf1[3] = input[3] + input[28]; + bf1[4] = input[4] + input[27]; + bf1[5] = input[5] + input[26]; + bf1[6] = input[6] + input[25]; + bf1[7] = input[7] + input[24]; + bf1[8] = input[8] + input[23]; + bf1[9] = input[9] + input[22]; + bf1[10] = input[10] + input[21]; + bf1[11] = input[11] + input[20]; + bf1[12] = input[12] + input[19]; + bf1[13] = input[13] + input[18]; + bf1[14] = input[14] + input[17]; + bf1[15] = input[15] + input[16]; + bf1[16] = -input[16] + input[15]; + bf1[17] = -input[17] + input[14]; + bf1[18] = -input[18] + input[13]; + bf1[19] = -input[19] + input[12]; + bf1[20] = -input[20] + input[11]; + bf1[21] = -input[21] + input[10]; + bf1[22] = -input[22] + input[9]; + bf1[23] = -input[23] + input[8]; + bf1[24] = -input[24] + input[7]; + bf1[25] = -input[25] + input[6]; + bf1[26] = -input[26] + input[5]; + bf1[27] = -input[27] + input[4]; + bf1[28] = -input[28] + input[3]; + bf1[29] = -input[29] + input[2]; + bf1[30] = -input[30] + input[1]; + bf1[31] = -input[31] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[15]; + bf1[1] = bf0[1] + bf0[14]; + bf1[2] = bf0[2] + bf0[13]; + bf1[3] = bf0[3] + bf0[12]; + bf1[4] = bf0[4] + bf0[11]; + bf1[5] = bf0[5] + bf0[10]; + bf1[6] = bf0[6] + bf0[9]; + bf1[7] = bf0[7] + bf0[8]; + bf1[8] = -bf0[8] + bf0[7]; + bf1[9] = -bf0[9] + bf0[6]; + bf1[10] = -bf0[10] + bf0[5]; + bf1[11] = -bf0[11] + bf0[4]; + bf1[12] = -bf0[12] + bf0[3]; + bf1[13] = -bf0[13] + bf0[2]; + bf1[14] = -bf0[14] + bf0[1]; + bf1[15] = -bf0[15] + bf0[0]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[23]; + bf1[17] = bf0[17] + bf0[22]; + bf1[18] = bf0[18] + bf0[21]; + bf1[19] = bf0[19] + bf0[20]; + bf1[20] = -bf0[20] + bf0[19]; + bf1[21] = -bf0[21] + bf0[18]; + bf1[22] = -bf0[22] + bf0[17]; + bf1[23] = -bf0[23] + bf0[16]; + bf1[24] = -bf0[24] + bf0[31]; + bf1[25] = -bf0[25] + bf0[30]; + bf1[26] = -bf0[26] + bf0[29]; + bf1[27] = -bf0[27] + bf0[28]; + bf1[28] = bf0[28] + bf0[27]; + bf1[29] = bf0[29] + bf0[26]; + bf1[30] = bf0[30] + bf0[25]; + bf1[31] = bf0[31] + bf0[24]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[19]; + bf1[17] = bf0[17] + bf0[18]; + bf1[18] = -bf0[18] + bf0[17]; + bf1[19] = -bf0[19] + bf0[16]; + bf1[20] = -bf0[20] + bf0[23]; + bf1[21] = -bf0[21] + bf0[22]; + bf1[22] = bf0[22] + bf0[21]; + bf1[23] = bf0[23] + bf0[20]; + bf1[24] = bf0[24] + bf0[27]; + bf1[25] = bf0[25] + bf0[26]; + bf1[26] = -bf0[26] + bf0[25]; + bf1[27] = -bf0[27] + bf0[24]; + bf1[28] = -bf0[28] + bf0[31]; + bf1[29] = -bf0[29] + bf0[30]; + bf1[30] = bf0[30] + bf0[29]; + bf1[31] = bf0[31] + bf0[28]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + bf1[16] = bf0[16] + bf0[17]; + bf1[17] = -bf0[17] + bf0[16]; + bf1[18] = -bf0[18] + bf0[19]; + bf1[19] = bf0[19] + bf0[18]; + bf1[20] = bf0[20] + bf0[21]; + bf1[21] = -bf0[21] + bf0[20]; + bf1[22] = -bf0[22] + bf0[23]; + bf1[23] = bf0[23] + bf0[22]; + bf1[24] = bf0[24] + bf0[25]; + bf1[25] = -bf0[25] + bf0[24]; + bf1[26] = -bf0[26] + bf0[27]; + bf1[27] = bf0[27] + bf0[26]; + bf1[28] = bf0[28] + bf0[29]; + bf1[29] = -bf0[29] + bf0[28]; + bf1[30] = -bf0[30] + bf0[31]; + bf1[31] = bf0[31] + bf0[30]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); + bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[16]; + bf1[2] = bf0[8]; + bf1[3] = bf0[24]; + bf1[4] = bf0[4]; + bf1[5] = bf0[20]; + bf1[6] = bf0[12]; + bf1[7] = bf0[28]; + bf1[8] = bf0[2]; + bf1[9] = bf0[18]; + bf1[10] = bf0[10]; + bf1[11] = bf0[26]; + bf1[12] = bf0[6]; + bf1[13] = bf0[22]; + bf1[14] = bf0[14]; + bf1[15] = bf0[30]; + bf1[16] = bf0[1]; + bf1[17] = bf0[17]; + bf1[18] = bf0[9]; + bf1[19] = bf0[25]; + bf1[20] = bf0[5]; + bf1[21] = bf0[21]; + bf1[22] = bf0[13]; + bf1[23] = bf0[29]; + bf1[24] = bf0[3]; + bf1[25] = bf0[19]; + bf1[26] = bf0[11]; + bf1[27] = bf0[27]; + bf1[28] = bf0[7]; + bf1[29] = bf0[23]; + bf1[30] = bf0[15]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + int bit = cos_bit; + const int32_t *sinpi = sinpi_arr(bit); + int32_t x0, x1, x2, x3; + int32_t s0, s1, s2, s3, s4, s5, s6, s7; + + // stage 0 + av1_range_check_buf(0, input, input, 4, stage_range[0]); + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + // stage 1 + s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]); + s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]); + s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]); + s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]); + s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]); + s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]); + s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]); + s7 = range_check_value(x0 + x1, stage_range[1]); + + // stage 2 + s7 = range_check_value(s7 - x3, stage_range[2]); + + // stage 3 + x0 = range_check_value(s0 + s2, bit + stage_range[3]); + x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]); + x2 = range_check_value(s1 - s3, bit + stage_range[3]); + x3 = range_check_value(s4, bit + stage_range[3]); + + // stage 4 + x0 = range_check_value(x0 + s5, bit + stage_range[4]); + x2 = range_check_value(x2 + s6, bit + stage_range[4]); + + // stage 5 + s0 = range_check_value(x0 + x3, bit + stage_range[5]); + s1 = range_check_value(x1, bit + stage_range[5]); + s2 = range_check_value(x2 - x3, bit + stage_range[5]); + s3 = range_check_value(x2 - x0, bit + stage_range[5]); + + // stage 6 + s3 = range_check_value(s3 + x3, bit + stage_range[6]); + + // 1-D transform scaling factor is sqrt(2). + output[0] = round_shift(s0, bit); + output[1] = round_shift(s1, bit); + output[2] = round_shift(s2, bit); + output[3] = round_shift(s3, bit); + av1_range_check_buf(6, input, output, 4, stage_range[6]); +} + +void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 8; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + assert(output != input); + bf1 = output; + bf1[0] = input[0]; + bf1[1] = -input[7]; + bf1[2] = -input[3]; + bf1[3] = input[4]; + bf1[4] = -input[1]; + bf1[5] = input[6]; + bf1[6] = input[2]; + bf1[7] = -input[5]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[2]; + bf1[1] = bf0[1] + bf0[3]; + bf1[2] = bf0[0] - bf0[2]; + bf1[3] = bf0[1] - bf0[3]; + bf1[4] = bf0[4] + bf0[6]; + bf1[5] = bf0[5] + bf0[7]; + bf1[6] = bf0[4] - bf0[6]; + bf1[7] = bf0[5] - bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[4]; + bf1[1] = bf0[1] + bf0[5]; + bf1[2] = bf0[2] + bf0[6]; + bf1[3] = bf0[3] + bf0[7]; + bf1[4] = bf0[0] - bf0[4]; + bf1[5] = bf0[1] - bf0[5]; + bf1[6] = bf0[2] - bf0[6]; + bf1[7] = bf0[3] - bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[1]; + bf1[1] = bf0[6]; + bf1[2] = bf0[3]; + bf1[3] = bf0[4]; + bf1[4] = bf0[5]; + bf1[5] = bf0[2]; + bf1[6] = bf0[7]; + bf1[7] = bf0[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 16; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + assert(output != input); + bf1 = output; + bf1[0] = input[0]; + bf1[1] = -input[15]; + bf1[2] = -input[7]; + bf1[3] = input[8]; + bf1[4] = -input[3]; + bf1[5] = input[12]; + bf1[6] = input[4]; + bf1[7] = -input[11]; + bf1[8] = -input[1]; + bf1[9] = input[14]; + bf1[10] = input[6]; + bf1[11] = -input[9]; + bf1[12] = input[2]; + bf1[13] = -input[13]; + bf1[14] = -input[5]; + bf1[15] = input[10]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[2]; + bf1[1] = bf0[1] + bf0[3]; + bf1[2] = bf0[0] - bf0[2]; + bf1[3] = bf0[1] - bf0[3]; + bf1[4] = bf0[4] + bf0[6]; + bf1[5] = bf0[5] + bf0[7]; + bf1[6] = bf0[4] - bf0[6]; + bf1[7] = bf0[5] - bf0[7]; + bf1[8] = bf0[8] + bf0[10]; + bf1[9] = bf0[9] + bf0[11]; + bf1[10] = bf0[8] - bf0[10]; + bf1[11] = bf0[9] - bf0[11]; + bf1[12] = bf0[12] + bf0[14]; + bf1[13] = bf0[13] + bf0[15]; + bf1[14] = bf0[12] - bf0[14]; + bf1[15] = bf0[13] - bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[4]; + bf1[1] = bf0[1] + bf0[5]; + bf1[2] = bf0[2] + bf0[6]; + bf1[3] = bf0[3] + bf0[7]; + bf1[4] = bf0[0] - bf0[4]; + bf1[5] = bf0[1] - bf0[5]; + bf1[6] = bf0[2] - bf0[6]; + bf1[7] = bf0[3] - bf0[7]; + bf1[8] = bf0[8] + bf0[12]; + bf1[9] = bf0[9] + bf0[13]; + bf1[10] = bf0[10] + bf0[14]; + bf1[11] = bf0[11] + bf0[15]; + bf1[12] = bf0[8] - bf0[12]; + bf1[13] = bf0[9] - bf0[13]; + bf1[14] = bf0[10] - bf0[14]; + bf1[15] = bf0[11] - bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); + bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[8]; + bf1[1] = bf0[1] + bf0[9]; + bf1[2] = bf0[2] + bf0[10]; + bf1[3] = bf0[3] + bf0[11]; + bf1[4] = bf0[4] + bf0[12]; + bf1[5] = bf0[5] + bf0[13]; + bf1[6] = bf0[6] + bf0[14]; + bf1[7] = bf0[7] + bf0[15]; + bf1[8] = bf0[0] - bf0[8]; + bf1[9] = bf0[1] - bf0[9]; + bf1[10] = bf0[2] - bf0[10]; + bf1[11] = bf0[3] - bf0[11]; + bf1[12] = bf0[4] - bf0[12]; + bf1[13] = bf0[5] - bf0[13]; + bf1[14] = bf0[6] - bf0[14]; + bf1[15] = bf0[7] - bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); + bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit); + bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[1]; + bf1[1] = bf0[14]; + bf1[2] = bf0[3]; + bf1[3] = bf0[12]; + bf1[4] = bf0[5]; + bf1[5] = bf0[10]; + bf1[6] = bf0[7]; + bf1[7] = bf0[8]; + bf1[8] = bf0[9]; + bf1[9] = bf0[6]; + bf1[10] = bf0[11]; + bf1[11] = bf0[4]; + bf1[12] = bf0[13]; + bf1[13] = bf0[2]; + bf1[14] = bf0[15]; + bf1[15] = bf0[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 4; ++i) + output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); + av1_range_check_buf(0, input, output, 4, stage_range[0]); +} + +void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 8; ++i) output[i] = input[i] * 2; + av1_range_check_buf(0, input, output, 8, stage_range[0]); +} + +void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 16; ++i) + output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); + av1_range_check_buf(0, input, output, 16, stage_range[0]); +} + +void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) output[i] = input[i] * 4; + av1_range_check_buf(0, input, output, 32, stage_range[0]); +} + +void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 64; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[64]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[63]; + bf1[1] = input[1] + input[62]; + bf1[2] = input[2] + input[61]; + bf1[3] = input[3] + input[60]; + bf1[4] = input[4] + input[59]; + bf1[5] = input[5] + input[58]; + bf1[6] = input[6] + input[57]; + bf1[7] = input[7] + input[56]; + bf1[8] = input[8] + input[55]; + bf1[9] = input[9] + input[54]; + bf1[10] = input[10] + input[53]; + bf1[11] = input[11] + input[52]; + bf1[12] = input[12] + input[51]; + bf1[13] = input[13] + input[50]; + bf1[14] = input[14] + input[49]; + bf1[15] = input[15] + input[48]; + bf1[16] = input[16] + input[47]; + bf1[17] = input[17] + input[46]; + bf1[18] = input[18] + input[45]; + bf1[19] = input[19] + input[44]; + bf1[20] = input[20] + input[43]; + bf1[21] = input[21] + input[42]; + bf1[22] = input[22] + input[41]; + bf1[23] = input[23] + input[40]; + bf1[24] = input[24] + input[39]; + bf1[25] = input[25] + input[38]; + bf1[26] = input[26] + input[37]; + bf1[27] = input[27] + input[36]; + bf1[28] = input[28] + input[35]; + bf1[29] = input[29] + input[34]; + bf1[30] = input[30] + input[33]; + bf1[31] = input[31] + input[32]; + bf1[32] = -input[32] + input[31]; + bf1[33] = -input[33] + input[30]; + bf1[34] = -input[34] + input[29]; + bf1[35] = -input[35] + input[28]; + bf1[36] = -input[36] + input[27]; + bf1[37] = -input[37] + input[26]; + bf1[38] = -input[38] + input[25]; + bf1[39] = -input[39] + input[24]; + bf1[40] = -input[40] + input[23]; + bf1[41] = -input[41] + input[22]; + bf1[42] = -input[42] + input[21]; + bf1[43] = -input[43] + input[20]; + bf1[44] = -input[44] + input[19]; + bf1[45] = -input[45] + input[18]; + bf1[46] = -input[46] + input[17]; + bf1[47] = -input[47] + input[16]; + bf1[48] = -input[48] + input[15]; + bf1[49] = -input[49] + input[14]; + bf1[50] = -input[50] + input[13]; + bf1[51] = -input[51] + input[12]; + bf1[52] = -input[52] + input[11]; + bf1[53] = -input[53] + input[10]; + bf1[54] = -input[54] + input[9]; + bf1[55] = -input[55] + input[8]; + bf1[56] = -input[56] + input[7]; + bf1[57] = -input[57] + input[6]; + bf1[58] = -input[58] + input[5]; + bf1[59] = -input[59] + input[4]; + bf1[60] = -input[60] + input[3]; + bf1[61] = -input[61] + input[2]; + bf1[62] = -input[62] + input[1]; + bf1[63] = -input[63] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[31]; + bf1[1] = bf0[1] + bf0[30]; + bf1[2] = bf0[2] + bf0[29]; + bf1[3] = bf0[3] + bf0[28]; + bf1[4] = bf0[4] + bf0[27]; + bf1[5] = bf0[5] + bf0[26]; + bf1[6] = bf0[6] + bf0[25]; + bf1[7] = bf0[7] + bf0[24]; + bf1[8] = bf0[8] + bf0[23]; + bf1[9] = bf0[9] + bf0[22]; + bf1[10] = bf0[10] + bf0[21]; + bf1[11] = bf0[11] + bf0[20]; + bf1[12] = bf0[12] + bf0[19]; + bf1[13] = bf0[13] + bf0[18]; + bf1[14] = bf0[14] + bf0[17]; + bf1[15] = bf0[15] + bf0[16]; + bf1[16] = -bf0[16] + bf0[15]; + bf1[17] = -bf0[17] + bf0[14]; + bf1[18] = -bf0[18] + bf0[13]; + bf1[19] = -bf0[19] + bf0[12]; + bf1[20] = -bf0[20] + bf0[11]; + bf1[21] = -bf0[21] + bf0[10]; + bf1[22] = -bf0[22] + bf0[9]; + bf1[23] = -bf0[23] + bf0[8]; + bf1[24] = -bf0[24] + bf0[7]; + bf1[25] = -bf0[25] + bf0[6]; + bf1[26] = -bf0[26] + bf0[5]; + bf1[27] = -bf0[27] + bf0[4]; + bf1[28] = -bf0[28] + bf0[3]; + bf1[29] = -bf0[29] + bf0[2]; + bf1[30] = -bf0[30] + bf0[1]; + bf1[31] = -bf0[31] + bf0[0]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = bf0[37]; + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); + bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit); + bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit); + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = bf0[58]; + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[15]; + bf1[1] = bf0[1] + bf0[14]; + bf1[2] = bf0[2] + bf0[13]; + bf1[3] = bf0[3] + bf0[12]; + bf1[4] = bf0[4] + bf0[11]; + bf1[5] = bf0[5] + bf0[10]; + bf1[6] = bf0[6] + bf0[9]; + bf1[7] = bf0[7] + bf0[8]; + bf1[8] = -bf0[8] + bf0[7]; + bf1[9] = -bf0[9] + bf0[6]; + bf1[10] = -bf0[10] + bf0[5]; + bf1[11] = -bf0[11] + bf0[4]; + bf1[12] = -bf0[12] + bf0[3]; + bf1[13] = -bf0[13] + bf0[2]; + bf1[14] = -bf0[14] + bf0[1]; + bf1[15] = -bf0[15] + bf0[0]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[47]; + bf1[33] = bf0[33] + bf0[46]; + bf1[34] = bf0[34] + bf0[45]; + bf1[35] = bf0[35] + bf0[44]; + bf1[36] = bf0[36] + bf0[43]; + bf1[37] = bf0[37] + bf0[42]; + bf1[38] = bf0[38] + bf0[41]; + bf1[39] = bf0[39] + bf0[40]; + bf1[40] = -bf0[40] + bf0[39]; + bf1[41] = -bf0[41] + bf0[38]; + bf1[42] = -bf0[42] + bf0[37]; + bf1[43] = -bf0[43] + bf0[36]; + bf1[44] = -bf0[44] + bf0[35]; + bf1[45] = -bf0[45] + bf0[34]; + bf1[46] = -bf0[46] + bf0[33]; + bf1[47] = -bf0[47] + bf0[32]; + bf1[48] = -bf0[48] + bf0[63]; + bf1[49] = -bf0[49] + bf0[62]; + bf1[50] = -bf0[50] + bf0[61]; + bf1[51] = -bf0[51] + bf0[60]; + bf1[52] = -bf0[52] + bf0[59]; + bf1[53] = -bf0[53] + bf0[58]; + bf1[54] = -bf0[54] + bf0[57]; + bf1[55] = -bf0[55] + bf0[56]; + bf1[56] = bf0[56] + bf0[55]; + bf1[57] = bf0[57] + bf0[54]; + bf1[58] = bf0[58] + bf0[53]; + bf1[59] = bf0[59] + bf0[52]; + bf1[60] = bf0[60] + bf0[51]; + bf1[61] = bf0[61] + bf0[50]; + bf1[62] = bf0[62] + bf0[49]; + bf1[63] = bf0[63] + bf0[48]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[23]; + bf1[17] = bf0[17] + bf0[22]; + bf1[18] = bf0[18] + bf0[21]; + bf1[19] = bf0[19] + bf0[20]; + bf1[20] = -bf0[20] + bf0[19]; + bf1[21] = -bf0[21] + bf0[18]; + bf1[22] = -bf0[22] + bf0[17]; + bf1[23] = -bf0[23] + bf0[16]; + bf1[24] = -bf0[24] + bf0[31]; + bf1[25] = -bf0[25] + bf0[30]; + bf1[26] = -bf0[26] + bf0[29]; + bf1[27] = -bf0[27] + bf0[28]; + bf1[28] = bf0[28] + bf0[27]; + bf1[29] = bf0[29] + bf0[26]; + bf1[30] = bf0[30] + bf0[25]; + bf1[31] = bf0[31] + bf0[24]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); + bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); + bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); + bf1[44] = bf0[44]; + bf1[45] = bf0[45]; + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = bf0[50]; + bf1[51] = bf0[51]; + bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit); + bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit); + bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit); + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[39]; + bf1[33] = bf0[33] + bf0[38]; + bf1[34] = bf0[34] + bf0[37]; + bf1[35] = bf0[35] + bf0[36]; + bf1[36] = -bf0[36] + bf0[35]; + bf1[37] = -bf0[37] + bf0[34]; + bf1[38] = -bf0[38] + bf0[33]; + bf1[39] = -bf0[39] + bf0[32]; + bf1[40] = -bf0[40] + bf0[47]; + bf1[41] = -bf0[41] + bf0[46]; + bf1[42] = -bf0[42] + bf0[45]; + bf1[43] = -bf0[43] + bf0[44]; + bf1[44] = bf0[44] + bf0[43]; + bf1[45] = bf0[45] + bf0[42]; + bf1[46] = bf0[46] + bf0[41]; + bf1[47] = bf0[47] + bf0[40]; + bf1[48] = bf0[48] + bf0[55]; + bf1[49] = bf0[49] + bf0[54]; + bf1[50] = bf0[50] + bf0[53]; + bf1[51] = bf0[51] + bf0[52]; + bf1[52] = -bf0[52] + bf0[51]; + bf1[53] = -bf0[53] + bf0[50]; + bf1[54] = -bf0[54] + bf0[49]; + bf1[55] = -bf0[55] + bf0[48]; + bf1[56] = -bf0[56] + bf0[63]; + bf1[57] = -bf0[57] + bf0[62]; + bf1[58] = -bf0[58] + bf0[61]; + bf1[59] = -bf0[59] + bf0[60]; + bf1[60] = bf0[60] + bf0[59]; + bf1[61] = bf0[61] + bf0[58]; + bf1[62] = bf0[62] + bf0[57]; + bf1[63] = bf0[63] + bf0[56]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[19]; + bf1[17] = bf0[17] + bf0[18]; + bf1[18] = -bf0[18] + bf0[17]; + bf1[19] = -bf0[19] + bf0[16]; + bf1[20] = -bf0[20] + bf0[23]; + bf1[21] = -bf0[21] + bf0[22]; + bf1[22] = bf0[22] + bf0[21]; + bf1[23] = bf0[23] + bf0[20]; + bf1[24] = bf0[24] + bf0[27]; + bf1[25] = bf0[25] + bf0[26]; + bf1[26] = -bf0[26] + bf0[25]; + bf1[27] = -bf0[27] + bf0[24]; + bf1[28] = -bf0[28] + bf0[31]; + bf1[29] = -bf0[29] + bf0[30]; + bf1[30] = bf0[30] + bf0[29]; + bf1[31] = bf0[31] + bf0[28]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); + bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); + bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = bf0[41]; + bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit); + bf1[54] = bf0[54]; + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit); + bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit); + bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit); + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[35]; + bf1[33] = bf0[33] + bf0[34]; + bf1[34] = -bf0[34] + bf0[33]; + bf1[35] = -bf0[35] + bf0[32]; + bf1[36] = -bf0[36] + bf0[39]; + bf1[37] = -bf0[37] + bf0[38]; + bf1[38] = bf0[38] + bf0[37]; + bf1[39] = bf0[39] + bf0[36]; + bf1[40] = bf0[40] + bf0[43]; + bf1[41] = bf0[41] + bf0[42]; + bf1[42] = -bf0[42] + bf0[41]; + bf1[43] = -bf0[43] + bf0[40]; + bf1[44] = -bf0[44] + bf0[47]; + bf1[45] = -bf0[45] + bf0[46]; + bf1[46] = bf0[46] + bf0[45]; + bf1[47] = bf0[47] + bf0[44]; + bf1[48] = bf0[48] + bf0[51]; + bf1[49] = bf0[49] + bf0[50]; + bf1[50] = -bf0[50] + bf0[49]; + bf1[51] = -bf0[51] + bf0[48]; + bf1[52] = -bf0[52] + bf0[55]; + bf1[53] = -bf0[53] + bf0[54]; + bf1[54] = bf0[54] + bf0[53]; + bf1[55] = bf0[55] + bf0[52]; + bf1[56] = bf0[56] + bf0[59]; + bf1[57] = bf0[57] + bf0[58]; + bf1[58] = -bf0[58] + bf0[57]; + bf1[59] = -bf0[59] + bf0[56]; + bf1[60] = -bf0[60] + bf0[63]; + bf1[61] = -bf0[61] + bf0[62]; + bf1[62] = bf0[62] + bf0[61]; + bf1[63] = bf0[63] + bf0[60]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + bf1[16] = bf0[16] + bf0[17]; + bf1[17] = -bf0[17] + bf0[16]; + bf1[18] = -bf0[18] + bf0[19]; + bf1[19] = bf0[19] + bf0[18]; + bf1[20] = bf0[20] + bf0[21]; + bf1[21] = -bf0[21] + bf0[20]; + bf1[22] = -bf0[22] + bf0[23]; + bf1[23] = bf0[23] + bf0[22]; + bf1[24] = bf0[24] + bf0[25]; + bf1[25] = -bf0[25] + bf0[24]; + bf1[26] = -bf0[26] + bf0[27]; + bf1[27] = bf0[27] + bf0[26]; + bf1[28] = bf0[28] + bf0[29]; + bf1[29] = -bf0[29] + bf0[28]; + bf1[30] = -bf0[30] + bf0[31]; + bf1[31] = bf0[31] + bf0[30]; + bf1[32] = bf0[32]; + bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); + bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); + bf1[43] = bf0[43]; + bf1[44] = bf0[44]; + bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit); + bf1[51] = bf0[51]; + bf1[52] = bf0[52]; + bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit); + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit); + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit); + bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit); + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); + bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); + bf1[32] = bf0[32] + bf0[33]; + bf1[33] = -bf0[33] + bf0[32]; + bf1[34] = -bf0[34] + bf0[35]; + bf1[35] = bf0[35] + bf0[34]; + bf1[36] = bf0[36] + bf0[37]; + bf1[37] = -bf0[37] + bf0[36]; + bf1[38] = -bf0[38] + bf0[39]; + bf1[39] = bf0[39] + bf0[38]; + bf1[40] = bf0[40] + bf0[41]; + bf1[41] = -bf0[41] + bf0[40]; + bf1[42] = -bf0[42] + bf0[43]; + bf1[43] = bf0[43] + bf0[42]; + bf1[44] = bf0[44] + bf0[45]; + bf1[45] = -bf0[45] + bf0[44]; + bf1[46] = -bf0[46] + bf0[47]; + bf1[47] = bf0[47] + bf0[46]; + bf1[48] = bf0[48] + bf0[49]; + bf1[49] = -bf0[49] + bf0[48]; + bf1[50] = -bf0[50] + bf0[51]; + bf1[51] = bf0[51] + bf0[50]; + bf1[52] = bf0[52] + bf0[53]; + bf1[53] = -bf0[53] + bf0[52]; + bf1[54] = -bf0[54] + bf0[55]; + bf1[55] = bf0[55] + bf0[54]; + bf1[56] = bf0[56] + bf0[57]; + bf1[57] = -bf0[57] + bf0[56]; + bf1[58] = -bf0[58] + bf0[59]; + bf1[59] = bf0[59] + bf0[58]; + bf1[60] = bf0[60] + bf0[61]; + bf1[61] = -bf0[61] + bf0[60]; + bf1[62] = -bf0[62] + bf0[63]; + bf1[63] = bf0[63] + bf0[62]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 10 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = bf0[21]; + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = bf0[26]; + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit); + bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit); + bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit); + bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit); + bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit); + bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit); + bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit); + bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit); + bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit); + bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit); + bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit); + bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit); + bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit); + bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit); + bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit); + bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit); + bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit); + bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit); + bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit); + bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit); + bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit); + bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit); + bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 11 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[32]; + bf1[2] = bf0[16]; + bf1[3] = bf0[48]; + bf1[4] = bf0[8]; + bf1[5] = bf0[40]; + bf1[6] = bf0[24]; + bf1[7] = bf0[56]; + bf1[8] = bf0[4]; + bf1[9] = bf0[36]; + bf1[10] = bf0[20]; + bf1[11] = bf0[52]; + bf1[12] = bf0[12]; + bf1[13] = bf0[44]; + bf1[14] = bf0[28]; + bf1[15] = bf0[60]; + bf1[16] = bf0[2]; + bf1[17] = bf0[34]; + bf1[18] = bf0[18]; + bf1[19] = bf0[50]; + bf1[20] = bf0[10]; + bf1[21] = bf0[42]; + bf1[22] = bf0[26]; + bf1[23] = bf0[58]; + bf1[24] = bf0[6]; + bf1[25] = bf0[38]; + bf1[26] = bf0[22]; + bf1[27] = bf0[54]; + bf1[28] = bf0[14]; + bf1[29] = bf0[46]; + bf1[30] = bf0[30]; + bf1[31] = bf0[62]; + bf1[32] = bf0[1]; + bf1[33] = bf0[33]; + bf1[34] = bf0[17]; + bf1[35] = bf0[49]; + bf1[36] = bf0[9]; + bf1[37] = bf0[41]; + bf1[38] = bf0[25]; + bf1[39] = bf0[57]; + bf1[40] = bf0[5]; + bf1[41] = bf0[37]; + bf1[42] = bf0[21]; + bf1[43] = bf0[53]; + bf1[44] = bf0[13]; + bf1[45] = bf0[45]; + bf1[46] = bf0[29]; + bf1[47] = bf0[61]; + bf1[48] = bf0[3]; + bf1[49] = bf0[35]; + bf1[50] = bf0[19]; + bf1[51] = bf0[51]; + bf1[52] = bf0[11]; + bf1[53] = bf0[43]; + bf1[54] = bf0[27]; + bf1[55] = bf0[59]; + bf1[56] = bf0[7]; + bf1[57] = bf0[39]; + bf1[58] = bf0[23]; + bf1[59] = bf0[55]; + bf1[60] = bf0[15]; + bf1[61] = bf0[47]; + bf1[62] = bf0[31]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h new file mode 100644 index 0000000000..9ef54fe4de --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ +#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ + +#include "av1/common/av1_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h new file mode 100644 index 0000000000..2777cc25bc --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ +#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ +#include "av1/common/enums.h" +#include "av1/encoder/av1_fwd_txfm1d.h" +extern const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL]; +extern const int8_t av1_fwd_cos_bit_col[5][5]; +extern const int8_t av1_fwd_cos_bit_row[5][5]; +#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm2d.c b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c new file mode 100644 index 0000000000..12a9535a7c --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/txfm_common.h" +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" + +static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT4: return av1_fdct4; + case TXFM_TYPE_DCT8: return av1_fdct8; + case TXFM_TYPE_DCT16: return av1_fdct16; + case TXFM_TYPE_DCT32: return av1_fdct32; + case TXFM_TYPE_DCT64: return av1_fdct64; + case TXFM_TYPE_ADST4: return av1_fadst4; + case TXFM_TYPE_ADST8: return av1_fadst8; + case TXFM_TYPE_ADST16: return av1_fadst16; + case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c; + case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c; + case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c; + case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c; + default: assert(0); return NULL; + } +} + +void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, int bd) { + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { + stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1; + } + + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { + stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1; + } +} + +static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output, + const int stride, const TXFM_2D_FLIP_CFG *cfg, + int32_t *buf, int bd) { + int c, r; + // Note when assigning txfm_size_col, we use the txfm_size from the + // row configuration and vice versa. This is intentionally done to + // accurately perform rectangular transforms. When the transform is + // rectangular, the number of columns will be the same as the + // txfm_size stored in the row cfg struct. It will make no difference + // for square transforms. + const int txfm_size_col = tx_size_wide[cfg->tx_size]; + const int txfm_size_row = tx_size_high[cfg->tx_size]; + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); + assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); + av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd); + + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); + + // use output buffer as temp buffer + int32_t *temp_in = output; + int32_t *temp_out = output + txfm_size_row; + + // Columns + for (c = 0; c < txfm_size_col; ++c) { + if (cfg->ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c]; + } else { + for (r = 0; r < txfm_size_row; ++r) + // flip upside down + temp_in[r] = input[(txfm_size_row - r - 1) * stride + c]; + } + av1_round_shift_array(temp_in, txfm_size_row, -shift[0]); + txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + if (cfg->lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + buf[r * txfm_size_col + c] = temp_out[r]; + } else { + for (r = 0; r < txfm_size_row; ++r) + // flip from left to right + buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r]; + } + } + + DECLARE_ALIGNED(16, int32_t, row_buffer[MAX_TX_SIZE]); + + // Rows + for (r = 0; r < txfm_size_row; ++r) { + txfm_func_row(buf + r * txfm_size_col, row_buffer, cos_bit_row, + stage_range_row); + av1_round_shift_array(row_buffer, txfm_size_col, -shift[2]); + if (abs(rect_type) == 1) { + // Multiply everything by Sqrt2 if the transform is rectangular and the + // size difference is a factor of 2. + for (c = 0; c < txfm_size_col; ++c) { + row_buffer[c] = + round_shift((int64_t)row_buffer[c] * NewSqrt2, NewSqrt2Bits); + } + } + for (c = 0; c < txfm_size_col; ++c) { + output[c * txfm_size_row + r] = row_buffer[c]; + } + } +} + +void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[8 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[4 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[8 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 32]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 64]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + + // Zero out top-right 32x32 area. + for (int col = 0; col < 32; ++col) { + memset(output + col * 64 + 32, 0, 32 * sizeof(*output)); + } + // Zero out the bottom 64x32 area. + memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output)); + // Re-pack non-zero coeffs in the first 32x32 indices. + for (int col = 1; col < 32; ++col) { + memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output)); + } +} + +void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out right 32x32 area. + for (int col = 0; col < 32; ++col) { + memset(output + col * 64 + 32, 0, 32 * sizeof(*output)); + } + // Re-pack non-zero coeffs in the first 32x32 indices. + for (int col = 1; col < 32; ++col) { + memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output)); + } +} + +void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 32]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out the bottom 32x32 area. + memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output)); + // Note: no repacking needed here. +} + +void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out right 32x16 area. + for (int row = 0; row < 16; ++row) { + memset(output + row * 64 + 32, 0, 32 * sizeof(*output)); + } + // Re-pack non-zero coeffs in the first 32x16 indices. + for (int row = 1; row < 16; ++row) { + memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output)); + } +} + +void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); + // Note: no repacking needed here. +} + +static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 }; +static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 }; +static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 }; +static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 }; +static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 }; +static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 }; + +const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL] = { + fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32, + fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16, + fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64, + fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32, + fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16, +}; + +const int8_t av1_fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/] + [MAX_TXWH_IDX /*txh_idx*/] = { + { 13, 13, 13, 0, 0 }, + { 13, 13, 13, 12, 0 }, + { 13, 13, 13, 12, 13 }, + { 0, 13, 13, 12, 13 }, + { 0, 0, 13, 12, 13 } + }; + +const int8_t av1_fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/] + [MAX_TXWH_IDX /*txh_idx*/] = { + { 13, 13, 12, 0, 0 }, + { 13, 13, 13, 12, 0 }, + { 13, 13, 12, 13, 12 }, + { 0, 12, 13, 12, 11 }, + { 0, 0, 12, 11, 10 } + }; + +static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 }; +static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 }; +static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 }; +static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 }; +static const int8_t fdct64_range_mult2[12] = { 0, 2, 4, 6, 8, 10, + 11, 11, 11, 11, 11, 11 }; + +static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 }; +static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 }; +static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 }; + +static const int8_t fidtx4_range_mult2[1] = { 1 }; +static const int8_t fidtx8_range_mult2[1] = { 2 }; +static const int8_t fidtx16_range_mult2[1] = { 3 }; +static const int8_t fidtx32_range_mult2[1] = { 4 }; + +static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = { + fdct4_range_mult2, fdct8_range_mult2, fdct16_range_mult2, + fdct32_range_mult2, fdct64_range_mult2, fadst4_range_mult2, + fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2, + fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2 +}; + +static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) { + av1_zero(cfg->stage_range_col); + av1_zero(cfg->stage_range_row); + + const int8_t *const range_mult2_col = + fwd_txfm_range_mult2_list[cfg->txfm_type_col]; + const int stage_num_col = cfg->stage_num_col; + // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow. + for (int i = 0; i < stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) + cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1; + + const int8_t *const range_mult2_row = + fwd_txfm_range_mult2_list[cfg->txfm_type_row]; + const int stage_num_row = cfg->stage_num_row; + // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow. + for (int i = 0; i < stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { + cfg->stage_range_row[i] = + (range_mult2_col[stage_num_col - 1] + range_mult2_row[i] + 1) >> 1; + } +} + +void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg) { + assert(cfg != NULL); + cfg->tx_size = tx_size; + set_flip_cfg(tx_type, cfg); + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + cfg->shift = av1_fwd_txfm_shift_ls[tx_size]; + cfg->cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + cfg->cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; + assert(cfg->txfm_type_col != TXFM_TYPE_INVALID); + cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; + assert(cfg->txfm_type_row != TXFM_TYPE_INVALID); + cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; + cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; + set_fwd_txfm_non_scale_range(cfg); +} diff --git a/third_party/aom/av1/encoder/av1_ml_partition_models.h b/third_party/aom/av1/encoder/av1_ml_partition_models.h new file mode 100644 index 0000000000..2572b138d5 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_ml_partition_models.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_ +#define AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +// TODO(kyslov): Replace with proper weights after training AV1 models + +#define FEATURES 6 +static const float av1_var_part_nn_weights_64_layer0[FEATURES * 8] = { + 0.35755366f, 0.86281112f, -0.20871686f, 0.0409634f, 0.97305766f, + 0.75510254f, 0.04860447f, 0.77095283f, -0.44105278f, -0.3755049f, + -0.08456618f, 1.1821136f, -0.73956301f, 1.30016453f, 0.45566902f, + 0.4742967f, 0.44213975f, 0.4876028f, 0.26720522f, -0.34429858f, + -0.25148252f, -0.49623932f, -0.46747941f, -0.36656624f, 0.10213375f, + 0.60262819f, -0.54788715f, -0.27272022f, 1.0995462f, -0.36338376f, + -0.64836313f, 0.16057039f, 1.02782791f, 0.9985311f, 0.90607883f, + 0.80570411f, -0.07750863f, -0.74006402f, 1.72839526f, 1.72355343f, + 1.69288916f, 1.59102043f, 0.14140216f, -1.47262839f, 0.4262519f, + -0.33805936f, -0.02449707f, 0.67203692f +}; + +static const float av1_var_part_nn_bias_64_layer0[8] = { + 0.39995694f, 0.65593756f, 1.12876737f, 1.28790576f, + 0.53468556f, 0.3177908f, -0.74388266f, -1.81131248f +}; + +static const float av1_var_part_nn_weights_64_layer1[8] = { + -1.31174053f, 0.69696917f, 0.78721456f, 0.45326379f, + 0.79258322f, 1.74626188f, -5.41831f, 3.33887435f +}; + +static const float av1_var_part_nn_bias_64_layer1[1] = { -0.90951047f }; + +static const float av1_var_part_means_64[FEATURES] = { + 5.36750249f, 11.58023127f, 0.25550964f, 0.23809917f, 0.24650665f, 0.22117687f +}; +static const float av1_var_part_vars_64[FEATURES] = { + 0.89599769f, 2.2686018f, 0.02568608f, 0.02523411f, 0.02443085f, 0.01922085f +}; + +static const NN_CONFIG av1_var_part_nnconfig_64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + av1_var_part_nn_weights_64_layer0, + av1_var_part_nn_weights_64_layer1, + }, + { + av1_var_part_nn_bias_64_layer0, + av1_var_part_nn_bias_64_layer1, + }, +}; + +static const float av1_var_part_nn_weights_32_layer0[FEATURES * 8] = { + 0.97886049f, -1.66262011f, 0.94902798f, 0.7080922f, 0.91181186f, + 0.35222601f, -0.04428585f, 0.42086472f, -0.0206325f, -0.77937809f, + -0.70947522f, -1.24463119f, 0.23739497f, -1.34327359f, 0.01024804f, + 0.4544633f, -0.96907661f, 0.67279522f, 0.23180693f, 1.54063368f, + -0.15700707f, 0.18597331f, 0.34167589f, 0.40736558f, 0.69213366f, + -1.33584593f, 1.21190814f, 1.26725267f, 1.21284802f, 1.26611399f, + 0.17546514f, -0.30248399f, -1.32589316f, -1.37432674f, -1.37423023f, + -1.26890855f, 0.12166347f, -0.94565678f, -1.47475267f, -0.69279948f, + -0.10166587f, -0.23489881f, 0.57123565f, 0.80051137f, -1.28411946f, + -1.36576732f, -1.30257508f, -1.30575106f +}; + +static const float av1_var_part_nn_bias_32_layer0[8] = { + -1.6301435f, 0.61879037f, -1.68612662f, 1.66960165f, + -0.0838243f, 0.32253287f, -0.65755282f, 0.96661531f +}; + +static const float av1_var_part_nn_weights_32_layer1[8] = { + 1.99257161f, 0.7331492f, 1.33539961f, 1.13501456f, + -2.21154528f, 1.85858542f, -0.85565298f, -1.96410246f +}; + +static const float av1_var_part_nn_bias_32_layer1[1] = { -0.14880827f }; + +static const float av1_var_part_means_32[FEATURES] = { + 5.36360686f, 9.88421868f, 0.23543671f, 0.23621205f, 0.23409667f, 0.22855539f +}; + +static const float av1_var_part_vars_32[FEATURES] = { + 0.89077225f, 2.32312894f, 0.02167654f, 0.02392842f, 0.02466495f, 0.02047641f +}; + +static const NN_CONFIG av1_var_part_nnconfig_32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + av1_var_part_nn_weights_32_layer0, + av1_var_part_nn_weights_32_layer1, + }, + { + av1_var_part_nn_bias_32_layer0, + av1_var_part_nn_bias_32_layer1, + }, +}; + +static const float av1_var_part_nn_weights_16_layer0[FEATURES * 8] = { + 0.45118305f, -0.22068295f, 0.4604435f, -0.1446326f, -0.15765035f, + 0.42260198f, -0.0945916f, 0.49544996f, 0.62781567f, -0.41564372f, + -0.39103292f, 0.44407624f, 0.48382613f, -0.85424238f, -0.00961433f, + 0.25383582f, 0.14403897f, 0.00901859f, -0.83201967f, -0.19323284f, + 0.59271213f, 0.69487457f, 0.6897112f, 0.62768521f, 0.9204492f, + -1.42448347f, -0.16491054f, -0.10114424f, -0.1069687f, -0.11289049f, + 0.26290832f, -0.41850393f, 0.17239733f, 0.41770622f, 0.43725942f, + 0.19362467f, -0.35955731f, -0.899446f, 0.49726389f, 0.66569571f, + 0.65893982f, 0.53199654f, -0.1158694f, -0.26472603f, 0.4155923f, + 0.15059544f, 0.09596755f, 0.26247133f +}; + +static const float av1_var_part_nn_bias_16_layer0[8] = { + 1.64486321f, -0.11851574f, 1.29322833f, -0.61193136f, + 0.33027532f, 1.04197232f, -0.80716674f, 0.88681233f +}; + +static const float av1_var_part_nn_weights_16_layer1[8] = { + -1.02832118f, 0.72800106f, -0.42904783f, 1.44490586f, + -1.03888227f, -0.9023916f, -1.51543102f, -0.43059521f +}; + +static const float av1_var_part_nn_bias_16_layer1[1] = { -0.85087946f }; + +static const float av1_var_part_means_16[FEATURES] = { + 5.32551326f, 8.218448f, 0.21954822f, 0.22808377f, 0.23019798f, 0.22320699f +}; + +static const float av1_var_part_vars_16[FEATURES] = { 0.86806032f, 2.39938956f, + 0.01958579f, 0.02437927f, + 0.02420755f, 0.0192003f }; + +static const NN_CONFIG av1_var_part_nnconfig_16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + av1_var_part_nn_weights_16_layer0, + av1_var_part_nn_weights_16_layer1, + }, + { + av1_var_part_nn_bias_16_layer0, + av1_var_part_nn_bias_16_layer1, + }, +}; + +#undef FEATURES + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_ diff --git a/third_party/aom/av1/encoder/av1_noise_estimate.c b/third_party/aom/av1/encoder/av1_noise_estimate.c new file mode 100644 index 0000000000..25007bb6d4 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_noise_estimate.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/av1_noise_estimate.h" +#include "av1/encoder/encoder.h" +#if CONFIG_AV1_TEMPORAL_DENOISING +#include "av1/encoder/av1_temporal_denoiser.h" +#endif + +#if CONFIG_AV1_TEMPORAL_DENOISING +// For SVC: only do noise estimation on top spatial layer. +static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) { + return (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)); +} +#endif + +void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { + const int64_t area = (int64_t)width * height; + ne->enabled = 0; + ne->level = (area < 1280 * 720) ? kLowLow : kLow; + ne->value = 0; + ne->count = 0; + ne->thresh = 90; + ne->last_w = 0; + ne->last_h = 0; + if (area >= 1920 * 1080) { + ne->thresh = 200; + } else if (area >= 1280 * 720) { + ne->thresh = 140; + } else if (area >= 640 * 360) { + ne->thresh = 115; + } + ne->num_frames_estimate = 15; + ne->adapt_thresh = (3 * ne->thresh) >> 1; +} + +static int enable_noise_estimation(AV1_COMP *const cpi) { + const int resize_pending = is_frame_resize_pending(cpi); + +#if CONFIG_AV1_HIGHBITDEPTH + if (cpi->common.seq_params->use_highbitdepth) return 0; +#endif +// Enable noise estimation if denoising is on. +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && + cpi->common.width >= 320 && cpi->common.height >= 180) + return 1; +#endif + // Only allow noise estimate under certain encoding mode. + // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original. + // Not enabled for SVC mode and screen_content_mode. + // Not enabled for low resolutions. + if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->oxcf.rc_cfg.mode == AOM_CBR && + cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 && + resize_pending == 0 && !cpi->ppi->use_svc && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && + cpi->common.width * cpi->common.height >= 640 * 360) + return 1; + else + return 0; +} + +#if CONFIG_AV1_TEMPORAL_DENOISING +static void copy_frame(YV12_BUFFER_CONFIG *const dest, + const YV12_BUFFER_CONFIG *const src) { + const uint8_t *srcbuf = src->y_buffer; + uint8_t *destbuf = dest->y_buffer; + + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + + for (int r = 0; r < dest->y_height; ++r) { + memcpy(destbuf, srcbuf, dest->y_width); + destbuf += dest->y_stride; + srcbuf += src->y_stride; + } +} +#endif // CONFIG_AV1_TEMPORAL_DENOISING + +NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { + int noise_level = kLowLow; + if (ne->value > (ne->thresh << 1)) { + noise_level = kHigh; + } else { + if (ne->value > ne->thresh) + noise_level = kMedium; + else if (ne->value > (ne->thresh >> 1)) + noise_level = kLow; + else + noise_level = kLowLow; + } + return noise_level; +} + +void av1_update_noise_estimate(AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + NOISE_ESTIMATE *const ne = &cpi->noise_estimate; + const int low_res = (cm->width <= 352 && cm->height <= 288); + // Estimate of noise level every frame_period frames. + int frame_period = 8; + int thresh_consec_zeromv = 2; + int frame_counter = cm->current_frame.frame_number; + // Estimate is between current source and last source. + YV12_BUFFER_CONFIG *last_source = cpi->last_source; +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) { + last_source = &cpi->denoiser.last_source; + // Tune these thresholds for different resolutions when denoising is + // enabled. + if (cm->width > 640 && cm->width <= 1920) { + thresh_consec_zeromv = 2; + } + } +#endif + ne->enabled = enable_noise_estimation(cpi); + if (cpi->svc.number_spatial_layers > 1) + frame_counter = cpi->svc.current_superframe; + if (!ne->enabled || frame_counter % frame_period != 0 || + last_source == NULL || + (cpi->svc.number_spatial_layers == 1 && + (ne->last_w != cm->width || ne->last_h != cm->height))) { +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + copy_frame(&cpi->denoiser.last_source, cpi->source); +#endif + if (last_source != NULL) { + ne->last_w = cm->width; + ne->last_h = cm->height; + } + return; + } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 && + cpi->rc.frames_since_key > cpi->svc.number_spatial_layers && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) { + // Force noise estimation to 0 and denoiser off if content has high motion. + ne->level = kLowLow; + ne->count = 0; + ne->num_frames_estimate = 10; +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && + cpi->svc.current_superframe > 1) { + av1_denoiser_set_noise_level(cpi, ne->level); + copy_frame(&cpi->denoiser.last_source, cpi->source); + } +#endif + return; + } else { + unsigned int bin_size = 100; + unsigned int hist[MAX_VAR_HIST_BINS] = { 0 }; + unsigned int hist_avg[MAX_VAR_HIST_BINS]; + unsigned int max_bin = 0; + unsigned int max_bin_count = 0; + unsigned int bin_cnt; + BLOCK_SIZE bsize = BLOCK_16X16; + // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have + // been encoded as zero/small mv at least x consecutive frames, compute + // the variance to update estimate of noise in the source. + const uint8_t *src_y = cpi->source->y_buffer; + const int src_ystride = cpi->source->y_stride; + const uint8_t *last_src_y = last_source->y_buffer; + const int last_src_ystride = last_source->y_stride; + int mi_row, mi_col; + int num_low_motion = 0; + int frame_low_motion = 1; + for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += 2) { + for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += 2) { + int bl_index = + (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1); + if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv) + num_low_motion++; + } + } + if (num_low_motion < + (((3 * (mi_params->mi_rows * mi_params->mi_cols) >> 2)) >> 3)) + frame_low_motion = 0; + for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) { + // 16x16 blocks, 1/4 sample of frame. + if (mi_row % 8 == 0 && mi_col % 8 == 0 && + mi_row < mi_params->mi_rows - 3 && + mi_col < mi_params->mi_cols - 3) { + int bl_index = + (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1); + int bl_index1 = bl_index + 1; + int bl_index2 = bl_index + (mi_params->mi_cols >> 1); + int bl_index3 = bl_index2 + 1; + int consec_zeromv = + AOMMIN(cpi->consec_zero_mv[bl_index], + AOMMIN(cpi->consec_zero_mv[bl_index1], + AOMMIN(cpi->consec_zero_mv[bl_index2], + cpi->consec_zero_mv[bl_index3]))); + // Only consider blocks that are likely steady background. i.e, have + // been encoded as zero/low motion x (= thresh_consec_zeromv) frames + // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all + // 4 sub-blocks for 16x16 block. And exclude this frame if + // high_source_sad is true (i.e., scene/content change). + if (frame_low_motion && consec_zeromv > thresh_consec_zeromv && + !cpi->rc.high_source_sad) { + unsigned int sse; + // Compute variance between co-located blocks from current and + // last input frames. + unsigned int variance = cpi->ppi->fn_ptr[bsize].vf( + src_y, src_ystride, last_src_y, last_src_ystride, &sse); + unsigned int hist_index = variance / bin_size; + if (hist_index < MAX_VAR_HIST_BINS) + hist[hist_index]++; + else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1)) + hist[MAX_VAR_HIST_BINS - 1]++; // Account for the tail + } + } + src_y += 4; + last_src_y += 4; + } + src_y += (src_ystride << 2) - (mi_params->mi_cols << 2); + last_src_y += (last_src_ystride << 2) - (mi_params->mi_cols << 2); + } + ne->last_w = cm->width; + ne->last_h = cm->height; + // Adjust histogram to account for effect that histogram flattens + // and shifts to zero as scene darkens. + if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) { + hist[0] = 0; + hist[1] >>= 2; + hist[2] >>= 2; + hist[3] >>= 2; + hist[4] >>= 1; + hist[5] >>= 1; + hist[6] = 3 * hist[6] >> 1; + hist[MAX_VAR_HIST_BINS - 1] >>= 1; + } + + // Average hist[] and find largest bin + for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) { + if (bin_cnt == 0) + hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3; + else if (bin_cnt == MAX_VAR_HIST_BINS - 1) + hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2; + else if (bin_cnt == MAX_VAR_HIST_BINS - 2) + hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + + (hist[bin_cnt + 1] >> 1) + 2) >> + 2; + else + hist_avg[bin_cnt] = + (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >> + 2; + + if (hist_avg[bin_cnt] > max_bin_count) { + max_bin_count = hist_avg[bin_cnt]; + max_bin = bin_cnt; + } + } + // Scale by 40 to work with existing thresholds + ne->value = (int)((3 * ne->value + max_bin * 40) >> 2); + // Quickly increase VNR strength when the noise level increases suddenly. + if (ne->level < kMedium && ne->value > ne->adapt_thresh) { + ne->count = ne->num_frames_estimate; + } else { + ne->count++; + } + if (ne->count == ne->num_frames_estimate) { + // Reset counter and check noise level condition. + ne->num_frames_estimate = 30; + ne->count = 0; + ne->level = av1_noise_estimate_extract_level(ne); +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + av1_denoiser_set_noise_level(cpi, ne->level); +#endif + } + } +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + copy_frame(&cpi->denoiser.last_source, cpi->source); +#endif +} diff --git a/third_party/aom/av1/encoder/av1_noise_estimate.h b/third_party/aom/av1/encoder/av1_noise_estimate.h new file mode 100644 index 0000000000..85530666f6 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_noise_estimate.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_ +#define AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_ + +#include "av1/encoder/block.h" +#include "aom_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_VAR_HIST_BINS 20 + +typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL; + +typedef struct noise_estimate { + int enabled; + NOISE_LEVEL level; + int value; + int thresh; + int adapt_thresh; + int count; + int last_w; + int last_h; + int num_frames_estimate; +} NOISE_ESTIMATE; + +struct AV1_COMP; + +void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height); + +NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne); + +void av1_update_noise_estimate(struct AV1_COMP *const cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_ diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c new file mode 100644 index 0000000000..110d17f434 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_quantize.c @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem.h" + +#include "av1/common/idct.h" +#include "av1/common/quant_common.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rd.h" + +void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + *eob_ptr = 0; +} + +int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2], + const int16_t dequant_ptr[2], + const int16_t round_ptr[2], int log_scale, + const int16_t *scan, int coeff_count, + const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr) { + memset(qcoeff_ptr, 0, coeff_count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, coeff_count * sizeof(*dqcoeff_ptr)); + const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale) }; + int eob = 0; + for (int i = 0; i < coeff_count; i++) { + const int rc = scan[i]; + const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]); + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32 = 0; + if ((abs_coeff << (1 + log_scale)) >= thresh) { + abs_coeff = clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX); + tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); + if (tmp32) { + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const tran_low_t abs_dqcoeff = + (tmp32 * dequant_ptr[rc != 0]) >> log_scale; + dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; + } + } + if (tmp32) eob = i + 1; + } + return eob; +} + +static void quantize_fp_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, int log_scale) { + int i, eob = -1; + const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale) }; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (qm_ptr == NULL && iqm_ptr == NULL) { + *eob_ptr = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr, + log_scale, scan, (int)n_coeffs, + coeff_ptr, qcoeff_ptr, dqcoeff_ptr); + } else { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const int coeff_sign = AOMSIGN(coeff); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32 = 0; + if (abs_coeff * wt >= + (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { + abs_coeff += rounding[rc != 0]; + abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX); + tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; + } + + if (tmp32) eob = i; + } + *eob_ptr = eob + 1; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_quantize_fp_helper_c( + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, int log_scale) { + int i; + int eob = -1; + const int shift = 16 - log_scale; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)iscan; + + if (qm_ptr || iqm_ptr) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const int coeff_sign = AOMSIGN(coeff); + const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int abs_qcoeff = 0; + if (abs_coeff * wt >= + (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { + const int64_t tmp = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + abs_qcoeff = + (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = i; + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + } + } else { + const int log_scaled_round_arr[2] = { + ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale), + }; + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int rc01 = (rc != 0); + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int log_scaled_round = log_scaled_round_arr[rc01]; + if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) { + const int quant = quant_ptr[rc01]; + const int dequant = dequant_ptr[rc01]; + const int64_t tmp = (int64_t)abs_coeff + log_scaled_round; + const int abs_qcoeff = (int)((tmp * quant) >> shift); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + if (abs_qcoeff) eob = i; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + } + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 0); +} + +void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)iscan; + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (int i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) eob = i; + } + *eob_ptr = eob + 1; +} + +void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 1); +} + +void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 2); +} + +void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + if (qm_ptr != NULL && iqm_ptr != NULL) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 1: + av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 2: + av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + default: assert(0); + } + } +} + +void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; +#if !CONFIG_REALTIME_ONLY + if (qparam->use_quant_b_adapt) { + // TODO(sarahparker) These quantize_b optimizations need SIMD + // implementations + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_quantize_b_adaptive_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX, + p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); + break; + case 1: + aom_quantize_b_32x32_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_quantize_b_64x64_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } + return; + } +#endif // !CONFIG_REALTIME_ONLY + + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 1: + aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 2: + aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + default: assert(0); + } + } +} + +static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp; + int eob = -1; + int32_t tmp32; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (tmp32) eob = 0; + } + *eob_ptr = eob + 1; +} + +void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + (void)sc; + assert(qparam->log_scale >= 0 && qparam->log_scale < (3)); + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX, + p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0], + eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + if (qm_ptr != NULL && iqm_ptr != NULL) { + highbd_quantize_fp_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qparam->log_scale); + } +} + +void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; +#if !CONFIG_REALTIME_ONLY + if (qparam->use_quant_b_adapt) { + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_highbd_quantize_b_adaptive_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_highbd_quantize_b_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 1: + aom_highbd_quantize_b_32x32_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_highbd_quantize_b_64x64_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } + return; + } +#endif // !CONFIG_REALTIME_ONLY + + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_highbd_quantize_b_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 1: + aom_highbd_quantize_b_32x32( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_highbd_quantize_b_64x64( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } +} + +static INLINE void highbd_quantize_dc( + const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) { + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[0]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale); + const int64_t tmpw = tmp * wt; + const int abs_qcoeff = + (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int dequant = + (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + (void)sc; + + highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX, + p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr, + qparam->log_scale); +} + +void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + int log_scale) { + highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, + log_scale); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static void invert_quant(int16_t *quant, int16_t *shift, int d) { + uint32_t t; + int l, m; + t = d; + l = get_msb(t); + m = 1 + (1 << (16 + l)) / d; + *quant = (int16_t)(m - (1 << 16)); + *shift = 1 << (16 - l); +} + +static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) { + const int quant = av1_dc_quant_QTX(q, 0, bit_depth); + switch (bit_depth) { + case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); + case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); + case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80); + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} + +void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, + int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q, + int v_ac_delta_q, QUANTS *const quants, + Dequants *const deq) { + int i, q, quant_QTX; + + for (q = 0; q < QINDEX_RANGE; q++) { + const int qzbin_factor = get_qzbin_factor(q, bit_depth); + const int qrounding_factor = q == 0 ? 64 : 48; + + for (i = 0; i < 2; ++i) { + const int qrounding_factor_fp = 64; + // y quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, 0, bit_depth); + invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], + quant_QTX); + quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->y_dequant_QTX[q][i] = quant_QTX; + + // u quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth); + invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i], + quant_QTX); + quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->u_dequant_QTX[q][i] = quant_QTX; + + // v quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth); + invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i], + quant_QTX); + quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->v_dequant_QTX[q][i] = quant_QTX; + } + + for (i = 2; i < 8; i++) { // 8: SIMD width + quants->y_quant[q][i] = quants->y_quant[q][1]; + quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1]; + quants->y_round_fp[q][i] = quants->y_round_fp[q][1]; + quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; + quants->y_zbin[q][i] = quants->y_zbin[q][1]; + quants->y_round[q][i] = quants->y_round[q][1]; + deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1]; + + quants->u_quant[q][i] = quants->u_quant[q][1]; + quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1]; + quants->u_round_fp[q][i] = quants->u_round_fp[q][1]; + quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1]; + quants->u_zbin[q][i] = quants->u_zbin[q][1]; + quants->u_round[q][i] = quants->u_round[q][1]; + deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1]; + + quants->v_quant[q][i] = quants->v_quant[q][1]; + quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1]; + quants->v_round_fp[q][i] = quants->v_round_fp[q][1]; + quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1]; + quants->v_zbin[q][i] = quants->v_zbin[q][1]; + quants->v_round[q][i] = quants->v_round[q][1]; + deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1]; + } + } +} + +static INLINE bool deltaq_params_have_changed( + const DeltaQuantParams *prev_deltaq_params, + const CommonQuantParams *quant_params) { + return (prev_deltaq_params->y_dc_delta_q != quant_params->y_dc_delta_q || + prev_deltaq_params->u_dc_delta_q != quant_params->u_dc_delta_q || + prev_deltaq_params->v_dc_delta_q != quant_params->v_dc_delta_q || + prev_deltaq_params->u_ac_delta_q != quant_params->u_ac_delta_q || + prev_deltaq_params->v_ac_delta_q != quant_params->v_ac_delta_q); +} + +void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params, + const CommonQuantParams *quant_params, + aom_bit_depth_t bit_depth) { + DeltaQuantParams *const prev_deltaq_params = + &enc_quant_dequant_params->prev_deltaq_params; + + // Re-initialize the quantizer only if any of the dc/ac deltaq parameters + // change. + if (!deltaq_params_have_changed(prev_deltaq_params, quant_params)) return; + QUANTS *const quants = &enc_quant_dequant_params->quants; + Dequants *const dequants = &enc_quant_dequant_params->dequants; + av1_build_quantizer(bit_depth, quant_params->y_dc_delta_q, + quant_params->u_dc_delta_q, quant_params->u_ac_delta_q, + quant_params->v_dc_delta_q, quant_params->v_ac_delta_q, + quants, dequants); + + // Record the state of deltaq parameters. + prev_deltaq_params->y_dc_delta_q = quant_params->y_dc_delta_q; + prev_deltaq_params->u_dc_delta_q = quant_params->u_dc_delta_q; + prev_deltaq_params->v_dc_delta_q = quant_params->v_dc_delta_q; + prev_deltaq_params->u_ac_delta_q = quant_params->u_ac_delta_q; + prev_deltaq_params->v_ac_delta_q = quant_params->v_ac_delta_q; +} + +void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params, + int qindex, MACROBLOCK *x) { + const QUANTS *const quants = &enc_quant_dequant_params->quants; + const Dequants *const dequants = &enc_quant_dequant_params->dequants; + x->qindex = qindex; + x->seg_skip_block = + 0; // TODO(angiebird): Find a proper place to init this variable. + + // Y + x->plane[0].quant_QTX = quants->y_quant[qindex]; + x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex]; + x->plane[0].round_fp_QTX = quants->y_round_fp[qindex]; + x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex]; + x->plane[0].zbin_QTX = quants->y_zbin[qindex]; + x->plane[0].round_QTX = quants->y_round[qindex]; + x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex]; + + // U + x->plane[1].quant_QTX = quants->u_quant[qindex]; + x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex]; + x->plane[1].round_fp_QTX = quants->u_round_fp[qindex]; + x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex]; + x->plane[1].zbin_QTX = quants->u_zbin[qindex]; + x->plane[1].round_QTX = quants->u_round[qindex]; + x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex]; + + // V + x->plane[2].quant_QTX = quants->v_quant[qindex]; + x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex]; + x->plane[2].round_fp_QTX = quants->v_round_fp[qindex]; + x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex]; + x->plane[2].zbin_QTX = quants->v_zbin[qindex]; + x->plane[2].round_QTX = quants->v_round[qindex]; + x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex]; +} + +void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id, + MACROBLOCKD *xd) { + const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id); + const int qmlevel_y = + use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1; + const int qmlevel_u = + use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1; + const int qmlevel_v = + use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1; + const int qmlevel_ls[MAX_MB_PLANE] = { qmlevel_y, qmlevel_u, qmlevel_v }; + for (int i = 0; i < MAX_MB_PLANE; ++i) { + const int qmlevel = qmlevel_ls[i]; + memcpy(&xd->plane[i].seg_qmatrix[segment_id], + quant_params->gqmatrix[qmlevel][i], + sizeof(quant_params->gqmatrix[qmlevel][i])); + memcpy(&xd->plane[i].seg_iqmatrix[segment_id], + quant_params->giqmatrix[qmlevel][i], + sizeof(quant_params->giqmatrix[qmlevel][i])); + } +} + +void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x, + int segment_id, const int do_update) { + const AV1_COMMON *const cm = &cpi->common; + const CommonQuantParams *const quant_params = &cm->quant_params; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + int qindex_rd; + + const int current_qindex = AOMMAX( + 0, + AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag + ? quant_params->base_qindex + x->delta_qindex + : quant_params->base_qindex)); + const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex); + + if (cpi->oxcf.sb_qp_sweep) { + const int current_rd_qindex = + AOMMAX(0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag + ? quant_params->base_qindex + + x->rdmult_delta_qindex + : quant_params->base_qindex)); + qindex_rd = av1_get_qindex(&cm->seg, segment_id, current_rd_qindex); + } else { + qindex_rd = qindex; + } + + const int qindex_rdmult = qindex_rd + quant_params->y_dc_delta_q; + const int rdmult = av1_compute_rd_mult( + qindex_rdmult, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); + + const int qindex_change = x->qindex != qindex; + if (qindex_change || do_update) { + av1_set_q_index(&cpi->enc_quant_dequant_params, qindex, x); + } + + MACROBLOCKD *const xd = &x->e_mbd; + if ((segment_id != x->prev_segment_id) || + av1_use_qmatrix(quant_params, xd, segment_id)) { + av1_set_qmatrix(quant_params, segment_id, xd); + } + + x->seg_skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); + + av1_set_error_per_bit(&x->errorperbit, rdmult); + av1_set_sad_per_bit(cpi, &x->sadperbit, qindex_rd); + + x->prev_segment_id = segment_id; +} + +void av1_frame_init_quantizer(AV1_COMP *cpi) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + x->prev_segment_id = -1; + av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 1); +} + +static int adjust_hdr_cb_deltaq(int base_qindex) { + double baseQp = base_qindex / QP_SCALE_FACTOR; + const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET; + const double dcbQP = CHROMA_CB_QP_SCALE * chromaQp * QP_SCALE_FACTOR; + int dqpCb = (int)(dcbQP + (dcbQP < 0 ? -0.5 : 0.5)); + dqpCb = AOMMIN(0, dqpCb); + dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR); + return dqpCb; +} + +static int adjust_hdr_cr_deltaq(int base_qindex) { + double baseQp = base_qindex / QP_SCALE_FACTOR; + const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET; + const double dcrQP = CHROMA_CR_QP_SCALE * chromaQp * QP_SCALE_FACTOR; + int dqpCr = (int)(dcrQP + (dcrQP < 0 ? -0.5 : 0.5)); + dqpCr = AOMMIN(0, dqpCr); + dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR); + return dqpCr; +} + +void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel, + int q, int enable_chroma_deltaq, int enable_hdr_deltaq) { + // quantizer has to be reinitialized with av1_init_quantizer() if any + // delta_q changes. + CommonQuantParams *quant_params = &cm->quant_params; + quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q); + quant_params->y_dc_delta_q = 0; + + if (enable_chroma_deltaq) { + // TODO(aomedia:2717): need to design better delta + quant_params->u_dc_delta_q = 2; + quant_params->u_ac_delta_q = 2; + quant_params->v_dc_delta_q = 2; + quant_params->v_ac_delta_q = 2; + } else { + quant_params->u_dc_delta_q = 0; + quant_params->u_ac_delta_q = 0; + quant_params->v_dc_delta_q = 0; + quant_params->v_ac_delta_q = 0; + } + + // following section 8.3.2 in T-REC-H.Sup15 document + // to apply to AV1 qindex in the range of [0, 255] + if (enable_hdr_deltaq) { + int dqpCb = adjust_hdr_cb_deltaq(quant_params->base_qindex); + int dqpCr = adjust_hdr_cr_deltaq(quant_params->base_qindex); + quant_params->u_dc_delta_q = quant_params->u_ac_delta_q = dqpCb; + quant_params->v_dc_delta_q = quant_params->v_ac_delta_q = dqpCr; + if (dqpCb != dqpCr) { + cm->seq_params->separate_uv_delta_q = 1; + } + } + + quant_params->qmatrix_level_y = + aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel); + quant_params->qmatrix_level_u = + aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q, + min_qmlevel, max_qmlevel); + + if (!cm->seq_params->separate_uv_delta_q) + quant_params->qmatrix_level_v = quant_params->qmatrix_level_u; + else + quant_params->qmatrix_level_v = + aom_get_qmlevel(quant_params->base_qindex + quant_params->v_ac_delta_q, + min_qmlevel, max_qmlevel); +} + +// Table that converts 0-63 Q-range values passed in outside to the Qindex +// range used internally. +static const int quantizer_to_qindex[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, + 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, + 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, + 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, + 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255, +}; + +int av1_quantizer_to_qindex(int quantizer) { + return quantizer_to_qindex[quantizer]; +} + +int av1_qindex_to_quantizer(int qindex) { + int quantizer; + + for (quantizer = 0; quantizer < 64; ++quantizer) + if (quantizer_to_qindex[quantizer] >= qindex) return quantizer; + + return 63; +} diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h new file mode 100644 index 0000000000..040973376d --- /dev/null +++ b/third_party/aom/av1/encoder/av1_quantize.h @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_ +#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_ + +#include "config/aom_config.h" + +#include "av1/common/quant_common.h" +#include "av1/common/scan.h" +#include "av1/encoder/block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct QUANT_PARAM { + int log_scale; + TX_SIZE tx_size; + const qm_val_t *qmatrix; + const qm_val_t *iqmatrix; + int use_quant_b_adapt; + int use_optimize_b; + int xform_quant_idx; +} QUANT_PARAM; + +typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +// The QUANTS structure is used only for internal quantizer setup in +// av1_quantize.c. +// All of its fields use the same coefficient shift/scaling at TX. +typedef struct { + // 0: dc 1: ac 2-8: ac repeated to SIMD width + DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); + + // TODO(jingning): in progress of re-working the quantization. will decide + // if we want to deprecate the current use of y_quant. + DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]); + + DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]); +} QUANTS; + +// The Dequants structure is used only for internal quantizer setup in +// av1_quantize.c. +// Fields are suffixed according to whether or not they're expressed in +// the same coefficient shift/precision as TX or a fixed Q3 format. +typedef struct { + DECLARE_ALIGNED(16, int16_t, + y_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, + u_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, + v_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width +} Dequants; + +// The DeltaQuantParams structure holds the dc/ac deltaq parameters. +typedef struct { + int y_dc_delta_q; + int u_dc_delta_q; + int u_ac_delta_q; + int v_dc_delta_q; + int v_ac_delta_q; +} DeltaQuantParams; + +typedef struct { + // Quantization parameters for internal quantizer setup. + QUANTS quants; + // Dequantization parameters for internal quantizer setup. + Dequants dequants; + // Deltaq parameters to track the state of the dc/ac deltaq parameters in + // cm->quant_params. It is used to decide whether the quantizer tables need + // to be re-initialized. + DeltaQuantParams prev_deltaq_params; +} EncQuantDequantParams; + +struct AV1_COMP; +struct AV1Common; + +void av1_frame_init_quantizer(struct AV1_COMP *cpi); + +void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x, + int segment_id, const int do_update); + +void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, + int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q, + int v_ac_delta_q, QUANTS *const quants, + Dequants *const deq); + +void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params, + const CommonQuantParams *quant_params, + aom_bit_depth_t bit_depth); + +void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel, + int max_qmlevel, int q, int enable_chroma_deltaq, + int enable_hdr_deltaq); + +int av1_quantizer_to_qindex(int quantizer); + +int av1_qindex_to_quantizer(int qindex); + +void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr); + +/*!\brief Quantize transform coefficients without using qmatrix + * + * quant_ptr, dequant_ptr and round_ptr are size 2 arrays, + * where index 0 corresponds to dc coeff and index 1 corresponds to ac coeffs. + * + * \param[in] quant_ptr 16-bit fixed point representation of inverse + * quantize step size, i.e. 2^16/dequant + * \param[in] dequant_ptr quantize step size + * \param[in] round_ptr rounding + * \param[in] log_scale the relative log scale of the transform + * coefficients + * \param[in] scan scan[i] indicates the position of ith to-be-coded + * coefficient + * \param[in] coeff_count number of coefficients + * \param[out] qcoeff_ptr quantized coefficients + * \param[out] dqcoeff_ptr dequantized coefficients + * + * \return The last non-zero coefficient's scan index plus 1 + */ +int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2], + const int16_t dequant_ptr[2], + const int16_t round_ptr[2], int log_scale, + const int16_t *scan, int coeff_count, + const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr); + +void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +/*!\brief Update quantize parameters in MACROBLOCK + * + * \param[in] enc_quant_dequant_params This parameter cached the quantize and + * dequantize parameters for all q + * indices. + * \param[in] qindex Quantize index used for the current + * superblock. + * \param[out] x A superblock data structure for + * encoder. + */ +void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params, + int qindex, MACROBLOCK *x); + +/*!\brief Update quantize matrix in MACROBLOCKD based on segment id + * + * \param[in] quant_params Quantize parameters used by encoder and decoder + * \param[in] segment_id Segment id. + * \param[out] xd A superblock data structure used by encoder and + * decoder. + */ +void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id, + MACROBLOCKD *xd); + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_ diff --git a/third_party/aom/av1/encoder/av1_temporal_denoiser.c b/third_party/aom/av1/encoder/av1_temporal_denoiser.c new file mode 100644 index 0000000000..3012df6311 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_temporal_denoiser.c @@ -0,0 +1,805 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/av1_temporal_denoiser.h" +#include "av1/encoder/encoder.h" + +#ifdef OUTPUT_YUV_DENOISED +static void make_grayscale(YV12_BUFFER_CONFIG *yuv); +#endif + +static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + return 3 + (increase_denoising ? 1 : 0); +} + +static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + (void)increase_denoising; + return 4; +} + +static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + (void)increase_denoising; + return 625; +} + +static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) { + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40); +} + +static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, + int motion_magnitude) { + if (motion_magnitude > noise_motion_thresh(bs, increase_denoising)) { + if (increase_denoising) + return (1 << num_pels_log2_lookup[bs]) << 2; + else + return 0; + } else { + return (1 << num_pels_log2_lookup[bs]) << 4; + } +} + +static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) { + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2); +} + +// TODO(kyslov): If increase_denoising is enabled in the future, +// we might need to update the code for calculating 'total_adj' in +// case the C code is not bit-exact with corresponding sse2 code. +int av1_denoiser_filter_c(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude) { + int r, c; + const uint8_t *sig_start = sig; + const uint8_t *mc_avg_start = mc_avg; + uint8_t *avg_start = avg; + int diff, adj, absdiff, delta; + int adj_val[] = { 3, 4, 6 }; + int total_adj = 0; + int shift_inc = 1; + + // If motion_magnitude is small, making the denoiser more aggressive by + // increasing the adjustment for each level. Add another increment for + // blocks that are labeled for increase denoising. + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { + if (increase_denoising) { + shift_inc = 2; + } + adj_val[0] += shift_inc; + adj_val[1] += shift_inc; + adj_val[2] += shift_inc; + } + + // First attempt to apply a strong temporal denoising filter. + for (r = 0; r < block_size_high[bs]; ++r) { + for (c = 0; c < block_size_wide[bs]; ++c) { + diff = mc_avg[c] - sig[c]; + absdiff = abs(diff); + + if (absdiff <= absdiff_thresh(bs, increase_denoising)) { + avg[c] = mc_avg[c]; + total_adj += diff; + } else { + switch (absdiff) { + case 4: + case 5: + case 6: + case 7: adj = adj_val[0]; break; + case 8: + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: adj = adj_val[1]; break; + default: adj = adj_val[2]; + } + if (diff > 0) { + avg[c] = AOMMIN(UINT8_MAX, sig[c] + adj); + total_adj += adj; + } else { + avg[c] = AOMMAX(0, sig[c] - adj); + total_adj -= adj; + } + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + + // If the strong filter did not modify the signal too much, we're all set. + if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) { + return FILTER_BLOCK; + } + + // Otherwise, we try to dampen the filter if the delta is not too high. + delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) >> + num_pels_log2_lookup[bs]) + + 1; + + if (delta >= delta_thresh(bs, increase_denoising)) { + return COPY_BLOCK; + } + + mc_avg = mc_avg_start; + avg = avg_start; + sig = sig_start; + for (r = 0; r < block_size_high[bs]; ++r) { + for (c = 0; c < block_size_wide[bs]; ++c) { + diff = mc_avg[c] - sig[c]; + adj = abs(diff); + if (adj > delta) { + adj = delta; + } + if (diff > 0) { + // Diff positive means we made positive adjustment above + // (in first try/attempt), so now make negative adjustment to bring + // denoised signal down. + avg[c] = AOMMAX(0, avg[c] - adj); + total_adj -= adj; + } else { + // Diff negative means we made negative adjustment above + // (in first try/attempt), so now make positive adjustment to bring + // denoised signal up. + avg[c] = AOMMIN(UINT8_MAX, avg[c] + adj); + total_adj += adj; + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + + // We can use the filter if it has been sufficiently dampened + if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) { + return FILTER_BLOCK; + } + return COPY_BLOCK; +} + +static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row, + int mi_col) { + return framebuf + (stride * mi_row << 2) + (mi_col << 2); +} + +static AV1_DENOISER_DECISION perform_motion_compensation( + AV1_COMMON *const cm, AV1_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs, + int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx, + int motion_magnitude, int *zeromv_filter, int num_spatial_layers, int width, + int lst_fb_idx, int gld_fb_idx, int use_svc, int spatial_layer, + int use_gf_temporal_ref) { + const int sse_diff = (ctx->newmv_sse == UINT_MAX) + ? 0 + : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse); + int frame; + int denoise_layer_idx = 0; + MACROBLOCKD *filter_mbd = &mb->e_mbd; + MB_MODE_INFO *mi = filter_mbd->mi[0]; + MB_MODE_INFO saved_mi; + int i; + struct buf_2d saved_dst[MAX_MB_PLANE]; + struct buf_2d saved_pre[MAX_MB_PLANE]; + // const RefBuffer *saved_block_refs[2]; + MV_REFERENCE_FRAME saved_frame; + + frame = ctx->best_reference_frame; + + saved_mi = *mi; + + // Avoid denoising small blocks. When noise > kDenLow or frame width > 480, + // denoise 16x16 blocks. + if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 || + (bs == BLOCK_16X16 && width > 480 && + denoiser->denoising_level <= kDenLow)) + return COPY_BLOCK; + + // If the best reference frame uses inter-prediction and there is enough of a + // difference in sum-squared-error, use it. + if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME && + sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) { + mi->ref_frame[0] = ctx->best_reference_frame; + mi->mode = ctx->best_sse_inter_mode; + mi->mv[0] = ctx->best_sse_mv; + } else { + // Otherwise, use the zero reference frame. + frame = ctx->best_zeromv_reference_frame; + ctx->newmv_sse = ctx->zeromv_sse; + // Bias to last reference. + if ((num_spatial_layers > 1 && !use_gf_temporal_ref) || + frame == ALTREF_FRAME || + (frame == GOLDEN_FRAME && use_gf_temporal_ref) || + (frame != LAST_FRAME && + ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) || + denoiser->denoising_level >= kDenHigh))) { + frame = LAST_FRAME; + ctx->newmv_sse = ctx->zeromv_lastref_sse; + } + mi->ref_frame[0] = frame; + mi->mode = GLOBALMV; + mi->mv[0].as_int = 0; + ctx->best_sse_inter_mode = GLOBALMV; + ctx->best_sse_mv.as_int = 0; + *zeromv_filter = 1; + if (denoiser->denoising_level > kDenMedium) { + motion_magnitude = 0; + } + } + + saved_frame = frame; + // When using SVC, we need to map REF_FRAME to the frame buffer index. + if (use_svc) { + if (frame == LAST_FRAME) + frame = lst_fb_idx + 1; + else if (frame == GOLDEN_FRAME) + frame = gld_fb_idx + 1; + // Shift for the second spatial layer. + if (num_spatial_layers - spatial_layer == 2) + frame = frame + denoiser->num_ref_frames; + denoise_layer_idx = num_spatial_layers - spatial_layer - 1; + } + + // Force copy (no denoise, copy source in denoised buffer) if + // running_avg_y[frame] is NULL. + if (denoiser->running_avg_y[frame].buffer_alloc == NULL) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + if (motion_magnitude > (noise_motion_thresh(bs, increase_denoising) << 3)) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + + // We will restore these after motion compensation. + for (i = 0; i < MAX_MB_PLANE; ++i) { + saved_pre[i] = filter_mbd->plane[i].pre[0]; + saved_dst[i] = filter_mbd->plane[i].dst; + } + + // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser + // struct. + set_ref_ptrs(cm, filter_mbd, saved_frame, NONE); + av1_setup_pre_planes(filter_mbd, 0, &(denoiser->running_avg_y[frame]), mi_row, + mi_col, filter_mbd->block_ref_scale_factors[0], 1); + av1_setup_dst_planes(filter_mbd->plane, bs, + &(denoiser->mc_running_avg_y[denoise_layer_idx]), mi_row, + mi_col, 0, 1); + + av1_enc_build_inter_predictor_y(filter_mbd, mi_row, mi_col); + + // Restore everything to its original state + *mi = saved_mi; + for (i = 0; i < MAX_MB_PLANE; ++i) { + filter_mbd->plane[i].pre[0] = saved_pre[i]; + filter_mbd->plane[i].dst = saved_dst[i]; + } + + return FILTER_BLOCK; +} + +void av1_denoiser_denoise(AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, + BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, + AV1_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref) { + int mv_col, mv_row; + int motion_magnitude = 0; + int zeromv_filter = 0; + AV1_DENOISER *denoiser = &cpi->denoiser; + AV1_DENOISER_DECISION decision = COPY_BLOCK; + + const int shift = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 + ? denoiser->num_ref_frames + : 0; + YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift]; + const int denoise_layer_index = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1; + YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index]; + uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col); + + uint8_t *mc_avg_start = + block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col); + struct buf_2d src = mb->plane[0].src; + int increase_denoising = 0; + int last_is_reference = cpi->ref_frame_flags & AOM_LAST_FLAG; + mv_col = ctx->best_sse_mv.as_mv.col; + mv_row = ctx->best_sse_mv.as_mv.row; + motion_magnitude = mv_row * mv_row + mv_col * mv_col; + + if (denoiser->denoising_level == kDenHigh) increase_denoising = 1; + + // Copy block if LAST_FRAME is not a reference. + // Last doesn't always exist when SVC layers are dynamically changed, e.g. top + // spatial layer doesn't have last reference when it's brought up for the + // first time on the fly. + if (last_is_reference && denoiser->denoising_level >= kDenLow && + !ctx->sb_skip_denoising) + decision = perform_motion_compensation( + &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, + motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers, + cpi->source->y_width, cpi->ppi->rtc_ref.ref_idx[0], + cpi->ppi->rtc_ref.ref_idx[3], cpi->ppi->use_svc, + cpi->svc.spatial_layer_id, use_gf_temporal_ref); + + if (decision == FILTER_BLOCK) { + decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start, + mc_avg.y_stride, avg_start, avg.y_stride, + increase_denoising, bs, motion_magnitude); + } + + if (decision == FILTER_BLOCK) { + aom_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, + block_size_wide[bs], block_size_high[bs]); + } else { // COPY_BLOCK + aom_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, + block_size_wide[bs], block_size_high[bs]); + } + *denoiser_decision = decision; + if (decision == FILTER_BLOCK && zeromv_filter == 1) + *denoiser_decision = FILTER_ZEROMV_BLOCK; +} + +static void copy_frame(YV12_BUFFER_CONFIG *const dest, + const YV12_BUFFER_CONFIG *const src) { + int r; + const uint8_t *srcbuf = src->y_buffer; + uint8_t *destbuf = dest->y_buffer; + + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + + for (r = 0; r < dest->y_height; ++r) { + memcpy(destbuf, srcbuf, dest->y_width); + destbuf += dest->y_stride; + srcbuf += src->y_stride; + } +} + +static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest, + YV12_BUFFER_CONFIG *const src) { + uint8_t *tmp_buf = dest->y_buffer; + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + dest->y_buffer = src->y_buffer; + src->y_buffer = tmp_buf; +} + +void av1_denoiser_update_frame_info( + AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref, + struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame, + int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx, int resized, + int svc_refresh_denoiser_buffers, int second_spatial_layer) { + const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0; + // Copy source into denoised reference buffers on KEY_FRAME or + // if the just encoded frame was resized. For SVC, copy source if the base + // spatial layer was key frame. + if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset || + svc_refresh_denoiser_buffers) { + int i; + // Start at 1 so as not to overwrite the INTRA_FRAME + for (i = 1; i < denoiser->num_ref_frames; ++i) { + if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL) + copy_frame(&denoiser->running_avg_y[i + shift], &src); + } + denoiser->reset = 0; + return; + } + + if (rtc_ref->set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (rtc_ref->refresh[svc->spatial_layer_id] & (1 << i)) + copy_frame(&denoiser->running_avg_y[i + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } else { + // If more than one refresh occurs, must copy frame buffer. + if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > + 1) { + if (refresh_alt_ref_frame) { + copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } else { + if (refresh_alt_ref_frame) { + swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } + } +} + +void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { + ctx->zeromv_sse = INT64_MAX; + ctx->newmv_sse = INT64_MAX; + ctx->zeromv_lastref_sse = INT64_MAX; + ctx->best_sse_mv.as_int = 0; +} + +void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse, + PREDICTION_MODE mode, + PICK_MODE_CONTEXT *ctx) { + if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) { + ctx->zeromv_sse = sse; + ctx->best_zeromv_reference_frame = mi->ref_frame[0]; + if (mi->ref_frame[0] == LAST_FRAME) ctx->zeromv_lastref_sse = sse; + } + + if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) { + ctx->newmv_sse = sse; + ctx->best_sse_inter_mode = mode; + ctx->best_sse_mv = mi->mv[0]; + ctx->best_reference_frame = mi->ref_frame[0]; + } +} + +static int av1_denoiser_realloc_svc_helper(AV1_COMMON *cm, + AV1_DENOISER *denoiser, int fb_idx) { + int fail = 0; + if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) { + fail = aom_alloc_frame_buffer( + &denoiser->running_avg_y[fb_idx], cm->width, cm->height, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->features.byte_alignment, 0, 0); + if (fail) { + av1_denoiser_free(denoiser); + return 1; + } + } + return 0; +} + +int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser, + struct RTC_REF *rtc_ref, struct SVC *svc, + int svc_buf_shift, int refresh_alt, + int refresh_gld, int refresh_lst, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx) { + int fail = 0; + if (rtc_ref->set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (cm->current_frame.frame_type == KEY_FRAME || + rtc_ref->refresh[svc->spatial_layer_id] & (1 << i)) { + fail = av1_denoiser_realloc_svc_helper(cm, denoiser, + i + 1 + svc_buf_shift); + } + } + } else { + if (refresh_alt) { + // Increase the frame buffer index by 1 to map it to the buffer index in + // the denoiser. + fail = av1_denoiser_realloc_svc_helper(cm, denoiser, + alt_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_gld) { + fail = av1_denoiser_realloc_svc_helper(cm, denoiser, + gld_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_lst) { + fail = av1_denoiser_realloc_svc_helper(cm, denoiser, + lst_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + } + return 0; +} + +int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, int use_highbitdepth, int border) { + int i, layer, fail, init_num_ref_frames; + const int legacy_byte_alignment = 0; + int num_layers = 1; + int scaled_width = width; + int scaled_height = height; + if (use_svc) { + LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + svc->temporal_layer_id]; + av1_get_layer_resolution(width, height, lc->scaling_factor_num, + lc->scaling_factor_den, &scaled_width, + &scaled_height); + // For SVC: only denoise at most 2 spatial (highest) layers. + if (noise_sen >= 2) + // Denoise from one spatial layer below the top. + svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 2, 0); + else + // Only denoise the top spatial layer. + svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 1, 0); + num_layers = svc->number_spatial_layers - svc->first_layer_denoise; + } + assert(denoiser != NULL); + denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES; + init_num_ref_frames = use_svc ? REF_FRAMES : NONSVC_REF_FRAMES; + denoiser->num_layers = num_layers; + CHECK_MEM_ERROR(cm, denoiser->running_avg_y, + aom_calloc(denoiser->num_ref_frames * num_layers, + sizeof(denoiser->running_avg_y[0]))); + CHECK_MEM_ERROR( + cm, denoiser->mc_running_avg_y, + aom_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0]))); + + for (layer = 0; layer < num_layers; ++layer) { + const int denoise_width = (layer == 0) ? width : scaled_width; + const int denoise_height = (layer == 0) ? height : scaled_height; + for (i = 0; i < init_num_ref_frames; ++i) { + fail = aom_alloc_frame_buffer( + &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer], + denoise_width, denoise_height, ssx, ssy, use_highbitdepth, border, + legacy_byte_alignment, 0, 0); + if (fail) { + av1_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + } + + fail = aom_alloc_frame_buffer( + &denoiser->mc_running_avg_y[layer], denoise_width, denoise_height, ssx, + ssy, use_highbitdepth, border, legacy_byte_alignment, 0, 0); + if (fail) { + av1_denoiser_free(denoiser); + return 1; + } + } + + // denoiser->last_source only used for noise_estimation, so only for top + // layer. + fail = aom_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy, + use_highbitdepth, border, legacy_byte_alignment, + 0, 0); + if (fail) { + av1_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + denoiser->frame_buffer_initialized = 1; + denoiser->denoising_level = kDenMedium; + denoiser->prev_denoising_level = kDenMedium; + denoiser->reset = 0; + denoiser->current_denoiser_frame = 0; + return 0; +} + +void av1_denoiser_free(AV1_DENOISER *denoiser) { + int i; + if (denoiser == NULL) { + return; + } + denoiser->frame_buffer_initialized = 0; + for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) { + aom_free_frame_buffer(&denoiser->running_avg_y[i]); + } + aom_free(denoiser->running_avg_y); + denoiser->running_avg_y = NULL; + + for (i = 0; i < denoiser->num_layers; ++i) { + aom_free_frame_buffer(&denoiser->mc_running_avg_y[i]); + } + + aom_free(denoiser->mc_running_avg_y); + denoiser->mc_running_avg_y = NULL; + aom_free_frame_buffer(&denoiser->last_source); +} + +// TODO(kyslov) Enable when SVC temporal denosing is implemented +#if 0 +static void force_refresh_longterm_ref(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // If long term reference is used, force refresh of that slot, so + // denoiser buffer for long term reference stays in sync. + if (svc->use_gf_temporal_ref_current_layer) { + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->refresh_alt_ref_frame = 1; + } +} +#endif + +void av1_denoiser_set_noise_level(AV1_COMP *const cpi, int noise_level) { + AV1_DENOISER *const denoiser = &cpi->denoiser; + denoiser->denoising_level = noise_level; + if (denoiser->denoising_level > kDenLowLow && + denoiser->prev_denoising_level == kDenLowLow) { + denoiser->reset = 1; +// TODO(kyslov) Enable when SVC temporal denosing is implemented +#if 0 + force_refresh_longterm_ref(cpi); +#endif + } else { + denoiser->reset = 0; + } + denoiser->prev_denoising_level = denoiser->denoising_level; +} + +// Scale/increase the partition threshold +// for denoiser speed-up. +int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level, + CONTENT_STATE_SB content_state, + int temporal_layer_id) { + if ((content_state.source_sad_nonrd <= kLowSad && + content_state.low_sumdiff) || + (content_state.source_sad_nonrd == kHighSad && + content_state.low_sumdiff) || + (content_state.lighting_change && !content_state.low_sumdiff) || + (noise_level == kDenHigh) || (temporal_layer_id != 0)) { + int64_t scaled_thr = + (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2; + return scaled_thr; + } else { + return (5 * threshold) >> 2; + } +} + +// Scale/increase the ac skip threshold for +// denoiser speed-up. +int64_t av1_scale_acskip_thresh(int64_t threshold, + AV1_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id) { + if (noise_level >= kDenLow && abs_sumdiff < 5) + threshold *= (noise_level == kDenLow) ? 2 + : (temporal_layer_id == 2) ? 10 + : 6; + return threshold; +} + +void av1_denoiser_reset_on_first_frame(AV1_COMP *const cpi) { + if (/*av1_denoise_svc_non_key(cpi) &&*/ + cpi->denoiser.current_denoiser_frame == 0) { + cpi->denoiser.reset = 1; +// TODO(kyslov) Enable when SVC temporal denosing is implemented +#if 0 + force_refresh_longterm_ref(cpi); +#endif + } +} + +void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + SVC *const svc = &cpi->svc; + + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->denoiser.denoising_level > kDenLowLow) { + int svc_refresh_denoiser_buffers = 0; + int denoise_svc_second_layer = 0; + FRAME_TYPE frame_type = cm->current_frame.frame_type == INTRA_ONLY_FRAME + ? KEY_FRAME + : cm->current_frame.frame_type; + cpi->denoiser.current_denoiser_frame++; + const int resize_pending = is_frame_resize_pending(cpi); + + if (cpi->ppi->use_svc) { +// TODO(kyslov) Enable when SVC temporal denosing is implemented +#if 0 + const int svc_buf_shift = + svc->number_spatial_layers - svc->spatial_layer_id == 2 + ? cpi->denoiser.num_ref_frames + : 0; + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + svc_refresh_denoiser_buffers = + lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id]; + denoise_svc_second_layer = + svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0; + // Check if we need to allocate extra buffers in the denoiser + // for refreshed frames. + if (av1_denoiser_realloc_svc(cm, &cpi->denoiser, rtc_ref, + svc, svc_buf_shift, + cpi->refresh_alt_ref_frame, + cpi->refresh_golden_frame, + cpi->refresh_last_frame, cpi->alt_fb_idx, + cpi->gld_fb_idx, cpi->lst_fb_idx)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to re-allocate denoiser for SVC"); +#endif + } + av1_denoiser_update_frame_info( + &cpi->denoiser, *cpi->source, rtc_ref, svc, frame_type, + cpi->refresh_frame.alt_ref_frame, cpi->refresh_frame.golden_frame, 1, + rtc_ref->ref_idx[6], rtc_ref->ref_idx[3], rtc_ref->ref_idx[0], + resize_pending, svc_refresh_denoiser_buffers, denoise_svc_second_layer); + } +} + +#ifdef OUTPUT_YUV_DENOISED +static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { + int r, c; + uint8_t *u = yuv->u_buffer; + uint8_t *v = yuv->v_buffer; + + for (r = 0; r < yuv->uv_height; ++r) { + for (c = 0; c < yuv->uv_width; ++c) { + u[c] = UINT8_MAX / 2; + v[c] = UINT8_MAX / 2; + } + u += yuv->uv_stride; + v += yuv->uv_stride; + } +} + +void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { + unsigned char *src = s->y_buffer; + int h = s->y_crop_height; + + do { + fwrite(src, s->y_width, 1, yuv_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); +} +#endif diff --git a/third_party/aom/av1/encoder/av1_temporal_denoiser.h b/third_party/aom/av1/encoder/av1_temporal_denoiser.h new file mode 100644 index 0000000000..14dcccce69 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_temporal_denoiser.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_ +#define AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_ + +#include "av1/encoder/block.h" +#include "aom_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MOTION_MAGNITUDE_THRESHOLD (8 * 3) + +// Denoiser is used in non svc real-time mode which does not use alt-ref, so no +// need to allocate for it, and hence we need MAX_REF_FRAME - 1 +#define NONSVC_REF_FRAMES REF_FRAMES - 1 + +// Number of frame buffers when SVC is used. [0] for current denoised buffer and +// [1..8] for REF_FRAMES +#define SVC_REF_FRAMES 9 + +typedef enum av1_denoiser_decision { + COPY_BLOCK, + FILTER_BLOCK, + FILTER_ZEROMV_BLOCK +} AV1_DENOISER_DECISION; + +typedef enum av1_denoiser_level { + kDenLowLow, + kDenLow, + kDenMedium, + kDenHigh +} AV1_DENOISER_LEVEL; + +typedef struct av1_denoiser { + YV12_BUFFER_CONFIG *running_avg_y; + YV12_BUFFER_CONFIG *mc_running_avg_y; + YV12_BUFFER_CONFIG last_source; + int frame_buffer_initialized; + int reset; + int num_ref_frames; + int num_layers; + unsigned int current_denoiser_frame; + AV1_DENOISER_LEVEL denoising_level; + AV1_DENOISER_LEVEL prev_denoising_level; +} AV1_DENOISER; + +typedef struct { + int64_t zero_last_cost_orig; + unsigned int *ref_frame_cost; + int_mv (*frame_mv)[REF_FRAMES]; + int reuse_inter_pred; + TX_SIZE best_tx_size; + PREDICTION_MODE best_mode; + MV_REFERENCE_FRAME best_ref_frame; + int_interpfilters best_pred_filter; + uint8_t best_mode_skip_txfm; +} AV1_PICKMODE_CTX_DEN; + +struct AV1_COMP; +struct SVC; +struct RTC_REF; + +void av1_denoiser_update_frame_info( + AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref, + struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame, + int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx, int resized, + int svc_refresh_denoiser_buffers, int second_spatial_layer); + +void av1_denoiser_denoise(struct AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, + AV1_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref); + +void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); + +void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse, + PREDICTION_MODE mode, + PICK_MODE_CONTEXT *ctx); + +int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser, + struct RTC_REF *rtc, struct SVC *svc, + int svc_buf_shift, int refresh_alt, + int refresh_gld, int refresh_lst, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx); + +int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, int use_highbitdepth, int border); + +#if CONFIG_AV1_TEMPORAL_DENOISING +// This function is used by both c and sse2 denoiser implementations. +// Define it as a static function within the scope where av1_denoiser.h +// is referenced. +static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs, + int increase_denoising) { + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2); +} +#endif + +void av1_denoiser_free(AV1_DENOISER *denoiser); + +void av1_denoiser_set_noise_level(struct AV1_COMP *const cpi, int noise_level); + +void av1_denoiser_reset_on_first_frame(struct AV1_COMP *const cpi); + +int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level, + CONTENT_STATE_SB content_state, + int temporal_layer_id); + +int64_t av1_scale_acskip_thresh(int64_t threshold, + AV1_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id); + +void av1_denoiser_update_ref_frame(struct AV1_COMP *const cpi); + +void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_ diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c new file mode 100644 index 0000000000..219784fedf --- /dev/null +++ b/third_party/aom/av1/encoder/bitstream.c @@ -0,0 +1,4248 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_dsp/bitwriter_buffer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem_ops.h" +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "av1/common/cdef.h" +#include "av1/common/cfl.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/bitstream.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/pickrst.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/tokenize.h" + +#define ENC_MISMATCH_DEBUG 0 +#define SETUP_TIME_OH_CONST 5 // Setup time overhead constant per worker +#define JOB_DISP_TIME_OH_CONST 1 // Job dispatch time overhead per tile + +static INLINE void write_uniform(aom_writer *w, int n, int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return; + if (v < m) { + aom_write_literal(w, v, l - 1); + } else { + aom_write_literal(w, m + ((v - m) >> 1), l - 1); + aom_write_literal(w, (v - m) & 1, 1); + } +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE void loop_restoration_write_sb_coeffs( + const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx, + aom_writer *const w, int plane, FRAME_COUNTS *counts); +#endif + +static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx, + const MB_MODE_INFO *mi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, + PREDICTION_MODE mode, + aom_writer *w) { + assert(!is_intrabc_block(mi)); + (void)mi; + aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi), + INTRA_MODES); +} + +static AOM_INLINE void write_inter_mode(aom_writer *w, PREDICTION_MODE mode, + FRAME_CONTEXT *ec_ctx, + const int16_t mode_ctx) { + const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; + + aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2); + + if (mode != NEWMV) { + const int16_t zeromv_ctx = + (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2); + + if (mode != GLOBALMV) { + int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2); + } + } +} + +static AOM_INLINE void write_drl_idx( + FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) { + assert(mbmi->ref_mv_idx < 3); + + const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV; + if (new_mv) { + int idx; + for (idx = 0; idx < 2; ++idx) { + if (mbmi_ext_frame->ref_mv_count > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx); + + aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx], + 2); + if (mbmi->ref_mv_idx == idx) return; + } + } + return; + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + int idx; + // TODO(jingning): Temporary solution to compensate the NEARESTMV offset. + for (idx = 1; idx < 3; ++idx) { + if (mbmi_ext_frame->ref_mv_count > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx); + aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1), + ec_ctx->drl_cdf[drl_ctx], 2); + if (mbmi->ref_mv_idx == (idx - 1)) return; + } + } + return; + } +} + +static AOM_INLINE void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w, + PREDICTION_MODE mode, + const int16_t mode_ctx) { + assert(is_inter_compound_mode(mode)); + aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode), + xd->tile_ctx->inter_compound_mode_cdf[mode_ctx], + INTER_COMPOUND_MODES); +} + +static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, + TX_SIZE tx_size, int depth, + int blk_row, int blk_col, + aom_writer *w) { + FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; + const int max_blocks_high = max_block_high(xd, mbmi->bsize, 0); + const int max_blocks_wide = max_block_wide(xd, mbmi->bsize, 0); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (depth == MAX_VARTX_DEPTH) { + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + return; + } + + const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, + mbmi->bsize, tx_size); + const int txb_size_index = + av1_get_txb_size_index(mbmi->bsize, blk_row, blk_col); + const int write_txfm_partition = + tx_size == mbmi->inter_tx_size[txb_size_index]; + if (write_txfm_partition) { + aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2); + + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + // TODO(yuec): set correct txfm partition update for qttx + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + + aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2); + + if (sub_txs == TX_4X4) { + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, sub_txs, tx_size); + return; + } + + assert(bsw > 0 && bsh > 0); + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + const int offsetc = blk_col + col; + write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w); + } + } + } +} + +static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd, + aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + if (block_signals_txsize(bsize)) { + const TX_SIZE tx_size = mbmi->tx_size; + const int tx_size_ctx = get_tx_size_context(xd); + const int depth = tx_size_to_depth(tx_size, bsize); + const int max_depths = bsize_to_max_depth(bsize); + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + + assert(depth >= 0 && depth <= max_depths); + assert(!is_inter_block(mbmi)); + assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi))); + + aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], + max_depths + 1); + } +} + +static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd, + uint8_t segment_id, const MB_MODE_INFO *mi, + aom_writer *w) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { + const int skip_txfm = mi->skip_txfm; + const int ctx = av1_get_skip_txfm_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, skip_txfm, ec_ctx->skip_txfm_cdfs[ctx], 2); + return skip_txfm; + } +} + +static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd, + uint8_t segment_id, const MB_MODE_INFO *mi, + aom_writer *w) { + if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 0; + } + const int skip_mode = mi->skip_mode; + if (!is_comp_ref_allowed(mi->bsize)) { + assert(!skip_mode); + return 0; + } + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + // These features imply single-reference mode, while skip mode implies + // compound reference. Hence, the two are mutually exclusive. + // In other words, skip_mode is implicitly 0 here. + assert(!skip_mode); + return 0; + } + const int ctx = av1_get_skip_mode_context(xd); + aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2); + return skip_mode; +} + +static AOM_INLINE void write_is_inter(const AV1_COMMON *cm, + const MACROBLOCKD *xd, uint8_t segment_id, + aom_writer *w, const int is_inter) { + if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + assert(is_inter); + return; + } + const int ctx = av1_get_intra_inter_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2); + } +} + +static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, + aom_writer *w) { + MOTION_MODE last_motion_mode_allowed = + cm->features.switchable_motion_mode + ? motion_mode_allowed(cm->global_motion, xd, mbmi, + cm->features.allow_warped_motion) + : SIMPLE_TRANSLATION; + assert(mbmi->motion_mode <= last_motion_mode_allowed); + switch (last_motion_mode_allowed) { + case SIMPLE_TRANSLATION: break; + case OBMC_CAUSAL: + aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL, + xd->tile_ctx->obmc_cdf[mbmi->bsize], 2); + break; + default: + aom_write_symbol(w, mbmi->motion_mode, + xd->tile_ctx->motion_mode_cdf[mbmi->bsize], + MOTION_MODES); + } +} + +static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd, + int delta_qindex, aom_writer *w) { + int sign = delta_qindex < 0; + int abs = sign ? -delta_qindex : delta_qindex; + int rem_bits, thr; + int smallval = abs < DELTA_Q_SMALL ? 1 : 0; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf, + DELTA_Q_PROBS + 1); + + if (!smallval) { + rem_bits = get_msb(abs - 1); + thr = (1 << rem_bits) + 1; + aom_write_literal(w, rem_bits - 1, 3); + aom_write_literal(w, abs - thr, rem_bits); + } + if (abs > 0) { + aom_write_bit(w, sign); + } +} + +static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm, + const MACROBLOCKD *xd, int lf_id, + int delta_lflevel, + int delta_lf_multi, aom_writer *w) { + int sign = delta_lflevel < 0; + int abs = sign ? -delta_lflevel : delta_lflevel; + int rem_bits, thr; + int smallval = abs < DELTA_LF_SMALL ? 1 : 0; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + (void)cm; + + if (delta_lf_multi) { + assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT + : FRAME_LF_COUNT - 2)); + aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), + ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1); + } else { + aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf, + DELTA_LF_PROBS + 1); + } + + if (!smallval) { + rem_bits = get_msb(abs - 1); + thr = (1 << rem_bits) + 1; + aom_write_literal(w, rem_bits - 1, 3); + aom_write_literal(w, abs - thr, rem_bits); + } + if (abs > 0) { + aom_write_bit(w, sign); + } +} + +static AOM_INLINE void pack_map_tokens(aom_writer *w, const TokenExtra **tp, + int n, int num, MapCdf map_pb_cdf) { + const TokenExtra *p = *tp; + const int palette_size_idx = n - PALETTE_MIN_SIZE; + write_uniform(w, n, p->token); // The first color index. + ++p; + --num; + for (int i = 0; i < num; ++i) { + assert((p->color_ctx >= 0) && + (p->color_ctx < PALETTE_COLOR_INDEX_CONTEXTS)); + aom_cdf_prob *color_map_cdf = map_pb_cdf[palette_size_idx][p->color_ctx]; + aom_write_symbol(w, p->token, color_map_cdf, n); + ++p; + } + *tp = p; +} + +static AOM_INLINE void pack_txb_tokens( + aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TokenExtra **tp, + const TokenExtra *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi, + int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block, + int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) { + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + + if (tx_size == plane_tx_size || plane) { + av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, block, tx_size); +#if CONFIG_RD_DEBUG + TOKEN_STATS tmp_token_stats; + init_token_stats(&tmp_token_stats); + token_stats->cost += tmp_token_stats.cost; +#endif + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsh * bsw; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + + assert(bsw > 0 && bsh > 0); + + for (int r = 0; r < row_end; r += bsh) { + const int offsetr = blk_row + r; + for (int c = 0; c < col_end; c += bsw) { + const int offsetc = blk_col + c; + pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize, + bit_depth, block, offsetr, offsetc, sub_txs, + token_stats); + block += step; + } + } + } +} + +static INLINE void set_spatial_segment_id( + const CommonModeInfoParams *const mi_params, uint8_t *segment_ids, + BLOCK_SIZE bsize, int mi_row, int mi_col, uint8_t segment_id) { + const int mi_offset = mi_row * mi_params->mi_cols + mi_col; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh); + + const int mi_stride = mi_params->mi_cols; + + set_segment_id(segment_ids, mi_offset, xmis, ymis, mi_stride, segment_id); +} + +int av1_neg_interleave(int x, int ref, int max) { + assert(x < max); + const int diff = x - ref; + if (!ref) return x; + if (ref >= (max - 1)) return -x + max - 1; + if (2 * ref < max) { + if (abs(diff) <= ref) { + if (diff > 0) + return (diff << 1) - 1; + else + return ((-diff) << 1); + } + return x; + } else { + if (abs(diff) < (max - ref)) { + if (diff > 0) + return (diff << 1) - 1; + else + return ((-diff) << 1); + } + return (max - x) - 1; + } +} + +static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd, + const MB_MODE_INFO *const mbmi, + aom_writer *w, + const struct segmentation *seg, + struct segmentation_probs *segp, + int skip_txfm) { + if (!seg->enabled || !seg->update_map) return; + + AV1_COMMON *const cm = &cpi->common; + int cdf_num; + const uint8_t pred = av1_get_spatial_seg_pred( + cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + if (skip_txfm) { + // Still need to transmit tx size for intra blocks even if skip_txfm is + // true. Changing segment_id may make the tx size become invalid, e.g + // changing from lossless to lossy. + assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment); + + set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize, + mi_row, mi_col, pred); + set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->bsize, + mi_row, mi_col, pred); + /* mbmi is read only but we need to update segment_id */ + ((MB_MODE_INFO *)mbmi)->segment_id = pred; + return; + } + + const int coded_id = + av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1); + aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num]; + aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS); + set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize, + mi_row, mi_col, mbmi->segment_id); +} + +#define WRITE_REF_BIT(bname, pname) \ + aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2) + +// This function encodes the reference frame +static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm, + const MACROBLOCKD *xd, aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_compound = has_second_ref(mbmi); + const uint8_t segment_id = mbmi->segment_id; + + // If segment level coding of this signal is disabled... + // or the segment allows multiple reference frame options + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + assert(!is_compound); + assert(mbmi->ref_frame[0] == + get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); + } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + assert(!is_compound); + assert(mbmi->ref_frame[0] == LAST_FRAME); + } else { + // does the feature use compound prediction or not + // (if not specified at the frame/segment level) + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + if (is_comp_ref_allowed(mbmi->bsize)) + aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2); + } else { + assert((!is_compound) == + (cm->current_frame.reference_mode == SINGLE_REFERENCE)); + } + + if (is_compound) { + const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi) + ? UNIDIR_COMP_REFERENCE + : BIDIR_COMP_REFERENCE; + aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd), + 2); + + if (comp_ref_type == UNIDIR_COMP_REFERENCE) { + const int bit = mbmi->ref_frame[0] == BWDREF_FRAME; + WRITE_REF_BIT(bit, uni_comp_ref_p); + + if (!bit) { + assert(mbmi->ref_frame[0] == LAST_FRAME); + const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME || + mbmi->ref_frame[1] == GOLDEN_FRAME; + WRITE_REF_BIT(bit1, uni_comp_ref_p1); + if (bit1) { + const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME; + WRITE_REF_BIT(bit2, uni_comp_ref_p2); + } + } else { + assert(mbmi->ref_frame[1] == ALTREF_FRAME); + } + + return; + } + + assert(comp_ref_type == BIDIR_COMP_REFERENCE); + + const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME || + mbmi->ref_frame[0] == LAST3_FRAME); + WRITE_REF_BIT(bit, comp_ref_p); + + if (!bit) { + const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME; + WRITE_REF_BIT(bit1, comp_ref_p1); + } else { + const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME; + WRITE_REF_BIT(bit2, comp_ref_p2); + } + + const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME; + WRITE_REF_BIT(bit_bwd, comp_bwdref_p); + + if (!bit_bwd) { + WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1); + } + + } else { + const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME && + mbmi->ref_frame[0] >= BWDREF_FRAME); + WRITE_REF_BIT(bit0, single_ref_p1); + + if (bit0) { + const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME; + WRITE_REF_BIT(bit1, single_ref_p2); + + if (!bit1) { + WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6); + } + } else { + const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME || + mbmi->ref_frame[0] == GOLDEN_FRAME); + WRITE_REF_BIT(bit2, single_ref_p3); + + if (!bit2) { + const int bit3 = mbmi->ref_frame[0] != LAST_FRAME; + WRITE_REF_BIT(bit3, single_ref_p4); + } else { + const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME; + WRITE_REF_BIT(bit4, single_ref_p5); + } + } + } + } +} + +static AOM_INLINE void write_filter_intra_mode_info( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + aom_writer *w) { + if (av1_filter_intra_allowed(cm, mbmi)) { + aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra, + xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2); + if (mbmi->filter_intra_mode_info.use_filter_intra) { + const FILTER_INTRA_MODE mode = + mbmi->filter_intra_mode_info.filter_intra_mode; + aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf, + FILTER_INTRA_MODES); + } + } +} + +static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta, + aom_cdf_prob *cdf) { + aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf, + 2 * MAX_ANGLE_DELTA + 1); +} + +static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm, + ThreadData *td, aom_writer *w) { + const MACROBLOCKD *xd = &td->mb.e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (!av1_is_interp_needed(xd)) { + int_interpfilters filters = av1_broadcast_interp_filter( + av1_unswitchable_filter(cm->features.interp_filter)); + assert(mbmi->interp_filters.as_int == filters.as_int); + (void)filters; + return; + } + if (cm->features.interp_filter == SWITCHABLE) { + int dir; + for (dir = 0; dir < 2; ++dir) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = + av1_extract_interp_filter(mbmi->interp_filters, dir); + aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx], + SWITCHABLE_FILTERS); + ++td->interp_filter_selected[filter]; + if (cm->seq_params->enable_dual_filter == 0) return; + } + } +} + +// Transmit color values with delta encoding. Write the first value as +// literal, and the deltas between each value and the previous one. "min_val" is +// the smallest possible value of the deltas. +static AOM_INLINE void delta_encode_palette_colors(const int *colors, int num, + int bit_depth, int min_val, + aom_writer *w) { + if (num <= 0) return; + assert(colors[0] < (1 << bit_depth)); + aom_write_literal(w, colors[0], bit_depth); + if (num == 1) return; + int max_delta = 0; + int deltas[PALETTE_MAX_SIZE]; + memset(deltas, 0, sizeof(deltas)); + for (int i = 1; i < num; ++i) { + assert(colors[i] < (1 << bit_depth)); + const int delta = colors[i] - colors[i - 1]; + deltas[i - 1] = delta; + assert(delta >= min_val); + if (delta > max_delta) max_delta = delta; + } + const int min_bits = bit_depth - 3; + int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits); + assert(bits <= bit_depth); + int range = (1 << bit_depth) - colors[0] - min_val; + aom_write_literal(w, bits - min_bits, 2); + for (int i = 0; i < num - 1; ++i) { + aom_write_literal(w, deltas[i] - min_val, bits); + range -= deltas[i]; + bits = AOMMIN(bits, av1_ceil_log2(range)); + } +} + +// Transmit luma palette color values. First signal if each color in the color +// cache is used. Those colors that are not in the cache are transmitted with +// delta encoding. +static AOM_INLINE void write_palette_colors_y( + const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi, + int bit_depth, aom_writer *w) { + const int n = pmi->palette_size[0]; + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = + av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n, + cache_color_found, out_cache_colors); + int n_in_cache = 0; + for (int i = 0; i < n_cache && n_in_cache < n; ++i) { + const int found = cache_color_found[i]; + aom_write_bit(w, found); + n_in_cache += found; + } + assert(n_in_cache + n_out_cache == n); + delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w); +} + +// Write chroma palette color values. U channel is handled similarly to the luma +// channel. For v channel, either use delta encoding or transmit raw values +// directly, whichever costs less. +static AOM_INLINE void write_palette_colors_uv( + const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi, + int bit_depth, aom_writer *w) { + const int n = pmi->palette_size[1]; + const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE; + const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE; + // U channel colors. + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = av1_index_color_cache( + color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors); + int n_in_cache = 0; + for (int i = 0; i < n_cache && n_in_cache < n; ++i) { + const int found = cache_color_found[i]; + aom_write_bit(w, found); + n_in_cache += found; + } + delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w); + + // V channel colors. Don't use color cache as the colors are not sorted. + const int max_val = 1 << bit_depth; + int zero_count = 0, min_bits_v = 0; + int bits_v = + av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v); + const int rate_using_delta = + 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; + const int rate_using_raw = bit_depth * n; + if (rate_using_delta < rate_using_raw) { // delta encoding + assert(colors_v[0] < (1 << bit_depth)); + aom_write_bit(w, 1); + aom_write_literal(w, bits_v - min_bits_v, 2); + aom_write_literal(w, colors_v[0], bit_depth); + for (int i = 1; i < n; ++i) { + assert(colors_v[i] < (1 << bit_depth)); + if (colors_v[i] == colors_v[i - 1]) { // No need to signal sign bit. + aom_write_literal(w, 0, bits_v); + continue; + } + const int delta = abs((int)colors_v[i] - colors_v[i - 1]); + const int sign_bit = colors_v[i] < colors_v[i - 1]; + if (delta <= max_val - delta) { + aom_write_literal(w, delta, bits_v); + aom_write_bit(w, sign_bit); + } else { + aom_write_literal(w, max_val - delta, bits_v); + aom_write_bit(w, !sign_bit); + } + } + } else { // Transmit raw values. + aom_write_bit(w, 0); + for (int i = 0; i < n; ++i) { + assert(colors_v[i] < (1 << bit_depth)); + aom_write_literal(w, colors_v[i], bit_depth); + } + } +} + +static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm, + const MACROBLOCKD *xd, + const MB_MODE_INFO *const mbmi, + aom_writer *w) { + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE bsize = mbmi->bsize; + assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize)); + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + + if (mbmi->mode == DC_PRED) { + const int n = pmi->palette_size[0]; + const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd); + aom_write_symbol( + w, n > 0, + xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2); + if (n > 0) { + aom_write_symbol(w, n - PALETTE_MIN_SIZE, + xd->tile_ctx->palette_y_size_cdf[bsize_ctx], + PALETTE_SIZES); + write_palette_colors_y(xd, pmi, cm->seq_params->bit_depth, w); + } + } + + const int uv_dc_pred = + num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref; + if (uv_dc_pred) { + const int n = pmi->palette_size[1]; + const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); + aom_write_symbol(w, n > 0, + xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2); + if (n > 0) { + aom_write_symbol(w, n - PALETTE_MIN_SIZE, + xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], + PALETTE_SIZES); + write_palette_colors_uv(xd, pmi, cm->seq_params->bit_depth, w); + } + } +} + +void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, + TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const FeatureFlags *const features = &cm->features; + const int is_inter = is_inter_block(mbmi); + if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 && + ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) || + (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && + !mbmi->skip_txfm && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + const TxSetType tx_set_type = av1_get_ext_tx_set_type( + tx_size, is_inter, features->reduced_tx_set_used); + const int eset = + get_ext_tx_set(tx_size, is_inter, features->reduced_tx_set_used); + // eset == 0 should correspond to a set with only DCT_DCT and there + // is no need to send the tx_type + assert(eset > 0); + assert(av1_ext_tx_used[tx_set_type][tx_type]); + if (is_inter) { + aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type], + ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], + av1_num_ext_tx_set[tx_set_type]); + } else { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = + fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]; + else + intra_dir = mbmi->mode; + aom_write_symbol( + w, av1_ext_tx_ind[tx_set_type][tx_type], + ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir], + av1_num_ext_tx_set[tx_set_type]); + } + } +} + +static AOM_INLINE void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, + BLOCK_SIZE bsize, + PREDICTION_MODE mode, + aom_writer *w) { + aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]], + INTRA_MODES); +} + +static AOM_INLINE void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx, + UV_PREDICTION_MODE uv_mode, + PREDICTION_MODE y_mode, + CFL_ALLOWED_TYPE cfl_allowed, + aom_writer *w) { + aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode], + UV_INTRA_MODES - !cfl_allowed); +} + +static AOM_INLINE void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, + uint8_t idx, int8_t joint_sign, + aom_writer *w) { + aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS); + // Magnitudes are only signaled for nonzero codes. + if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE); + } + if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE); + } +} + +static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, + aom_writer *w, int skip) { + if (cm->features.coded_lossless || cm->features.allow_intrabc) return; + + // At the start of a superblock, mark that we haven't yet written CDEF + // strengths for any of the CDEF units contained in this superblock. + const int sb_mask = (cm->seq_params->mib_size - 1); + const int mi_row_in_sb = (xd->mi_row & sb_mask); + const int mi_col_in_sb = (xd->mi_col & sb_mask); + if (mi_row_in_sb == 0 && mi_col_in_sb == 0) { + xd->cdef_transmitted[0] = xd->cdef_transmitted[1] = + xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false; + } + + // CDEF unit size is 64x64 irrespective of the superblock size. + const int cdef_size = 1 << (6 - MI_SIZE_LOG2); + + // Find index of this CDEF unit in this superblock. + const int index_mask = cdef_size; + const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0); + const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0); + const int index = (cm->seq_params->sb_size == BLOCK_128X128) + ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb + : 0; + + // Write CDEF strength to the first non-skip coding block in this CDEF unit. + if (!xd->cdef_transmitted[index] && !skip) { + // CDEF strength for this CDEF unit needs to be stored in the MB_MODE_INFO + // of the 1st block in this CDEF unit. + const int first_block_mask = ~(cdef_size - 1); + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int grid_idx = + get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask, + xd->mi_col & first_block_mask); + const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx]; + aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits); + xd->cdef_transmitted[index] = true; + } +} + +static AOM_INLINE void write_inter_segment_id( + AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w, + const struct segmentation *const seg, struct segmentation_probs *const segp, + int skip, int preskip) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + AV1_COMMON *const cm = &cpi->common; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + if (seg->update_map) { + if (preskip) { + if (!seg->segid_preskip) return; + } else { + if (seg->segid_preskip) return; + if (skip) { + write_segment_id(cpi, xd, mbmi, w, seg, segp, 1); + if (seg->temporal_update) mbmi->seg_id_predicted = 0; + return; + } + } + if (seg->temporal_update) { + const int pred_flag = mbmi->seg_id_predicted; + aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd); + aom_write_symbol(w, pred_flag, pred_cdf, 2); + if (!pred_flag) { + write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); + } + if (pred_flag) { + set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, + mbmi->bsize, mi_row, mi_col, mbmi->segment_id); + } + } else { + write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); + } + } +} + +// If delta q is present, writes delta_q index. +// Also writes delta_q loop filter levels, if present. +static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm, + MACROBLOCKD *const xd, int skip, + aom_writer *w) { + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + + if (delta_q_info->delta_q_present_flag) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + const int super_block_upper_left = + ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) && + ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0); + + if ((bsize != cm->seq_params->sb_size || skip == 0) && + super_block_upper_left) { + assert(mbmi->current_qindex > 0); + const int reduced_delta_qindex = + (mbmi->current_qindex - xd->current_base_qindex) / + delta_q_info->delta_q_res; + write_delta_qindex(xd, reduced_delta_qindex, w); + xd->current_base_qindex = mbmi->current_qindex; + if (delta_q_info->delta_lf_present_flag) { + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + int reduced_delta_lflevel = + (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / + delta_q_info->delta_lf_res; + write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, 1, w); + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; + } + } else { + int reduced_delta_lflevel = + (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / + delta_q_info->delta_lf_res; + write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, 0, w); + xd->delta_lf_from_base = mbmi->delta_lf_from_base; + } + } + } + } +} + +static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm, + MACROBLOCKD *const xd, + int is_keyframe, + aom_writer *w) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PREDICTION_MODE mode = mbmi->mode; + const BLOCK_SIZE bsize = mbmi->bsize; + + // Y mode. + if (is_keyframe) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w); + } else { + write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w); + } + + // Y angle delta. + const int use_angle_delta = av1_use_angle_delta(bsize); + if (use_angle_delta && av1_is_directional_mode(mode)) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], + ec_ctx->angle_delta_cdf[mode - V_PRED]); + } + + // UV mode and UV angle delta. + if (!cm->seq_params->monochrome && xd->is_chroma_ref) { + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); + if (uv_mode == UV_CFL_PRED) + write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); + const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); + if (use_angle_delta && av1_is_directional_mode(intra_mode)) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], + ec_ctx->angle_delta_cdf[intra_mode - V_PRED]); + } + } + + // Palette. + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { + write_palette_mode_info(cm, xd, mbmi, w); + } + + // Filter intra. + write_filter_intra_mode_info(cm, xd, mbmi, w); +} + +static INLINE int16_t mode_context_analyzer( + const int16_t mode_context, const MV_REFERENCE_FRAME *const rf) { + if (rf[1] <= INTRA_FRAME) return mode_context; + + const int16_t newmv_ctx = mode_context & NEWMV_CTX_MASK; + const int16_t refmv_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + + const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN( + newmv_ctx, COMP_NEWMV_CTXS - 1)]; + return comp_ctx; +} + +static INLINE int_mv get_ref_mv_from_stack( + int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame) { + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack; + + if (ref_frame[1] > INTRA_FRAME) { + assert(ref_idx == 0 || ref_idx == 1); + return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv + : curr_ref_mv_stack[ref_mv_idx].this_mv; + } + + assert(ref_idx == 0); + return ref_mv_idx < mbmi_ext_frame->ref_mv_count + ? curr_ref_mv_stack[ref_mv_idx].this_mv + : mbmi_ext_frame->global_mvs[ref_frame_type]; +} + +static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int ref_mv_idx = mbmi->ref_mv_idx; + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { + assert(has_second_ref(mbmi)); + ref_mv_idx += 1; + } + return get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx, + x->mbmi_ext_frame); +} + +static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td, + aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &ec_ctx->seg; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame; + const PREDICTION_MODE mode = mbmi->mode; + const uint8_t segment_id = mbmi->segment_id; + const BLOCK_SIZE bsize = mbmi->bsize; + const int allow_hp = cm->features.allow_high_precision_mv; + const int is_inter = is_inter_block(mbmi); + const int is_compound = has_second_ref(mbmi); + int ref; + + write_inter_segment_id(cpi, xd, w, seg, segp, 0, 1); + + write_skip_mode(cm, xd, segment_id, mbmi, w); + + assert(IMPLIES(mbmi->skip_mode, mbmi->skip_txfm)); + const int skip = + mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w); + + write_inter_segment_id(cpi, xd, w, seg, segp, skip, 0); + + write_cdef(cm, xd, w, skip); + + write_delta_q_params(cm, xd, skip, w); + + if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); + + if (mbmi->skip_mode) return; + + if (!is_inter) { + write_intra_prediction_modes(cm, xd, 0, w); + } else { + int16_t mode_ctx; + + av1_collect_neighbors_ref_counts(xd); + + write_ref_frames(cm, xd, w); + + mode_ctx = + mode_context_analyzer(mbmi_ext_frame->mode_context, mbmi->ref_frame); + + // If segment skip is not enabled code the mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { + if (is_inter_compound_mode(mode)) + write_inter_compound_mode(xd, w, mode, mode_ctx); + else if (is_inter_singleref_mode(mode)) + write_inter_mode(w, mode, ec_ctx, mode_ctx); + + if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode)) + write_drl_idx(ec_ctx, mbmi, mbmi_ext_frame, w); + else + assert(mbmi->ref_mv_idx == 0); + } + + if (mode == NEWMV || mode == NEW_NEWMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = get_ref_mv(x, ref); + av1_encode_mv(cpi, w, td, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc, + allow_hp); + } + } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = get_ref_mv(x, 1); + av1_encode_mv(cpi, w, td, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, + allow_hp); + } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = get_ref_mv(x, 0); + av1_encode_mv(cpi, w, td, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, + allow_hp); + } + + if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE && + cpi->common.seq_params->enable_interintra_compound && + is_interintra_allowed(mbmi)) { + const int interintra = mbmi->ref_frame[1] == INTRA_FRAME; + const int bsize_group = size_group_lookup[bsize]; + aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2); + if (interintra) { + aom_write_symbol(w, mbmi->interintra_mode, + ec_ctx->interintra_mode_cdf[bsize_group], + INTERINTRA_MODES); + if (av1_is_wedge_used(bsize)) { + aom_write_symbol(w, mbmi->use_wedge_interintra, + ec_ctx->wedge_interintra_cdf[bsize], 2); + if (mbmi->use_wedge_interintra) { + aom_write_symbol(w, mbmi->interintra_wedge_index, + ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES); + } + } + } + } + + if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w); + + // First write idx to indicate current compound inter prediction mode group + // Group A (0): dist_wtd_comp, compound_average + // Group B (1): interintra, compound_diffwtd, wedge + if (has_second_ref(mbmi)) { + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params->enable_masked_compound; + + if (masked_compound_used) { + const int ctx_comp_group_idx = get_comp_group_idx_context(xd); + aom_write_symbol(w, mbmi->comp_group_idx, + ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2); + } else { + assert(mbmi->comp_group_idx == 0); + } + + if (mbmi->comp_group_idx == 0) { + if (mbmi->compound_idx) + assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE); + + if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) { + const int comp_index_ctx = get_comp_index_context(cm, xd); + aom_write_symbol(w, mbmi->compound_idx, + ec_ctx->compound_index_cdf[comp_index_ctx], 2); + } else { + assert(mbmi->compound_idx == 1); + } + } else { + assert(cpi->common.current_frame.reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION); + assert(masked_compound_used); + // compound_diffwtd, wedge + assert(mbmi->interinter_comp.type == COMPOUND_WEDGE || + mbmi->interinter_comp.type == COMPOUND_DIFFWTD); + + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) + aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE, + ec_ctx->compound_type_cdf[bsize], + MASKED_COMPOUND_TYPES); + + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); + aom_write_symbol(w, mbmi->interinter_comp.wedge_index, + ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES); + aom_write_bit(w, mbmi->interinter_comp.wedge_sign); + } else { + assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD); + aom_write_literal(w, mbmi->interinter_comp.mask_type, + MAX_DIFFWTD_MASK_BITS); + } + } + } + write_mb_interp_filter(cm, td, w); + } +} + +static AOM_INLINE void write_intrabc_info( + MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, + aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + int use_intrabc = is_intrabc_block(mbmi); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2); + if (use_intrabc) { + assert(mbmi->mode == DC_PRED); + assert(mbmi->uv_mode == UV_DC_PRED); + assert(mbmi->motion_mode == SIMPLE_TRANSLATION); + int_mv dv_ref = mbmi_ext_frame->ref_mv_stack[0].this_mv; + av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc); + } +} + +static AOM_INLINE void write_mb_modes_kf( + AV1_COMP *cpi, MACROBLOCKD *xd, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &ec_ctx->seg; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + + if (seg->segid_preskip && seg->update_map) + write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); + + const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w); + + if (!seg->segid_preskip && seg->update_map) + write_segment_id(cpi, xd, mbmi, w, seg, segp, skip); + + write_cdef(cm, xd, w, skip); + + write_delta_q_params(cm, xd, skip, w); + + if (av1_allow_intrabc(cm)) { + write_intrabc_info(xd, mbmi_ext_frame, w); + if (is_intrabc_block(mbmi)) return; + } + + write_intra_prediction_modes(cm, xd, 1, w); +} + +#if CONFIG_RD_DEBUG +static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) { + printf("\nmi->mi_row == %d\n", mi->mi_row); + printf("&& mi->mi_col == %d\n", mi->mi_col); + printf("&& mi->bsize == %d\n", mi->bsize); + printf("&& mi->tx_size == %d\n", mi->tx_size); + printf("&& mi->mode == %d\n", mi->mode); +} + +static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, + int plane) { + if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) { + printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n", + plane, rd_stats->txb_coeff_cost[plane], token_stats->cost); + return 1; + } + return 0; +} +#endif + +#if ENC_MISMATCH_DEBUG +static AOM_INLINE void enc_dump_logs( + const AV1_COMMON *const cm, + const MBMIExtFrameBufferInfo *const mbmi_ext_info, int mi_row, int mi_col) { + const MB_MODE_INFO *const mbmi = *( + cm->mi_params.mi_grid_base + (mi_row * cm->mi_params.mi_stride + mi_col)); + const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = + mbmi_ext_info->frame_base + get_mi_ext_idx(mi_row, mi_col, + cm->mi_params.mi_alloc_bsize, + mbmi_ext_info->stride); + if (is_inter_block(mbmi)) { +#define FRAME_TO_CHECK 11 + if (cm->current_frame.frame_number == FRAME_TO_CHECK && + cm->show_frame == 1) { + const BLOCK_SIZE bsize = mbmi->bsize; + + int_mv mv[2] = { 0 }; + const int is_comp_ref = has_second_ref(mbmi); + + for (int ref = 0; ref < 1 + is_comp_ref; ++ref) + mv[ref].as_mv = mbmi->mv[ref].as_mv; + + if (!is_comp_ref) { + mv[1].as_int = 0; + } + + const int16_t mode_ctx = + is_comp_ref ? 0 + : mode_context_analyzer(mbmi_ext_frame->mode_context, + mbmi->ref_frame); + + const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; + int16_t zeromv_ctx = -1; + int16_t refmv_ctx = -1; + + if (mbmi->mode != NEWMV) { + zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + if (mbmi->mode != GLOBALMV) + refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + } + + printf( + "=== ENCODER ===: " + "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, " + "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, " + "ref[1]=%d, motion_mode=%d, mode_ctx=%d, " + "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n", + cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode, + mbmi->mode, bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, + mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0], + mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, + zeromv_ctx, refmv_ctx, mbmi->tx_size); + } + } +} +#endif // ENC_MISMATCH_DEBUG + +static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td, + aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &td->mb.e_mbd; + MB_MODE_INFO *m = xd->mi[0]; + + if (frame_is_intra_only(cm)) { + write_mb_modes_kf(cpi, xd, td->mb.mbmi_ext_frame, w); + } else { + // has_subpel_mv_component needs the ref frame buffers set up to look + // up if they are scaled. has_subpel_mv_component is in turn needed by + // write_switchable_interp_filter, which is called by pack_inter_mode_mvs. + set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]); + +#if ENC_MISMATCH_DEBUG + enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col); +#endif // ENC_MISMATCH_DEBUG + + pack_inter_mode_mvs(cpi, td, w); + } +} + +static AOM_INLINE void write_inter_txb_coeff( + AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi, + aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end, + TOKEN_STATS *token_stats, const int row, const int col, int *block, + const int plane) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bsize = mbmi->bsize; + assert(bsize < BLOCK_SIZES_ALL); + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + const int bkw = tx_size_wide_unit[max_tx_size]; + const int bkh = tx_size_high_unit[max_tx_size]; + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, ss_x, ss_y); + const int num_4x4_w = mi_size_wide[plane_bsize]; + const int num_4x4_h = mi_size_high[plane_bsize]; + const int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + const int mu_blocks_high = mi_size_high[max_unit_bsize]; + const int unit_height = AOMMIN(mu_blocks_high + (row >> ss_y), num_4x4_h); + const int unit_width = AOMMIN(mu_blocks_wide + (col >> ss_x), num_4x4_w); + for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) { + for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) { + pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize, + cm->seq_params->bit_depth, *block, blk_row, blk_col, + max_tx_size, token_stats); + *block += step; + } + } +} + +static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x, + aom_writer *w, const TokenExtra **tok, + const TokenExtra *const tok_end) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + + assert(!mbmi->skip_txfm); + + const int is_inter = is_inter_block(mbmi); + if (!is_inter) { + av1_write_intra_coeffs_mb(cm, x, w, bsize); + } else { + int block[MAX_MB_PLANE] = { 0 }; + assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + const int num_4x4_w = mi_size_wide[bsize]; + const int num_4x4_h = mi_size_high[bsize]; + TOKEN_STATS token_stats; + init_token_stats(&token_stats); + + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + assert(max_unit_bsize == get_plane_block_size(BLOCK_64X64, + xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide); + mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high); + + const int num_planes = av1_num_planes(cm); + for (int row = 0; row < num_4x4_h; row += mu_blocks_high) { + for (int col = 0; col < num_4x4_w; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats, row, + col, &block[plane], plane); + } + } + } +#if CONFIG_RD_DEBUG + for (int plane = 0; plane < num_planes; ++plane) { + if (mbmi->bsize >= BLOCK_8X8 && + rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) { + dump_mode_info(mbmi); + assert(0); + } + } +#endif // CONFIG_RD_DEBUG + } +} + +static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td, + const TileInfo *const tile, aom_writer *w, + const TokenExtra **tok, + const TokenExtra *const tok_end, + int mi_row, int mi_col) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCKD *xd = &td->mb.e_mbd; + FRAME_CONTEXT *tile_ctx = xd->tile_ctx; + const int grid_idx = mi_row * mi_params->mi_stride + mi_col; + xd->mi = mi_params->mi_grid_base + grid_idx; + td->mb.mbmi_ext_frame = + cpi->mbmi_ext_info.frame_base + + get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize, + cpi->mbmi_ext_info.stride); + xd->tx_type_map = mi_params->tx_type_map + grid_idx; + xd->tx_type_map_stride = mi_params->mi_stride; + + const MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + assert(bsize <= cm->seq_params->sb_size || + (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL)); + + const int bh = mi_size_high[bsize]; + const int bw = mi_size_wide[bsize]; + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, + mi_params->mi_cols); + + xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + write_mbmi_b(cpi, td, w); + + for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) { + const uint8_t palette_size_plane = + mbmi->palette_mode_info.palette_size[plane]; + assert(!mbmi->skip_mode || !palette_size_plane); + if (palette_size_plane > 0) { + assert(mbmi->use_intrabc == 0); + assert(av1_allow_palette(cm->features.allow_screen_content_tools, + mbmi->bsize)); + assert(!plane || xd->is_chroma_ref); + int rows, cols; + av1_get_block_dimensions(mbmi->bsize, plane, xd, NULL, NULL, &rows, + &cols); + assert(*tok < tok_end); + MapCdf map_pb_cdf = plane ? tile_ctx->palette_uv_color_index_cdf + : tile_ctx->palette_y_color_index_cdf; + pack_map_tokens(w, tok, palette_size_plane, rows * cols, map_pb_cdf); + } + } + + const int is_inter_tx = is_inter_block(mbmi); + const int skip_txfm = mbmi->skip_txfm; + const uint8_t segment_id = mbmi->segment_id; + if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) && + !(is_inter_tx && skip_txfm) && !xd->lossless[segment_id]) { + if (is_inter_tx) { // This implies skip flag is 0. + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0); + const int txbh = tx_size_high_unit[max_tx_size]; + const int txbw = tx_size_wide_unit[max_tx_size]; + const int width = mi_size_wide[bsize]; + const int height = mi_size_high[bsize]; + for (int idy = 0; idy < height; idy += txbh) { + for (int idx = 0; idx < width; idx += txbw) { + write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w); + } + } + } else { + write_selected_tx_size(xd, w); + set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd); + } + } else { + set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, + skip_txfm && is_inter_tx, xd); + } + + if (!mbmi->skip_txfm) { + int start = aom_tell_size(w); + + write_tokens_b(cpi, &td->mb, w, tok, tok_end); + + const int end = aom_tell_size(w); + td->coefficient_size += end - start; + } +} + +static AOM_INLINE void write_partition(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, int hbs, + int mi_row, int mi_col, PARTITION_TYPE p, + BLOCK_SIZE bsize, aom_writer *w) { + const int is_partition_point = bsize >= BLOCK_8X8; + + if (!is_partition_point) return; + + const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols; + const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (!has_rows && !has_cols) { + assert(p == PARTITION_SPLIT); + return; + } + + if (has_rows && has_cols) { + aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], + partition_cdf_length(bsize)); + } else if (!has_rows && has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_HORZ); + assert(bsize > BLOCK_8X8); + aom_cdf_prob cdf[2]; + partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize); + aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2); + } else { + assert(has_rows && !has_cols); + assert(p == PARTITION_SPLIT || p == PARTITION_VERT); + assert(bsize > BLOCK_8X8); + aom_cdf_prob cdf[2]; + partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize); + aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2); + } +} + +static AOM_INLINE void write_modes_sb( + AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile, + aom_writer *const w, const TokenExtra **tok, + const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCKD *const xd = &td->mb.e_mbd; + assert(bsize < BLOCK_SIZES_ALL); + const int hbs = mi_size_wide[bsize] / 2; + const int quarter_step = mi_size_wide[bsize] / 4; + int i; + const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + +#if !CONFIG_REALTIME_ONLY + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + int rcol0, rcol1, rrow0, rrow1; + + // Skip some unnecessary work if loop restoration is disabled + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + + if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, + &rcol0, &rcol1, &rrow0, &rrow1)) { + const int rstride = cm->rst_info[plane].horz_units; + for (int rrow = rrow0; rrow < rrow1; ++rrow) { + for (int rcol = rcol0; rcol < rcol1; ++rcol) { + const int runit_idx = rcol + rrow * rstride; + loop_restoration_write_sb_coeffs(cm, xd, runit_idx, w, plane, + td->counts); + } + } + } + } +#endif + + write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w); + switch (partition) { + case PARTITION_NONE: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + break; + case PARTITION_HORZ: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_row + hbs < mi_params->mi_rows) + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); + break; + case PARTITION_VERT: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_col + hbs < mi_params->mi_cols) + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); + break; + case PARTITION_SPLIT: + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col, subsize); + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs, + subsize); + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col, + subsize); + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs, + subsize); + break; + case PARTITION_HORZ_A: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); + break; + case PARTITION_HORZ_B: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_VERT_A: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); + break; + case PARTITION_VERT_B: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_HORZ_4: + for (i = 0; i < 4; ++i) { + int this_mi_row = mi_row + i * quarter_step; + if (i > 0 && this_mi_row >= mi_params->mi_rows) break; + + write_modes_b(cpi, td, tile, w, tok, tok_end, this_mi_row, mi_col); + } + break; + case PARTITION_VERT_4: + for (i = 0; i < 4; ++i) { + int this_mi_col = mi_col + i * quarter_step; + if (i > 0 && this_mi_col >= mi_params->mi_cols) break; + + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, this_mi_col); + } + break; + default: assert(0); + } + + // update partition context + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +} + +// Populate token pointers appropriately based on token_info. +static AOM_INLINE void get_token_pointers(const TokenInfo *token_info, + const int tile_row, int tile_col, + const int sb_row_in_tile, + const TokenExtra **tok, + const TokenExtra **tok_end) { + if (!is_token_info_allocated(token_info)) { + *tok = NULL; + *tok_end = NULL; + return; + } + *tok = token_info->tplist[tile_row][tile_col][sb_row_in_tile].start; + *tok_end = + *tok + token_info->tplist[tile_row][tile_col][sb_row_in_tile].count; +} + +static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td, + const TileInfo *const tile, + aom_writer *const w, int tile_row, + int tile_col) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &td->mb.e_mbd; + const int mi_row_start = tile->mi_row_start; + const int mi_row_end = tile->mi_row_end; + const int mi_col_start = tile->mi_col_start; + const int mi_col_end = tile->mi_col_end; + const int num_planes = av1_num_planes(cm); + + av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row); + av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd); + + if (cpi->common.delta_q_info.delta_q_present_flag) { + xd->current_base_qindex = cpi->common.quant_params.base_qindex; + if (cpi->common.delta_q_info.delta_lf_present_flag) { + av1_reset_loop_filter_delta(xd, num_planes); + } + } + + for (int mi_row = mi_row_start; mi_row < mi_row_end; + mi_row += cm->seq_params->mib_size) { + const int sb_row_in_tile = + (mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2; + const TokenInfo *token_info = &cpi->token_info; + const TokenExtra *tok; + const TokenExtra *tok_end; + get_token_pointers(token_info, tile_row, tile_col, sb_row_in_tile, &tok, + &tok_end); + + av1_zero_left_context(xd); + + for (int mi_col = mi_col_start; mi_col < mi_col_end; + mi_col += cm->seq_params->mib_size) { + td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col); + write_modes_sb(cpi, td, tile, w, &tok, tok_end, mi_row, mi_col, + cm->seq_params->sb_size); + } + assert(tok == tok_end); + } +} + +static AOM_INLINE void encode_restoration_mode( + AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { + assert(!cm->features.all_lossless); + if (!cm->seq_params->enable_restoration) return; + if (cm->features.allow_intrabc) return; + const int num_planes = av1_num_planes(cm); + int all_none = 1, chroma_none = 1; + for (int p = 0; p < num_planes; ++p) { + RestorationInfo *rsi = &cm->rst_info[p]; + if (rsi->frame_restoration_type != RESTORE_NONE) { + all_none = 0; + chroma_none &= p == 0; + } + switch (rsi->frame_restoration_type) { + case RESTORE_NONE: + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, 0); + break; + case RESTORE_WIENER: + aom_wb_write_bit(wb, 1); + aom_wb_write_bit(wb, 0); + break; + case RESTORE_SGRPROJ: + aom_wb_write_bit(wb, 1); + aom_wb_write_bit(wb, 1); + break; + case RESTORE_SWITCHABLE: + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, 1); + break; + default: assert(0); + } + } + if (!all_none) { + assert(cm->seq_params->sb_size == BLOCK_64X64 || + cm->seq_params->sb_size == BLOCK_128X128); + const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64; + + RestorationInfo *rsi = &cm->rst_info[0]; + + assert(rsi->restoration_unit_size >= sb_size); + assert(RESTORATION_UNITSIZE_MAX == 256); + + if (sb_size == 64) { + aom_wb_write_bit(wb, rsi->restoration_unit_size > 64); + } + if (rsi->restoration_unit_size > 64) { + aom_wb_write_bit(wb, rsi->restoration_unit_size > 128); + } + } + + if (num_planes > 1) { + int s = + AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y); + if (s && !chroma_none) { + aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size != + cm->rst_info[0].restoration_unit_size); + assert(cm->rst_info[1].restoration_unit_size == + cm->rst_info[0].restoration_unit_size || + cm->rst_info[1].restoration_unit_size == + (cm->rst_info[0].restoration_unit_size >> s)); + assert(cm->rst_info[2].restoration_unit_size == + cm->rst_info[1].restoration_unit_size); + } else if (!s) { + assert(cm->rst_info[1].restoration_unit_size == + cm->rst_info[0].restoration_unit_size); + assert(cm->rst_info[2].restoration_unit_size == + cm->rst_info[1].restoration_unit_size); + } + } +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE void write_wiener_filter(int wiener_win, + const WienerInfo *wiener_info, + WienerInfo *ref_wiener_info, + aom_writer *wb) { + if (wiener_win == WIENER_WIN) + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV); + else + assert(wiener_info->vfilter[0] == 0 && + wiener_info->vfilter[WIENER_WIN - 1] == 0); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV); + if (wiener_win == WIENER_WIN) + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV); + else + assert(wiener_info->hfilter[0] == 0 && + wiener_info->hfilter[WIENER_WIN - 1] == 0); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV); + memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info)); +} + +static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info, + SgrprojInfo *ref_sgrproj_info, + aom_writer *wb) { + aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS); + const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; + + if (params->r[0] == 0) { + assert(sgrproj_info->xqd[0] == 0); + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + } else if (params->r[1] == 0) { + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + } else { + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + } + + memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info)); +} + +static AOM_INLINE void loop_restoration_write_sb_coeffs( + const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx, + aom_writer *const w, int plane, FRAME_COUNTS *counts) { + const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx]; + const RestorationInfo *rsi = cm->rst_info + plane; + RestorationType frame_rtype = rsi->frame_restoration_type; + assert(frame_rtype != RESTORE_NONE); + + (void)counts; + assert(!cm->features.all_lossless); + + const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN; + WienerInfo *ref_wiener_info = &xd->wiener_info[plane]; + SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane]; + RestorationType unit_rtype = rui->restoration_type; + + if (frame_rtype == RESTORE_SWITCHABLE) { + aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf, + RESTORE_SWITCHABLE_TYPES); +#if CONFIG_ENTROPY_STATS + ++counts->switchable_restore[unit_rtype]; +#endif + switch (unit_rtype) { + case RESTORE_WIENER: +#if DEBUG_LR_COSTING + assert(!memcmp( + ref_wiener_info, + &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx].wiener_info, + sizeof(*ref_wiener_info))); +#endif + write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w); + break; + case RESTORE_SGRPROJ: +#if DEBUG_LR_COSTING + assert(!memcmp(&ref_sgrproj_info->xqd, + &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx] + .sgrproj_info.xqd, + sizeof(ref_sgrproj_info->xqd))); +#endif + write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w); + break; + default: assert(unit_rtype == RESTORE_NONE); break; + } + } else if (frame_rtype == RESTORE_WIENER) { + aom_write_symbol(w, unit_rtype != RESTORE_NONE, + xd->tile_ctx->wiener_restore_cdf, 2); +#if CONFIG_ENTROPY_STATS + ++counts->wiener_restore[unit_rtype != RESTORE_NONE]; +#endif + if (unit_rtype != RESTORE_NONE) { +#if DEBUG_LR_COSTING + assert( + !memcmp(ref_wiener_info, + &lr_ref_params[RESTORE_WIENER][plane][runit_idx].wiener_info, + sizeof(*ref_wiener_info))); +#endif + write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w); + } + } else if (frame_rtype == RESTORE_SGRPROJ) { + aom_write_symbol(w, unit_rtype != RESTORE_NONE, + xd->tile_ctx->sgrproj_restore_cdf, 2); +#if CONFIG_ENTROPY_STATS + ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE]; +#endif + if (unit_rtype != RESTORE_NONE) { +#if DEBUG_LR_COSTING + assert(!memcmp( + &ref_sgrproj_info->xqd, + &lr_ref_params[RESTORE_SGRPROJ][plane][runit_idx].sgrproj_info.xqd, + sizeof(ref_sgrproj_info->xqd))); +#endif + write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w); + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +// Only write out the ref delta section if any of the elements +// will signal a delta. +static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) { + struct loopfilter *lf = &cm->lf; + if (!lf->mode_ref_delta_update) { + return 0; + } + const RefCntBuffer *buf = get_primary_ref_frame_buf(cm); + int8_t last_ref_deltas[REF_FRAMES]; + int8_t last_mode_deltas[MAX_MODE_LF_DELTAS]; + if (buf == NULL) { + av1_set_default_ref_deltas(last_ref_deltas); + av1_set_default_mode_deltas(last_mode_deltas); + } else { + memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES); + memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS); + } + for (int i = 0; i < REF_FRAMES; i++) { + if (lf->ref_deltas[i] != last_ref_deltas[i]) { + return true; + } + } + for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) { + if (lf->mode_deltas[i] != last_mode_deltas[i]) { + return true; + } + } + return false; +} + +static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + assert(!cm->features.coded_lossless); + if (cm->features.allow_intrabc) return; + const int num_planes = av1_num_planes(cm); + struct loopfilter *lf = &cm->lf; + + // Encode the loop filter level and type + aom_wb_write_literal(wb, lf->filter_level[0], 6); + aom_wb_write_literal(wb, lf->filter_level[1], 6); + if (num_planes > 1) { + if (lf->filter_level[0] || lf->filter_level[1]) { + aom_wb_write_literal(wb, lf->filter_level_u, 6); + aom_wb_write_literal(wb, lf->filter_level_v, 6); + } + } + aom_wb_write_literal(wb, lf->sharpness_level, 3); + + aom_wb_write_bit(wb, lf->mode_ref_delta_enabled); + + // Write out loop filter deltas applied at the MB level based on mode or + // ref frame (if they are enabled), only if there is information to write. + int meaningful = is_mode_ref_delta_meaningful(cm); + aom_wb_write_bit(wb, meaningful); + if (!meaningful) { + return; + } + + const RefCntBuffer *buf = get_primary_ref_frame_buf(cm); + int8_t last_ref_deltas[REF_FRAMES]; + int8_t last_mode_deltas[MAX_MODE_LF_DELTAS]; + if (buf == NULL) { + av1_set_default_ref_deltas(last_ref_deltas); + av1_set_default_mode_deltas(last_mode_deltas); + } else { + memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES); + memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS); + } + for (int i = 0; i < REF_FRAMES; i++) { + const int delta = lf->ref_deltas[i]; + const int changed = delta != last_ref_deltas[i]; + aom_wb_write_bit(wb, changed); + if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6); + } + for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) { + const int delta = lf->mode_deltas[i]; + const int changed = delta != last_mode_deltas[i]; + aom_wb_write_bit(wb, changed); + if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6); + } +} + +static AOM_INLINE void encode_cdef(const AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + assert(!cm->features.coded_lossless); + if (!cm->seq_params->enable_cdef) return; + if (cm->features.allow_intrabc) return; + const int num_planes = av1_num_planes(cm); + int i; + aom_wb_write_literal(wb, cm->cdef_info.cdef_damping - 3, 2); + aom_wb_write_literal(wb, cm->cdef_info.cdef_bits, 2); + for (i = 0; i < cm->cdef_info.nb_cdef_strengths; i++) { + aom_wb_write_literal(wb, cm->cdef_info.cdef_strengths[i], + CDEF_STRENGTH_BITS); + if (num_planes > 1) + aom_wb_write_literal(wb, cm->cdef_info.cdef_uv_strengths[i], + CDEF_STRENGTH_BITS); + } +} + +static AOM_INLINE void write_delta_q(struct aom_write_bit_buffer *wb, + int delta_q) { + if (delta_q != 0) { + aom_wb_write_bit(wb, 1); + aom_wb_write_inv_signed_literal(wb, delta_q, 6); + } else { + aom_wb_write_bit(wb, 0); + } +} + +static AOM_INLINE void encode_quantization( + const CommonQuantParams *const quant_params, int num_planes, + bool separate_uv_delta_q, struct aom_write_bit_buffer *wb) { + aom_wb_write_literal(wb, quant_params->base_qindex, QINDEX_BITS); + write_delta_q(wb, quant_params->y_dc_delta_q); + if (num_planes > 1) { + int diff_uv_delta = + (quant_params->u_dc_delta_q != quant_params->v_dc_delta_q) || + (quant_params->u_ac_delta_q != quant_params->v_ac_delta_q); + if (separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta); + write_delta_q(wb, quant_params->u_dc_delta_q); + write_delta_q(wb, quant_params->u_ac_delta_q); + if (diff_uv_delta) { + write_delta_q(wb, quant_params->v_dc_delta_q); + write_delta_q(wb, quant_params->v_ac_delta_q); + } + } + aom_wb_write_bit(wb, quant_params->using_qmatrix); + if (quant_params->using_qmatrix) { + aom_wb_write_literal(wb, quant_params->qmatrix_level_y, QM_LEVEL_BITS); + aom_wb_write_literal(wb, quant_params->qmatrix_level_u, QM_LEVEL_BITS); + if (!separate_uv_delta_q) + assert(quant_params->qmatrix_level_u == quant_params->qmatrix_level_v); + else + aom_wb_write_literal(wb, quant_params->qmatrix_level_v, QM_LEVEL_BITS); + } +} + +static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + int i, j; + struct segmentation *seg = &cm->seg; + + aom_wb_write_bit(wb, seg->enabled); + if (!seg->enabled) return; + + // Write update flags + if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) { + aom_wb_write_bit(wb, seg->update_map); + if (seg->update_map) aom_wb_write_bit(wb, seg->temporal_update); + aom_wb_write_bit(wb, seg->update_data); + } + + // Segmentation data + if (seg->update_data) { + for (i = 0; i < MAX_SEGMENTS; i++) { + for (j = 0; j < SEG_LVL_MAX; j++) { + const int active = segfeature_active(seg, i, j); + aom_wb_write_bit(wb, active); + if (active) { + const int data_max = av1_seg_feature_data_max(j); + const int data_min = -data_max; + const int ubits = get_unsigned_bits(data_max); + const int data = clamp(get_segdata(seg, i, j), data_min, data_max); + + if (av1_is_segfeature_signed(j)) { + aom_wb_write_inv_signed_literal(wb, data, ubits); + } else { + aom_wb_write_literal(wb, data, ubits); + } + } + } + } + } +} + +static AOM_INLINE void write_frame_interp_filter( + InterpFilter filter, struct aom_write_bit_buffer *wb) { + aom_wb_write_bit(wb, filter == SWITCHABLE); + if (filter != SWITCHABLE) + aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS); +} + +// Same function as write_uniform but writing to uncompresses header wb +static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, + int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return; + if (v < m) { + aom_wb_write_literal(wb, v, l - 1); + } else { + aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1); + aom_wb_write_literal(wb, (v - m) & 1, 1); + } +} + +static AOM_INLINE void write_tile_info_max_tile( + const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { + int width_sb = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); + int height_sb = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); + int size_sb, i; + const CommonTileParams *const tiles = &cm->tiles; + + aom_wb_write_bit(wb, tiles->uniform_spacing); + + if (tiles->uniform_spacing) { + int ones = tiles->log2_cols - tiles->min_log2_cols; + while (ones--) { + aom_wb_write_bit(wb, 1); + } + if (tiles->log2_cols < tiles->max_log2_cols) { + aom_wb_write_bit(wb, 0); + } + + // rows + ones = tiles->log2_rows - tiles->min_log2_rows; + while (ones--) { + aom_wb_write_bit(wb, 1); + } + if (tiles->log2_rows < tiles->max_log2_rows) { + aom_wb_write_bit(wb, 0); + } + } else { + // Explicit tiles with configurable tile widths and heights + // columns + for (i = 0; i < tiles->cols; i++) { + size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; + wb_write_uniform(wb, AOMMIN(width_sb, tiles->max_width_sb), size_sb - 1); + width_sb -= size_sb; + } + assert(width_sb == 0); + + // rows + for (i = 0; i < tiles->rows; i++) { + size_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; + wb_write_uniform(wb, AOMMIN(height_sb, tiles->max_height_sb), + size_sb - 1); + height_sb -= size_sb; + } + assert(height_sb == 0); + } +} + +static AOM_INLINE void write_tile_info(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *saved_wb, + struct aom_write_bit_buffer *wb) { + write_tile_info_max_tile(cm, wb); + + *saved_wb = *wb; + if (cm->tiles.rows * cm->tiles.cols > 1) { + // tile id used for cdf update + aom_wb_write_literal(wb, 0, cm->tiles.log2_cols + cm->tiles.log2_rows); + // Number of bytes in tile size - 1 + aom_wb_write_literal(wb, 3, 2); + } +} + +static AOM_INLINE void write_ext_tile_info( + const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb, + struct aom_write_bit_buffer *wb) { + // This information is stored as a separate byte. + int mod = wb->bit_offset % CHAR_BIT; + if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod); + assert(aom_wb_is_byte_aligned(wb)); + + *saved_wb = *wb; + if (cm->tiles.rows * cm->tiles.cols > 1) { + // Note that the last item in the uncompressed header is the data + // describing tile configuration. + // Number of bytes in tile column size - 1 + aom_wb_write_literal(wb, 0, 2); + // Number of bytes in tile size - 1 + aom_wb_write_literal(wb, 0, 2); + } +} + +static INLINE int find_identical_tile( + const int tile_row, const int tile_col, + TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) { + const MV32 candidate_offset[1] = { { 1, 0 } }; + const uint8_t *const cur_tile_data = + tile_buffers[tile_row][tile_col].data + 4; + const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size; + + int i; + + if (tile_row == 0) return 0; + + // (TODO: yunqingwang) For now, only above tile is checked and used. + // More candidates such as left tile can be added later. + for (i = 0; i < 1; i++) { + int row_offset = candidate_offset[0].row; + int col_offset = candidate_offset[0].col; + int row = tile_row - row_offset; + int col = tile_col - col_offset; + const uint8_t *tile_data; + TileBufferEnc *candidate; + + if (row < 0 || col < 0) continue; + + const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data); + + // Read out tile-copy-mode bit: + if ((tile_hdr >> 31) == 1) { + // The candidate is a copy tile itself: the offset is stored in bits + // 30 through 24 inclusive. + row_offset += (tile_hdr >> 24) & 0x7f; + row = tile_row - row_offset; + } + + candidate = &tile_buffers[row][col]; + + if (row_offset >= 128 || candidate->size != cur_tile_size) continue; + + tile_data = candidate->data + 4; + + if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue; + + // Identical tile found + assert(row_offset > 0); + return row_offset; + } + + // No identical tile found + return 0; +} + +static AOM_INLINE void write_render_size(const AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + const int scaling_active = av1_resize_scaled(cm); + aom_wb_write_bit(wb, scaling_active); + if (scaling_active) { + aom_wb_write_literal(wb, cm->render_width - 1, 16); + aom_wb_write_literal(wb, cm->render_height - 1, 16); + } +} + +static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + const SequenceHeader *const seq_params = cm->seq_params; + if (!seq_params->enable_superres) { + assert(cm->superres_scale_denominator == SCALE_NUMERATOR); + return; + } + + // First bit is whether to to scale or not + if (cm->superres_scale_denominator == SCALE_NUMERATOR) { + aom_wb_write_bit(wb, 0); // no scaling + } else { + aom_wb_write_bit(wb, 1); // scaling, write scale factor + assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN); + assert(cm->superres_scale_denominator < + SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS)); + aom_wb_write_literal( + wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN, + SUPERRES_SCALE_BITS); + } +} + +static AOM_INLINE void write_frame_size(const AV1_COMMON *cm, + int frame_size_override, + struct aom_write_bit_buffer *wb) { + const int coded_width = cm->superres_upscaled_width - 1; + const int coded_height = cm->superres_upscaled_height - 1; + + if (frame_size_override) { + const SequenceHeader *seq_params = cm->seq_params; + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; + aom_wb_write_literal(wb, coded_width, num_bits_width); + aom_wb_write_literal(wb, coded_height, num_bits_height); + } + + write_superres_scale(cm, wb); + write_render_size(cm, wb); +} + +static AOM_INLINE void write_frame_size_with_refs( + const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { + int found = 0; + + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame); + + if (cfg != NULL) { + found = cm->superres_upscaled_width == cfg->y_crop_width && + cm->superres_upscaled_height == cfg->y_crop_height; + found &= cm->render_width == cfg->render_width && + cm->render_height == cfg->render_height; + } + aom_wb_write_bit(wb, found); + if (found) { + write_superres_scale(cm, wb); + break; + } + } + + if (!found) { + int frame_size_override = 1; // Always equal to 1 in this function + write_frame_size(cm, frame_size_override, wb); + } +} + +static AOM_INLINE void write_profile(BITSTREAM_PROFILE profile, + struct aom_write_bit_buffer *wb) { + assert(profile >= PROFILE_0 && profile < MAX_PROFILES); + aom_wb_write_literal(wb, profile, PROFILE_BITS); +} + +static AOM_INLINE void write_bitdepth(const SequenceHeader *const seq_params, + struct aom_write_bit_buffer *wb) { + // Profile 0/1: [0] for 8 bit, [1] 10-bit + // Profile 2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit + aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1); + if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) { + aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1); + } +} + +static AOM_INLINE void write_color_config( + const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { + write_bitdepth(seq_params, wb); + const int is_monochrome = seq_params->monochrome; + // monochrome bit + if (seq_params->profile != PROFILE_1) + aom_wb_write_bit(wb, is_monochrome); + else + assert(!is_monochrome); + if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED && + seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED && + seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) { + aom_wb_write_bit(wb, 0); // No color description present + } else { + aom_wb_write_bit(wb, 1); // Color description present + aom_wb_write_literal(wb, seq_params->color_primaries, 8); + aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8); + aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8); + } + if (is_monochrome) { + // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + aom_wb_write_bit(wb, seq_params->color_range); + return; + } + if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && + seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && + seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + assert(seq_params->profile == PROFILE_1 || + (seq_params->profile == PROFILE_2 && + seq_params->bit_depth == AOM_BITS_12)); + } else { + // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + aom_wb_write_bit(wb, seq_params->color_range); + if (seq_params->profile == PROFILE_0) { + // 420 only + assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1); + } else if (seq_params->profile == PROFILE_1) { + // 444 only + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + } else if (seq_params->profile == PROFILE_2) { + if (seq_params->bit_depth == AOM_BITS_12) { + // 420, 444 or 422 + aom_wb_write_bit(wb, seq_params->subsampling_x); + if (seq_params->subsampling_x == 0) { + assert(seq_params->subsampling_y == 0 && + "4:4:0 subsampling not allowed in AV1"); + } else { + aom_wb_write_bit(wb, seq_params->subsampling_y); + } + } else { + // 422 only + assert(seq_params->subsampling_x == 1 && + seq_params->subsampling_y == 0); + } + } + if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + } + if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) { + aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2); + } + } + aom_wb_write_bit(wb, seq_params->separate_uv_delta_q); +} + +static AOM_INLINE void write_timing_info_header( + const aom_timing_info_t *const timing_info, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal(wb, timing_info->num_units_in_display_tick, 32); + aom_wb_write_unsigned_literal(wb, timing_info->time_scale, 32); + aom_wb_write_bit(wb, timing_info->equal_picture_interval); + if (timing_info->equal_picture_interval) { + aom_wb_write_uvlc(wb, timing_info->num_ticks_per_picture - 1); + } +} + +static AOM_INLINE void write_decoder_model_info( + const aom_dec_model_info_t *const decoder_model_info, + struct aom_write_bit_buffer *wb) { + aom_wb_write_literal( + wb, decoder_model_info->encoder_decoder_buffer_delay_length - 1, 5); + aom_wb_write_unsigned_literal( + wb, decoder_model_info->num_units_in_decoding_tick, 32); + aom_wb_write_literal(wb, decoder_model_info->buffer_removal_time_length - 1, + 5); + aom_wb_write_literal( + wb, decoder_model_info->frame_presentation_time_length - 1, 5); +} + +static AOM_INLINE void write_dec_model_op_parameters( + const aom_dec_model_op_parameters_t *op_params, int buffer_delay_length, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal(wb, op_params->decoder_buffer_delay, + buffer_delay_length); + aom_wb_write_unsigned_literal(wb, op_params->encoder_buffer_delay, + buffer_delay_length); + aom_wb_write_bit(wb, op_params->low_delay_mode_flag); +} + +static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal( + wb, cm->frame_presentation_time, + cm->seq_params->decoder_model_info.frame_presentation_time_length); +} + +static AOM_INLINE void write_film_grain_params( + const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) { + const AV1_COMMON *const cm = &cpi->common; + const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params; + aom_wb_write_bit(wb, pars->apply_grain); + if (!pars->apply_grain) return; + + aom_wb_write_literal(wb, pars->random_seed, 16); + + if (cm->current_frame.frame_type == INTER_FRAME) + aom_wb_write_bit(wb, pars->update_parameters); + + if (!pars->update_parameters) { + int ref_frame, ref_idx; + for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) { + ref_idx = get_ref_frame_map_idx(cm, ref_frame); + assert(ref_idx != INVALID_IDX); + const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx]; + if (buf->film_grain_params_present && + aom_check_grain_params_equiv(pars, &buf->film_grain_params)) { + break; + } + } + assert(ref_frame < REF_FRAMES); + aom_wb_write_literal(wb, ref_idx, 3); + return; + } + + // Scaling functions parameters + aom_wb_write_literal(wb, pars->num_y_points, 4); // max 14 + for (int i = 0; i < pars->num_y_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8); + } + + if (!cm->seq_params->monochrome) { + aom_wb_write_bit(wb, pars->chroma_scaling_from_luma); + } else { + assert(!pars->chroma_scaling_from_luma); + } + + if (cm->seq_params->monochrome || pars->chroma_scaling_from_luma || + ((cm->seq_params->subsampling_x == 1) && + (cm->seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) { + assert(pars->num_cb_points == 0 && pars->num_cr_points == 0); + } else { + aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10 + for (int i = 0; i < pars->num_cb_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8); + } + + aom_wb_write_literal(wb, pars->num_cr_points, 4); // max 10 + for (int i = 0; i < pars->num_cr_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8); + } + } + + aom_wb_write_literal(wb, pars->scaling_shift - 8, 2); // 8 + value + + // AR coefficients + // Only sent if the corresponsing scaling function has + // more than 0 points + + aom_wb_write_literal(wb, pars->ar_coeff_lag, 2); + + int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (pars->num_y_points > 0) ++num_pos_chroma; + + if (pars->num_y_points) + for (int i = 0; i < num_pos_luma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8); + + if (pars->num_cb_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8); + + if (pars->num_cr_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8); + + aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2); // 8 + value + + aom_wb_write_literal(wb, pars->grain_scale_shift, 2); + + if (pars->num_cb_points) { + aom_wb_write_literal(wb, pars->cb_mult, 8); + aom_wb_write_literal(wb, pars->cb_luma_mult, 8); + aom_wb_write_literal(wb, pars->cb_offset, 9); + } + + if (pars->num_cr_points) { + aom_wb_write_literal(wb, pars->cr_mult, 8); + aom_wb_write_literal(wb, pars->cr_luma_mult, 8); + aom_wb_write_literal(wb, pars->cr_offset, 9); + } + + aom_wb_write_bit(wb, pars->overlap_flag); + + aom_wb_write_bit(wb, pars->clip_to_restricted_range); +} + +static AOM_INLINE void write_sb_size(const SequenceHeader *const seq_params, + struct aom_write_bit_buffer *wb) { + (void)seq_params; + (void)wb; + assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]); + assert(seq_params->mib_size == 1 << seq_params->mib_size_log2); + assert(seq_params->sb_size == BLOCK_128X128 || + seq_params->sb_size == BLOCK_64X64); + aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0); +} + +static AOM_INLINE void write_sequence_header( + const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { + aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4); + aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4); + aom_wb_write_literal(wb, seq_params->max_frame_width - 1, + seq_params->num_bits_width); + aom_wb_write_literal(wb, seq_params->max_frame_height - 1, + seq_params->num_bits_height); + + if (!seq_params->reduced_still_picture_hdr) { + aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag); + if (seq_params->frame_id_numbers_present_flag) { + // We must always have delta_frame_id_length < frame_id_length, + // in order for a frame to be referenced with a unique delta. + // Avoid wasting bits by using a coding that enforces this restriction. + aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4); + aom_wb_write_literal( + wb, + seq_params->frame_id_length - seq_params->delta_frame_id_length - 1, + 3); + } + } + + write_sb_size(seq_params, wb); + + aom_wb_write_bit(wb, seq_params->enable_filter_intra); + aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter); + + if (!seq_params->reduced_still_picture_hdr) { + aom_wb_write_bit(wb, seq_params->enable_interintra_compound); + aom_wb_write_bit(wb, seq_params->enable_masked_compound); + aom_wb_write_bit(wb, seq_params->enable_warped_motion); + aom_wb_write_bit(wb, seq_params->enable_dual_filter); + + aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint); + + if (seq_params->order_hint_info.enable_order_hint) { + aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp); + aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs); + } + if (seq_params->force_screen_content_tools == 2) { + aom_wb_write_bit(wb, 1); + } else { + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, seq_params->force_screen_content_tools); + } + if (seq_params->force_screen_content_tools > 0) { + if (seq_params->force_integer_mv == 2) { + aom_wb_write_bit(wb, 1); + } else { + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, seq_params->force_integer_mv); + } + } else { + assert(seq_params->force_integer_mv == 2); + } + if (seq_params->order_hint_info.enable_order_hint) + aom_wb_write_literal( + wb, seq_params->order_hint_info.order_hint_bits_minus_1, 3); + } + + aom_wb_write_bit(wb, seq_params->enable_superres); + aom_wb_write_bit(wb, seq_params->enable_cdef); + aom_wb_write_bit(wb, seq_params->enable_restoration); +} + +static AOM_INLINE void write_global_motion_params( + const WarpedMotionParams *params, const WarpedMotionParams *ref_params, + struct aom_write_bit_buffer *wb, int allow_hp) { + const TransformationType type = params->wmtype; + + // As a workaround for an AV1 spec bug, we avoid choosing TRANSLATION + // type models. Check here that we don't accidentally pick one somehow. + // See comments in gm_get_motion_vector() for details on the bug we're + // working around here + assert(type != TRANSLATION); + + aom_wb_write_bit(wb, type != IDENTITY); + if (type != IDENTITY) { + aom_wb_write_bit(wb, type == ROTZOOM); + if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION); + } + + if (type >= ROTZOOM) { + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF), + (params->wmmat[3] >> GM_ALPHA_PREC_DIFF)); + } + + if (type >= AFFINE) { + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF), + (params->wmmat[4] >> GM_ALPHA_PREC_DIFF)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + } + + if (type >= TRANSLATION) { + const int trans_bits = (type == TRANSLATION) + ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + const int trans_prec_diff = (type == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + aom_wb_write_signed_primitive_refsubexpfin( + wb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[0] >> trans_prec_diff), + (params->wmmat[0] >> trans_prec_diff)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[1] >> trans_prec_diff), + (params->wmmat[1] >> trans_prec_diff)); + } +} + +static AOM_INLINE void write_global_motion(AV1_COMP *cpi, + struct aom_write_bit_buffer *wb) { + AV1_COMMON *const cm = &cpi->common; + int frame; + for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { + const WarpedMotionParams *ref_params = + cm->prev_frame ? &cm->prev_frame->global_motion[frame] + : &default_warp_params; + write_global_motion_params(&cm->global_motion[frame], ref_params, wb, + cm->features.allow_high_precision_mv); + // TODO(sarahparker, debargha): The logic in the commented out code below + // does not work currently and causes mismatches when resize is on. + // Fix it before turning the optimization back on. + /* + YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame); + if (cpi->source->y_crop_width == ref_buf->y_crop_width && + cpi->source->y_crop_height == ref_buf->y_crop_height) { + write_global_motion_params(&cm->global_motion[frame], + &cm->prev_frame->global_motion[frame], wb, + cm->features.allow_high_precision_mv); + } else { + assert(cm->global_motion[frame].wmtype == IDENTITY && + "Invalid warp type for frames of different resolutions"); + } + */ + /* + printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n", + cm->current_frame.frame_number, cm->show_frame, frame, + cm->global_motion[frame].wmmat[0], + cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2], + cm->global_motion[frame].wmmat[3]); + */ + } +} + +static int check_frame_refs_short_signaling(AV1_COMMON *const cm, + bool enable_ref_short_signaling) { + // In rtc case when res < 360p and speed >= 9, we turn on + // frame_refs_short_signaling if it won't break the decoder. + if (enable_ref_short_signaling) { + const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + const int base = + 1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + + const int order_hint_group_cur = + cm->current_frame.display_order_hint / base; + const int order_hint_group_gld = + cm->ref_frame_map[gld_map_idx]->display_order_hint / base; + const int relative_dist = cm->current_frame.order_hint - + cm->ref_frame_map[gld_map_idx]->order_hint; + + // If current frame and GOLDEN frame are in the same order_hint group, and + // they are not far apart (i.e., > 64 frames), then return 1. + if (order_hint_group_cur == order_hint_group_gld && relative_dist >= 0 && + relative_dist <= 64) { + return 1; + } + return 0; + } + + // Check whether all references are distinct frames. + const RefCntBuffer *seen_bufs[INTER_REFS_PER_FRAME] = { NULL }; + int num_refs = 0; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + int seen = 0; + for (int i = 0; i < num_refs; i++) { + if (seen_bufs[i] == buf) { + seen = 1; + break; + } + } + if (!seen) seen_bufs[num_refs++] = buf; + } + } + + // We only turn on frame_refs_short_signaling when all references are + // distinct. + if (num_refs < INTER_REFS_PER_FRAME) { + // It indicates that there exist more than one reference frame pointing to + // the same reference buffer, i.e. two or more references are duplicate. + return 0; + } + + // Check whether the encoder side ref frame choices are aligned with that to + // be derived at the decoder side. + int remapped_ref_idx_decoder[REF_FRAMES]; + + const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME); + const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + + // Set up the frame refs mapping indexes according to the + // frame_refs_short_signaling policy. + av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx); + + // We only turn on frame_refs_short_signaling when the encoder side decision + // on ref frames is identical to that at the decoder side. + int frame_refs_short_signaling = 1; + for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) { + // Compare the buffer index between two reference frames indexed + // respectively by the encoder and the decoder side decisions. + RefCntBuffer *ref_frame_buf_new = NULL; + if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) { + ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]]; + } + if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) { + frame_refs_short_signaling = 0; + break; + } + } + +#if 0 // For debug + printf("\nFrame=%d: \n", cm->current_frame.frame_number); + printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling); + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + printf("enc_ref(map_idx=%d)=%d, vs. " + "dec_ref(map_idx=%d)=%d\n", + get_ref_frame_map_idx(cm, ref_frame), ref_frame, + cm->remapped_ref_idx[ref_frame - LAST_FRAME], + ref_frame); + } +#endif // 0 + + return frame_refs_short_signaling; +} + +// New function based on HLS R18 +static AOM_INLINE void write_uncompressed_header_obu( + AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb, + struct aom_write_bit_buffer *wb) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + const CommonQuantParams *quant_params = &cm->quant_params; + CurrentFrame *const current_frame = &cm->current_frame; + FeatureFlags *const features = &cm->features; + + if (!cpi->sf.rt_sf.enable_ref_short_signaling || + !seq_params->order_hint_info.enable_order_hint || + seq_params->order_hint_info.enable_ref_frame_mvs) { + current_frame->frame_refs_short_signaling = 0; + } else { + current_frame->frame_refs_short_signaling = 1; + } + + if (seq_params->still_picture) { + assert(cm->show_existing_frame == 0); + assert(cm->show_frame == 1); + assert(current_frame->frame_type == KEY_FRAME); + } + if (!seq_params->reduced_still_picture_hdr) { + if (encode_show_existing_frame(cm)) { + aom_wb_write_bit(wb, 1); // show_existing_frame + aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3); + + if (seq_params->decoder_model_info_present_flag && + seq_params->timing_info.equal_picture_interval == 0) { + write_tu_pts_info(cm, wb); + } + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_len = seq_params->frame_id_length; + int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; + aom_wb_write_literal(wb, display_frame_id, frame_id_len); + } + return; + } else { + aom_wb_write_bit(wb, 0); // show_existing_frame + } + + aom_wb_write_literal(wb, current_frame->frame_type, 2); + + aom_wb_write_bit(wb, cm->show_frame); + if (cm->show_frame) { + if (seq_params->decoder_model_info_present_flag && + seq_params->timing_info.equal_picture_interval == 0) + write_tu_pts_info(cm, wb); + } else { + aom_wb_write_bit(wb, cm->showable_frame); + } + if (frame_is_sframe(cm)) { + assert(features->error_resilient_mode); + } else if (!(current_frame->frame_type == KEY_FRAME && cm->show_frame)) { + aom_wb_write_bit(wb, features->error_resilient_mode); + } + } + aom_wb_write_bit(wb, features->disable_cdf_update); + + if (seq_params->force_screen_content_tools == 2) { + aom_wb_write_bit(wb, features->allow_screen_content_tools); + } else { + assert(features->allow_screen_content_tools == + seq_params->force_screen_content_tools); + } + + if (features->allow_screen_content_tools) { + if (seq_params->force_integer_mv == 2) { + aom_wb_write_bit(wb, features->cur_frame_force_integer_mv); + } else { + assert(features->cur_frame_force_integer_mv == + seq_params->force_integer_mv); + } + } else { + assert(features->cur_frame_force_integer_mv == 0); + } + + int frame_size_override_flag = 0; + + if (seq_params->reduced_still_picture_hdr) { + assert(cm->superres_upscaled_width == seq_params->max_frame_width && + cm->superres_upscaled_height == seq_params->max_frame_height); + } else { + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_len = seq_params->frame_id_length; + aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len); + } + + if (cm->superres_upscaled_width > seq_params->max_frame_width || + cm->superres_upscaled_height > seq_params->max_frame_height) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Frame dimensions are larger than the maximum values"); + } + + frame_size_override_flag = + frame_is_sframe(cm) + ? 1 + : (cm->superres_upscaled_width != seq_params->max_frame_width || + cm->superres_upscaled_height != seq_params->max_frame_height); + if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag); + + if (seq_params->order_hint_info.enable_order_hint) + aom_wb_write_literal( + wb, current_frame->order_hint, + seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + + if (!features->error_resilient_mode && !frame_is_intra_only(cm)) { + aom_wb_write_literal(wb, features->primary_ref_frame, PRIMARY_REF_BITS); + } + } + + if (seq_params->decoder_model_info_present_flag) { + aom_wb_write_bit(wb, cpi->ppi->buffer_removal_time_present); + if (cpi->ppi->buffer_removal_time_present) { + for (int op_num = 0; + op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { + if (seq_params->op_params[op_num].decoder_model_param_present_flag) { + if (seq_params->operating_point_idc[op_num] == 0 || + ((seq_params->operating_point_idc[op_num] >> + cm->temporal_layer_id) & + 0x1 && + (seq_params->operating_point_idc[op_num] >> + (cm->spatial_layer_id + 8)) & + 0x1)) { + aom_wb_write_unsigned_literal( + wb, cm->buffer_removal_times[op_num], + seq_params->decoder_model_info.buffer_removal_time_length); + cm->buffer_removal_times[op_num]++; + if (cm->buffer_removal_times[op_num] == 0) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "buffer_removal_time overflowed"); + } + } + } + } + } + } + + // Shown keyframes and switch-frames automatically refreshes all reference + // frames. For all other frame types, we need to write refresh_frame_flags. + if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) || + current_frame->frame_type == INTER_FRAME || + current_frame->frame_type == INTRA_ONLY_FRAME) + aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES); + + if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) { + // Write all ref frame order hints if error_resilient_mode == 1 + if (features->error_resilient_mode && + seq_params->order_hint_info.enable_order_hint) { + for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { + aom_wb_write_literal( + wb, cm->ref_frame_map[ref_idx]->order_hint, + seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + } + } + } + + if (current_frame->frame_type == KEY_FRAME) { + write_frame_size(cm, frame_size_override_flag, wb); + assert(!av1_superres_scaled(cm) || !features->allow_intrabc); + if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) + aom_wb_write_bit(wb, features->allow_intrabc); + } else { + if (current_frame->frame_type == INTRA_ONLY_FRAME) { + write_frame_size(cm, frame_size_override_flag, wb); + assert(!av1_superres_scaled(cm) || !features->allow_intrabc); + if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) + aom_wb_write_bit(wb, features->allow_intrabc); + } else if (current_frame->frame_type == INTER_FRAME || + frame_is_sframe(cm)) { + MV_REFERENCE_FRAME ref_frame; + + // NOTE: Error resilient mode turns off frame_refs_short_signaling + // automatically. +#define FRAME_REFS_SHORT_SIGNALING 0 +#if FRAME_REFS_SHORT_SIGNALING + current_frame->frame_refs_short_signaling = + seq_params->order_hint_info.enable_order_hint; +#endif // FRAME_REFS_SHORT_SIGNALING + + if (current_frame->frame_refs_short_signaling) { + // In rtc case when cpi->sf.rt_sf.enable_ref_short_signaling is true, + // we turn on frame_refs_short_signaling when the current frame and + // golden frame are in the same order_hint group, and their relative + // distance is <= 64 (in order to be decodable). + + // For other cases, an example solution for encoder-side + // implementation on frame_refs_short_signaling is also provided in + // this function, where frame_refs_short_signaling is only turned on + // when the encoder side decision on ref frames is identical to that + // at the decoder side. + + current_frame->frame_refs_short_signaling = + check_frame_refs_short_signaling( + cm, cpi->sf.rt_sf.enable_ref_short_signaling); + } + + if (seq_params->order_hint_info.enable_order_hint) + aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling); + + if (current_frame->frame_refs_short_signaling) { + const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME); + aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2); + + const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2); + } + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX); + if (!current_frame->frame_refs_short_signaling) + aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame), + REF_FRAMES_LOG2); + if (seq_params->frame_id_numbers_present_flag) { + int i = get_ref_frame_map_idx(cm, ref_frame); + int frame_id_len = seq_params->frame_id_length; + int diff_len = seq_params->delta_frame_id_length; + int delta_frame_id_minus_1 = + ((cm->current_frame_id - cm->ref_frame_id[i] + + (1 << frame_id_len)) % + (1 << frame_id_len)) - + 1; + if (delta_frame_id_minus_1 < 0 || + delta_frame_id_minus_1 >= (1 << diff_len)) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Invalid delta_frame_id_minus_1"); + } + aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len); + } + } + + if (!features->error_resilient_mode && frame_size_override_flag) { + write_frame_size_with_refs(cm, wb); + } else { + write_frame_size(cm, frame_size_override_flag, wb); + } + + if (!features->cur_frame_force_integer_mv) + aom_wb_write_bit(wb, features->allow_high_precision_mv); + write_frame_interp_filter(features->interp_filter, wb); + aom_wb_write_bit(wb, features->switchable_motion_mode); + if (frame_might_allow_ref_frame_mvs(cm)) { + aom_wb_write_bit(wb, features->allow_ref_frame_mvs); + } else { + assert(features->allow_ref_frame_mvs == 0); + } + } + } + + const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) && + !(features->disable_cdf_update); + if (cm->tiles.large_scale) + assert(features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED); + + if (might_bwd_adapt) { + aom_wb_write_bit( + wb, features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED); + } + + write_tile_info(cm, saved_wb, wb); + encode_quantization(quant_params, av1_num_planes(cm), + cm->seq_params->separate_uv_delta_q, wb); + encode_segmentation(cm, wb); + + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0); + if (quant_params->base_qindex > 0) { + aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag); + if (delta_q_info->delta_q_present_flag) { + aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2); + xd->current_base_qindex = quant_params->base_qindex; + if (features->allow_intrabc) + assert(delta_q_info->delta_lf_present_flag == 0); + else + aom_wb_write_bit(wb, delta_q_info->delta_lf_present_flag); + if (delta_q_info->delta_lf_present_flag) { + aom_wb_write_literal(wb, get_msb(delta_q_info->delta_lf_res), 2); + aom_wb_write_bit(wb, delta_q_info->delta_lf_multi); + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + } + } + } + + if (features->all_lossless) { + assert(!av1_superres_scaled(cm)); + } else { + if (!features->coded_lossless) { + encode_loopfilter(cm, wb); + encode_cdef(cm, wb); + } + encode_restoration_mode(cm, wb); + } + + // Write TX mode + if (features->coded_lossless) + assert(features->tx_mode == ONLY_4X4); + else + aom_wb_write_bit(wb, features->tx_mode == TX_MODE_SELECT); + + if (!frame_is_intra_only(cm)) { + const int use_hybrid_pred = + current_frame->reference_mode == REFERENCE_MODE_SELECT; + + aom_wb_write_bit(wb, use_hybrid_pred); + } + + if (current_frame->skip_mode_info.skip_mode_allowed) + aom_wb_write_bit(wb, current_frame->skip_mode_info.skip_mode_flag); + + if (frame_might_allow_warped_motion(cm)) + aom_wb_write_bit(wb, features->allow_warped_motion); + else + assert(!features->allow_warped_motion); + + aom_wb_write_bit(wb, features->reduced_tx_set_used); + + if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb); + + if (seq_params->film_grain_params_present && + (cm->show_frame || cm->showable_frame)) + write_film_grain_params(cpi, wb); + + if (cm->tiles.large_scale) write_ext_tile_info(cm, saved_wb, wb); +} + +static int choose_size_bytes(uint32_t size, int spare_msbs) { + // Choose the number of bytes required to represent size, without + // using the 'spare_msbs' number of most significant bits. + + // Make sure we will fit in 4 bytes to start with.. + if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1; + + // Normalise to 32 bits + size <<= spare_msbs; + + if (size >> 24 != 0) + return 4; + else if (size >> 16 != 0) + return 3; + else if (size >> 8 != 0) + return 2; + else + return 1; +} + +static AOM_INLINE void mem_put_varsize(uint8_t *const dst, const int sz, + const int val) { + switch (sz) { + case 1: dst[0] = (uint8_t)(val & 0xff); break; + case 2: mem_put_le16(dst, val); break; + case 3: mem_put_le24(dst, val); break; + case 4: mem_put_le32(dst, val); break; + default: assert(0 && "Invalid size"); break; + } +} + +static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst, + const uint32_t data_size, const uint32_t max_tile_size, + const uint32_t max_tile_col_size, + int *const tile_size_bytes, + int *const tile_col_size_bytes) { + // Choose the tile size bytes (tsb) and tile column size bytes (tcsb) + int tsb; + int tcsb; + + if (tiles->large_scale) { + // The top bit in the tile size field indicates tile copy mode, so we + // have 1 less bit to code the tile size + tsb = choose_size_bytes(max_tile_size, 1); + tcsb = choose_size_bytes(max_tile_col_size, 0); + } else { + tsb = choose_size_bytes(max_tile_size, 0); + tcsb = 4; // This is ignored + (void)max_tile_col_size; + } + + assert(tsb > 0); + assert(tcsb > 0); + + *tile_size_bytes = tsb; + *tile_col_size_bytes = tcsb; + if (tsb == 4 && tcsb == 4) return data_size; + + uint32_t wpos = 0; + uint32_t rpos = 0; + + if (tiles->large_scale) { + int tile_row; + int tile_col; + + for (tile_col = 0; tile_col < tiles->cols; tile_col++) { + // All but the last column has a column header + if (tile_col < tiles->cols - 1) { + uint32_t tile_col_size = mem_get_le32(dst + rpos); + rpos += 4; + + // Adjust the tile column size by the number of bytes removed + // from the tile size fields. + tile_col_size -= (4 - tsb) * tiles->rows; + + mem_put_varsize(dst + wpos, tcsb, tile_col_size); + wpos += tcsb; + } + + for (tile_row = 0; tile_row < tiles->rows; tile_row++) { + // All, including the last row has a header + uint32_t tile_header = mem_get_le32(dst + rpos); + rpos += 4; + + // If this is a copy tile, we need to shift the MSB to the + // top bit of the new width, and there is no data to copy. + if (tile_header >> 31 != 0) { + if (tsb < 4) tile_header >>= 32 - 8 * tsb; + mem_put_varsize(dst + wpos, tsb, tile_header); + wpos += tsb; + } else { + mem_put_varsize(dst + wpos, tsb, tile_header); + wpos += tsb; + + tile_header += AV1_MIN_TILE_SIZE_BYTES; + memmove(dst + wpos, dst + rpos, tile_header); + rpos += tile_header; + wpos += tile_header; + } + } + } + + assert(rpos > wpos); + assert(rpos == data_size); + + return wpos; + } + const int n_tiles = tiles->cols * tiles->rows; + int n; + + for (n = 0; n < n_tiles; n++) { + int tile_size; + + if (n == n_tiles - 1) { + tile_size = data_size - rpos; + } else { + tile_size = mem_get_le32(dst + rpos); + rpos += 4; + mem_put_varsize(dst + wpos, tsb, tile_size); + tile_size += AV1_MIN_TILE_SIZE_BYTES; + wpos += tsb; + } + + memmove(dst + wpos, dst + rpos, tile_size); + + rpos += tile_size; + wpos += tile_size; + } + + assert(rpos > wpos); + assert(rpos == data_size); + + return wpos; +} + +uint32_t av1_write_obu_header(AV1LevelParams *const level_params, + int *frame_header_count, OBU_TYPE obu_type, + int obu_extension, uint8_t *const dst) { + if (level_params->keep_level_stats && + (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER)) + ++(*frame_header_count); + + struct aom_write_bit_buffer wb = { dst, 0 }; + uint32_t size = 0; + + aom_wb_write_literal(&wb, 0, 1); // forbidden bit. + aom_wb_write_literal(&wb, (int)obu_type, 4); + aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1); + aom_wb_write_literal(&wb, 1, 1); // obu_has_size_field + aom_wb_write_literal(&wb, 0, 1); // reserved + + if (obu_extension) { + aom_wb_write_literal(&wb, obu_extension & 0xFF, 8); + } + + size = aom_wb_bytes_written(&wb); + return size; +} + +int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size, + uint8_t *dest) { + const size_t offset = obu_header_size; + size_t coded_obu_size = 0; + const uint32_t obu_size = (uint32_t)obu_payload_size; + assert(obu_size == obu_payload_size); + + if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset, + &coded_obu_size) != 0) { + return AOM_CODEC_ERROR; + } + + return AOM_CODEC_OK; +} + +size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size, + uint8_t *data) { + const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size); + const size_t move_dst_offset = length_field_size + obu_header_size; + const size_t move_src_offset = obu_header_size; + const size_t move_size = obu_payload_size; + memmove(data + move_dst_offset, data + move_src_offset, move_size); + return length_field_size; +} + +static AOM_INLINE void add_trailing_bits(struct aom_write_bit_buffer *wb) { + if (aom_wb_is_byte_aligned(wb)) { + aom_wb_write_literal(wb, 0x80, 8); + } else { + // assumes that the other bits are already 0s + aom_wb_write_bit(wb, 1); + } +} + +static AOM_INLINE void write_bitstream_level(AV1_LEVEL seq_level_idx, + struct aom_write_bit_buffer *wb) { + assert(is_valid_seq_level_idx(seq_level_idx)); + aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS); +} + +uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params, + uint8_t *const dst) { + struct aom_write_bit_buffer wb = { dst, 0 }; + uint32_t size = 0; + + write_profile(seq_params->profile, &wb); + + // Still picture or not + aom_wb_write_bit(&wb, seq_params->still_picture); + assert(IMPLIES(!seq_params->still_picture, + !seq_params->reduced_still_picture_hdr)); + // whether to use reduced still picture header + aom_wb_write_bit(&wb, seq_params->reduced_still_picture_hdr); + + if (seq_params->reduced_still_picture_hdr) { + assert(seq_params->timing_info_present == 0); + assert(seq_params->decoder_model_info_present_flag == 0); + assert(seq_params->display_model_info_present_flag == 0); + write_bitstream_level(seq_params->seq_level_idx[0], &wb); + } else { + aom_wb_write_bit( + &wb, seq_params->timing_info_present); // timing info present flag + + if (seq_params->timing_info_present) { + // timing_info + write_timing_info_header(&seq_params->timing_info, &wb); + aom_wb_write_bit(&wb, seq_params->decoder_model_info_present_flag); + if (seq_params->decoder_model_info_present_flag) { + write_decoder_model_info(&seq_params->decoder_model_info, &wb); + } + } + aom_wb_write_bit(&wb, seq_params->display_model_info_present_flag); + aom_wb_write_literal(&wb, seq_params->operating_points_cnt_minus_1, + OP_POINTS_CNT_MINUS_1_BITS); + int i; + for (i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { + aom_wb_write_literal(&wb, seq_params->operating_point_idc[i], + OP_POINTS_IDC_BITS); + write_bitstream_level(seq_params->seq_level_idx[i], &wb); + if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0) + aom_wb_write_bit(&wb, seq_params->tier[i]); + if (seq_params->decoder_model_info_present_flag) { + aom_wb_write_bit( + &wb, seq_params->op_params[i].decoder_model_param_present_flag); + if (seq_params->op_params[i].decoder_model_param_present_flag) { + write_dec_model_op_parameters( + &seq_params->op_params[i], + seq_params->decoder_model_info + .encoder_decoder_buffer_delay_length, + &wb); + } + } + if (seq_params->display_model_info_present_flag) { + aom_wb_write_bit( + &wb, seq_params->op_params[i].display_model_param_present_flag); + if (seq_params->op_params[i].display_model_param_present_flag) { + assert(seq_params->op_params[i].initial_display_delay >= 1); + assert(seq_params->op_params[i].initial_display_delay <= 10); + aom_wb_write_literal( + &wb, seq_params->op_params[i].initial_display_delay - 1, 4); + } + } + } + } + write_sequence_header(seq_params, &wb); + + write_color_config(seq_params, &wb); + + aom_wb_write_bit(&wb, seq_params->film_grain_params_present); + + add_trailing_bits(&wb); + + size = aom_wb_bytes_written(&wb); + return size; +} + +static uint32_t write_frame_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd, + struct aom_write_bit_buffer *saved_wb, + uint8_t *const dst, + int append_trailing_bits) { + struct aom_write_bit_buffer wb = { dst, 0 }; + write_uncompressed_header_obu(cpi, xd, saved_wb, &wb); + if (append_trailing_bits) add_trailing_bits(&wb); + return aom_wb_bytes_written(&wb); +} + +static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile, + int end_tile, int tiles_log2, + int tile_start_and_end_present_flag) { + struct aom_write_bit_buffer wb = { dst, 0 }; + uint32_t size = 0; + + if (!tiles_log2) return size; + + aom_wb_write_bit(&wb, tile_start_and_end_present_flag); + + if (tile_start_and_end_present_flag) { + aom_wb_write_literal(&wb, start_tile, tiles_log2); + aom_wb_write_literal(&wb, end_tile, tiles_log2); + } + + size = aom_wb_bytes_written(&wb); + return size; +} + +extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size, + const char *filename); + +typedef struct { + uint32_t tg_hdr_size; + uint32_t frame_header_size; +} LargeTileFrameOBU; + +// Initialize OBU header for large scale tile case. +static uint32_t init_large_scale_tile_obu_header( + AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb, + LargeTileFrameOBU *lst_obu) { + AV1LevelParams *const level_params = &cpi->ppi->level_params; + CurrentFrame *const current_frame = &cpi->common.current_frame; + // For large_scale_tile case, we always have only one tile group, so it can + // be written as an OBU_FRAME. + const OBU_TYPE obu_type = OBU_FRAME; + lst_obu->tg_hdr_size = av1_write_obu_header( + level_params, &cpi->frame_header_count, obu_type, 0, *data); + *data += lst_obu->tg_hdr_size; + + const uint32_t frame_header_size = + write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, saved_wb, *data, 0); + *data += frame_header_size; + lst_obu->frame_header_size = frame_header_size; + // (yunqing) This test ensures the correctness of large scale tile coding. + if (cpi->oxcf.tile_cfg.enable_ext_tile_debug) { + char fn[20] = "./fh"; + fn[4] = current_frame->frame_number / 100 + '0'; + fn[5] = (current_frame->frame_number % 100) / 10 + '0'; + fn[6] = (current_frame->frame_number % 10) + '0'; + fn[7] = '\0'; + av1_print_uncompressed_frame_header(*data - frame_header_size, + frame_header_size, fn); + } + return frame_header_size; +} + +// Write total buffer size and related information into the OBU header for large +// scale tile case. +static void write_large_scale_tile_obu_size( + const CommonTileParams *const tiles, uint8_t *const dst, uint8_t *data, + struct aom_write_bit_buffer *saved_wb, LargeTileFrameOBU *const lst_obu, + int have_tiles, uint32_t *total_size, int max_tile_size, + int max_tile_col_size) { + int tile_size_bytes = 0; + int tile_col_size_bytes = 0; + if (have_tiles) { + *total_size = remux_tiles( + tiles, data, *total_size - lst_obu->frame_header_size, max_tile_size, + max_tile_col_size, &tile_size_bytes, &tile_col_size_bytes); + *total_size += lst_obu->frame_header_size; + } + + // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write + // current tile group size before tile data(include tile column header). + // Tile group size doesn't include the bytes storing tg size. + *total_size += lst_obu->tg_hdr_size; + const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size; + const size_t length_field_size = + av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst); + if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) != + AOM_CODEC_OK) + assert(0); + + *total_size += (uint32_t)length_field_size; + saved_wb->bit_buffer += length_field_size; + + // Now fill in the gaps in the uncompressed header. + if (have_tiles) { + assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4); + aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2); + + assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); + aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2); + } +} + +// Store information on each large scale tile in the OBU header. +static void write_large_scale_tile_obu( + AV1_COMP *const cpi, uint8_t *const dst, LargeTileFrameOBU *const lst_obu, + int *const largest_tile_id, uint32_t *total_size, const int have_tiles, + unsigned int *const max_tile_size, unsigned int *const max_tile_col_size) { + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + + TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + unsigned int tile_size = 0; + + av1_reset_pack_bs_thread_data(&cpi->td); + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + TileInfo tile_info; + const int is_last_col = (tile_col == tile_cols - 1); + const uint32_t col_offset = *total_size; + + av1_tile_set_col(&tile_info, cm, tile_col); + + // The last column does not have a column header + if (!is_last_col) *total_size += 4; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; + const int data_offset = have_tiles ? 4 : 0; + const int tile_idx = tile_row * tile_cols + tile_col; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + av1_tile_set_row(&tile_info, cm, tile_row); + aom_writer mode_bc; + + buf->data = dst + *total_size + lst_obu->tg_hdr_size; + + // Is CONFIG_EXT_TILE = 1, every tile in the row has a header, + // even for the last one, unless no tiling is used at all. + *total_size += data_offset; + cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; + mode_bc.allow_update_cdf = !tiles->large_scale; + mode_bc.allow_update_cdf = + mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; + aom_start_encode(&mode_bc, buf->data + data_offset); + write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col); + if (aom_stop_encode(&mode_bc) < 0) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Error writing modes"); + } + tile_size = mode_bc.pos; + buf->size = tile_size; + + // Record the maximum tile size we see, so we can compact headers later. + if (tile_size > *max_tile_size) { + *max_tile_size = tile_size; + *largest_tile_id = tile_cols * tile_row + tile_col; + } + + if (have_tiles) { + // tile header: size of this tile, or copy offset + uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES; + const int tile_copy_mode = + ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256) ? 1 + : 0; + + // If tile_copy_mode = 1, check if this tile is a copy tile. + // Very low chances to have copy tiles on the key frames, so don't + // search on key frames to reduce unnecessary search. + if (cm->current_frame.frame_type != KEY_FRAME && tile_copy_mode) { + const int identical_tile_offset = + find_identical_tile(tile_row, tile_col, tile_buffers); + + // Indicate a copy-tile by setting the most significant bit. + // The row-offset to copy from is stored in the highest byte. + // remux_tiles will move these around later + if (identical_tile_offset > 0) { + tile_size = 0; + tile_header = identical_tile_offset | 0x80; + tile_header <<= 24; + } + } + + mem_put_le32(buf->data, (MEM_VALUE_T)tile_header); + } + + *total_size += tile_size; + } + if (!is_last_col) { + uint32_t col_size = *total_size - col_offset - 4; + mem_put_le32(dst + col_offset + lst_obu->tg_hdr_size, col_size); + + // Record the maximum tile column size we see. + *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size); + } + } + av1_accumulate_pack_bs_thread_data(cpi, &cpi->td); +} + +// Packs information in the obu header for large scale tiles. +static INLINE uint32_t pack_large_scale_tiles_in_tg_obus( + AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, int *const largest_tile_id) { + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + uint32_t total_size = 0; + unsigned int max_tile_size = 0; + unsigned int max_tile_col_size = 0; + const int have_tiles = tiles->cols * tiles->rows > 1; + uint8_t *data = dst; + + LargeTileFrameOBU lst_obu; + + total_size += + init_large_scale_tile_obu_header(cpi, &data, saved_wb, &lst_obu); + + write_large_scale_tile_obu(cpi, dst, &lst_obu, largest_tile_id, &total_size, + have_tiles, &max_tile_size, &max_tile_col_size); + + write_large_scale_tile_obu_size(tiles, dst, data, saved_wb, &lst_obu, + have_tiles, &total_size, max_tile_size, + max_tile_col_size); + + return total_size; +} + +// Writes obu, tile group and uncompressed headers to bitstream. +void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd, + PackBSParams *const pack_bs_params, + const int tile_idx) { + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size; + const int tg_size = + (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg; + + // Write Tile group, frame and OBU header + // A new tile group begins at this tile. Write the obu header and + // tile group header + const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP; + *curr_tg_hdr_size = av1_write_obu_header( + &cpi->ppi->level_params, &cpi->frame_header_count, obu_type, + pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr); + pack_bs_params->obu_header_size = *curr_tg_hdr_size; + + if (cpi->num_tg == 1) + *curr_tg_hdr_size += write_frame_header_obu( + cpi, xd, pack_bs_params->saved_wb, + pack_bs_params->tile_data_curr + *curr_tg_hdr_size, 0); + *curr_tg_hdr_size += write_tile_group_header( + pack_bs_params->tile_data_curr + *curr_tg_hdr_size, tile_idx, + AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1), + (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1); + *pack_bs_params->total_size += *curr_tg_hdr_size; +} + +// Pack tile data in the bitstream with tile_group, frame +// and OBU header. +void av1_pack_tile_info(AV1_COMP *const cpi, ThreadData *const td, + PackBSParams *const pack_bs_params) { + aom_writer mode_bc; + AV1_COMMON *const cm = &cpi->common; + int tile_row = pack_bs_params->tile_row; + int tile_col = pack_bs_params->tile_col; + uint32_t *const total_size = pack_bs_params->total_size; + TileInfo tile_info; + av1_tile_set_col(&tile_info, cm, tile_col); + av1_tile_set_row(&tile_info, cm, tile_row); + mode_bc.allow_update_cdf = 1; + mode_bc.allow_update_cdf = + mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; + + unsigned int tile_size; + + const int num_planes = av1_num_planes(cm); + av1_reset_loop_restoration(&td->mb.e_mbd, num_planes); + + pack_bs_params->buf.data = pack_bs_params->dst + *total_size; + + // The last tile of the tile group does not have a header. + if (!pack_bs_params->is_last_tile_in_tg) *total_size += 4; + + // Pack tile data + aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size); + write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col); + if (aom_stop_encode(&mode_bc) < 0) { + aom_internal_error(td->mb.e_mbd.error_info, AOM_CODEC_ERROR, + "Error writing modes"); + } + tile_size = mode_bc.pos; + assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); + + pack_bs_params->buf.size = tile_size; + + // Write tile size + if (!pack_bs_params->is_last_tile_in_tg) { + // size of this tile + mem_put_le32(pack_bs_params->buf.data, tile_size - AV1_MIN_TILE_SIZE_BYTES); + } +} + +void av1_write_last_tile_info( + AV1_COMP *const cpi, const FrameHeaderInfo *fh_info, + struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size, + uint8_t *curr_tg_start, uint32_t *const total_size, + uint8_t **tile_data_start, int *const largest_tile_id, + int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header) { + // write current tile group size + const uint32_t obu_payload_size = + (uint32_t)(*curr_tg_data_size) - obu_header_size; + const size_t length_field_size = + av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, + curr_tg_start) != AOM_CODEC_OK) { + assert(0); + } + *curr_tg_data_size += (int)length_field_size; + *total_size += (uint32_t)length_field_size; + *tile_data_start += length_field_size; + if (cpi->num_tg == 1) { + // if this tg is combined with the frame header then update saved + // frame header base offset according to length field size + saved_wb->bit_buffer += length_field_size; + } + + if (!(*is_first_tg) && cpi->common.features.error_resilient_mode) { + // Make room for a duplicate Frame Header OBU. + memmove(curr_tg_start + fh_info->total_length, curr_tg_start, + *curr_tg_data_size); + + // Insert a copy of the Frame Header OBU. + memcpy(curr_tg_start, fh_info->frame_header, fh_info->total_length); + + // Force context update tile to be the first tile in error + // resilient mode as the duplicate frame headers will have + // context_update_tile_id set to 0 + *largest_tile_id = 0; + + // Rewrite the OBU header to change the OBU type to Redundant Frame + // Header. + av1_write_obu_header(&cpi->ppi->level_params, &cpi->frame_header_count, + OBU_REDUNDANT_FRAME_HEADER, obu_extn_header, + &curr_tg_start[fh_info->obu_header_byte_offset]); + + *curr_tg_data_size += (int)(fh_info->total_length); + *total_size += (uint32_t)(fh_info->total_length); + } + *is_first_tg = 0; +} + +void av1_reset_pack_bs_thread_data(ThreadData *const td) { + td->coefficient_size = 0; + td->max_mv_magnitude = 0; + av1_zero(td->interp_filter_selected); +} + +void av1_accumulate_pack_bs_thread_data(AV1_COMP *const cpi, + ThreadData const *td) { + int do_max_mv_magnitude_update = 1; + cpi->rc.coefficient_size += td->coefficient_size; + + // Disable max_mv_magnitude update for parallel frames based on update flag. + if (!cpi->do_frame_data_update) do_max_mv_magnitude_update = 0; + + if (cpi->sf.mv_sf.auto_mv_step_size && do_max_mv_magnitude_update) + cpi->mv_search_params.max_mv_magnitude = + AOMMAX(cpi->mv_search_params.max_mv_magnitude, td->max_mv_magnitude); + + for (InterpFilter filter = EIGHTTAP_REGULAR; filter < SWITCHABLE; filter++) + cpi->common.cur_frame->interp_filter_selected[filter] += + td->interp_filter_selected[filter]; +} + +// Store information related to each default tile in the OBU header. +static void write_tile_obu( + AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id, + unsigned int *max_tile_size, uint32_t *const obu_header_size, + uint8_t **tile_data_start) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + const CommonTileParams *const tiles = &cm->tiles; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + // Fixed size tile groups for the moment + const int num_tg_hdrs = cpi->num_tg; + const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs; + int tile_count = 0; + size_t curr_tg_data_size = 0; + uint8_t *tile_data_curr = dst; + int new_tg = 1; + int is_first_tg = 1; + + av1_reset_pack_bs_thread_data(&cpi->td); + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + const int tile_idx = tile_row * tile_cols + tile_col; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + + int is_last_tile_in_tg = 0; + if (new_tg) { + tile_data_curr = dst + *total_size; + tile_count = 0; + } + tile_count++; + + if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) + is_last_tile_in_tg = 1; + + xd->tile_ctx = &this_tile->tctx; + + // PackBSParams stores all parameters required to pack tile and header + // info. + PackBSParams pack_bs_params; + pack_bs_params.dst = dst; + pack_bs_params.curr_tg_hdr_size = 0; + pack_bs_params.is_last_tile_in_tg = is_last_tile_in_tg; + pack_bs_params.new_tg = new_tg; + pack_bs_params.obu_extn_header = obu_extn_header; + pack_bs_params.obu_header_size = 0; + pack_bs_params.saved_wb = saved_wb; + pack_bs_params.tile_col = tile_col; + pack_bs_params.tile_row = tile_row; + pack_bs_params.tile_data_curr = tile_data_curr; + pack_bs_params.total_size = total_size; + + if (new_tg) + av1_write_obu_tg_tile_headers(cpi, xd, &pack_bs_params, tile_idx); + + av1_pack_tile_info(cpi, &cpi->td, &pack_bs_params); + + if (new_tg) { + curr_tg_data_size = pack_bs_params.curr_tg_hdr_size; + *tile_data_start += pack_bs_params.curr_tg_hdr_size; + *obu_header_size = pack_bs_params.obu_header_size; + new_tg = 0; + } + if (is_last_tile_in_tg) new_tg = 1; + + curr_tg_data_size += + (pack_bs_params.buf.size + (is_last_tile_in_tg ? 0 : 4)); + + if (pack_bs_params.buf.size > *max_tile_size) { + *largest_tile_id = tile_idx; + *max_tile_size = (unsigned int)pack_bs_params.buf.size; + } + + if (is_last_tile_in_tg) + av1_write_last_tile_info(cpi, fh_info, saved_wb, &curr_tg_data_size, + tile_data_curr, total_size, tile_data_start, + largest_tile_id, &is_first_tg, + *obu_header_size, obu_extn_header); + *total_size += (uint32_t)pack_bs_params.buf.size; + } + } + av1_accumulate_pack_bs_thread_data(cpi, &cpi->td); +} + +// Write total buffer size and related information into the OBU header for +// default tile case. +static void write_tile_obu_size(AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, + int largest_tile_id, uint32_t *const total_size, + unsigned int max_tile_size, + uint32_t obu_header_size, + uint8_t *tile_data_start) { + const CommonTileParams *const tiles = &cpi->common.tiles; + + // Fill in context_update_tile_id indicating the tile to use for the + // cdf update. The encoder currently sets it to the largest tile + // (but is up to the encoder) + aom_wb_overwrite_literal(saved_wb, largest_tile_id, + (tiles->log2_cols + tiles->log2_rows)); + // If more than one tile group. tile_size_bytes takes the default value 4 + // and does not need to be set. For a single tile group it is set in the + // section below. + if (cpi->num_tg != 1) return; + int tile_size_bytes = 4, unused; + const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst); + const uint32_t tile_data_size = *total_size - tile_data_offset; + + *total_size = remux_tiles(tiles, tile_data_start, tile_data_size, + max_tile_size, 0, &tile_size_bytes, &unused); + *total_size += tile_data_offset; + assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); + + aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2); + + // Update the OBU length if remux_tiles() reduced the size. + uint64_t payload_size; + size_t length_field_size; + int res = + aom_uleb_decode(dst + obu_header_size, *total_size - obu_header_size, + &payload_size, &length_field_size); + assert(res == 0); + (void)res; + + const uint64_t new_payload_size = + *total_size - obu_header_size - length_field_size; + if (new_payload_size != payload_size) { + size_t new_length_field_size; + res = aom_uleb_encode(new_payload_size, length_field_size, + dst + obu_header_size, &new_length_field_size); + assert(res == 0); + if (new_length_field_size < length_field_size) { + const size_t src_offset = obu_header_size + length_field_size; + const size_t dst_offset = obu_header_size + new_length_field_size; + memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size); + *total_size -= (int)(length_field_size - new_length_field_size); + } + } +} + +// As per the experiments, single-thread bitstream packing is better for +// frames with a smaller bitstream size. This behavior is due to setup time +// overhead of multithread function would be more than that of time required +// to pack the smaller bitstream of such frames. This function computes the +// number of required number of workers based on setup time overhead and job +// dispatch time overhead for given tiles and available workers. +int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles, + int avail_workers, bool pack_bs_mt_enabled) { + if (!pack_bs_mt_enabled) return 1; + + uint64_t frame_abs_sum_level = 0; + + for (int idx = 0; idx < num_tiles; idx++) + frame_abs_sum_level += tile_data[idx].abs_sum_level; + + int ideal_num_workers = 1; + const float job_disp_time_const = (float)num_tiles * JOB_DISP_TIME_OH_CONST; + float max_sum = 0.0; + + for (int num_workers = avail_workers; num_workers > 1; num_workers--) { + const float fas_per_worker_const = + ((float)(num_workers - 1) / num_workers) * frame_abs_sum_level; + const float setup_time_const = (float)num_workers * SETUP_TIME_OH_CONST; + const float this_sum = fas_per_worker_const - setup_time_const - + job_disp_time_const / num_workers; + + if (this_sum > max_sum) { + max_sum = this_sum; + ideal_num_workers = num_workers; + } + } + return ideal_num_workers; +} + +static INLINE uint32_t pack_tiles_in_tg_obus( + AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id) { + const CommonTileParams *const tiles = &cpi->common.tiles; + uint32_t total_size = 0; + unsigned int max_tile_size = 0; + uint32_t obu_header_size = 0; + uint8_t *tile_data_start = dst; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + const int num_tiles = tile_rows * tile_cols; + + const int num_workers = calc_pack_bs_mt_workers( + cpi->tile_data, num_tiles, cpi->mt_info.num_mod_workers[MOD_PACK_BS], + cpi->mt_info.pack_bs_mt_enabled); + + if (num_workers > 1) { + av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header, + fh_info, largest_tile_id, &max_tile_size, + &obu_header_size, &tile_data_start, num_workers); + } else { + write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header, + fh_info, largest_tile_id, &max_tile_size, &obu_header_size, + &tile_data_start); + } + + if (num_tiles > 1) + write_tile_obu_size(cpi, dst, saved_wb, *largest_tile_id, &total_size, + max_tile_size, obu_header_size, tile_data_start); + return total_size; +} + +static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, + uint8_t obu_extension_header, + const FrameHeaderInfo *fh_info, + int *const largest_tile_id) { + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + *largest_tile_id = 0; + + // Select the coding strategy (temporal or spatial) + if (cm->seg.enabled && cm->seg.update_map) { + if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { + cm->seg.temporal_update = 0; + } else { + cm->seg.temporal_update = 1; + if (cpi->td.rd_counts.seg_tmp_pred_cost[0] < + cpi->td.rd_counts.seg_tmp_pred_cost[1]) + cm->seg.temporal_update = 0; + } + } + + if (tiles->large_scale) + return pack_large_scale_tiles_in_tg_obus(cpi, dst, saved_wb, + largest_tile_id); + + return pack_tiles_in_tg_obus(cpi, dst, saved_wb, obu_extension_header, + fh_info, largest_tile_id); +} + +static size_t av1_write_metadata_obu(const aom_metadata_t *metadata, + uint8_t *const dst) { + size_t coded_metadata_size = 0; + const uint64_t metadata_type = (uint64_t)metadata->type; + if (aom_uleb_encode(metadata_type, sizeof(metadata_type), dst, + &coded_metadata_size) != 0) { + return 0; + } + memcpy(dst + coded_metadata_size, metadata->payload, metadata->sz); + // Add trailing bits. + dst[coded_metadata_size + metadata->sz] = 0x80; + return (uint32_t)(coded_metadata_size + metadata->sz + 1); +} + +static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) { + if (!cpi->source) return 0; + AV1_COMMON *const cm = &cpi->common; + aom_metadata_array_t *arr = cpi->source->metadata; + if (!arr) return 0; + size_t obu_header_size = 0; + size_t obu_payload_size = 0; + size_t total_bytes_written = 0; + size_t length_field_size = 0; + for (size_t i = 0; i < arr->sz; i++) { + aom_metadata_t *current_metadata = arr->metadata_array[i]; + if (current_metadata && current_metadata->payload) { + if ((cm->current_frame.frame_type == KEY_FRAME && + current_metadata->insert_flag == AOM_MIF_KEY_FRAME) || + (cm->current_frame.frame_type != KEY_FRAME && + current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) || + current_metadata->insert_flag == AOM_MIF_ANY_FRAME) { + obu_header_size = av1_write_obu_header(&cpi->ppi->level_params, + &cpi->frame_header_count, + OBU_METADATA, 0, dst); + obu_payload_size = + av1_write_metadata_obu(current_metadata, dst + obu_header_size); + length_field_size = + av1_obu_memmove(obu_header_size, obu_payload_size, dst); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) == + AOM_CODEC_OK) { + const size_t obu_size = obu_header_size + obu_payload_size; + dst += obu_size + length_field_size; + total_bytes_written += obu_size + length_field_size; + } else { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Error writing metadata OBU size"); + } + } + } + } + return total_bytes_written; +} + +int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size, + int *const largest_tile_id) { + uint8_t *data = dst; + uint32_t data_size; + AV1_COMMON *const cm = &cpi->common; + AV1LevelParams *const level_params = &cpi->ppi->level_params; + uint32_t obu_header_size = 0; + uint32_t obu_payload_size = 0; + FrameHeaderInfo fh_info = { NULL, 0, 0 }; + const uint8_t obu_extension_header = + cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0; + + // If no non-zero delta_q has been used, reset delta_q_present_flag + if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) { + cm->delta_q_info.delta_q_present_flag = 0; + } + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_reset_write(); +#endif + + cpi->frame_header_count = 0; + + // The TD is now written outside the frame encode loop + + // write sequence header obu at each key frame or intra_only frame, + // preceded by 4-byte size + if (cm->current_frame.frame_type == INTRA_ONLY_FRAME || + cm->current_frame.frame_type == KEY_FRAME) { + obu_header_size = av1_write_obu_header( + level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data); + obu_payload_size = + av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size); + const size_t length_field_size = + av1_obu_memmove(obu_header_size, obu_payload_size, data); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + data += obu_header_size + obu_payload_size + length_field_size; + } + + // write metadata obus before the frame obu that has the show_frame flag set + if (cm->show_frame) data += av1_write_metadata_array(cpi, data); + + const int write_frame_header = + (cpi->num_tg > 1 || encode_show_existing_frame(cm)); + struct aom_write_bit_buffer saved_wb = { NULL, 0 }; + size_t length_field = 0; + if (write_frame_header) { + // Write Frame Header OBU. + fh_info.frame_header = data; + obu_header_size = + av1_write_obu_header(level_params, &cpi->frame_header_count, + OBU_FRAME_HEADER, obu_extension_header, data); + obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb, + data + obu_header_size, 1); + + length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + fh_info.obu_header_byte_offset = 0; + fh_info.total_length = obu_header_size + obu_payload_size + length_field; + data += fh_info.total_length; + } + + if (encode_show_existing_frame(cm)) { + data_size = 0; + } else { + // Since length_field is determined adaptively after frame header + // encoding, saved_wb must be adjusted accordingly. + if (saved_wb.bit_buffer != NULL) { + saved_wb.bit_buffer += length_field; + } + + // Each tile group obu will be preceded by 4-byte size of the tile group + // obu + data_size = write_tiles_in_tg_obus( + cpi, data, &saved_wb, obu_extension_header, &fh_info, largest_tile_id); + } + data += data_size; + *size = data - dst; + return AOM_CODEC_OK; +} diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h new file mode 100644 index 0000000000..12e8a630db --- /dev/null +++ b/third_party/aom/av1/encoder/bitstream.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_BITSTREAM_H_ +#define AOM_AV1_ENCODER_BITSTREAM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" +#include "av1/encoder/level.h" +#include "aom_dsp/bitwriter.h" + +struct aom_write_bit_buffer; +struct AV1_COMP; +struct ThreadData; + +/*!\cond */ + +// Stores the location and size of a tile's data in the bitstream. Used for +// later identifying identical tiles +typedef struct { + uint8_t *data; + size_t size; +} TileBufferEnc; + +typedef struct { + uint8_t *frame_header; + size_t obu_header_byte_offset; + size_t total_length; +} FrameHeaderInfo; + +typedef struct { + struct aom_write_bit_buffer *saved_wb; // Bit stream buffer writer structure + TileBufferEnc buf; // Structure to hold bitstream buffer and size + uint32_t *total_size; // Size of the bitstream buffer for the tile in bytes + uint8_t *dst; // Base address of tile bitstream buffer + uint8_t *tile_data_curr; // Base address of tile-group bitstream buffer + size_t tile_buf_size; // Available bitstream buffer for the tile in bytes + uint8_t obu_extn_header; // Presence of OBU extension header + uint32_t obu_header_size; // Size of the OBU header + int curr_tg_hdr_size; // Size of the obu, tg, frame headers + int tile_size_mi; // Tile size in mi units + int tile_row; // Number of tile rows + int tile_col; // Number of tile columns + int is_last_tile_in_tg; // Flag to indicate last tile in a tile-group + int new_tg; // Flag to indicate starting of a new tile-group +} PackBSParams; + +typedef struct { + uint64_t abs_sum_level; + uint16_t tile_idx; +} PackBSTileOrder; + +// Pack bitstream data for pack bitstream multi-threading. +typedef struct { +#if CONFIG_MULTITHREAD + // Mutex lock used while dispatching jobs. + pthread_mutex_t *mutex_; +#endif + // Tile order structure of pack bitstream multithreading. + PackBSTileOrder pack_bs_tile_order[MAX_TILES]; + + // Index of next job to be processed. + int next_job_idx; + // Initialized to false, set to true by the worker thread that encounters an + // error in order to abort the processing of other worker threads. + bool pack_bs_mt_exit; +} AV1EncPackBSSync; + +/*!\endcond */ + +// Writes only the OBU Sequence Header payload, and returns the size of the +// payload written to 'dst'. This function does not write the OBU header, the +// optional extension, or the OBU size to 'dst'. +uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params, + uint8_t *const dst); + +// Writes the OBU header byte, and the OBU header extension byte when +// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'. +uint32_t av1_write_obu_header(AV1LevelParams *const level_params, + int *frame_header_count, OBU_TYPE obu_type, + int obu_extension, uint8_t *const dst); + +int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size, + uint8_t *dest); + +// Pack tile data in the bitstream with tile_group, frame +// and OBU header. +void av1_pack_tile_info(struct AV1_COMP *const cpi, struct ThreadData *const td, + PackBSParams *const pack_bs_params); + +void av1_write_last_tile_info( + struct AV1_COMP *const cpi, const FrameHeaderInfo *fh_info, + struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size, + uint8_t *curr_tg_start, uint32_t *const total_size, + uint8_t **tile_data_start, int *const largest_tile_id, + int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header); + +/*!\brief Pack the bitstream for one frame + * + * \ingroup high_level_algo + * \callgraph + */ +int av1_pack_bitstream(struct AV1_COMP *const cpi, uint8_t *dst, size_t *size, + int *const largest_tile_id); + +void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, + TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w); + +void av1_reset_pack_bs_thread_data(struct ThreadData *const td); + +void av1_accumulate_pack_bs_thread_data(struct AV1_COMP *const cpi, + struct ThreadData const *td); + +void av1_write_obu_tg_tile_headers(struct AV1_COMP *const cpi, + MACROBLOCKD *const xd, + PackBSParams *const pack_bs_params, + const int tile_idx); + +int av1_neg_interleave(int x, int ref, int max); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_BITSTREAM_H_ diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h new file mode 100644 index 0000000000..33d2d8c2a0 --- /dev/null +++ b/third_party/aom/av1/encoder/block.h @@ -0,0 +1,1515 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*! \file + * Declares various structs used to encode the current partition block. + */ +#ifndef AOM_AV1_ENCODER_BLOCK_H_ +#define AOM_AV1_ENCODER_BLOCK_H_ + +#include "av1/common/blockd.h" +#include "av1/common/entropymv.h" +#include "av1/common/entropy.h" +#include "av1/common/enums.h" +#include "av1/common/mvref_common.h" + +#include "av1/encoder/enc_enums.h" +#include "av1/encoder/mcomp_structs.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/partition_cnn_weights.h" +#endif + +#include "av1/encoder/hash_motion.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//! Minimum linear dimension of a tpl block +#define MIN_TPL_BSIZE_1D 16 +//! Maximum number of tpl block in a super block +#define MAX_TPL_BLK_IN_SB (MAX_SB_SIZE / MIN_TPL_BSIZE_1D) +//! Number of txfm hash records kept for the partition block. +#define RD_RECORD_BUFFER_LEN 8 + +/*! Maximum value taken by transform type probabilities */ +#define MAX_TX_TYPE_PROB 1024 + +//! Compute color sensitivity index for given plane +#define COLOR_SENS_IDX(plane) ((plane)-1) + +//! Enable timer statistics of mode search in non-rd +#define COLLECT_NONRD_PICK_MODE_STAT 0 + +/*!\cond */ +#if COLLECT_NONRD_PICK_MODE_STAT +#include "aom_ports/aom_timer.h" + +typedef struct _mode_search_stat_nonrd { + int32_t num_blocks[BLOCK_SIZES]; + int64_t total_block_times[BLOCK_SIZES]; + int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT]; + int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t nonskipped_search_times[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t ms_time[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t ifs_time[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t model_rd_time[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t txfm_time[BLOCK_SIZES][MB_MODE_COUNT]; + struct aom_usec_timer timer1; + struct aom_usec_timer timer2; + struct aom_usec_timer bsize_timer; +} mode_search_stat_nonrd; +#endif // COLLECT_NONRD_PICK_MODE_STAT +/*!\endcond */ + +/*! \brief Superblock level encoder info + * + * SuperblockEnc stores superblock level information used by the encoder for + * more efficient encoding. Currently this is mostly used to store TPL data + * for the current superblock. + */ +typedef struct { + //! Maximum partition size for the sb. + BLOCK_SIZE min_partition_size; + //! Minimum partition size for the sb. + BLOCK_SIZE max_partition_size; + + /***************************************************************************** + * \name TPL Info + * + * Information gathered from tpl_model at tpl block precision for the + * superblock to speed up the encoding process.. + ****************************************************************************/ + /**@{*/ + //! Number of TPL blocks in this superblock. + int tpl_data_count; + //! TPL's estimate of inter cost for each tpl block. + int64_t tpl_inter_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB]; + //! TPL's estimate of tpl cost for each tpl block. + int64_t tpl_intra_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB]; + //! Motion vectors found by TPL model for each tpl block. + int_mv tpl_mv[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB][INTER_REFS_PER_FRAME]; + //! TPL's stride for the arrays in this struct. + int tpl_stride; + /**@}*/ +} SuperBlockEnc; + +/*! \brief Stores the best performing modes. + */ +typedef struct { + //! The mbmi used to reconstruct the winner mode. + MB_MODE_INFO mbmi; + //! Rdstats of the winner mode. + RD_STATS rd_cost; + //! Rdcost of the winner mode + int64_t rd; + //! Luma rate of the winner mode. + int rate_y; + //! Chroma rate of the winner mode. + int rate_uv; + //! The color map needed to reconstruct palette mode. + uint8_t color_index_map[MAX_SB_SQUARE]; + //! The current winner mode. + THR_MODES mode_index; +} WinnerModeStats; + +/*! \brief Each source plane of the current macroblock + * + * This struct also stores the txfm buffers and quantizer settings. + */ +typedef struct macroblock_plane { + //! Stores source - pred so the txfm can be computed later + int16_t *src_diff; + //! Dequantized coefficients + tran_low_t *dqcoeff; + //! Quantized coefficients + tran_low_t *qcoeff; + //! Transformed coefficients + tran_low_t *coeff; + //! Location of the end of qcoeff (end of block). + uint16_t *eobs; + //! Contexts used to code the transform coefficients. + uint8_t *txb_entropy_ctx; + //! A buffer containing the source frame. + struct buf_2d src; + + /*! \name Quantizer Settings + * + * \attention These are used/accessed only in the quantization process. + * RDO does not and *must not* depend on any of these values. + * All values below share the coefficient scale/shift used in TX. + */ + /**@{*/ + //! Quantization step size used by AV1_XFORM_QUANT_FP. + const int16_t *quant_fp_QTX; + //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_FP. + const int16_t *round_fp_QTX; + //! Quantization step size used by AV1_XFORM_QUANT_B. + const int16_t *quant_QTX; + //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_B. + const int16_t *round_QTX; + //! Scale factor to shift coefficients toward zero. Only used by QUANT_B. + const int16_t *quant_shift_QTX; + //! Size of the quantization bin around 0. Only Used by QUANT_B + const int16_t *zbin_QTX; + //! Dequantizer + const int16_t *dequant_QTX; + /**@}*/ +} MACROBLOCK_PLANE; + +/*! \brief Costs for encoding the coefficients within a level. + * + * Covers everything including txb_skip, eob, dc_sign, + */ +typedef struct { + //! Cost to skip txfm for the current txfm block. + int txb_skip_cost[TXB_SKIP_CONTEXTS][2]; + /*! \brief Cost for encoding the base_eob of a level. + * + * Decoder uses base_eob to derive the base_level as base_eob := base_eob+1. + */ + int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3]; + /*! \brief Cost for encoding the base level of a coefficient. + * + * Decoder derives coeff_base as coeff_base := base_eob + 1. + */ + int base_cost[SIG_COEF_CONTEXTS][8]; + /*! \brief Cost for encoding the last non-zero coefficient. + * + * Eob is derived from eob_extra at the decoder as eob := eob_extra + 1 + */ + int eob_extra_cost[EOB_COEF_CONTEXTS][2]; + //! Cost for encoding the dc_sign + int dc_sign_cost[DC_SIGN_CONTEXTS][2]; + //! Cost for encoding an increment to the coefficient + int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1]; +} LV_MAP_COEFF_COST; + +/*! \brief Costs for encoding the eob. + */ +typedef struct { + //! eob_cost. + int eob_cost[2][11]; +} LV_MAP_EOB_COST; + +/*! \brief Stores the transforms coefficients for the whole superblock. + */ +typedef struct { + //! The transformed coefficients. + tran_low_t *tcoeff[MAX_MB_PLANE]; + //! Where the transformed coefficients end. + uint16_t *eobs[MAX_MB_PLANE]; + /*! \brief Transform block entropy contexts. + * + * Each element is used as a bit field. + * - Bits 0~3: txb_skip_ctx + * - Bits 4~5: dc_sign_ctx. + */ + uint8_t *entropy_ctx[MAX_MB_PLANE]; +} CB_COEFF_BUFFER; + +/*! \brief Extended mode info derived from mbmi. + */ +typedef struct { + // TODO(angiebird): Reduce the buffer size according to sb_type + //! The reference mv list for the current block. + CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE]; + //! The weights used to compute the ref mvs. + uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE]; + //! Number of ref mvs in the drl. + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; + //! Global mvs + int_mv global_mvs[REF_FRAMES]; + //! Context used to encode the current mode. + int16_t mode_context[MODE_CTX_REF_FRAMES]; +} MB_MODE_INFO_EXT; + +/*! \brief Stores best extended mode information at frame level. + * + * The frame level in here is used in bitstream preparation stage. The + * information in \ref MB_MODE_INFO_EXT are copied to this struct to save + * memory. + */ +typedef struct { + //! \copydoc MB_MODE_INFO_EXT::ref_mv_stack + CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE]; + //! \copydoc MB_MODE_INFO_EXT::weight + uint16_t weight[USABLE_REF_MV_STACK_SIZE]; + //! \copydoc MB_MODE_INFO_EXT::ref_mv_count + uint8_t ref_mv_count; + // TODO(Ravi/Remya): Reduce the buffer size of global_mvs + //! \copydoc MB_MODE_INFO_EXT::global_mvs + int_mv global_mvs[REF_FRAMES]; + //! \copydoc MB_MODE_INFO_EXT::mode_context + int16_t mode_context; + //! Offset of current coding block's coeff buffer relative to the sb. + uint16_t cb_offset[PLANE_TYPES]; +} MB_MODE_INFO_EXT_FRAME; + +/*! \brief Inter-mode txfm results for a partition block. + */ +typedef struct { + //! Txfm size used if the current mode is intra mode. + TX_SIZE tx_size; + //! Txfm sizes used if the current mode is inter mode. + TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN]; + //! Map showing which txfm block skips the txfm process. + uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + //! Map showing the txfm types for each block. + uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + //! Rd_stats for the whole partition block. + RD_STATS rd_stats; + //! Hash value of the current record. + uint32_t hash_value; +} MB_RD_INFO; + +/*! \brief Hash records of the inter-mode transform results + * + * Hash records of the inter-mode transform results for a whole partition block + * based on the residue. Since this operates on the partition block level, this + * can give us a whole txfm partition tree. + */ +typedef struct { + /*! Circular buffer that stores the inter-mode txfm results of a partition + * block. + */ + MB_RD_INFO mb_rd_info[RD_RECORD_BUFFER_LEN]; + //! Index to insert the newest rd record. + int index_start; + //! Number of info stored in this record. + int num; + //! Hash function + CRC32C crc_calculator; +} MB_RD_RECORD; + +//! Number of compound rd stats +#define MAX_COMP_RD_STATS 64 +/*! \brief Rdcost stats in compound mode. + */ +typedef struct { + //! Rate of the compound modes. + int32_t rate[COMPOUND_TYPES]; + //! Distortion of the compound modes. + int64_t dist[COMPOUND_TYPES]; + //! Estimated rate of the compound modes. + int32_t model_rate[COMPOUND_TYPES]; + //! Estimated distortion of the compound modes. + int64_t model_dist[COMPOUND_TYPES]; + //! Rate need to send the mask type. + int comp_rs2[COMPOUND_TYPES]; + //! Motion vector for each predictor. + int_mv mv[2]; + //! Ref frame for each predictor. + MV_REFERENCE_FRAME ref_frames[2]; + //! Current prediction mode. + PREDICTION_MODE mode; + //! Current interpolation filter. + int_interpfilters filter; + //! Refmv index in the drl. + int ref_mv_idx; + //! Whether the predictors are GLOBALMV. + int is_global[2]; + //! Current parameters for interinter mode. + INTERINTER_COMPOUND_DATA interinter_comp; +} COMP_RD_STATS; + +/*! \brief Contains buffers used to speed up rdopt for obmc. + * + * See the comments for calc_target_weighted_pred for details. + */ +typedef struct { + /*! \brief A new source weighted with the above and left predictors. + * + * Used to efficiently construct multiple obmc predictors during rdopt. + */ + int32_t *wsrc; + /*! \brief A new mask constructed from the original horz/vert mask. + * + * \copydetails wsrc + */ + int32_t *mask; + /*! \brief Prediction from the up predictor. + * + * Used to build the obmc predictor. + */ + uint8_t *above_pred; + /*! \brief Prediction from the up predictor. + * + * \copydetails above_pred + */ + uint8_t *left_pred; +} OBMCBuffer; + +/*! \brief Contains color maps used in palette mode. + */ +typedef struct { + //! The best color map found. + uint8_t best_palette_color_map[MAX_PALETTE_SQUARE]; + //! A temporary buffer used for k-means clustering. + int16_t kmeans_data_buf[2 * MAX_PALETTE_SQUARE]; +} PALETTE_BUFFER; + +/*! \brief Contains buffers used by av1_compound_type_rd() + * + * For sizes and alignment of these arrays, refer to + * alloc_compound_type_rd_buffers() function. + */ +typedef struct { + //! First prediction. + uint8_t *pred0; + //! Second prediction. + uint8_t *pred1; + //! Source - first prediction. + int16_t *residual1; + //! Second prediction - first prediction. + int16_t *diff10; + //! Backup of the best segmentation mask. + uint8_t *tmp_best_mask_buf; +} CompoundTypeRdBuffers; + +/*! \brief Holds some parameters related to partitioning schemes in AV1. + */ +// TODO(chiyotsai@google.com): Consolidate this with SIMPLE_MOTION_DATA_TREE +typedef struct { +#if !CONFIG_REALTIME_ONLY + // The following 4 parameters are used for cnn-based partitioning on intra + // frame. + /*! \brief Current index on the partition block quad tree. + * + * Used to index into the cnn buffer for partition decision. + */ + int quad_tree_idx; + //! Whether the CNN buffer contains valid output. + int cnn_output_valid; + //! A buffer used by our segmentation CNN for intra-frame partitioning. + float cnn_buffer[CNN_OUT_BUF_SIZE]; + //! log of the quantization parameter of the ancestor BLOCK_64X64. + float log_q; +#endif + + /*! \brief Variance of the subblocks in the superblock. + * + * This is used by rt mode for variance based partitioning. + * The indices corresponds to the following block sizes: + * - 0 - 128x128 + * - 1-2 - 128x64 + * - 3-4 - 64x128 + * - 5-8 - 64x64 + * - 9-16 - 64x32 + * - 17-24 - 32x64 + * - 25-40 - 32x32 + * - 41-104 - 16x16 + */ + uint8_t variance_low[105]; +} PartitionSearchInfo; + +/*!\cond */ +enum { + /** + * Do not prune transform depths. + */ + TX_PRUNE_NONE = 0, + /** + * Prune largest transform (depth 0) based on NN model. + */ + TX_PRUNE_LARGEST = 1, + /** + * Prune split transforms (depth>=1) based on NN model. + */ + TX_PRUNE_SPLIT = 2, +} UENUM1BYTE(TX_PRUNE_TYPE); +/*!\endcond */ + +/*! \brief Defines the parameters used to perform txfm search. + * + * For the most part, this determines how various speed features are used. + */ +typedef struct { + /*! \brief Whether to limit the intra txfm search type to the default txfm. + * + * This could either be a result of either sequence parameter or speed + * features. + */ + int use_default_intra_tx_type; + + /*! Probability threshold used for conditionally forcing tx type*/ + int default_inter_tx_type_prob_thresh; + + //! Whether to prune 2d transforms based on 1d transform results. + int prune_2d_txfm_mode; + + /*! \brief Variable from \ref WinnerModeParams based on current eval mode. + * + * See the documentation for \ref WinnerModeParams for more detail. + */ + unsigned int coeff_opt_thresholds[2]; + /*! \copydoc coeff_opt_thresholds */ + unsigned int tx_domain_dist_threshold; + /*! \copydoc coeff_opt_thresholds */ + TX_SIZE_SEARCH_METHOD tx_size_search_method; + /*! \copydoc coeff_opt_thresholds */ + unsigned int use_transform_domain_distortion; + /*! \copydoc coeff_opt_thresholds */ + unsigned int skip_txfm_level; + + /*! \brief How to search for the optimal tx_size + * + * If ONLY_4X4, use TX_4X4; if TX_MODE_LARGEST, use the largest tx_size for + * the current partition block; if TX_MODE_SELECT, search through the whole + * tree. + * + * \attention + * Although this looks suspicious similar to a bitstream element, this + * tx_mode_search_type is only used internally by the encoder, and is *not* + * written to the bitstream. It determines what kind of tx_mode would be + * searched. For example, we might set it to TX_MODE_LARGEST to find a good + * candidate, then code it as TX_MODE_SELECT. + */ + TX_MODE tx_mode_search_type; + + /*! + * Determines whether a block can be predicted as transform skip or DC only + * based on residual mean and variance. + * Type 0 : No skip block or DC only block prediction + * Type 1 : Prediction of skip block based on residual mean and variance + * Type 2 : Prediction of skip block or DC only block based on residual mean + * and variance + */ + unsigned int predict_dc_level; + + /*! + * Whether or not we should use the quantization matrix as weights for PSNR + * during RD search. + */ + int use_qm_dist_metric; + + /*! + * Keep track of previous mode evaluation stage type. This will be used to + * reset mb rd hash record when mode evaluation type changes. + */ + int mode_eval_type; + +#if !CONFIG_REALTIME_ONLY + //! Indicates the transform depths for which RD evaluation is skipped. + TX_PRUNE_TYPE nn_prune_depths_for_intra_tx; + + /*! \brief Indicates if NN model should be invoked to prune transform depths. + * + * Used to signal whether NN model should be evaluated to prune the R-D + * evaluation of specific transform depths. + */ + bool enable_nn_prune_intra_tx_depths; +#endif +} TxfmSearchParams; + +/*!\cond */ +#define MAX_NUM_8X8_TXBS ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)) +#define MAX_NUM_16X16_TXBS ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)) +#define MAX_NUM_32X32_TXBS ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)) +#define MAX_NUM_64X64_TXBS ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)) +/*!\endcond */ + +/*! \brief Stores various encoding/search decisions related to txfm search. + * + * This struct contains a cache of previous txfm results, and some buffers for + * the current txfm decision. + */ +typedef struct { + //! Whether to skip transform and quantization on a partition block level. + uint8_t skip_txfm; + + /*! \brief Whether to skip transform and quantization on a txfm block level. + * + * Skips transform and quantization on a transform block level inside the + * current partition block. Each element of this array is used as a bit-field. + * So for example, the we are skipping on the luma plane, then the last bit + * would be set to 1. + */ + uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + + /*! \brief Transform types inside the partition block + * + * Keeps a record of what kind of transform to use for each of the transform + * block inside the partition block. + * \attention The buffer here is *never* directly used. Instead, this just + * allocates the memory for MACROBLOCKD::tx_type_map during rdopt on the + * partition block. So if we need to save memory, we could move the allocation + * to pick_sb_mode instead. + */ + uint8_t tx_type_map_[MAX_MIB_SIZE * MAX_MIB_SIZE]; + + //! Txfm hash records of inter-modes. + MB_RD_RECORD *mb_rd_record; + + /*! \brief Number of txb splits. + * + * Keep track of how many times we've used split tx partition for transform + * blocks. Somewhat misleadingly, this parameter doesn't actually keep track + * of the count of the current block. Instead, it's a cumulative count across + * of the whole frame. The main usage is that if txb_split_count is zero, then + * we can signal TX_MODE_LARGEST at frame level. + */ + // TODO(chiyotsai@google.com): Move this to a more appropriate location such + // as ThreadData. + unsigned int txb_split_count; +#if CONFIG_SPEED_STATS + //! For debugging. Used to check how many txfm searches we are doing. + unsigned int tx_search_count; +#endif // CONFIG_SPEED_STATS +} TxfmSearchInfo; +#undef MAX_NUM_8X8_TXBS +#undef MAX_NUM_16X16_TXBS +#undef MAX_NUM_32X32_TXBS +#undef MAX_NUM_64X64_TXBS + +/*! \brief Holds the entropy costs for various modes sent to the bitstream. + * + * \attention This does not include the costs for mv and transformed + * coefficients. + */ +typedef struct { + /***************************************************************************** + * \name Partition Costs + ****************************************************************************/ + /**@{*/ + //! Cost for coding the partition. + int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES]; + /**@}*/ + + /***************************************************************************** + * \name Intra Costs: General + ****************************************************************************/ + /**@{*/ + //! Luma mode cost for inter frame. + int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES]; + //! Luma mode cost for intra frame. + int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; + //! Chroma mode cost + int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES]; + //! filter_intra_cost + int filter_intra_cost[BLOCK_SIZES_ALL][2]; + //! filter_intra_mode_cost + int filter_intra_mode_cost[FILTER_INTRA_MODES]; + //! angle_delta_cost + int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1]; + + //! Rate rate associated with each alpha codeword + int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE]; + /**@}*/ + + /***************************************************************************** + * \name Intra Costs: Screen Contents + ****************************************************************************/ + /**@{*/ + //! intrabc_cost + int intrabc_cost[2]; + + //! palette_y_size_cost + int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + //! palette_uv_size_cost + int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + //! palette_y_color_cost + int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + //! palette_uv_color_cost + int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + //! palette_y_mode_cost + int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2]; + //! palette_uv_mode_cost + int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2]; + /**@}*/ + + /***************************************************************************** + * \name Inter Costs: MV Modes + ****************************************************************************/ + /**@{*/ + //! skip_mode_cost + int skip_mode_cost[SKIP_MODE_CONTEXTS][2]; + //! newmv_mode_cost + int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2]; + //! zeromv_mode_cost + int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2]; + //! refmv_mode_cost + int refmv_mode_cost[REFMV_MODE_CONTEXTS][2]; + //! drl_mode_cost0 + int drl_mode_cost0[DRL_MODE_CONTEXTS][2]; + /**@}*/ + + /***************************************************************************** + * \name Inter Costs: Ref Frame Types + ****************************************************************************/ + /**@{*/ + //! single_ref_cost + int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2]; + //! comp_inter_cost + int comp_inter_cost[COMP_INTER_CONTEXTS][2]; + //! comp_ref_type_cost + int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS] + [CDF_SIZE(COMP_REFERENCE_TYPES)]; + //! uni_comp_ref_cost + int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] + [CDF_SIZE(2)]; + /*! \brief Cost for signaling ref_frame[0] in bidir-comp mode + * + * Includes LAST_FRAME, LAST2_FRAME, LAST3_FRAME, and GOLDEN_FRAME. + */ + int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2]; + /*! \brief Cost for signaling ref_frame[1] in bidir-comp mode + * + * Includes ALTREF_FRAME, ALTREF2_FRAME, and BWDREF_FRAME. + */ + int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2]; + /**@}*/ + + /***************************************************************************** + * \name Inter Costs: Compound Types + ****************************************************************************/ + /**@{*/ + //! intra_inter_cost + int intra_inter_cost[INTRA_INTER_CONTEXTS][2]; + //! inter_compound_mode_cost + int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; + //! compound_type_cost + int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES]; + //! wedge_idx_cost + int wedge_idx_cost[BLOCK_SIZES_ALL][16]; + //! interintra_cost + int interintra_cost[BLOCK_SIZE_GROUPS][2]; + //! wedge_interintra_cost + int wedge_interintra_cost[BLOCK_SIZES_ALL][2]; + //! interintra_mode_cost + int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; + /**@}*/ + + /***************************************************************************** + * \name Inter Costs: Compound Masks + ****************************************************************************/ + /**@{*/ + //! comp_idx_cost + int comp_idx_cost[COMP_INDEX_CONTEXTS][2]; + //! comp_group_idx_cost + int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2]; + /**@}*/ + + /***************************************************************************** + * \name Inter Costs: Motion Modes/Filters + ****************************************************************************/ + /**@{*/ + //! motion_mode_cost + int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES]; + //! motion_mode_cost1 + int motion_mode_cost1[BLOCK_SIZES_ALL][2]; + //! switchable_interp_costs + int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; + /**@}*/ + + /***************************************************************************** + * \name Txfm Mode Costs + ****************************************************************************/ + /**@{*/ + //! skip_txfm_cost + int skip_txfm_cost[SKIP_CONTEXTS][2]; + //! tx_size_cost + int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; + //! txfm_partition_cost + int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2]; + //! inter_tx_type_costs + int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; + //! intra_tx_type_costs + int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [TX_TYPES]; + /**@}*/ + + /***************************************************************************** + * \name Restoration Mode Costs + ****************************************************************************/ + /**@{*/ + //! switchable_restore_cost + int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES]; + //! wiener_restore_cost + int wiener_restore_cost[2]; + //! sgrproj_restore_cost + int sgrproj_restore_cost[2]; + /**@}*/ + + /***************************************************************************** + * \name Segmentation Mode Costs + ****************************************************************************/ + /**@{*/ + //! tmp_pred_cost + int tmp_pred_cost[SEG_TEMPORAL_PRED_CTXS][2]; + //! spatial_pred_cost + int spatial_pred_cost[SPATIAL_PREDICTION_PROBS][MAX_SEGMENTS]; + /**@}*/ +} ModeCosts; + +/*! \brief Holds mv costs for encoding and motion search. + */ +typedef struct { + /***************************************************************************** + * \name Encoding Costs + * Here are the entropy costs needed to encode a given mv. + * \ref nmv_cost_alloc and \ref nmv_cost_hp_alloc are two arrays that holds + * the memory for holding the mv cost. But since the motion vectors can be + * negative, we shift them to the middle and store the resulting pointer in + * \ref nmv_cost and \ref nmv_cost_hp for easier referencing. Finally, \ref + * mv_cost_stack points to the \ref nmv_cost with the mv precision we are + * currently working with. In essence, only \ref mv_cost_stack is needed for + * motion search, the other can be considered private. + ****************************************************************************/ + /**@{*/ + //! Costs for coding the zero components. + int nmv_joint_cost[MV_JOINTS]; + + //! Allocates memory for 1/4-pel motion vector costs. + int nmv_cost_alloc[2][MV_VALS]; + //! Allocates memory for 1/8-pel motion vector costs. + int nmv_cost_hp_alloc[2][MV_VALS]; + //! Points to the middle of \ref nmv_cost_alloc + int *nmv_cost[2]; + //! Points to the middle of \ref nmv_cost_hp_alloc + int *nmv_cost_hp[2]; + //! Points to the nmv_cost_hp in use. + int **mv_cost_stack; + /**@}*/ +} MvCosts; + +/*! \brief Holds mv costs for intrabc. + */ +typedef struct { + /*! Costs for coding the joint mv. */ + int joint_mv[MV_JOINTS]; + + /*! \brief Cost of transmitting the actual motion vector. + * dv_costs_alloc[0][i] is the cost of motion vector with horizontal + * component (mv_row) equal to i - MV_MAX. dv_costs_alloc[1][i] is the cost of + * motion vector with vertical component (mv_col) equal to i - MV_MAX. + */ + int dv_costs_alloc[2][MV_VALS]; + + /*! Points to the middle of \ref dv_costs_alloc. */ + int *dv_costs[2]; +} IntraBCMVCosts; + +/*! \brief Holds the costs needed to encode the coefficients + */ +typedef struct { + //! Costs for coding the coefficients. + LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES]; + //! Costs for coding the eobs. + LV_MAP_EOB_COST eob_costs[7][2]; +} CoeffCosts; + +/*!\cond */ +// 4: NEAREST, NEW, NEAR, GLOBAL +#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4) +/*!\endcond */ +struct inter_modes_info; + +/*! \brief Holds the motion samples for warp motion model estimation + */ +typedef struct { + //! Number of samples. + int num; + //! Sample locations in current frame. + int pts[16]; + //! Sample location in the reference frame. + int pts_inref[16]; +} WARP_SAMPLE_INFO; + +/*!\cond */ +typedef enum { + kZeroSad = 0, + kVeryLowSad = 1, + kLowSad = 2, + kMedSad = 3, + kHighSad = 4 +} SOURCE_SAD; + +typedef struct { + //! SAD levels in non-rd path + SOURCE_SAD source_sad_nonrd; + //! SAD levels in rd-path for var-based part qindex thresholds + SOURCE_SAD source_sad_rd; + int lighting_change; + int low_sumdiff; +} CONTENT_STATE_SB; + +// Structure to hold pixel level gradient info. +typedef struct { + uint16_t abs_dx_abs_dy_sum; + int8_t hist_bin_idx; + bool is_dx_zero; +} PixelLevelGradientInfo; + +// Structure to hold the variance and log(1 + variance) for 4x4 sub-blocks. +typedef struct { + double log_var; + int var; +} Block4x4VarInfo; + +#ifndef NDEBUG +typedef struct SetOffsetsLoc { + int mi_row; + int mi_col; + BLOCK_SIZE bsize; +} SetOffsetsLoc; +#endif // NDEBUG + +/*!\endcond */ + +/*! \brief Encoder's parameters related to the current coding block. + * + * This struct contains most of the information the encoder needs to encode the + * current coding block. This includes the src and pred buffer, a copy of the + * decoder's view of the current block, the txfm coefficients. This struct also + * contains various buffers and data used to speed up the encoding process. + */ +typedef struct macroblock { + /***************************************************************************** + * \name Source, Buffers and Decoder + ****************************************************************************/ + /**@{*/ + /*! \brief Each of the encoding plane. + * + * An array holding the src buffer for each of plane of the current block. It + * also contains the txfm and quantized txfm coefficients. + */ + struct macroblock_plane plane[MAX_MB_PLANE]; + + /*! \brief Decoder's view of current coding block. + * + * Contains the encoder's copy of what the decoder sees in the current block. + * Most importantly, this struct contains pointers to mbmi that is used in + * final bitstream packing. + */ + MACROBLOCKD e_mbd; + + /*! \brief Derived coding information. + * + * Contains extra information not transmitted in the bitstream but are + * derived. For example, this contains the stack of ref_mvs. + */ + MB_MODE_INFO_EXT mbmi_ext; + + /*! \brief Finalized mbmi_ext for the whole frame. + * + * Contains the finalized info in mbmi_ext that gets used at the frame level + * for bitstream packing. + */ + MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame; + + //! Entropy context for the current row. + FRAME_CONTEXT *row_ctx; + /*! \brief Entropy context for the current tile. + * + * This context will be used to update color_map_cdf pointer which would be + * used during pack bitstream. For single thread and tile-multithreading case + * this pointer will be same as xd->tile_ctx, but for the case of row-mt: + * xd->tile_ctx will point to a temporary context while tile_pb_ctx will point + * to the accurate tile context. + */ + FRAME_CONTEXT *tile_pb_ctx; + + /*! \brief Buffer of transformed coefficients + * + * Points to cb_coef_buff in the AV1_COMP struct, which contains the finalized + * coefficients. This is here to conveniently copy the best coefficients to + * frame level for bitstream packing. Since CB_COEFF_BUFFER is allocated on a + * superblock level, we need to combine it with cb_offset to get the proper + * position for the current coding block. + */ + CB_COEFF_BUFFER *cb_coef_buff; + //! Offset of current coding block's coeff buffer relative to the sb. + uint16_t cb_offset[PLANE_TYPES]; + + //! Modified source and masks used for fast OBMC search. + OBMCBuffer obmc_buffer; + //! Buffer to store the best palette map. + PALETTE_BUFFER *palette_buffer; + //! Buffer used for compound_type_rd(). + CompoundTypeRdBuffers comp_rd_buffer; + //! Buffer to store convolution during averaging process in compound mode. + CONV_BUF_TYPE *tmp_conv_dst; + + /*! \brief Temporary buffer to hold prediction. + * + * Points to a buffer that is used to hold temporary prediction results. This + * is used in two ways: + * - This is a temporary buffer used to ping-pong the prediction in + * handle_inter_mode. + * - xd->tmp_obmc_bufs also points to this buffer, and is used in ombc + * prediction. + */ + uint8_t *tmp_pred_bufs[2]; + /**@}*/ + + /***************************************************************************** + * \name Rdopt Costs + ****************************************************************************/ + /**@{*/ + /*! \brief Quantization index for the current partition block. + * + * This is used to as the index to find quantization parameter for luma and + * chroma transformed coefficients. + */ + int qindex; + + /*! \brief Difference between frame-level qindex and current qindex. + * + * This is used to track whether a non-zero delta for qindex is used at least + * once in the current frame. + */ + int delta_qindex; + + /*! \brief Difference between frame-level qindex and qindex used to + * compute rdmult (lambda). + * + * rdmult_delta_qindex is assigned the same as delta_qindex before qp sweep. + * During qp sweep, delta_qindex is changed and used to calculate the actual + * quant params, while rdmult_delta_qindex remains the same, and is used to + * calculate the rdmult in "set_deltaq_rdmult". + */ + int rdmult_delta_qindex; + + /*! \brief Current qindex (before being adjusted by delta_q_res) used to + * derive rdmult_delta_qindex. + */ + int rdmult_cur_qindex; + + /*! \brief Rate-distortion multiplier. + * + * The rd multiplier used to determine the rate-distortion trade-off. This is + * roughly proportional to the inverse of q-index for a given frame, but this + * can be manipulated for better rate-control. For example, in tune_ssim + * mode, this is scaled by a factor related to the variance of the current + * block. + */ + int rdmult; + + //! Intra only, per sb rd adjustment. + int intra_sb_rdmult_modifier; + + //! Superblock level distortion propagation factor. + double rb; + + //! Energy in the current source coding block. Used to calculate \ref rdmult + int mb_energy; + //! Energy in the current source superblock. Used to calculate \ref rdmult + int sb_energy_level; + + //! The rate needed to signal a mode to the bitstream. + ModeCosts mode_costs; + + //! The rate needed to encode a new motion vector to the bitstream and some + //! multipliers for motion search. + MvCosts *mv_costs; + + /*! The rate needed to encode a new motion vector to the bitstream in intrabc + * mode. + */ + IntraBCMVCosts *dv_costs; + + //! The rate needed to signal the txfm coefficients to the bitstream. + CoeffCosts coeff_costs; + /**@}*/ + + /***************************************************************************** + * \name Rate to Distortion Multipliers + ****************************************************************************/ + /**@{*/ + //! A multiplier that converts mv cost to l2 error. + int errorperbit; + //! A multiplier that converts mv cost to l1 error. + int sadperbit; + /**@}*/ + + /****************************************************************************** + * \name Segmentation + *****************************************************************************/ + /**@{*/ + /*! \brief Skip mode for the segment + * + * A syntax element of the segmentation mode. In skip_block mode, all mvs are + * set 0 and all txfms are skipped. + */ + int seg_skip_block; + + /*! \brief Number of segment 1 blocks + * Actual number of (4x4) blocks that were applied delta-q, + * for segment 1. + */ + int actual_num_seg1_blocks; + + /*!\brief Number of segment 2 blocks + * Actual number of (4x4) blocks that were applied delta-q, + * for segment 2. + */ + int actual_num_seg2_blocks; + + /*!\brief Number of zero motion vectors + */ + int cnt_zeromv; + + /*!\brief Flag to force zeromv-skip at superblock level, for nonrd path. + * + * 0/1 imply zeromv-skip is disabled/enabled. 2 implies that the blocks + * in the superblock may be marked as zeromv-skip at block level. + */ + int force_zeromv_skip_for_sb; + + /*!\brief Flag to force zeromv-skip at block level, for nonrd path. + */ + int force_zeromv_skip_for_blk; + + /*! \brief Previous segment id for which qmatrices were updated. + * This is used to bypass setting of qmatrices if no change in qindex. + */ + int prev_segment_id; + /**@}*/ + + /***************************************************************************** + * \name Superblock + ****************************************************************************/ + /**@{*/ + //! Information on a whole superblock level. + // TODO(chiyotsai@google.com): Refactor this out of macroblock + SuperBlockEnc sb_enc; + + /*! \brief Characteristics of the current superblock. + * + * Characteristics like whether the block has high sad, low sad, etc. This is + * only used by av1 realtime mode. + */ + CONTENT_STATE_SB content_state_sb; + /**@}*/ + + /***************************************************************************** + * \name Reference Frame Search + ****************************************************************************/ + /**@{*/ + /*! \brief Sum absolute distortion of the predicted mv for each ref frame. + * + * This is used to measure how viable a reference frame is. + */ + int pred_mv_sad[REF_FRAMES]; + /*! \brief The minimum of \ref pred_mv_sad. + * + * Index 0 stores the minimum \ref pred_mv_sad across past reference frames. + * Index 1 stores the minimum \ref pred_mv_sad across future reference frames. + */ + int best_pred_mv_sad[2]; + //! The sad of the 1st mv ref (nearest). + int pred_mv0_sad[REF_FRAMES]; + //! The sad of the 2nd mv ref (near). + int pred_mv1_sad[REF_FRAMES]; + + /*! \brief Disables certain ref frame pruning based on tpl. + * + * Determines whether a given ref frame is "good" based on data from the TPL + * model. If so, this stops selective_ref frame from pruning the given ref + * frame at block level. + */ + uint8_t tpl_keep_ref_frame[REF_FRAMES]; + + /*! \brief Warp motion samples buffer. + * + * Store the motion samples used for warp motion. + */ + WARP_SAMPLE_INFO warp_sample_info[REF_FRAMES]; + + /*! \brief Reference frames picked by the square subblocks in a superblock. + * + * Keeps track of ref frames that are selected by square partition blocks + * within a superblock, in MI resolution. They can be used to prune ref frames + * for rectangular blocks. + */ + int picked_ref_frames_mask[MAX_MIB_SIZE * MAX_MIB_SIZE]; + + /*! \brief Prune ref frames in real-time mode. + * + * Determines whether to prune reference frames in real-time mode. For the + * most part, this is the same as nonrd_prune_ref_frame_search in + * cpi->sf.rt_sf.nonrd_prune_ref_frame_search, but this can be selectively + * turned off if the only frame available is GOLDEN_FRAME. + */ + int nonrd_prune_ref_frame_search; + /**@}*/ + + /***************************************************************************** + * \name Partition Search + ****************************************************************************/ + /**@{*/ + //! Stores some partition-search related buffers. + PartitionSearchInfo part_search_info; + + /*! \brief Whether to disable some features to force a mode in current block. + * + * In some cases, our speed features can be overly aggressive and remove all + * modes search in the superblock. When this happens, we set + * must_find_valid_partition to 1 to reduce the number of speed features, and + * recode the superblock again. + */ + int must_find_valid_partition; + /**@}*/ + + /***************************************************************************** + * \name Prediction Mode Search + ****************************************************************************/ + /**@{*/ + /*! \brief Inter skip mode. + * + * Skip mode tries to use the closest forward and backward references for + * inter prediction. Skip here means to skip transmitting the reference + * frames, not to be confused with skip_txfm. + */ + int skip_mode; + + /*! \brief Factors used for rd-thresholding. + * + * Determines a rd threshold to determine whether to continue searching the + * current mode. If the current best rd is already <= threshold, then we skip + * the current mode. + */ + int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES]; + + /*! \brief Tracks the winner modes in the current coding block. + * + * Winner mode is a two-pass strategy to find the best prediction mode. In the + * first pass, we search the prediction modes with a limited set of txfm + * options, and keep the top modes. These modes are called the winner modes. + * In the second pass, we retry the winner modes with more thorough txfm + * options. + */ + WinnerModeStats *winner_mode_stats; + //! Tracks how many winner modes there are. + int winner_mode_count; + + /*! \brief The model used for rd-estimation to avoid txfm + * + * These are for inter_mode_rd_model_estimation, which is another two pass + * approach. In this speed feature, we collect data in the first couple frames + * to build an rd model to estimate the rdcost of a prediction model based on + * the residue error. Once enough data is collected, this speed feature uses + * the estimated rdcost to find the most performant prediction mode. Then we + * follow up with a second pass find the best transform for the mode. + * Determines if one would go with reduced complexity transform block + * search model to select prediction modes, or full complexity model + * to select transform kernel. + */ + TXFM_RD_MODEL rd_model; + + /*! \brief Stores the inter mode information needed to build an rd model. + * + * These are for inter_mode_rd_model_estimation, which is another two pass + * approach. In this speed feature, we collect data in the first couple frames + * to build an rd model to estimate the rdcost of a prediction model based on + * the residue error. Once enough data is collected, this speed feature uses + * the estimated rdcost to find the most performant prediction mode. Then we + * follow up with a second pass find the best transform for the mode. + */ + // TODO(any): try to consolidate this speed feature with winner mode + // processing. + struct inter_modes_info *inter_modes_info; + + //! How to blend the compound predictions. + uint8_t compound_idx; + + //! A caches of results of compound type search so they can be reused later. + COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS]; + //! The idx for the latest compound mode in the cache \ref comp_rd_stats. + int comp_rd_stats_idx; + + /*! \brief Whether to recompute the luma prediction. + * + * In interpolation search, we can usually skip recalculating the luma + * prediction because it is already calculated by a previous predictor. This + * flag signifies that some modes might have been skipped, so we need to + * rebuild the prediction. + */ + int recalc_luma_mc_data; + + /*! \brief Data structure to speed up intrabc search. + * + * Contains the hash table, hash function, and buffer used for intrabc. + */ + IntraBCHashInfo intrabc_hash_info; + + /*! \brief Whether to reuse the mode stored in mb_mode_cache. */ + int use_mb_mode_cache; + /*! \brief The mode to reuse during \ref av1_rd_pick_intra_mode_sb and + * \ref av1_rd_pick_inter_mode. */ + const MB_MODE_INFO *mb_mode_cache; + /*! \brief Pointer to the buffer which caches gradient information. + * + * Pointer to the array of structures to store gradient information of each + * pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level + * structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV). + */ + PixelLevelGradientInfo *pixel_gradient_info; + /*! \brief Flags indicating the availability of cached gradient info. */ + bool is_sb_gradient_cached[PLANE_TYPES]; + + /*! \brief Flag to reuse predicted samples of inter block. */ + bool reuse_inter_pred; + /**@}*/ + + /***************************************************************************** + * \name MV Search + ****************************************************************************/ + /**@{*/ + /*! \brief Context used to determine the initial step size in motion search. + * + * This context is defined as the \f$l_\inf\f$ norm of the best ref_mvs for + * each frame. + */ + unsigned int max_mv_context[REF_FRAMES]; + + /*! \brief Limit for the range of motion vectors. + * + * These define limits to motion vector components to prevent them from + * extending outside the UMV borders + */ + FullMvLimits mv_limits; + + /*! \brief Buffer for storing the search site config. + * + * When resize mode or super resolution mode is on, the stride of the + * reference frame does not always match what's specified in \ref + * MotionVectorSearchParams::search_site_cfg. When his happens, we update the + * search_sine_config buffer here and use it for motion search. + */ + search_site_config search_site_cfg_buf[NUM_DISTINCT_SEARCH_METHODS]; + /**@}*/ + + /***************************************************************************** + * \name Txfm Search + ****************************************************************************/ + /**@{*/ + /*! \brief Parameters that control how motion search is done. + * + * Stores various txfm search related parameters such as txfm_type, txfm_size, + * trellis eob search, etc. + */ + TxfmSearchParams txfm_search_params; + + /*! \brief Results of the txfm searches that have been done. + * + * Caches old txfm search results and keeps the current txfm decisions to + * facilitate rdopt. + */ + TxfmSearchInfo txfm_search_info; + + /*! \brief Whether there is a strong color activity. + * + * Used in REALTIME coding mode to enhance the visual quality at the boundary + * of moving color objects. + */ + uint8_t color_sensitivity_sb[MAX_MB_PLANE - 1]; + //! Color sensitivity flag for the superblock for golden reference. + uint8_t color_sensitivity_sb_g[MAX_MB_PLANE - 1]; + //! Color sensitivity flag for the superblock for altref reference. + uint8_t color_sensitivity_sb_alt[MAX_MB_PLANE - 1]; + //! Color sensitivity flag for the coding block. + uint8_t color_sensitivity[MAX_MB_PLANE - 1]; + //! Coding block distortion value for uv/color, minimum over the inter modes. + int64_t min_dist_inter_uv; + + //! The buffer used by search_tx_type() to swap dqcoeff in macroblockd_plane + // so we can keep dqcoeff of the best tx_type. + tran_low_t *dqcoeff_buf; + /**@}*/ + + /***************************************************************************** + * \name Misc + ****************************************************************************/ + /**@{*/ + //! Variance of the source frame. + unsigned int source_variance; + //! Flag to indicate coding block is zero sad. + int block_is_zero_sad; + //! Flag to indicate superblock ME in variance partition is determined to be + // good/reliable, and so the superblock MV will be tested in the + // nonrd_pickmode. This is only used for LAST_FRAME. + int sb_me_partition; + //! Flag to indicate to test the superblock MV for the coding block in the + // nonrd_pickmode. + int sb_me_block; + //! Motion vector from superblock MV derived from int_pro_motion() in + // the variance_partitioning. + int_mv sb_me_mv; + //! SSE of the current predictor. + unsigned int pred_sse[REF_FRAMES]; + //! Prediction for ML based partition. +#if CONFIG_RT_ML_PARTITIONING + DECLARE_ALIGNED(16, uint8_t, est_pred[128 * 128]); +#endif + /**@}*/ + + /*! \brief NONE partition evaluated for merge. + * + * In variance based partitioning scheme, NONE & SPLIT partitions are + * evaluated to check the SPLIT can be merged as NONE. This flag signifies the + * partition is evaluated in the scheme. + */ + int try_merge_partition; + + /*! \brief Pointer to buffer which caches sub-block variances in a superblock. + * + * Pointer to the array of structures to store source variance information of + * each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to + * store source variance and log of source variance of each 4x4 sub-block. + */ + Block4x4VarInfo *src_var_info_of_4x4_sub_blocks; +#ifndef NDEBUG + /*! \brief A hash to make sure av1_set_offsets is called */ + SetOffsetsLoc last_set_offsets_loc; +#endif // NDEBUG + +#if COLLECT_NONRD_PICK_MODE_STAT + mode_search_stat_nonrd ms_stat_nonrd; +#endif // COLLECT_NONRD_PICK_MODE_STAT + + /*!\brief Number of pixels in current thread that choose palette mode in the + * fast encoding stage for screen content tool detemination. + */ + int palette_pixels; + + /*!\brief Pointer to the structure which stores the statistics used by + * sb-level multi-pass encoding. + */ + struct SB_FIRST_PASS_STATS *sb_stats_cache; + + /*!\brief Pointer to the structure which stores the statistics used by + * first-pass when superblock is searched twice consecutively. + */ + struct SB_FIRST_PASS_STATS *sb_fp_stats; + +#if CONFIG_PARTITION_SEARCH_ORDER + /*!\brief Pointer to RD_STATS structure to be used in + * av1_rd_partition_search(). + */ + RD_STATS *rdcost; +#endif // CONFIG_PARTITION_SEARCH_ORDER +} MACROBLOCK; +#undef SINGLE_REF_MODES + +/*!\cond */ +// Zeroes out 'n_stats' elements in the array x->winner_mode_stats. +// It only zeroes out what is necessary in 'color_index_map' (just the block +// size, not the whole array). +static INLINE void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats, + WinnerModeStats *stats) { + // When winner mode stats are not required, the memory allocation is avoided + // for x->winner_mode_stats. The stats pointer will be NULL in such cases. + if (stats == NULL) return; + + const int block_height = block_size_high[bsize]; + const int block_width = block_size_wide[bsize]; + for (int i = 0; i < n_stats; ++i) { + WinnerModeStats *const stat = &stats[i]; + memset(&stat->mbmi, 0, sizeof(stat->mbmi)); + memset(&stat->rd_cost, 0, sizeof(stat->rd_cost)); + memset(&stat->rd, 0, sizeof(stat->rd)); + memset(&stat->rate_y, 0, sizeof(stat->rate_y)); + memset(&stat->rate_uv, 0, sizeof(stat->rate_uv)); + // Do not reset the whole array as it is CPU intensive. + memset(&stat->color_index_map, 0, + block_width * block_height * sizeof(stat->color_index_map[0])); + memset(&stat->mode_index, 0, sizeof(stat->mode_index)); + } +} + +static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) { + static const char LUT[BLOCK_SIZES_ALL] = { + 0, // BLOCK_4X4 + 1, // BLOCK_4X8 + 1, // BLOCK_8X4 + 0, // BLOCK_8X8 + 1, // BLOCK_8X16 + 1, // BLOCK_16X8 + 0, // BLOCK_16X16 + 1, // BLOCK_16X32 + 1, // BLOCK_32X16 + 0, // BLOCK_32X32 + 1, // BLOCK_32X64 + 1, // BLOCK_64X32 + 0, // BLOCK_64X64 + 0, // BLOCK_64X128 + 0, // BLOCK_128X64 + 0, // BLOCK_128X128 + 1, // BLOCK_4X16 + 1, // BLOCK_16X4 + 1, // BLOCK_8X32 + 1, // BLOCK_32X8 + 1, // BLOCK_16X64 + 1, // BLOCK_64X16 + }; + + return LUT[bsize]; +} + +static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + return is_rect_tx_allowed_bsize(mbmi->bsize) && + !xd->lossless[mbmi->segment_id]; +} + +static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) { + TX_SIZE ctx_size = max_txsize_rect_lookup[bsize]; + int depth = 0; + while (tx_size != ctx_size) { + depth++; + ctx_size = sub_tx_size_map[ctx_size]; + assert(depth <= MAX_TX_DEPTH); + } + return depth; +} + +static INLINE void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx, + int skip) { + if (skip) + txb_skip[blk_idx] |= 1UL << plane; + else + txb_skip[blk_idx] &= ~(1UL << plane); +#ifndef NDEBUG + // Set chroma planes to uninitialized states when luma is set to check if + // it will be set later + if (plane == 0) { + txb_skip[blk_idx] |= 1UL << (1 + 4); + txb_skip[blk_idx] |= 1UL << (2 + 4); + } + + // Clear the initialization checking bit + txb_skip[blk_idx] &= ~(1UL << (plane + 4)); +#endif +} + +static INLINE int is_blk_skip(uint8_t *txb_skip, int plane, int blk_idx) { +#ifndef NDEBUG + // Check if this is initialized + assert(!(txb_skip[blk_idx] & (1UL << (plane + 4)))); + + // The magic number is 0x77, this is to test if there is garbage data + assert((txb_skip[blk_idx] & 0x88) == 0); +#endif + return (txb_skip[blk_idx] >> plane) & 1; +} + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_BLOCK_H_ diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c new file mode 100644 index 0000000000..6ad2ddaf25 --- /dev/null +++ b/third_party/aom/av1/encoder/blockiness.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/common.h" +#include "av1/common/filter.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +static int horizontal_filter(const uint8_t *s) { + return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6; +} + +static int vertical_filter(const uint8_t *s, int p) { + return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6; +} + +static int variance(int sum, int sum_squared, int size) { + return sum_squared / size - (sum / size) * (sum / size); +} +// Calculate a blockiness level for a vertical block edge. +// This function returns a new blockiness metric that's defined as + +// p0 p1 p2 p3 +// q0 q1 q2 q3 +// block edge -> +// r0 r1 r2 r3 +// s0 s1 s2 s3 + +// blockiness = p0*-2+q0*6+r0*-6+s0*2 + +// p1*-2+q1*6+r1*-6+s1*2 + +// p2*-2+q2*6+r2*-6+s2*2 + +// p3*-2+q3*6+r3*-6+s3*2 ; + +// reconstructed_blockiness = abs(blockiness from reconstructed buffer - +// blockiness from source buffer,0) +// +// I make the assumption that flat blocks are much more visible than high +// contrast blocks. As such, I scale the result of the blockiness calc +// by dividing the blockiness by the variance of the pixels on either side +// of the edge as follows: +// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2 +// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2 +// The returned blockiness is the scaled value +// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ; +static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r, + int rp, int size) { + int s_blockiness = 0; + int r_blockiness = 0; + int sum_0 = 0; + int sum_sq_0 = 0; + int sum_1 = 0; + int sum_sq_1 = 0; + int i; + int var_0; + int var_1; + for (i = 0; i < size; ++i, s += sp, r += rp) { + s_blockiness += horizontal_filter(s); + r_blockiness += horizontal_filter(r); + sum_0 += s[0]; + sum_sq_0 += s[0] * s[0]; + sum_1 += s[-1]; + sum_sq_1 += s[-1] * s[-1]; + } + var_0 = variance(sum_0, sum_sq_0, size); + var_1 = variance(sum_1, sum_sq_1, size); + r_blockiness = abs(r_blockiness); + s_blockiness = abs(s_blockiness); + + if (r_blockiness > s_blockiness) + return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); + else + return 0; +} + +// Calculate a blockiness level for a horizontal block edge +// same as above. +static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r, + int rp, int size) { + int s_blockiness = 0; + int r_blockiness = 0; + int sum_0 = 0; + int sum_sq_0 = 0; + int sum_1 = 0; + int sum_sq_1 = 0; + int i; + int var_0; + int var_1; + for (i = 0; i < size; ++i, ++s, ++r) { + s_blockiness += vertical_filter(s, sp); + r_blockiness += vertical_filter(r, rp); + sum_0 += s[0]; + sum_sq_0 += s[0] * s[0]; + sum_1 += s[-sp]; + sum_sq_1 += s[-sp] * s[-sp]; + } + var_0 = variance(sum_0, sum_sq_0, size); + var_1 = variance(sum_1, sum_sq_1, size); + r_blockiness = abs(r_blockiness); + s_blockiness = abs(s_blockiness); + + if (r_blockiness > s_blockiness) + return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); + else + return 0; +} + +// This function returns the blockiness for the entire frame currently by +// looking at all borders in steps of 4. +double av1_get_blockiness(const unsigned char *img1, int img1_pitch, + const unsigned char *img2, int img2_pitch, int width, + int height) { + double blockiness = 0; + int i, j; + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4) { + if (i > 0 && i < height && j > 0 && j < width) { + blockiness += + blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4); + blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j, + img2_pitch, 4); + } + } + } + blockiness /= width * height / 16; + return blockiness; +} diff --git a/third_party/aom/av1/encoder/cnn.c b/third_party/aom/av1/encoder/cnn.c new file mode 100644 index 0000000000..598b362753 --- /dev/null +++ b/third_party/aom/av1/encoder/cnn.c @@ -0,0 +1,1189 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "av1/common/av1_common_int.h" +#include "av1/encoder/cnn.h" + +#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a))) + +typedef struct { + const float **input; + int in_width; + int in_height; + int in_stride; + const CNN_LAYER_CONFIG *layer_config; + float **output; + int out_stride; + int start_idx; + int th_step; +} CONVOLVE_OPS; + +static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); } + +static INLINE float relu(float x) { return (x < 0) ? 0 : x; } + +typedef struct { + int allocsize; + int channels; + int width, height, stride; + float *buf[CNN_MAX_CHANNELS]; +} TENSOR; + +static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); } + +static void free_tensor(TENSOR *tensor) { + if (tensor->allocsize) { + aom_free(tensor->buf[0]); + tensor->buf[0] = NULL; + tensor->allocsize = 0; + } +} + +static bool realloc_tensor(TENSOR *tensor, int channels, int width, + int height) { + const int newallocsize = channels * width * height; + if (tensor->allocsize < newallocsize) { + free_tensor(tensor); + tensor->buf[0] = + (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize); + if (!tensor->buf[0]) return false; + tensor->allocsize = newallocsize; + } + tensor->width = width; + tensor->height = height; + tensor->stride = width; + tensor->channels = channels; + for (int c = 1; c < channels; ++c) + tensor->buf[c] = &tensor->buf[0][c * width * height]; + return true; +} + +static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset, + TENSOR *dst) { + assert(src->width == dst->width); + assert(src->height == dst->height); + assert(copy_channels <= src->channels); + if (src->stride == dst->width && dst->stride == dst->width) { + for (int c = 0; c < copy_channels; ++c) { + memcpy(dst->buf[dst_offset + c], src->buf[c], + sizeof(*dst->buf[0]) * src->width * src->height); + } + } else { + for (int c = 0; c < copy_channels; ++c) { + for (int r = 0; r < dst->height; ++r) { + memcpy(&dst->buf[dst_offset + c][r * dst->stride], + &src->buf[c][r * src->stride], + dst->width * sizeof(*dst->buf[c])); + } + } + } +} + +static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS], + int channels, int width, int height, int stride) { + tensor->allocsize = 0; + tensor->channels = channels; + tensor->width = width; + tensor->height = height; + tensor->stride = stride; + if (buf) { + for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c]; + } else { + for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL; + } +} + +static void swap_tensor(TENSOR *t1, TENSOR *t2) { + TENSOR t = *t1; + *t1 = *t2; + *t2 = t; +} + +// The concatenated tensor goes into dst with first the channels in +// original dst followed by the channels in the src +static bool concat_tensor(const TENSOR *src, TENSOR *dst) { + assert(src->width == dst->width); + assert(src->height == dst->height); + + const int dst_channels = dst->channels; + const int channels = dst->channels + src->channels; + const int newallocsize = channels * dst->width * dst->height; + if (dst->allocsize < newallocsize) { + TENSOR t; + init_tensor(&t); + // allocate new buffers and copy first the dst channels + if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false; + copy_tensor(dst, dst->channels, 0, &t); + // Swap the tensors and free the old buffers + swap_tensor(dst, &t); + free_tensor(&t); + } + for (int c = 1; c < channels; ++c) + dst->buf[c] = &dst->buf[0][c * dst->width * dst->height]; + // Copy the channels in src after the first dst_channels channels. + copy_tensor(src, src->channels, dst_channels, dst); + return true; +} + +int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) { + return (t1->width == t2->width && t1->height == t2->height); +} + +int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) { + return (t1->channels == t2->channels && t1->width == t2->width && + t1->height == t2->height); +} + +void av1_find_cnn_layer_output_size(int in_width, int in_height, + const CNN_LAYER_CONFIG *layer_config, + int *out_width, int *out_height) { + assert(layer_config->skip_width > 0); + assert(layer_config->skip_height > 0); + if (!layer_config->deconvolve) { + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + case PADDING_SAME_REPLICATE: + *out_width = (in_width + layer_config->skip_width - 1) / + layer_config->skip_width; + *out_height = (in_height + layer_config->skip_height - 1) / + layer_config->skip_height; + break; + case PADDING_VALID: + *out_width = + (in_width - layer_config->filter_width + layer_config->skip_width) / + layer_config->skip_width; + *out_height = (in_height - layer_config->filter_height + + layer_config->skip_height) / + layer_config->skip_height; + break; + default: assert(0 && "Unknown padding type"); + } + } else { + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + case PADDING_SAME_REPLICATE: + *out_width = in_width * layer_config->skip_width; + *out_height = in_height * layer_config->skip_height; + break; + case PADDING_VALID: + *out_width = (in_width - 1) * layer_config->skip_width + + layer_config->filter_width; + *out_height = (in_height - 1) * layer_config->skip_height + + layer_config->filter_height; + break; + default: assert(0 && "Unknown padding type"); + } + } +} + +void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config, + int channels_per_branch[]) { + int branch = layer_config->branch; + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + if (layer_config->branch_copy_type == BRANCH_INPUT) { + channels_per_branch[b] = layer_config->in_channels; + } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) { + channels_per_branch[b] = layer_config->out_channels; + } else if (layer_config->branch_copy_type == BRANCH_COMBINED) { + channels_per_branch[b] = layer_config->out_channels; + for (int c = 0; c < CNN_MAX_BRANCHES; ++c) { + if ((branch_config->branches_to_combine & (1 << c)) && c != branch) { + assert(channels_per_branch[c] > 0); + channels_per_branch[b] += channels_per_branch[c]; + } + } + } + } + } + channels_per_branch[branch] = layer_config->out_channels; + for (int c = 0; c < CNN_MAX_BRANCHES; ++c) { + if ((branch_config->branches_to_combine & (1 << c)) && c != branch) { + assert(channels_per_branch[c] > 0); + channels_per_branch[branch] += channels_per_branch[c]; + } + } +} + +#if CONFIG_DEBUG +static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) { + const int num_layers = cnn_config->num_layers; + const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config; + + for (int idx = 0; idx < num_layers; idx++) { + if (layer_configs[idx].output_num != -1) { + return 1; + } + } + return 0; +} +#endif + +void av1_find_cnn_output_size(int in_width, int in_height, + const CNN_CONFIG *cnn_config, int *out_width, + int *out_height, int *out_channels) { + int channels_per_branch[CNN_MAX_BRANCHES] = { 0 }; + int i_width[CNN_MAX_BRANCHES] = { 0 }; + int i_height[CNN_MAX_BRANCHES] = { 0 }; + i_width[0] = in_width + cnn_config->ext_width * 2; + i_height[0] = in_height + cnn_config->ext_height * 2; + +#if CONFIG_DEBUG + assert(cnn_has_at_least_one_output(cnn_config)); +#endif + + for (int i = 0; i < cnn_config->num_layers; ++i) { + const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i]; + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + const int branch = layer_config->branch; + int o_width = 0, o_height = 0; + + if (layer_config->branch_copy_type == BRANCH_INPUT) { + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + assert(i_width[branch] > 0 && i_height[branch] > 0); + i_width[b] = i_width[branch]; + i_height[b] = i_height[branch]; + } + } + } + + av1_find_cnn_layer_output_size(i_width[branch], i_height[branch], + layer_config, &o_width, &o_height); + i_width[branch] = o_width; + i_height[branch] = o_height; + + if (layer_config->branch_copy_type == BRANCH_OUTPUT) { + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + i_width[b] = o_width; + i_height[b] = o_height; + } + } + } + + find_cnn_out_channels(layer_config, channels_per_branch); + + const int output_num = layer_config->output_num; + if (output_num != -1) { // Current layer is an output layer + out_width[output_num] = o_width; + out_height[output_num] = o_height; + out_channels[output_num] = channels_per_branch[layer_config->branch]; + } + } +} + +static INLINE int get_start_shift_convolve(int width, int filt_width, + int stride) { + const int mod = (width % stride); + const int filt_off = (filt_width - 1) / 2; + const int dif = (mod ? mod - 1 : stride - 1); + return AOMMIN((dif + (filt_width % 2)) / 2, filt_off); +} + +void av1_cnn_add_c(float **output, int channels, int width, int height, + int stride, const float **add) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] += add[c][i * stride + j]; + } +} + +void av1_cnn_activate_c(float **output, int channels, int width, int height, + int stride, ACTIVATION layer_activation) { + if (layer_activation == RELU) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] = relu(output[c][i * stride + j]); + } + } else if (layer_activation == SOFTSIGN) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] = softsign(output[c][i * stride + j]); + } + } else if (layer_activation == SIGMOID) { + assert(0 && "Sigmoid has not been supported in CNN."); // TO DO + } else if (layer_activation != NONE) { + assert(0 && "Unknown activation type"); + } +} + +static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor, + const CNN_LAYER_CONFIG *layer_config, + int branch, TENSOR branch_output[]) { + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + // Copy layer's active tensor to output tensor of branch b if set in + // mask. The output becomes the input of the first layer of the branch + // because the layer of the branch is not the first layer. + int copy_channels = branch_config->channels_to_copy > 0 + ? branch_config->channels_to_copy + : layer_active_tensor->channels; + if (!realloc_tensor(&branch_output[b], copy_channels, + layer_active_tensor->width, + layer_active_tensor->height)) { + return false; + } + copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]); + } + } + return true; +} + +// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height +// greater than 1 and padding equal to PADDING_SAME_ZERO. +static void convolve_maxpool_padding_zero( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + const int cstep, const int filter_width_half, + const int filter_height_half) { + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) { + for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) { + for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); + ++hh) { + for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); + ++ww) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int ii = hh + l - filter_height_half; + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int jj = ww + m - filter_width_half; + if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) + continue; + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + const float a = sum; + if (h == hh && w == ww) + output[i][u * out_stride + v] = a; + else + output[i][u * out_stride + v] = + AOMMAX(output[i][u * out_stride + v], a); + } + } + } + } + } +} + +// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height +// greater than 1 and padding equal to PADDING_SAME_REPLICATE. +static void convolve_maxpool_padding_replicate( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + const int cstep, const int filter_width_half, + const int filter_height_half) { + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) { + for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) { + for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); + ++hh) { + for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); + ++ww) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int ii = + CLAMPINDEX(hh + l - filter_height_half, in_height); + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int jj = + CLAMPINDEX(ww + m - filter_width_half, in_width); + assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + const float a = sum; + if (h == hh && w == ww) + output[i][u * out_stride + v] = a; + else + output[i][u * out_stride + v] = + AOMMAX(output[i][u * out_stride + v], a); + } + } + } + } + } +} + +// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height +// greater than 1 and padding equal to PADDING_VALID. +static void convolve_maxpool_padding_valid( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + const int cstep) { + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1; + h += layer_config->skip_height, ++u) { + for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1; + w += layer_config->skip_width, ++v) { + for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); + ++hh) { + for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); + ++ww) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int ii = hh + l; + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int jj = ww + m; + assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + const float a = sum; + if (h == hh && w == ww) + output[i][u * out_stride + v] = a; + else + output[i][u * out_stride + v] = + AOMMAX(output[i][u * out_stride + v], a); + } + } + } + } + } +} + +// CNNConvolve specific to maxpool set as 0 with filter_height and filter_width +// equal to 1. +static void convolve_element_wise(const float **input, int in_width, + int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, + float **output, int out_stride, int start_idx, + int step) { + const int start_h = get_start_shift_convolve( + in_height, layer_config->filter_height, layer_config->skip_height); + const int start_w = + get_start_shift_convolve(in_width, layer_config->filter_width, + layer_config->skip_width) + + start_idx * layer_config->skip_width; + const int out_w_step = AOMMAX(step, 1); + const int in_w_step = layer_config->skip_width * out_w_step; + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = start_h, u = 0; h < in_height; + h += layer_config->skip_height, ++u) { + const int in_h = h * in_stride; + const int out_h = u * out_stride + start_idx; + for (int w = start_w, out_index = out_h; w < in_width; + w += in_w_step, out_index += out_w_step) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + sum += layer_config->weights[k * layer_config->out_channels + i] * + input[k][in_h + w]; + } + output[i][out_index] = sum; + } + } + } +} + +// CNNConvolve specific to maxpool set as 0 and padding equal to +// PADDING_SAME_ZERO. +static void convolve_no_maxpool_padding_zero( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int filter_width_half, + const int filter_height_half, const int ii_shift, const int jj_shift, + const int channel_step) { + const int start_h = get_start_shift_convolve( + in_height, layer_config->filter_height, layer_config->skip_height); + const int start_w = get_start_shift_convolve( + in_width, layer_config->filter_width, layer_config->skip_width); + const int end_ii_shift = filter_height_half + 1; + const int end_jj_shift = filter_width_half + 1; + // *_filter_margin stores the number of pixels along a dimension in the + // intersection of the complement of the image in the extended image + // and the filter. + const int top_filter_margin = layer_config->filter_width * ii_shift; + const int right_filter_margin = end_jj_shift - in_width; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + for (int h = start_h, u = 0; h < in_height; + h += layer_config->skip_height, ++u) { + const int out_h = u * out_stride; + const int top_cstep = + AOMMAX(0, top_filter_margin - h * layer_config->filter_width) * + cstep + + i; + const int start_ii = AOMMAX(0, h - ii_shift); + const int end_ii = AOMMIN(in_height, h + end_ii_shift); + for (int w = start_w, out_index = out_h; w < in_width; + w += layer_config->skip_width, ++out_index) { + const int left_cstep = AOMMAX(0, jj_shift - w) * cstep; + const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep; + const int start_jj = AOMMAX(0, w - jj_shift); + const int end_jj = AOMMIN(in_width, w + end_jj_shift); + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + top_cstep; + for (int ii = start_ii; ii < end_ii; ++ii) { + off += left_cstep; + for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) { + sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; + } + off += right_cstep; + } + } + output[i][out_index] = sum; + } + } + } +} + +// CNNConvolve specific to maxpool set as 0 and padding equal to +// PADDING_SAME_REPLICATE. +static void convolve_no_maxpool_padding_replicate( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int ii_shift, const int jj_shift, + const int channel_step) { + // h and w are shifted to an offset coordinate system to reduce in-loop + // computation. + const int start_h = + get_start_shift_convolve(in_height, layer_config->filter_height, + layer_config->skip_height) - + ii_shift; + const int start_w = + get_start_shift_convolve(in_width, layer_config->filter_width, + layer_config->skip_width) - + jj_shift; + const int end_h = in_height - ii_shift; + const int end_w = in_width - jj_shift; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + for (int h = start_h, u = 0; h < end_h; + h += layer_config->skip_height, ++u) { + const int out_h = u * out_stride; + const int upper_ii_index = layer_config->filter_height + h; + for (int w = start_w, out_index = out_h; w < end_w; + w += layer_config->skip_width, ++out_index) { + const int upper_jj_index = layer_config->filter_width + w; + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int ii = h; ii < upper_ii_index; ++ii) { + const int clamped_ii = CLAMPINDEX(ii, in_height); + for (int jj = w; jj < upper_jj_index; ++jj) { + const int clamped_jj = CLAMPINDEX(jj, in_width); + assert(clamped_ii >= 0 && clamped_ii < in_height && + clamped_jj >= 0 && clamped_jj < in_width); + sum += layer_config->weights[off] * + input[k][clamped_ii * in_stride + clamped_jj]; + off += cstep; + } + } + } + output[i][out_index] = sum; + } + } + } +} + +// CNNConvolve specific to maxpool set as 0 and padding equal to +// PADDING_VALID. +void av1_cnn_convolve_no_maxpool_padding_valid_c( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, + int start_idx, int cstep, int channel_step) { + assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) || + !layer_config->maxpool); + assert(layer_config->filter_height > 1 || layer_config->filter_width > 1); + assert(layer_config->pad == PADDING_VALID); + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1; + h += layer_config->skip_height, ++u) { + const int out_h = u * out_stride; + const int upper_ii_index = layer_config->filter_height + h; + for (int w = 0, out_index = out_h; + w < in_width - layer_config->filter_width + 1; + w += layer_config->skip_width, ++out_index) { + const int upper_jj_index = layer_config->filter_width + w; + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int ii = h; ii < upper_ii_index; ++ii) { + for (int jj = w; jj < upper_jj_index; ++jj) { + assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); + sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; + off += cstep; + } + } + } + output[i][out_index] = sum; + } + } + } +} + +static void av1_cnn_convolve(const float **input, int in_width, int in_height, + int in_stride, + const CNN_LAYER_CONFIG *layer_config, + float **output, int out_stride, int start_idx, + int step) { + assert(!layer_config->deconvolve); + const int cstep = layer_config->in_channels * layer_config->out_channels; + const int filter_height_half = layer_config->filter_height >> 1; + const int filter_width_half = layer_config->filter_width >> 1; + const int channel_step = AOMMAX(step, 1); + + if (layer_config->maxpool && + (layer_config->skip_height > 1 || layer_config->skip_width > 1)) { + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + convolve_maxpool_padding_zero(input, in_width, in_height, in_stride, + layer_config, output, out_stride, cstep, + filter_width_half, filter_height_half); + break; + case PADDING_SAME_REPLICATE: + convolve_maxpool_padding_replicate( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, cstep, filter_width_half, filter_height_half); + break; + case PADDING_VALID: + convolve_maxpool_padding_valid(input, in_width, in_height, in_stride, + layer_config, output, out_stride, cstep); + break; + default: assert(0 && "Unknown padding type"); + } + } else { + // Results in element-wise matrix multiplication. + if (layer_config->filter_height == 1 && layer_config->filter_width == 1) { + convolve_element_wise(input, in_width, in_height, in_stride, layer_config, + output, out_stride, start_idx, step); + return; + } + const int ii_shift = + filter_height_half - (layer_config->filter_height - 1) % 2; + const int jj_shift = + filter_width_half - (layer_config->filter_width - 1) % 2; + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + convolve_no_maxpool_padding_zero( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, start_idx, cstep, filter_width_half, filter_height_half, + ii_shift, jj_shift, channel_step); + break; + case PADDING_SAME_REPLICATE: + convolve_no_maxpool_padding_replicate( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step); + break; + case PADDING_VALID: + av1_cnn_convolve_no_maxpool_padding_valid( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, start_idx, cstep, channel_step); + break; + default: assert(0 && "Unknown padding type"); + } + } +} + +static int convolve_layer(void *arg1, void *arg2) { + const CONVOLVE_OPS *convolve_ops = arg1; + (void)arg2; + av1_cnn_convolve( + convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height, + convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output, + convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step); + return 1; +} + +static void convolve_layer_mt(const float **input, int in_width, int in_height, + int in_stride, + const CNN_LAYER_CONFIG *layer_config, + const CNN_THREAD_DATA *thread_data, + float **output, int out_stride) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + const int num_workers = thread_data->num_workers; + assert(thread_data->workers); + + CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS]; + for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) { + AVxWorker *const worker = &thread_data->workers[th]; + winterface->reset(worker); + + CONVOLVE_OPS convolve_op = { input, in_width, in_height, + in_stride, layer_config, output, + out_stride, th, num_workers }; + convolve_ops[th] = convolve_op; + worker->hook = convolve_layer; + worker->data1 = &(convolve_ops[th]); + worker->data2 = NULL; + + // Start convolving. + if (th == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + // Wait until all workers have finished. + for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) { + winterface->sync(&thread_data->workers[th]); + } +} + +static INLINE int get_start_shift_deconvolve(int filt_width, int stride) { + const int dif = AOMMAX(filt_width - stride, 0); + return dif / 2; +} + +void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, + int stride, const float *gamma, const float *beta, + const float *mean, const float *std) { + assert(gamma && beta && beta && std && "batchnorm has null parameter!"); + for (int ch = 0; ch < channels; ch++) { + const float ch_gamma = gamma[ch]; + const float ch_beta = beta[ch]; + const float ch_mean = mean[ch]; + const float ch_std = std[ch]; + float *image_row = image[ch]; + + for (int row = 0; row < height; row++) { + for (int col = 0; col < width; col++) { + image_row[col] = + ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta; + } + image_row += stride; + } + } +} + +void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height, + int in_stride, const CNN_LAYER_CONFIG *layer_config, + float **output, int out_stride) { + assert(layer_config->deconvolve); + + const int cstep = layer_config->in_channels * layer_config->out_channels; + + int out_width = 0; + int out_height = 0; + av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width, + &out_height); + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int u = 0; u < out_height; ++u) { + for (int v = 0; v < out_width; ++v) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int h = + u - l + + get_start_shift_deconvolve(layer_config->filter_height, + layer_config->skip_height); + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int w = + v - m + + get_start_shift_deconvolve(layer_config->filter_width, + layer_config->skip_width); + if ((h % layer_config->skip_height) != 0 || + (w % layer_config->skip_width) != 0) + continue; + const int ii = h / layer_config->skip_height; + const int jj = w / layer_config->skip_width; + if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) + continue; + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + output[i][u * out_stride + v] = sum; + } + } + } + break; + case PADDING_SAME_REPLICATE: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int u = 0; u < out_height; ++u) { + for (int v = 0; v < out_width; ++v) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int h = + u - l + + get_start_shift_deconvolve(layer_config->filter_height, + layer_config->skip_height); + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int w = + v - m + + get_start_shift_deconvolve(layer_config->filter_width, + layer_config->skip_width); + if ((h % layer_config->skip_height) != 0 || + (w % layer_config->skip_width) != 0) + continue; + const int ii = + CLAMPINDEX(h / layer_config->skip_height, in_height); + const int jj = + CLAMPINDEX(w / layer_config->skip_width, in_width); + assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + output[i][u * out_stride + v] = sum; + } + } + } + break; + case PADDING_VALID: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int u = 0; u < out_height; ++u) { + for (int v = 0; v < out_width; ++v) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int h = u - l; + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int w = v - m; + if ((h % layer_config->skip_height) != 0 || + (w % layer_config->skip_width) != 0) + continue; + const int ii = h / layer_config->skip_height; + const int jj = w / layer_config->skip_width; + if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) + continue; + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + output[i][u * out_stride + v] = sum; + } + } + } + break; + default: assert(0 && "Unknown padding type"); + } +} + +bool av1_cnn_predict_c(const float **input, int in_width, int in_height, + int in_stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + CNN_MULTI_OUT *output_struct) { + bool success = false; + TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } }; + TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } }; + + float **output[CNN_MAX_BRANCHES]; + const int *out_chs = output_struct->output_channels; + output[0] = output_struct->output_buffer; + for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) { + output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1]; + } + + int i_width = in_width; + int i_height = in_height; + int o_width = 0, o_height = 0; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + init_tensor(&tensor1[b]); + init_tensor(&tensor2[b]); + } + + const int *out_stride = output_struct->output_strides; + for (int layer = 0; layer < cnn_config->num_layers; ++layer) { + const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer]; + const int branch = layer_config->branch; + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + + // Allocate input tensor + if (layer == 0) { // First layer + assert(branch == 0); // First layer must be primary branch + assign_tensor(&tensor1[branch], (float **)input, + layer_config->in_channels, in_width, in_height, in_stride); + } else { // Non-first layer + // Swap tensor1 and tensor2 + swap_tensor(&tensor1[branch], &tensor2[branch]); + + i_width = tensor1[branch].width; + i_height = tensor1[branch].height; + } + + // Allocate output tensor + av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width, + &o_height); + const int output_num = layer_config->output_num; + if (output_num == -1) { // Non-output layer + if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width, + o_height)) { + goto Error; + } + } else { // Output layer + free_tensor(&tensor2[branch]); + assign_tensor(&tensor2[branch], output[output_num], + layer_config->out_channels, o_width, o_height, + out_stride[output_num]); + } + + // If we are combining branches make sure that the branch to combine + // is different from the current branch. + assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC, + !(branch_config->branches_to_combine & (1 << branch)))); + + if (layer_config->branch_copy_type == BRANCH_INPUT) { + if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config, + branch, tensor2)) { + goto Error; + } + } + // Check consistency of input and output channels + assert(tensor1[branch].channels == layer_config->in_channels); + assert(tensor2[branch].channels == layer_config->out_channels); + + // Convolve/Deconvolve + if (!cnn_config->layer_config[layer].deconvolve) { + if (thread_data->num_workers > 1) { + convolve_layer_mt((const float **)tensor1[branch].buf, + tensor1[branch].width, tensor1[branch].height, + tensor1[branch].stride, layer_config, thread_data, + tensor2[branch].buf, tensor2[branch].stride); + } else { + av1_cnn_convolve((const float **)tensor1[branch].buf, + tensor1[branch].width, tensor1[branch].height, + tensor1[branch].stride, layer_config, + tensor2[branch].buf, tensor2[branch].stride, 0, 1); + } + } else { + av1_cnn_deconvolve((const float **)tensor1[branch].buf, + tensor1[branch].width, tensor1[branch].height, + tensor1[branch].stride, layer_config, + tensor2[branch].buf, tensor2[branch].stride); + } + + if (layer_config->branch_copy_type == BRANCH_OUTPUT) { + if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config, + branch, tensor2)) { + goto Error; + } + } + + // Add tensors from other branches if needed + if (layer_config->branch_combine_type == BRANCH_ADD) { + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch])); + av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels, + tensor2[branch].width, tensor2[branch].height, + tensor2[branch].stride, (const float **)tensor2[b].buf); + } + } + } + + // Non-linearity + av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels, + tensor2[branch].width, tensor2[branch].height, + tensor2[branch].stride, layer_config->activation); + + if (layer_config->bn_params.bn_gamma) { + av1_cnn_batchnorm( + tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width, + tensor2[branch].height, tensor2[branch].stride, + layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta, + layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std); + } + + // Concatenate tensors + if (layer_config->branch_combine_type == BRANCH_CAT) { + if (output_num == -1) { // Non-output layer + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); + assert(tensor2[b].channels > 0); + if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error; + } + } + } else { // Output layer + const int existing_channels = tensor2[branch].channels; + int num_chs = existing_channels; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); + // Needed only to assign the new channel buffers + num_chs += tensor2[b].channels; + } + } + assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width, + o_height, out_stride[output_num]); + + num_chs = existing_channels; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); + // Needed only to assign the new channel buffers + copy_tensor(&tensor2[b], tensor2[b].channels, num_chs, + &tensor2[branch]); + num_chs += tensor2[b].channels; + } + } + } + } + + if (layer_config->branch_copy_type == BRANCH_COMBINED) { + if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config, + branch, tensor2)) { + goto Error; + } + } + } + + success = true; +Error: + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + free_tensor(&tensor1[b]); + free_tensor(&tensor2[b]); + } + return success; +} + +// Assume output already has proper allocation +// Assume input image buffers all have same resolution and strides +bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height, + int stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + CNN_MULTI_OUT *output) { + const float max_val = 255.0; + + const int in_width = width + 2 * cnn_config->ext_width; + const int in_height = height + 2 * cnn_config->ext_height; + const int in_channels = cnn_config->layer_config[0].in_channels; + float *inputs[CNN_MAX_CHANNELS]; + float *input_ = + (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_)); + if (!input_) return false; + const int in_stride = in_width; + + for (int c = 0; c < in_channels; ++c) { + inputs[c] = input_ + c * in_stride * in_height; + float *input = + inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width; + + if (cnn_config->strict_bounds) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + // extend left and right + for (int i = 0; i < height; ++i) { + for (int j = -cnn_config->ext_width; j < 0; ++j) + input[i * in_stride + j] = input[i * in_stride]; + for (int j = width; j < width + cnn_config->ext_width; ++j) + input[i * in_stride + j] = input[i * in_stride + width - 1]; + } + // extend top and bottom + for (int i = -cnn_config->ext_height; i < 0; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[-cnn_config->ext_width], in_width * sizeof(*input)); + for (int i = height; i < height + cnn_config->ext_height; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[(height - 1) * in_stride - cnn_config->ext_width], + in_width * sizeof(*input)); + } else { + for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height; + ++i) + for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width; + ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + } + } + bool success = av1_cnn_predict((const float **)inputs, in_width, in_height, + in_stride, cnn_config, thread_data, output); + + aom_free(input_); + return success; +} + +// Assume output already has proper allocation +// Assume input image buffers all have same resolution and strides +bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height, + int stride, + const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + int bit_depth, + CNN_MULTI_OUT *output) { + const float max_val = (float)((1 << bit_depth) - 1); + + const int in_width = width + 2 * cnn_config->ext_width; + const int in_height = height + 2 * cnn_config->ext_height; + const int in_channels = cnn_config->layer_config[0].in_channels; + float *inputs[CNN_MAX_CHANNELS]; + float *input_ = + (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_)); + if (!input_) return false; + const int in_stride = in_width; + + for (int c = 0; c < in_channels; ++c) { + inputs[c] = input_ + c * in_stride * in_height; + float *input = + inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width; + + if (cnn_config->strict_bounds) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + // extend left and right + for (int i = 0; i < height; ++i) { + for (int j = -cnn_config->ext_width; j < 0; ++j) + input[i * in_stride + j] = input[i * in_stride]; + for (int j = width; j < width + cnn_config->ext_width; ++j) + input[i * in_stride + j] = input[i * in_stride + width - 1]; + } + // extend top and bottom + for (int i = -cnn_config->ext_height; i < 0; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[-cnn_config->ext_width], in_width * sizeof(*input)); + for (int i = height; i < height + cnn_config->ext_height; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[(height - 1) * in_stride - cnn_config->ext_width], + in_width * sizeof(*input)); + } else { + for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height; + ++i) + for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width; + ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + } + } + + bool success = av1_cnn_predict((const float **)inputs, in_width, in_height, + in_stride, cnn_config, thread_data, output); + + aom_free(input_); + return success; +} diff --git a/third_party/aom/av1/encoder/cnn.h b/third_party/aom/av1/encoder/cnn.h new file mode 100644 index 0000000000..df6401f73f --- /dev/null +++ b/third_party/aom/av1/encoder/cnn.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_CNN_H_ +#define AOM_AV1_ENCODER_CNN_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include "aom_util/aom_thread.h" +#include "config/av1_rtcd.h" + +struct AV1Common; + +#define CNN_MAX_HIDDEN_LAYERS 64 +#define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1) +#define CNN_MAX_CHANNELS 256 +#define CNN_MAX_BRANCHES 4 +#define CNN_MAX_THREADS 32 + +#define NO_BRANCH_CONFIG \ + { 0, 0, 0 } +#define NO_BN_PARAMS \ + { NULL, NULL, NULL, NULL } + +enum { + PADDING_SAME_ZERO, // tensorflow's SAME padding with pixels outside + // the image area assumed to be 0 (default) + PADDING_SAME_REPLICATE, // tensorflow's SAME padding with pixels outside + // the image area replicated from closest edge + PADDING_VALID // tensorflow's VALID padding +} UENUM1BYTE(PADDING_TYPE); + +// enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION); + +// Times when input tensor may be copied to branches given in input_to_branches. +// BRANCH_NO_COPY: doesn't copy any tensor. +// BRANCH_INPUT: copies the input tensor to branches. +// BRANCH_OUTPUT: copies the convolved tensor to branches. +// BRANCH_COMBINED: copies the combined (after convolving and branch combining) +// tensor. If no combinations happen at this layer, then this option +// has the same effect as COPY_OUTPUT. +enum { + BRANCH_NO_COPY, + BRANCH_INPUT, + BRANCH_OUTPUT, + BRANCH_COMBINED +} UENUM1BYTE(BRANCH_COPY); + +// Types of combining branches with output of current layer: +// BRANCH_NOC: no branch combining +// BRANCH_ADD: Add previously stored branch tensor to output of layer +// BRANCH_CAT: Concatenate branch tensor to output of layer +enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE); + +// The parameters used to scale each channel in batch +// normalization. The processing in done on a per-channel basis. +// e.g. bn_mean[c] is the mean for all pixels in channel c. This +// is always applied after activation. The output is given by +// out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where +// norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c] +// here we assume that the effect of variance_epsilon is already +// taken into account when bn_std is calculated. The pointers +// needs to be either all zero or all valid. If all zero, then +// batchnorm is disabled, else batchnorm is applied. +struct CNN_BATCHNORM_PARAMS { + const float *bn_gamma; + const float *bn_beta; + const float *bn_mean; + const float *bn_std; +}; + +struct CNN_BRANCH_CONFIG { + int input_to_branches; // If nonzero, copy the active tensor to the current + // layer and store for future use in branches + // specified in the field as a binary mask. For + // example, if input_to_branch = 0x06, it means the + // input tensor to the current branch is copied to + // branches 1 and 2 (where 0 represents the primary + // branch). One restriction is that the mask + // cannot indicate copying to the current branch. + // If greater than 0, only copies the channels up + // to the given index. + int channels_to_copy; // Within the layer, input a copy of active + // tensor to branches given in input_to_branches. + int branches_to_combine; // mask of branches to combine with output of + // current layer, if + // branch_combine_type != BRANCH_NOC + // For example, if branches_to_combine = 0x0A, + // it means that braches 1 and 3 are combined + // with the current branch. +}; + +struct CNN_LAYER_CONFIG { + int in_channels; + int filter_width; + int filter_height; + int out_channels; + int skip_width; + int skip_height; + int maxpool; // whether to use maxpool or not (only effective when + // skip width or skip_height are > 1) + const float *weights; // array of length filter_height x filter_width x + // in_channels x out_channels where the inner-most + // scan is out_channels and the outer most scan is + // filter_height. + const float *bias; // array of length out_channels + PADDING_TYPE pad; // padding type + ACTIVATION activation; // the activation function to use after convolution + int deconvolve; // whether this is a deconvolution layer. + // 0: If skip_width or skip_height are > 1, then we + // reduce resolution + // 1: If skip_width or skip_height are > 1, then we + // increase resolution + int branch; // branch index in [0, CNN_MAX_BRANCHES - 1], where + // 0 refers to the primary branch. + BRANCH_COPY branch_copy_type; + BRANCH_COMBINE branch_combine_type; + struct CNN_BRANCH_CONFIG branch_config; + struct CNN_BATCHNORM_PARAMS + bn_params; // A struct that contains the parameters + // used for batch normalization. + int output_num; // The output buffer idx to which the layer output is + // written. Set to -1 to disable writing it to the output. In + // the case that branch_combine_type is BRANCH_CAT, all + // concatenated channels will be written to output. In the + // case of BRANCH_ADD, the output will be the result of + // summation. +}; + +struct CNN_CONFIG { + int num_layers; // number of CNN layers ( = number of hidden layers + 1) + int is_residue; // whether the output activation is a residue + int ext_width, ext_height; // extension horizontally and vertically + int strict_bounds; // whether the input bounds are strict or not. + // If strict, the extension area is filled by + // replication; if not strict, image data is + // assumed available beyond the bounds. + CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS]; +}; + +struct CNN_THREAD_DATA { + int num_workers; + AVxWorker *workers; +}; + +struct CNN_MULTI_OUT { + int num_outputs; + const int *output_channels; + const int *output_strides; + float **output_buffer; +}; + +// Function to return size of output +void av1_find_cnn_output_size(int in_width, int in_height, + const CNN_CONFIG *cnn_config, int *out_width, + int *out_height, int *out_channels); + +// Function to return output width and output height of given layer. +void av1_find_cnn_layer_output_size(int in_width, int in_height, + const CNN_LAYER_CONFIG *layer_config, + int *out_width, int *out_height); + +// Prediction functions from set of input image buffers. This function supports +// CNN with multiple outputs. +bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height, + int stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + struct CNN_MULTI_OUT *output); +bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height, + int stride, + const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + int bit_depth, CNN_MULTI_OUT *output); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_CNN_H_ diff --git a/third_party/aom/av1/encoder/compound_type.c b/third_party/aom/av1/encoder/compound_type.c new file mode 100644 index 0000000000..3b0ee88241 --- /dev/null +++ b/third_party/aom/av1/encoder/compound_type.c @@ -0,0 +1,1678 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/pred_common.h" +#include "av1/encoder/compound_type.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tx_search.h" + +typedef int64_t (*pick_interinter_mask_type)( + const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, + const uint8_t *const p0, const uint8_t *const p1, + const int16_t *const residual1, const int16_t *const diff10, + uint64_t *best_sse); + +// Checks if characteristics of search match +static INLINE int is_comp_rd_match(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const COMP_RD_STATS *st, + const MB_MODE_INFO *const mi, + int32_t *comp_rate, int64_t *comp_dist, + int32_t *comp_model_rate, + int64_t *comp_model_dist, int *comp_rs2) { + // TODO(ranjit): Ensure that compound type search use regular filter always + // and check if following check can be removed + // Check if interp filter matches with previous case + if (st->filter.as_int != mi->interp_filters.as_int) return 0; + + const MACROBLOCKD *const xd = &x->e_mbd; + // Match MV and reference indices + for (int i = 0; i < 2; ++i) { + if ((st->ref_frames[i] != mi->ref_frame[i]) || + (st->mv[i].as_int != mi->mv[i].as_int)) { + return 0; + } + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]]; + if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0; + } + + int reuse_data[COMPOUND_TYPES] = { 1, 1, 0, 0 }; + // For compound wedge, reuse data if newmv search is disabled when NEWMV is + // present or if NEWMV is not present in either of the directions + if ((!have_newmv_in_inter_mode(mi->mode) && + !have_newmv_in_inter_mode(st->mode)) || + (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)) + reuse_data[COMPOUND_WEDGE] = 1; + // For compound diffwtd, reuse data if fast search is enabled (no newmv search + // when NEWMV is present) or if NEWMV is not present in either of the + // directions + if (cpi->sf.inter_sf.enable_fast_compound_mode_search || + (!have_newmv_in_inter_mode(mi->mode) && + !have_newmv_in_inter_mode(st->mode))) + reuse_data[COMPOUND_DIFFWTD] = 1; + + // Store the stats for the different compound types + for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_TYPES; + comp_type++) { + if (reuse_data[comp_type]) { + comp_rate[comp_type] = st->rate[comp_type]; + comp_dist[comp_type] = st->dist[comp_type]; + comp_model_rate[comp_type] = st->model_rate[comp_type]; + comp_model_dist[comp_type] = st->model_dist[comp_type]; + comp_rs2[comp_type] = st->comp_rs2[comp_type]; + } + } + return 1; +} + +// Checks if similar compound type search case is accounted earlier +// If found, returns relevant rd data +static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi, + const MACROBLOCK *x, + const MB_MODE_INFO *const mbmi, + int32_t *comp_rate, int64_t *comp_dist, + int32_t *comp_model_rate, + int64_t *comp_model_dist, int *comp_rs2, + int *match_index) { + for (int j = 0; j < x->comp_rd_stats_idx; ++j) { + if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate, + comp_dist, comp_model_rate, comp_model_dist, + comp_rs2)) { + *match_index = j; + return 1; + } + } + return 0; // no match result found +} + +static INLINE bool enable_wedge_search( + MACROBLOCK *const x, const unsigned int disable_wedge_var_thresh) { + // Enable wedge search if source variance and edge strength are above + // the thresholds. + return x->source_variance > disable_wedge_var_thresh; +} + +static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x, + const AV1_COMP *const cpi) { + return enable_wedge_search( + x, cpi->sf.inter_sf.disable_interinter_wedge_var_thresh) && + cpi->oxcf.comp_type_cfg.enable_interinter_wedge; +} + +static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x, + const AV1_COMP *const cpi) { + return enable_wedge_search( + x, cpi->sf.inter_sf.disable_interintra_wedge_var_thresh) && + cpi->oxcf.comp_type_cfg.enable_interintra_wedge; +} + +static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x, + const BLOCK_SIZE bsize, const uint8_t *pred0, + int stride0, const uint8_t *pred1, + int stride1) { + static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = { + // 4X4 + BLOCK_INVALID, + // 4X8, 8X4, 8X8 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4, + // 8X16, 16X8, 16X16 + BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, + // 16X32, 32X16, 32X32 + BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, + // 32X64, 64X32, 64X64 + BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, + // 64x128, 128x64, 128x128 + BLOCK_32X64, BLOCK_64X32, BLOCK_64X64, + // 4X16, 16X4, 8X32 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16, + // 32X8, 16X64, 64X16 + BLOCK_16X4, BLOCK_8X32, BLOCK_32X8 + }; + const struct macroblock_plane *const p = &x->plane[0]; + const uint8_t *src = p->src.buf; + int src_stride = p->src.stride; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int bw_by2 = bw >> 1; + const int bh_by2 = bh >> 1; + uint32_t esq[2][2]; + int64_t tl, br; + + const BLOCK_SIZE f_index = split_qtr[bsize]; + assert(f_index != BLOCK_INVALID); + + if (is_cur_buf_hbd(&x->e_mbd)) { + pred0 = CONVERT_TO_BYTEPTR(pred0); + pred1 = CONVERT_TO_BYTEPTR(pred1); + } + + // Residual variance computation over relevant quandrants in order to + // find TL + BR, TL = sum(1st,2nd,3rd) quadrants of (pred0 - pred1), + // BR = sum(2nd,3rd,4th) quadrants of (pred1 - pred0) + // The 2nd and 3rd quadrants cancel out in TL + BR + // Hence TL + BR = 1st quadrant of (pred0-pred1) + 4th of (pred1-pred0) + // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants) + // for all codebooks; experiment with other quadrant combinations for + // 0, 90 and 135 degrees also. + cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]); + cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, + pred0 + bh_by2 * stride0 + bw_by2, stride0, + &esq[0][1]); + cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]); + cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, + pred1 + bh_by2 * stride1 + bw_by2, stride0, + &esq[1][1]); + + tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]); + br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]); + return (tl + br > 0); +} + +// Choose the best wedge index and sign +static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, + const BLOCK_SIZE bsize, const uint8_t *const p0, + const int16_t *const residual1, + const int16_t *const diff10, + int8_t *const best_wedge_sign, + int8_t *const best_wedge_index, uint64_t *best_sse) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = bw * bh; + assert(N >= 64); + int rate; + int64_t dist; + int64_t rd, best_rd = INT64_MAX; + int8_t wedge_index; + int8_t wedge_sign; + const int8_t wedge_types = get_wedge_types_lookup(bsize); + const uint8_t *mask; + uint64_t sse; + const int hbd = is_cur_buf_hbd(xd); + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + + DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0 +#if CONFIG_AV1_HIGHBITDEPTH + if (hbd) { + aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p0), bw); + } else { + aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw); + } +#else + (void)hbd; + aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw); +#endif + + int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) - + (int64_t)aom_sum_squares_i16(residual1, N)) * + (1 << WEDGE_WEIGHT_BITS) / 2; + int16_t *ds = residual0; + + av1_wedge_compute_delta_squares(ds, residual0, residual1, N); + + for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize); + + wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); + + mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + // int rate2; + // int64_t dist2; + // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2); + // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n", + // sse, rate, dist, rate2, dist2); dist = dist2; + // rate = rate2; + + rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index]; + rd = RDCOST(x->rdmult, rate, dist); + + if (rd < best_rd) { + *best_wedge_index = wedge_index; + *best_wedge_sign = wedge_sign; + best_rd = rd; + *best_sse = sse; + } + } + + return best_rd - + RDCOST(x->rdmult, + x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0); +} + +// Choose the best wedge index the specified sign +static int64_t pick_wedge_fixed_sign( + const AV1_COMP *const cpi, const MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int16_t *const residual1, + const int16_t *const diff10, const int8_t wedge_sign, + int8_t *const best_wedge_index, uint64_t *best_sse) { + const MACROBLOCKD *const xd = &x->e_mbd; + + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = bw * bh; + assert(N >= 64); + int rate; + int64_t dist; + int64_t rd, best_rd = INT64_MAX; + int8_t wedge_index; + const int8_t wedge_types = get_wedge_types_lookup(bsize); + const uint8_t *mask; + uint64_t sse; + const int hbd = is_cur_buf_hbd(xd); + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index]; + rd = RDCOST(x->rdmult, rate, dist); + + if (rd < best_rd) { + *best_wedge_index = wedge_index; + best_rd = rd; + *best_sse = sse; + } + } + return best_rd - + RDCOST(x->rdmult, + x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0); +} + +static int64_t pick_interinter_wedge( + const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, + const uint8_t *const p0, const uint8_t *const p1, + const int16_t *const residual1, const int16_t *const diff10, + uint64_t *best_sse) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int bw = block_size_wide[bsize]; + + int64_t rd; + int8_t wedge_index = -1; + int8_t wedge_sign = 0; + + assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); + assert(cpi->common.seq_params->enable_masked_compound); + + if (cpi->sf.inter_sf.fast_wedge_sign_estimate) { + wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw); + rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign, + &wedge_index, best_sse); + } else { + rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign, + &wedge_index, best_sse); + } + + mbmi->interinter_comp.wedge_sign = wedge_sign; + mbmi->interinter_comp.wedge_index = wedge_index; + return rd; +} + +static int64_t pick_interinter_seg(const AV1_COMP *const cpi, + MACROBLOCK *const x, const BLOCK_SIZE bsize, + const uint8_t *const p0, + const uint8_t *const p1, + const int16_t *const residual1, + const int16_t *const diff10, + uint64_t *best_sse) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = 1 << num_pels_log2_lookup[bsize]; + int rate; + int64_t dist; + DIFFWTD_MASK_TYPE cur_mask_type; + int64_t best_rd = INT64_MAX; + DIFFWTD_MASK_TYPE best_mask_type = 0; + const int hbd = is_cur_buf_hbd(xd); + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); + uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask }; + // try each mask type and its inverse + for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) { + // build mask and inverse +#if CONFIG_AV1_HIGHBITDEPTH + if (hbd) + av1_build_compound_diffwtd_mask_highbd( + tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, + CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); + else + av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, + p0, bw, p1, bw, bh, bw); +#else + (void)hbd; + av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, p0, + bw, p1, bw, bh, bw); +#endif // CONFIG_AV1_HIGHBITDEPTH + + // compute rd for mask + uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10, + tmp_mask[cur_mask_type], N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + const int64_t rd0 = RDCOST(x->rdmult, rate, dist); + + if (rd0 < best_rd) { + best_mask_type = cur_mask_type; + best_rd = rd0; + *best_sse = sse; + } + } + mbmi->interinter_comp.mask_type = best_mask_type; + if (best_mask_type == DIFFWTD_38_INV) { + memcpy(xd->seg_mask, seg_mask, N * 2); + } + return best_rd; +} + +static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const BLOCK_SIZE bsize, + const uint8_t *const p0, + const uint8_t *const p1) { + const MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(av1_is_wedge_used(bsize)); + assert(cpi->common.seq_params->enable_interintra_compound); + + const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1 + DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0 +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p1), bw); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw, + CONVERT_TO_BYTEPTR(p0), bw); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw); + aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw); + } +#else + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw); + aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw); +#endif + int8_t wedge_index = -1; + uint64_t sse; + int64_t rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0, + &wedge_index, &sse); + + mbmi->interintra_wedge_index = wedge_index; + return rd; +} + +static AOM_INLINE void get_inter_predictors_masked_compound( + MACROBLOCK *x, const BLOCK_SIZE bsize, uint8_t **preds0, uint8_t **preds1, + int16_t *residual1, int16_t *diff10, int *strides) { + MACROBLOCKD *xd = &x->e_mbd; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + // get inter predictors to use for masked compound modes + av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, preds0, + strides); + av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, preds1, + strides); + const struct buf_2d *const src = &x->plane[0].src; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(*preds1), bw); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1), + bw, CONVERT_TO_BYTEPTR(*preds0), bw); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, + bw); + aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); + } +#else + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, bw); + aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); +#endif +} + +// Computes the rd cost for the given interintra mode and updates the best +static INLINE void compute_best_interintra_mode( + const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd, + MACROBLOCK *const x, const int *const interintra_mode_cost, + const BUFFER_SET *orig_dst, uint8_t *intrapred, const uint8_t *tmp_buf, + INTERINTRA_MODE *best_interintra_mode, int64_t *best_interintra_rd, + INTERINTRA_MODE interintra_mode, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + int rate; + uint8_t skip_txfm_sb; + int64_t dist, skip_sse_sb; + const int bw = block_size_wide[bsize]; + mbmi->interintra_mode = interintra_mode; + int rmode = interintra_mode_cost[interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](cpi, bsize, x, xd, 0, 0, &rate, &dist, + &skip_txfm_sb, &skip_sse_sb, NULL, + NULL, NULL); + int64_t rd = RDCOST(x->rdmult, rate + rmode, dist); + if (rd < *best_interintra_rd) { + *best_interintra_rd = rd; + *best_interintra_mode = mbmi->interintra_mode; + } +} + +static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs, + MACROBLOCK *x, int64_t ref_best_rd, + RD_STATS *rd_stats) { + MACROBLOCKD *const xd = &x->e_mbd; + if (ref_best_rd < 0) return INT64_MAX; + av1_subtract_plane(x, bs, 0); + const int64_t rd = av1_estimate_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, + max_txsize_rect_lookup[bs]); + if (rd != INT64_MAX) { + const int skip_ctx = av1_get_skip_txfm_context(xd); + if (rd_stats->skip_txfm) { + const int s1 = x->mode_costs.skip_txfm_cost[skip_ctx][1]; + rd_stats->rate = s1; + } else { + const int s0 = x->mode_costs.skip_txfm_cost[skip_ctx][0]; + rd_stats->rate += s0; + } + } + return rd; +} + +// Computes the rd_threshold for smooth interintra rd search. +static AOM_INLINE int64_t compute_rd_thresh(MACROBLOCK *const x, + int total_mode_rate, + int64_t ref_best_rd) { + const int64_t rd_thresh = get_rd_thresh_from_best_rd( + ref_best_rd, (1 << INTER_INTRA_RD_THRESH_SHIFT), + INTER_INTRA_RD_THRESH_SCALE); + const int64_t mode_rd = RDCOST(x->rdmult, total_mode_rate, 0); + return (rd_thresh - mode_rd); +} + +// Computes the best wedge interintra mode +static AOM_INLINE int64_t compute_best_wedge_interintra( + const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd, + MACROBLOCK *const x, const int *const interintra_mode_cost, + const BUFFER_SET *orig_dst, uint8_t *intrapred_, uint8_t *tmp_buf_, + int *best_mode, int *best_wedge_index, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const int bw = block_size_wide[bsize]; + int64_t best_interintra_rd_wedge = INT64_MAX; + int64_t best_total_rd = INT64_MAX; + uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); + for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) { + mbmi->interintra_mode = mode; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + const int rate_overhead = + interintra_mode_cost[mode] + + x->mode_costs.wedge_idx_cost[bsize][mbmi->interintra_wedge_index]; + const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0); + if (total_rd < best_total_rd) { + best_total_rd = total_rd; + best_interintra_rd_wedge = rd; + *best_mode = mbmi->interintra_mode; + *best_wedge_index = mbmi->interintra_wedge_index; + } + } + return best_interintra_rd_wedge; +} + +static int handle_smooth_inter_intra_mode( + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + MB_MODE_INFO *mbmi, int64_t ref_best_rd, int *rate_mv, + INTERINTRA_MODE *best_interintra_mode, int64_t *best_rd, + int *best_mode_rate, const BUFFER_SET *orig_dst, uint8_t *tmp_buf, + uint8_t *intrapred, HandleInterModeArgs *args) { + MACROBLOCKD *xd = &x->e_mbd; + const ModeCosts *mode_costs = &x->mode_costs; + const int *const interintra_mode_cost = + mode_costs->interintra_mode_cost[size_group_lookup[bsize]]; + const AV1_COMMON *const cm = &cpi->common; + const int bw = block_size_wide[bsize]; + + mbmi->use_wedge_interintra = 0; + + if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 || + *best_interintra_mode == INTERINTRA_MODES) { + int64_t best_interintra_rd = INT64_MAX; + for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES; + ++cur_mode) { + if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra || + cpi->sf.intra_sf.disable_smooth_intra) && + cur_mode == II_SMOOTH_PRED) + continue; + compute_best_interintra_mode( + cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, tmp_buf, + best_interintra_mode, &best_interintra_rd, cur_mode, bsize); + } + args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode; + } + assert(IMPLIES(!cpi->oxcf.comp_type_cfg.enable_smooth_interintra, + *best_interintra_mode != II_SMOOTH_PRED)); + // Recompute prediction if required + bool interintra_mode_reuse = cpi->sf.inter_sf.reuse_inter_intra_mode || + *best_interintra_mode != INTERINTRA_MODES; + if (interintra_mode_reuse || *best_interintra_mode != INTERINTRA_MODES - 1) { + mbmi->interintra_mode = *best_interintra_mode; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + + // Compute rd cost for best smooth_interintra + RD_STATS rd_stats; + const int is_wedge_used = av1_is_wedge_used(bsize); + const int rmode = + interintra_mode_cost[*best_interintra_mode] + + (is_wedge_used ? mode_costs->wedge_interintra_cost[bsize][0] : 0); + const int total_mode_rate = rmode + *rate_mv; + const int64_t rd_thresh = compute_rd_thresh(x, total_mode_rate, ref_best_rd); + int64_t rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats); + if (rd != INT64_MAX) { + rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist); + } else { + return IGNORE_MODE; + } + *best_rd = rd; + *best_mode_rate = rmode; + // Return early if best rd not good enough + if (ref_best_rd < INT64_MAX && + (*best_rd >> INTER_INTRA_RD_THRESH_SHIFT) * INTER_INTRA_RD_THRESH_SCALE > + ref_best_rd) { + return IGNORE_MODE; + } + return 0; +} + +static int handle_wedge_inter_intra_mode( + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + MB_MODE_INFO *mbmi, int *rate_mv, INTERINTRA_MODE *best_interintra_mode, + int64_t *best_rd, const BUFFER_SET *orig_dst, uint8_t *tmp_buf_, + uint8_t *tmp_buf, uint8_t *intrapred_, uint8_t *intrapred, + HandleInterModeArgs *args, int *tmp_rate_mv, int *rate_overhead, + int_mv *tmp_mv, int64_t best_rd_no_wedge) { + MACROBLOCKD *xd = &x->e_mbd; + const ModeCosts *mode_costs = &x->mode_costs; + const int *const interintra_mode_cost = + mode_costs->interintra_mode_cost[size_group_lookup[bsize]]; + const AV1_COMMON *const cm = &cpi->common; + const int bw = block_size_wide[bsize]; + const int try_smooth_interintra = + cpi->oxcf.comp_type_cfg.enable_smooth_interintra; + + mbmi->use_wedge_interintra = 1; + + if (!cpi->sf.inter_sf.fast_interintra_wedge_search) { + // Exhaustive search of all wedge and mode combinations. + int best_mode = 0; + int best_wedge_index = 0; + *best_rd = compute_best_wedge_interintra( + cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_, tmp_buf_, + &best_mode, &best_wedge_index, bsize); + mbmi->interintra_mode = best_mode; + mbmi->interintra_wedge_index = best_wedge_index; + if (best_mode != INTERINTRA_MODES - 1) { + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + } + } else if (!try_smooth_interintra) { + if (*best_interintra_mode == INTERINTRA_MODES) { + mbmi->interintra_mode = INTERINTRA_MODES - 1; + *best_interintra_mode = INTERINTRA_MODES - 1; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + // Pick wedge mask based on INTERINTRA_MODES - 1 + *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + // Find the best interintra mode for the chosen wedge mask + for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES; + ++cur_mode) { + compute_best_interintra_mode( + cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, + tmp_buf, best_interintra_mode, best_rd, cur_mode, bsize); + } + args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode; + mbmi->interintra_mode = *best_interintra_mode; + + // Recompute prediction if required + if (*best_interintra_mode != INTERINTRA_MODES - 1) { + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + } + } else { + // Pick wedge mask for the best interintra mode (reused) + mbmi->interintra_mode = *best_interintra_mode; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + } + } else { + // Pick wedge mask for the best interintra mode from smooth_interintra + *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + } + + *rate_overhead = + interintra_mode_cost[mbmi->interintra_mode] + + mode_costs->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] + + mode_costs->wedge_interintra_cost[bsize][1]; + *best_rd += RDCOST(x->rdmult, *rate_overhead + *rate_mv, 0); + + int64_t rd = INT64_MAX; + const int_mv mv0 = mbmi->mv[0]; + // Refine motion vector for NEWMV case. + if (have_newmv_in_inter_mode(mbmi->mode)) { + int rate_sum; + uint8_t skip_txfm_sb; + int64_t dist_sum, skip_sse_sb; + // get negative of mask + const uint8_t *mask = + av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize); + av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv->as_mv, intrapred, + mask, bw, tmp_rate_mv, 0); + if (mbmi->mv[0].as_int != tmp_mv->as_int) { + mbmi->mv[0].as_int = tmp_mv->as_int; + // Set ref_frame[1] to NONE_FRAME temporarily so that the intra + // predictor is not calculated again in av1_enc_build_inter_predictor(). + mbmi->ref_frame[1] = NONE_FRAME; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + mbmi->ref_frame[1] = INTRA_FRAME; + av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf, + xd->plane[AOM_PLANE_Y].dst.stride, intrapred, bw); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb, + &skip_sse_sb, NULL, NULL, NULL); + rd = + RDCOST(x->rdmult, *tmp_rate_mv + *rate_overhead + rate_sum, dist_sum); + } + } + if (rd >= *best_rd) { + tmp_mv->as_int = mv0.as_int; + *tmp_rate_mv = *rate_mv; + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + // Evaluate closer to true rd + RD_STATS rd_stats; + const int64_t mode_rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv, 0); + const int64_t tmp_rd_thresh = best_rd_no_wedge - mode_rd; + rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats); + if (rd != INT64_MAX) { + rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv + rd_stats.rate, + rd_stats.dist); + } else { + if (*best_rd == INT64_MAX) return IGNORE_MODE; + } + *best_rd = rd; + return 0; +} + +int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, + HandleInterModeArgs *args, int64_t ref_best_rd, + int *rate_mv, int *tmp_rate2, + const BUFFER_SET *orig_dst) { + const int try_smooth_interintra = + cpi->oxcf.comp_type_cfg.enable_smooth_interintra; + + const int is_wedge_used = av1_is_wedge_used(bsize); + const int try_wedge_interintra = + is_wedge_used && enable_wedge_interintra_search(x, cpi); + + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const int bw = block_size_wide[bsize]; + DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); + uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); + uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Single reference inter prediction + mbmi->ref_frame[1] = NONE_FRAME; + xd->plane[0].dst.buf = tmp_buf; + xd->plane[0].dst.stride = bw; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + const int num_planes = av1_num_planes(cm); + + // Restore the buffers for intra prediction + restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = INTRA_FRAME; + INTERINTRA_MODE best_interintra_mode = + args->inter_intra_mode[mbmi->ref_frame[0]]; + + // Compute smooth_interintra + int64_t best_interintra_rd_nowedge = INT64_MAX; + int best_mode_rate = INT_MAX; + if (try_smooth_interintra) { + int ret = handle_smooth_inter_intra_mode( + cpi, x, bsize, mbmi, ref_best_rd, rate_mv, &best_interintra_mode, + &best_interintra_rd_nowedge, &best_mode_rate, orig_dst, tmp_buf, + intrapred, args); + if (ret == IGNORE_MODE) { + return IGNORE_MODE; + } + } + + // Compute wedge interintra + int64_t best_interintra_rd_wedge = INT64_MAX; + const int_mv mv0 = mbmi->mv[0]; + int_mv tmp_mv = mv0; + int tmp_rate_mv = 0; + int rate_overhead = 0; + if (try_wedge_interintra) { + int ret = handle_wedge_inter_intra_mode( + cpi, x, bsize, mbmi, rate_mv, &best_interintra_mode, + &best_interintra_rd_wedge, orig_dst, tmp_buf_, tmp_buf, intrapred_, + intrapred, args, &tmp_rate_mv, &rate_overhead, &tmp_mv, + best_interintra_rd_nowedge); + if (ret == IGNORE_MODE) { + return IGNORE_MODE; + } + } + + if (best_interintra_rd_nowedge == INT64_MAX && + best_interintra_rd_wedge == INT64_MAX) { + return IGNORE_MODE; + } + if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { + mbmi->mv[0].as_int = tmp_mv.as_int; + *tmp_rate2 += tmp_rate_mv - *rate_mv; + *rate_mv = tmp_rate_mv; + best_mode_rate = rate_overhead; + } else if (try_smooth_interintra && try_wedge_interintra) { + // If smooth was best, but we over-wrote the values when evaluating the + // wedge mode, we need to recompute the smooth values. + mbmi->use_wedge_interintra = 0; + mbmi->interintra_mode = best_interintra_mode; + mbmi->mv[0].as_int = mv0.as_int; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } + *tmp_rate2 += best_mode_rate; + + if (num_planes > 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_U, num_planes - 1); + } + return 0; +} + +// Computes the valid compound_types to be evaluated +static INLINE int compute_valid_comp_types(MACROBLOCK *x, + const AV1_COMP *const cpi, + BLOCK_SIZE bsize, + int masked_compound_used, + int mode_search_mask, + COMPOUND_TYPE *valid_comp_types) { + const AV1_COMMON *cm = &cpi->common; + int valid_type_count = 0; + int comp_type, valid_check; + int8_t enable_masked_type[MASKED_COMPOUND_TYPES] = { 0, 0 }; + + const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE)); + const int try_distwtd_comp = + ((mode_search_mask & (1 << COMPOUND_DISTWTD)) && + cm->seq_params->order_hint_info.enable_dist_wtd_comp == 1 && + cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED); + + // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases + for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD; + comp_type++) { + valid_check = + (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp; + if (valid_check && is_interinter_compound_used(comp_type, bsize)) + valid_comp_types[valid_type_count++] = comp_type; + } + // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases + if (masked_compound_used) { + // enable_masked_type[0] corresponds to COMPOUND_WEDGE + // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD + enable_masked_type[0] = enable_wedge_interinter_search(x, cpi); + enable_masked_type[1] = cpi->oxcf.comp_type_cfg.enable_diff_wtd_comp; + for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD; + comp_type++) { + if ((mode_search_mask & (1 << comp_type)) && + is_interinter_compound_used(comp_type, bsize) && + enable_masked_type[comp_type - COMPOUND_WEDGE]) + valid_comp_types[valid_type_count++] = comp_type; + } + } + return valid_type_count; +} + +// Calculates the cost for compound type mask +static INLINE void calc_masked_type_cost( + const ModeCosts *mode_costs, BLOCK_SIZE bsize, int comp_group_idx_ctx, + int comp_index_ctx, int masked_compound_used, int *masked_type_cost) { + av1_zero_array(masked_type_cost, COMPOUND_TYPES); + // Account for group index cost when wedge and/or diffwtd prediction are + // enabled + if (masked_compound_used) { + // Compound group index of average and distwtd is 0 + // Compound group index of wedge and diffwtd is 1 + masked_type_cost[COMPOUND_AVERAGE] += + mode_costs->comp_group_idx_cost[comp_group_idx_ctx][0]; + masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE]; + masked_type_cost[COMPOUND_WEDGE] += + mode_costs->comp_group_idx_cost[comp_group_idx_ctx][1]; + masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE]; + } + + // Compute the cost to signal compound index/type + masked_type_cost[COMPOUND_AVERAGE] += + mode_costs->comp_idx_cost[comp_index_ctx][1]; + masked_type_cost[COMPOUND_DISTWTD] += + mode_costs->comp_idx_cost[comp_index_ctx][0]; + masked_type_cost[COMPOUND_WEDGE] += mode_costs->compound_type_cost[bsize][0]; + masked_type_cost[COMPOUND_DIFFWTD] += + mode_costs->compound_type_cost[bsize][1]; +} + +// Updates mbmi structure with the relevant compound type info +static INLINE void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi, + COMPOUND_TYPE cur_type) { + mbmi->interinter_comp.type = cur_type; + mbmi->comp_group_idx = (cur_type >= COMPOUND_WEDGE); + mbmi->compound_idx = (cur_type != COMPOUND_DISTWTD); +} + +// When match is found, populate the compound type data +// and calculate the rd cost using the stored stats and +// update the mbmi appropriately. +static INLINE int populate_reuse_comp_type_data( + const MACROBLOCK *x, MB_MODE_INFO *mbmi, + BEST_COMP_TYPE_STATS *best_type_stats, int_mv *cur_mv, int32_t *comp_rate, + int64_t *comp_dist, int *comp_rs2, int *rate_mv, int64_t *rd, + int match_index) { + const int winner_comp_type = + x->comp_rd_stats[match_index].interinter_comp.type; + if (comp_rate[winner_comp_type] == INT_MAX) + return best_type_stats->best_compmode_interinter_cost; + update_mbmi_for_compound_type(mbmi, winner_comp_type); + mbmi->interinter_comp = x->comp_rd_stats[match_index].interinter_comp; + *rd = RDCOST( + x->rdmult, + comp_rs2[winner_comp_type] + *rate_mv + comp_rate[winner_comp_type], + comp_dist[winner_comp_type]); + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + return comp_rs2[winner_comp_type]; +} + +// Updates rd cost and relevant compound type data for the best compound type +static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd, + BEST_COMP_TYPE_STATS *best_type_stats, + int64_t best_rd_cur, + int64_t comp_model_rd_cur, int rs2) { + *rd = best_rd_cur; + best_type_stats->comp_best_model_rd = comp_model_rd_cur; + best_type_stats->best_compound_data = mbmi->interinter_comp; + best_type_stats->best_compmode_interinter_cost = rs2; +} + +// Updates best_mv for masked compound types +static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi, + int_mv *best_mv, int *best_tmp_rate_mv, + int tmp_rate_mv) { + *best_tmp_rate_mv = tmp_rate_mv; + best_mv[0].as_int = mbmi->mv[0].as_int; + best_mv[1].as_int = mbmi->mv[1].as_int; +} + +static INLINE void save_comp_rd_search_stat( + MACROBLOCK *x, const MB_MODE_INFO *const mbmi, const int32_t *comp_rate, + const int64_t *comp_dist, const int32_t *comp_model_rate, + const int64_t *comp_model_dist, const int_mv *cur_mv, const int *comp_rs2) { + const int offset = x->comp_rd_stats_idx; + if (offset < MAX_COMP_RD_STATS) { + COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset; + memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate)); + memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist)); + memcpy(rd_stats->model_rate, comp_model_rate, sizeof(rd_stats->model_rate)); + memcpy(rd_stats->model_dist, comp_model_dist, sizeof(rd_stats->model_dist)); + memcpy(rd_stats->comp_rs2, comp_rs2, sizeof(rd_stats->comp_rs2)); + memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv)); + memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames)); + rd_stats->mode = mbmi->mode; + rd_stats->filter = mbmi->interp_filters; + rd_stats->ref_mv_idx = mbmi->ref_mv_idx; + const MACROBLOCKD *const xd = &x->e_mbd; + for (int i = 0; i < 2; ++i) { + const WarpedMotionParams *const wm = + &xd->global_motion[mbmi->ref_frame[i]]; + rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype); + } + memcpy(&rd_stats->interinter_comp, &mbmi->interinter_comp, + sizeof(rd_stats->interinter_comp)); + ++x->comp_rd_stats_idx; + } +} + +static INLINE int get_interinter_compound_mask_rate( + const ModeCosts *const mode_costs, const MB_MODE_INFO *const mbmi) { + const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD + if (compound_type == COMPOUND_WEDGE) { + return av1_is_wedge_used(mbmi->bsize) + ? av1_cost_literal(1) + + mode_costs + ->wedge_idx_cost[mbmi->bsize] + [mbmi->interinter_comp.wedge_index] + : 0; + } else { + assert(compound_type == COMPOUND_DIFFWTD); + return av1_cost_literal(1); + } +} + +// Takes a backup of rate, distortion and model_rd for future reuse +static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate, + int64_t *comp_dist, int32_t *comp_model_rate, + int64_t *comp_model_dist, int rate_sum, + int64_t dist_sum, RD_STATS *rd_stats, + int *comp_rs2, int rs2) { + comp_rate[cur_type] = rd_stats->rate; + comp_dist[cur_type] = rd_stats->dist; + comp_model_rate[cur_type] = rate_sum; + comp_model_dist[cur_type] = dist_sum; + comp_rs2[cur_type] = rs2; +} + +static INLINE int save_mask_search_results(const PREDICTION_MODE this_mode, + const int reuse_level) { + if (reuse_level || (this_mode == NEW_NEWMV)) + return 1; + else + return 0; +} + +static INLINE int prune_mode_by_skip_rd(const AV1_COMP *const cpi, + MACROBLOCK *x, MACROBLOCKD *xd, + const BLOCK_SIZE bsize, + int64_t ref_skip_rd, int mode_rate) { + int eval_txfm = 1; + const int txfm_rd_gate_level = + get_txfm_rd_gate_level(cpi->common.seq_params->enable_masked_compound, + cpi->sf.inter_sf.txfm_rd_gate_level, bsize, + TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0); + // Check if the mode is good enough based on skip rd + if (txfm_rd_gate_level) { + int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize); + int64_t skip_rd = RDCOST(x->rdmult, mode_rate, (sse_y << 4)); + eval_txfm = + check_txfm_eval(x, bsize, ref_skip_rd, skip_rd, txfm_rd_gate_level, 1); + } + return eval_txfm; +} + +static int64_t masked_compound_type_rd( + const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, + const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2, + int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, + uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides, + int mode_rate, int64_t rd_thresh, int *calc_pred_masked_compound, + int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate, + int64_t *comp_model_dist, const int64_t comp_best_model_rd, + int64_t *const comp_model_rd_cur, int *comp_rs2, int64_t ref_skip_rd) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int64_t best_rd_cur = INT64_MAX; + int64_t rd = INT64_MAX; + const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD + assert(compound_type == COMPOUND_WEDGE || compound_type == COMPOUND_DIFFWTD); + int rate_sum; + uint8_t tmp_skip_txfm_sb; + int64_t dist_sum, tmp_skip_sse_sb; + pick_interinter_mask_type pick_interinter_mask[2] = { pick_interinter_wedge, + pick_interinter_seg }; + + // TODO(any): Save pred and mask calculation as well into records. However + // this may increase memory requirements as compound segment mask needs to be + // stored in each record. + if (*calc_pred_masked_compound) { + get_inter_predictors_masked_compound(x, bsize, preds0, preds1, residual1, + diff10, strides); + *calc_pred_masked_compound = 0; + } + if (compound_type == COMPOUND_WEDGE) { + unsigned int sse; + if (is_cur_buf_hbd(xd)) + (void)cpi->ppi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides, + CONVERT_TO_BYTEPTR(*preds1), *strides, + &sse); + else + (void)cpi->ppi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, + &sse); + const unsigned int mse = + ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]); + // If two predictors are very similar, skip wedge compound mode search + if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + } + // Function pointer to pick the appropriate mask + // compound_type == COMPOUND_WEDGE, calls pick_interinter_wedge() + // compound_type == COMPOUND_DIFFWTD, calls pick_interinter_seg() + uint64_t cur_sse = UINT64_MAX; + best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE]( + cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse); + *rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0); + assert(cur_sse != UINT64_MAX); + int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4)); + + // Although the true rate_mv might be different after motion search, but it + // is unlikely to be the best mode considering the transform rd cost and other + // mode overhead cost + int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0); + if (mode_rd > rd_thresh) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + + // Check if the mode is good enough based on skip rd + // TODO(nithya): Handle wedge_newmv_search if extending for lower speed + // setting + const int txfm_rd_gate_level = + get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound, + cpi->sf.inter_sf.txfm_rd_gate_level, bsize, + TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0); + if (txfm_rd_gate_level) { + int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur, + txfm_rd_gate_level, 1); + if (!eval_txfm) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + } + + // Compute cost if matching record not found, else, reuse data + if (comp_rate[compound_type] == INT_MAX) { + // Check whether new MV search for wedge is to be done + int wedge_newmv_search = + have_newmv_in_inter_mode(this_mode) && + (compound_type == COMPOUND_WEDGE) && + (!cpi->sf.inter_sf.disable_interinter_wedge_newmv_search); + + // Search for new MV if needed and build predictor + if (wedge_newmv_search) { + *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } else { + *out_rate_mv = rate_mv; + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, + preds1, strides); + } + // Get the RD cost from model RD + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, + &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); + *comp_model_rd_cur = rd; + // Override with best if current is worse than best for new MV + if (wedge_newmv_search) { + if (rd >= best_rd_cur) { + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + *out_rate_mv = rate_mv; + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, + strides, preds1, strides); + *comp_model_rd_cur = best_rd_cur; + } + } + if (cpi->sf.inter_sf.prune_comp_type_by_model_rd && + (*comp_model_rd_cur > comp_best_model_rd) && + comp_best_model_rd != INT64_MAX) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + // Compute RD cost for the current type + RD_STATS rd_stats; + const int64_t tmp_mode_rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0); + const int64_t tmp_rd_thresh = rd_thresh - tmp_mode_rd; + rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats); + if (rd != INT64_MAX) { + rd = + RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist); + // Backup rate and distortion for future reuse + backup_stats(compound_type, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, rate_sum, dist_sum, &rd_stats, comp_rs2, + *rs2); + } + } else { + // Reuse data as matching record is found + assert(comp_dist[compound_type] != INT64_MAX); + // When disable_interinter_wedge_newmv_search is set, motion refinement is + // disabled. Hence rate and distortion can be reused in this case as well + assert(IMPLIES((have_newmv_in_inter_mode(this_mode) && + (compound_type == COMPOUND_WEDGE)), + cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)); + assert(mbmi->mv[0].as_int == cur_mv[0].as_int); + assert(mbmi->mv[1].as_int == cur_mv[1].as_int); + *out_rate_mv = rate_mv; + // Calculate RD cost based on stored stats + rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type], + comp_dist[compound_type]); + // Recalculate model rdcost with the updated rate + *comp_model_rd_cur = + RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_model_rate[compound_type], + comp_model_dist[compound_type]); + } + return rd; +} + +// scaling values to be used for gating wedge/compound segment based on best +// approximate rd +static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 }; +static int comp_type_rd_threshold_div[3] = { 3, 16, 16 }; + +int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + HandleInterModeArgs *args, BLOCK_SIZE bsize, + int_mv *cur_mv, int mode_search_mask, + int masked_compound_used, const BUFFER_SET *orig_dst, + const BUFFER_SET *tmp_dst, + const CompoundTypeRdBuffers *buffers, int *rate_mv, + int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t ref_skip_rd, int *is_luma_interp_done, + int64_t rd_thresh) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + int ref_frame = av1_ref_frame_type(mbmi->ref_frame); + const int bw = block_size_wide[bsize]; + int rs2; + int_mv best_mv[2]; + int best_tmp_rate_mv = *rate_mv; + BEST_COMP_TYPE_STATS best_type_stats; + // Initializing BEST_COMP_TYPE_STATS + best_type_stats.best_compound_data.type = COMPOUND_AVERAGE; + best_type_stats.best_compmode_interinter_cost = 0; + best_type_stats.comp_best_model_rd = INT64_MAX; + + uint8_t *preds0[1] = { buffers->pred0 }; + uint8_t *preds1[1] = { buffers->pred1 }; + int strides[1] = { bw }; + int tmp_rate_mv; + COMPOUND_TYPE cur_type; + // Local array to store the mask cost for different compound types + int masked_type_cost[COMPOUND_TYPES]; + + int calc_pred_masked_compound = 1; + int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX }; + int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + int comp_rs2[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, + INT_MAX }; + int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX }; + int match_index = 0; + const int match_found = + find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, comp_rs2, &match_index); + best_mv[0].as_int = cur_mv[0].as_int; + best_mv[1].as_int = cur_mv[1].as_int; + *rd = INT64_MAX; + + // Local array to store the valid compound types to be evaluated in the core + // loop + COMPOUND_TYPE valid_comp_types[COMPOUND_TYPES] = { + COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD + }; + int valid_type_count = 0; + // compute_valid_comp_types() returns the number of valid compound types to be + // evaluated and populates the same in the local array valid_comp_types[]. + // It also sets the flag 'try_average_and_distwtd_comp' + valid_type_count = compute_valid_comp_types( + x, cpi, bsize, masked_compound_used, mode_search_mask, valid_comp_types); + + // The following context indices are independent of compound type + const int comp_group_idx_ctx = get_comp_group_idx_context(xd); + const int comp_index_ctx = get_comp_index_context(cm, xd); + + // Populates masked_type_cost local array for the 4 compound types + calc_masked_type_cost(&x->mode_costs, bsize, comp_group_idx_ctx, + comp_index_ctx, masked_compound_used, masked_type_cost); + + int64_t comp_model_rd_cur = INT64_MAX; + int64_t best_rd_cur = ref_best_rd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // If the match is found, calculate the rd cost using the + // stored stats and update the mbmi appropriately. + if (match_found && cpi->sf.inter_sf.reuse_compound_type_decision) { + return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv, + comp_rate, comp_dist, comp_rs2, + rate_mv, rd, match_index); + } + + // If COMPOUND_AVERAGE is not valid, use the spare buffer + if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); + + // Loop over valid compound types + for (int i = 0; i < valid_type_count; i++) { + cur_type = valid_comp_types[i]; + + if (args->cmp_mode[ref_frame] == COMPOUND_AVERAGE) { + if (cur_type == COMPOUND_WEDGE) continue; + } + + comp_model_rd_cur = INT64_MAX; + tmp_rate_mv = *rate_mv; + best_rd_cur = INT64_MAX; + ref_best_rd = AOMMIN(ref_best_rd, *rd); + update_mbmi_for_compound_type(mbmi, cur_type); + rs2 = masked_type_cost[cur_type]; + + int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); + if (mode_rd >= ref_best_rd) continue; + + // Derive the flags to indicate enabling/disabling of MV refinement process. + const int enable_fast_compound_mode_search = + cpi->sf.inter_sf.enable_fast_compound_mode_search; + const bool skip_mv_refinement_for_avg_distwtd = + enable_fast_compound_mode_search == 3 || + (enable_fast_compound_mode_search == 2 && (this_mode != NEW_NEWMV)); + const bool skip_mv_refinement_for_diffwtd = + (!enable_fast_compound_mode_search && cur_type == COMPOUND_DIFFWTD); + + // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD + if (cur_type < COMPOUND_WEDGE) { + if (skip_mv_refinement_for_avg_distwtd) { + int rate_sum; + uint8_t tmp_skip_txfm_sb; + int64_t dist_sum, tmp_skip_sse_sb; + + // Reuse data if matching record is found + if (comp_rate[cur_type] == INT_MAX) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1; + // Compute RD cost for the current type + RD_STATS est_rd_stats; + const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd; + int64_t est_rd = INT64_MAX; + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + rs2 + *rate_mv); + // Evaluate further if skip rd is low enough + if (eval_txfm) { + est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, + &est_rd_stats); + } + if (est_rd != INT64_MAX) { + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + comp_model_rd_cur = + RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + // Backup rate and distortion for future reuse + backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, rate_sum, dist_sum, &est_rd_stats, + comp_rs2, rs2); + } + } else { + // Calculate RD cost based on stored stats + assert(comp_dist[cur_type] != INT64_MAX); + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type], + comp_dist[cur_type]); + // Recalculate model rdcost with the updated rate + comp_model_rd_cur = + RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type], + comp_model_dist[cur_type]); + } + } else { + tmp_rate_mv = *rate_mv; + if (have_newmv_in_inter_mode(this_mode)) { + InterPredParams inter_pred_params; + av1_dist_wtd_comp_weight_assign( + &cpi->common, mbmi, &inter_pred_params.conv_params.fwd_offset, + &inter_pred_params.conv_params.bck_offset, + &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1); + int mask_value = inter_pred_params.conv_params.fwd_offset * 4; + memset(xd->seg_mask, mask_value, + sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); + tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + } + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1; + + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + rs2 + *rate_mv); + if (eval_txfm) { + RD_STATS est_rd_stats; + estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats); + + best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + } + } + + // use spare buffer for following compound type try + if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); + } else if (cur_type == COMPOUND_WEDGE) { + int best_mask_index = 0; + int best_wedge_sign = 0; + int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] }; + int best_rs2 = 0; + int best_rate_mv = *rate_mv; + int wedge_mask_size = get_wedge_types_lookup(bsize); + int need_mask_search = args->wedge_index == -1; + int wedge_newmv_search = + have_newmv_in_inter_mode(this_mode) && + !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search; + + if (need_mask_search && !wedge_newmv_search) { + // short cut repeated single reference block build + av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, + preds0, strides); + av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, + preds1, strides); + } + + for (int wedge_mask = 0; wedge_mask < wedge_mask_size && need_mask_search; + ++wedge_mask) { + for (int wedge_sign = 0; wedge_sign < 2; ++wedge_sign) { + tmp_rate_mv = *rate_mv; + mbmi->interinter_comp.wedge_index = wedge_mask; + mbmi->interinter_comp.wedge_sign = wedge_sign; + rs2 = masked_type_cost[cur_type]; + rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + + mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); + if (mode_rd >= ref_best_rd / 2) continue; + + if (wedge_newmv_search) { + tmp_rate_mv = av1_interinter_compound_motion_search( + cpi, x, cur_mv, bsize, this_mode); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, + bsize, AOM_PLANE_Y, AOM_PLANE_Y); + } else { + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, + strides, preds1, strides); + } + + RD_STATS est_rd_stats; + int64_t this_rd_cur = INT64_MAX; + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + rs2 + *rate_mv); + if (eval_txfm) { + this_rd_cur = estimate_yrd_for_sb( + cpi, bsize, x, AOMMIN(best_rd_cur, ref_best_rd), &est_rd_stats); + } + if (this_rd_cur < INT64_MAX) { + this_rd_cur = + RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + } + if (this_rd_cur < best_rd_cur) { + best_mask_index = wedge_mask; + best_wedge_sign = wedge_sign; + best_rd_cur = this_rd_cur; + tmp_mv[0] = mbmi->mv[0]; + tmp_mv[1] = mbmi->mv[1]; + best_rate_mv = tmp_rate_mv; + best_rs2 = rs2; + } + } + // Consider the asymmetric partitions for oblique angle only if the + // corresponding symmetric partition is the best so far. + // Note: For horizontal and vertical types, both symmetric and + // asymmetric partitions are always considered. + if (cpi->sf.inter_sf.enable_fast_wedge_mask_search) { + // The first 4 entries in wedge_codebook_16_heqw/hltw/hgtw[16] + // correspond to symmetric partitions of the 4 oblique angles, the + // next 4 entries correspond to the vertical/horizontal + // symmetric/asymmetric partitions and the last 8 entries correspond + // to the asymmetric partitions of oblique types. + const int idx_before_asym_oblique = 7; + const int last_oblique_sym_idx = 3; + if (wedge_mask == idx_before_asym_oblique) { + if (best_mask_index > last_oblique_sym_idx) { + break; + } else { + // Asymmetric (Index-1) map for the corresponding oblique masks. + // WEDGE_OBLIQUE27: sym - 0, asym - 8, 9 + // WEDGE_OBLIQUE63: sym - 1, asym - 12, 13 + // WEDGE_OBLIQUE117: sym - 2, asym - 14, 15 + // WEDGE_OBLIQUE153: sym - 3, asym - 10, 11 + const int asym_mask_idx[4] = { 7, 11, 13, 9 }; + wedge_mask = asym_mask_idx[best_mask_index]; + wedge_mask_size = wedge_mask + 3; + } + } + } + } + + if (need_mask_search) { + if (save_mask_search_results( + this_mode, cpi->sf.inter_sf.reuse_mask_search_results)) { + args->wedge_index = best_mask_index; + args->wedge_sign = best_wedge_sign; + } + } else { + mbmi->interinter_comp.wedge_index = args->wedge_index; + mbmi->interinter_comp.wedge_sign = args->wedge_sign; + rs2 = masked_type_cost[cur_type]; + rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + + if (wedge_newmv_search) { + tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + } + + best_mask_index = args->wedge_index; + best_wedge_sign = args->wedge_sign; + tmp_mv[0] = mbmi->mv[0]; + tmp_mv[1] = mbmi->mv[1]; + best_rate_mv = tmp_rate_mv; + best_rs2 = masked_type_cost[cur_type]; + best_rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + best_rs2 + *rate_mv); + if (eval_txfm) { + RD_STATS est_rd_stats; + estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats); + best_rd_cur = + RDCOST(x->rdmult, best_rs2 + tmp_rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + } + } + + mbmi->interinter_comp.wedge_index = best_mask_index; + mbmi->interinter_comp.wedge_sign = best_wedge_sign; + mbmi->mv[0] = tmp_mv[0]; + mbmi->mv[1] = tmp_mv[1]; + tmp_rate_mv = best_rate_mv; + rs2 = best_rs2; + } else if (skip_mv_refinement_for_diffwtd) { + int_mv tmp_mv[2]; + int best_mask_index = 0; + rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + + int need_mask_search = args->diffwtd_index == -1; + + for (int mask_index = 0; mask_index < 2 && need_mask_search; + ++mask_index) { + tmp_rate_mv = *rate_mv; + mbmi->interinter_comp.mask_type = mask_index; + if (have_newmv_in_inter_mode(this_mode)) { + // hard coded number for diff wtd + int mask_value = mask_index == 0 ? 38 : 26; + memset(xd->seg_mask, mask_value, + sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); + tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + } + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + RD_STATS est_rd_stats; + int64_t this_rd_cur = INT64_MAX; + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + rs2 + *rate_mv); + if (eval_txfm) { + this_rd_cur = + estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats); + } + if (this_rd_cur < INT64_MAX) { + this_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + } + + if (this_rd_cur < best_rd_cur) { + best_rd_cur = this_rd_cur; + best_mask_index = mbmi->interinter_comp.mask_type; + tmp_mv[0] = mbmi->mv[0]; + tmp_mv[1] = mbmi->mv[1]; + } + } + + if (need_mask_search) { + if (save_mask_search_results(this_mode, 0)) + args->diffwtd_index = best_mask_index; + } else { + mbmi->interinter_comp.mask_type = args->diffwtd_index; + rs2 = masked_type_cost[cur_type]; + rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + + int mask_value = mbmi->interinter_comp.mask_type == 0 ? 38 : 26; + memset(xd->seg_mask, mask_value, + sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); + + if (have_newmv_in_inter_mode(this_mode)) { + tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + } + best_mask_index = mbmi->interinter_comp.mask_type; + tmp_mv[0] = mbmi->mv[0]; + tmp_mv[1] = mbmi->mv[1]; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + RD_STATS est_rd_stats; + int64_t this_rd_cur = INT64_MAX; + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + rs2 + *rate_mv); + if (eval_txfm) { + this_rd_cur = + estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats); + } + if (this_rd_cur < INT64_MAX) { + best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + } + } + + mbmi->interinter_comp.mask_type = best_mask_index; + mbmi->mv[0] = tmp_mv[0]; + mbmi->mv[1] = tmp_mv[1]; + } else { + // Handle masked compound types + bool eval_masked_comp_type = true; + if (*rd != INT64_MAX) { + // Factors to control gating of compound type selection based on best + // approximate rd so far + const int max_comp_type_rd_threshold_mul = + comp_type_rd_threshold_mul[cpi->sf.inter_sf + .prune_comp_type_by_comp_avg]; + const int max_comp_type_rd_threshold_div = + comp_type_rd_threshold_div[cpi->sf.inter_sf + .prune_comp_type_by_comp_avg]; + // Evaluate COMPOUND_WEDGE / COMPOUND_DIFFWTD if approximated cost is + // within threshold + const int64_t approx_rd = ((*rd / max_comp_type_rd_threshold_div) * + max_comp_type_rd_threshold_mul); + if (approx_rd >= ref_best_rd) eval_masked_comp_type = false; + } + + if (eval_masked_comp_type) { + const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh); + best_rd_cur = masked_compound_type_rd( + cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst, + &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10, + strides, rd_stats->rate, tmp_rd_thresh, &calc_pred_masked_compound, + comp_rate, comp_dist, comp_model_rate, comp_model_dist, + best_type_stats.comp_best_model_rd, &comp_model_rd_cur, comp_rs2, + ref_skip_rd); + } + } + + // Update stats for best compound type + if (best_rd_cur < *rd) { + update_best_info(mbmi, rd, &best_type_stats, best_rd_cur, + comp_model_rd_cur, rs2); + if (have_newmv_in_inter_mode(this_mode)) + update_mask_best_mv(mbmi, best_mv, &best_tmp_rate_mv, tmp_rate_mv); + } + // reset to original mvs for next iteration + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + } + + mbmi->comp_group_idx = + (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1; + mbmi->compound_idx = + !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD); + mbmi->interinter_comp = best_type_stats.best_compound_data; + + if (have_newmv_in_inter_mode(this_mode)) { + mbmi->mv[0].as_int = best_mv[0].as_int; + mbmi->mv[1].as_int = best_mv[1].as_int; + rd_stats->rate += best_tmp_rate_mv - *rate_mv; + *rate_mv = best_tmp_rate_mv; + } + + if (this_mode == NEW_NEWMV) + args->cmp_mode[ref_frame] = mbmi->interinter_comp.type; + + restore_dst_buf(xd, *orig_dst, 1); + if (!match_found) + save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, cur_mv, comp_rs2); + return best_type_stats.best_compmode_interinter_cost; +} diff --git a/third_party/aom/av1/encoder/compound_type.h b/third_party/aom/av1/encoder/compound_type.h new file mode 100644 index 0000000000..a028a35093 --- /dev/null +++ b/third_party/aom/av1/encoder/compound_type.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_COMPOUND_TYPE_H_ +#define AOM_AV1_ENCODER_COMPOUND_TYPE_H_ + +#include "av1/encoder/encoder.h" +#include "av1/encoder/interp_search.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Structure to store the compound type related stats for best compound type +typedef struct { + INTERINTER_COMPOUND_DATA best_compound_data; + int64_t comp_best_model_rd; + int best_compmode_interinter_cost; +} BEST_COMP_TYPE_STATS; + +#define IGNORE_MODE -1 +// Searches for the best inter-intra mode. Returns IGNORE_MODE if no good mode +// is found, 0 otherwise. +int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, + HandleInterModeArgs *args, int64_t ref_best_rd, + int *rate_mv, int *tmp_rate2, + const BUFFER_SET *orig_dst); + +int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + HandleInterModeArgs *args, BLOCK_SIZE bsize, + int_mv *cur_mv, int mode_search_mask, + int masked_compound_used, const BUFFER_SET *orig_dst, + const BUFFER_SET *tmp_dst, + const CompoundTypeRdBuffers *buffers, int *rate_mv, + int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t ref_skip_rd, int *is_luma_interp_done, + int64_t rd_thresh); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_COMPOUND_TYPE_H_ diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c new file mode 100644 index 0000000000..aafe55d2d0 --- /dev/null +++ b/third_party/aom/av1/encoder/context_tree.c @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rd.h" +#include + +void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, + PICK_MODE_CONTEXT *src_ctx) { + dst_ctx->mic = src_ctx->mic; + dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best; + + dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk; + dst_ctx->skippable = src_ctx->skippable; +#if CONFIG_INTERNAL_STATS + dst_ctx->best_mode_index = src_ctx->best_mode_index; +#endif // CONFIG_INTERNAL_STATS + + memcpy(dst_ctx->blk_skip, src_ctx->blk_skip, + sizeof(uint8_t) * src_ctx->num_4x4_blk); + av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map, + src_ctx->num_4x4_blk); + + dst_ctx->rd_stats = src_ctx->rd_stats; + dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready; +} + +void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params, + PC_TREE_SHARED_BUFFERS *shared_bufs, + struct aom_internal_error_info *error) { + const int num_planes = seq_params->monochrome ? 1 : MAX_MB_PLANE; + const int max_sb_square_y = 1 << num_pels_log2_lookup[seq_params->sb_size]; + const int max_sb_square_uv = max_sb_square_y >> (seq_params->subsampling_x + + seq_params->subsampling_y); + for (int i = 0; i < num_planes; i++) { + const int max_num_pix = + (i == AOM_PLANE_Y) ? max_sb_square_y : max_sb_square_uv; + AOM_CHECK_MEM_ERROR(error, shared_bufs->coeff_buf[i], + aom_memalign(32, max_num_pix * sizeof(tran_low_t))); + AOM_CHECK_MEM_ERROR(error, shared_bufs->qcoeff_buf[i], + aom_memalign(32, max_num_pix * sizeof(tran_low_t))); + AOM_CHECK_MEM_ERROR(error, shared_bufs->dqcoeff_buf[i], + aom_memalign(32, max_num_pix * sizeof(tran_low_t))); + } +} + +void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs) { + for (int i = 0; i < 3; i++) { + aom_free(shared_bufs->coeff_buf[i]); + aom_free(shared_bufs->qcoeff_buf[i]); + aom_free(shared_bufs->dqcoeff_buf[i]); + shared_bufs->coeff_buf[i] = NULL; + shared_bufs->qcoeff_buf[i] = NULL; + shared_bufs->dqcoeff_buf[i] = NULL; + } +} + +PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi, + BLOCK_SIZE bsize, + PC_TREE_SHARED_BUFFERS *shared_bufs) { + PICK_MODE_CONTEXT *volatile ctx = NULL; + const AV1_COMMON *const cm = &cpi->common; + struct aom_internal_error_info error; + + if (setjmp(error.jmp)) { + av1_free_pmc(ctx, av1_num_planes(cm)); + return NULL; + } + error.setjmp = 1; + + AOM_CHECK_MEM_ERROR(&error, ctx, aom_calloc(1, sizeof(*ctx))); + ctx->rd_mode_is_ready = 0; + + const int num_planes = av1_num_planes(cm); + const int num_pix = block_size_wide[bsize] * block_size_high[bsize]; + const int num_blk = num_pix / 16; + + AOM_CHECK_MEM_ERROR(&error, ctx->blk_skip, + aom_calloc(num_blk, sizeof(*ctx->blk_skip))); + AOM_CHECK_MEM_ERROR(&error, ctx->tx_type_map, + aom_calloc(num_blk, sizeof(*ctx->tx_type_map))); + ctx->num_4x4_blk = num_blk; + + for (int i = 0; i < num_planes; ++i) { + ctx->coeff[i] = shared_bufs->coeff_buf[i]; + ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i]; + ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i]; + AOM_CHECK_MEM_ERROR(&error, ctx->eobs[i], + aom_memalign(32, num_blk * sizeof(*ctx->eobs[i]))); + AOM_CHECK_MEM_ERROR( + &error, ctx->txb_entropy_ctx[i], + aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i]))); + } + + if (num_pix <= MAX_PALETTE_SQUARE) { + for (int i = 0; i < 2; ++i) { + if (cm->features.allow_screen_content_tools) { + AOM_CHECK_MEM_ERROR( + &error, ctx->color_index_map[i], + aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i]))); + } else { + ctx->color_index_map[i] = NULL; + } + } + } + + av1_invalid_rd_stats(&ctx->rd_stats); + + return ctx; +} + +void av1_reset_pmc(PICK_MODE_CONTEXT *ctx) { + av1_zero_array(ctx->blk_skip, ctx->num_4x4_blk); + av1_zero_array(ctx->tx_type_map, ctx->num_4x4_blk); + av1_invalid_rd_stats(&ctx->rd_stats); +} + +void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes) { + if (ctx == NULL) return; + + aom_free(ctx->blk_skip); + ctx->blk_skip = NULL; + aom_free(ctx->tx_type_map); + for (int i = 0; i < num_planes; ++i) { + ctx->coeff[i] = NULL; + ctx->qcoeff[i] = NULL; + ctx->dqcoeff[i] = NULL; + aom_free(ctx->eobs[i]); + ctx->eobs[i] = NULL; + aom_free(ctx->txb_entropy_ctx[i]); + ctx->txb_entropy_ctx[i] = NULL; + } + + for (int i = 0; i < 2; ++i) { + if (ctx->color_index_map[i]) { + aom_free(ctx->color_index_map[i]); + ctx->color_index_map[i] = NULL; + } + } + + aom_free(ctx); +} + +PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize) { + PC_TREE *pc_tree = aom_calloc(1, sizeof(*pc_tree)); + if (pc_tree == NULL) return NULL; + + pc_tree->partitioning = PARTITION_NONE; + pc_tree->block_size = bsize; + + return pc_tree; +} + +#define FREE_PMC_NODE(CTX) \ + do { \ + av1_free_pmc(CTX, num_planes); \ + CTX = NULL; \ + } while (0) + +void av1_free_pc_tree_recursive(PC_TREE *pc_tree, int num_planes, int keep_best, + int keep_none, + PARTITION_SEARCH_TYPE partition_search_type) { + if (pc_tree == NULL) return; + + // Avoid freeing of extended partitions as they are not supported when + // partition_search_type is VAR_BASED_PARTITION. + if (partition_search_type == VAR_BASED_PARTITION && !keep_best && + !keep_none) { + FREE_PMC_NODE(pc_tree->none); + + for (int i = 0; i < 2; ++i) { + FREE_PMC_NODE(pc_tree->horizontal[i]); + FREE_PMC_NODE(pc_tree->vertical[i]); + } + +#if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY + for (int i = 0; i < 3; ++i) { + assert(pc_tree->horizontala[i] == NULL); + assert(pc_tree->horizontalb[i] == NULL); + assert(pc_tree->verticala[i] == NULL); + assert(pc_tree->verticalb[i] == NULL); + } + for (int i = 0; i < 4; ++i) { + assert(pc_tree->horizontal4[i] == NULL); + assert(pc_tree->vertical4[i] == NULL); + } +#endif + + for (int i = 0; i < 4; ++i) { + if (pc_tree->split[i] != NULL) { + av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0, + partition_search_type); + pc_tree->split[i] = NULL; + } + } + aom_free(pc_tree); + return; + } + + const PARTITION_TYPE partition = pc_tree->partitioning; + + if (!keep_none && (!keep_best || (partition != PARTITION_NONE))) + FREE_PMC_NODE(pc_tree->none); + + for (int i = 0; i < 2; ++i) { + if (!keep_best || (partition != PARTITION_HORZ)) + FREE_PMC_NODE(pc_tree->horizontal[i]); + if (!keep_best || (partition != PARTITION_VERT)) + FREE_PMC_NODE(pc_tree->vertical[i]); + } +#if !CONFIG_REALTIME_ONLY + for (int i = 0; i < 3; ++i) { + if (!keep_best || (partition != PARTITION_HORZ_A)) + FREE_PMC_NODE(pc_tree->horizontala[i]); + if (!keep_best || (partition != PARTITION_HORZ_B)) + FREE_PMC_NODE(pc_tree->horizontalb[i]); + if (!keep_best || (partition != PARTITION_VERT_A)) + FREE_PMC_NODE(pc_tree->verticala[i]); + if (!keep_best || (partition != PARTITION_VERT_B)) + FREE_PMC_NODE(pc_tree->verticalb[i]); + } + for (int i = 0; i < 4; ++i) { + if (!keep_best || (partition != PARTITION_HORZ_4)) + FREE_PMC_NODE(pc_tree->horizontal4[i]); + if (!keep_best || (partition != PARTITION_VERT_4)) + FREE_PMC_NODE(pc_tree->vertical4[i]); + } +#endif + if (!keep_best || (partition != PARTITION_SPLIT)) { + for (int i = 0; i < 4; ++i) { + if (pc_tree->split[i] != NULL) { + av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0, + partition_search_type); + pc_tree->split[i] = NULL; + } + } + } + + if (!keep_best && !keep_none) aom_free(pc_tree); +} + +int av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) { + // The structure 'sms_tree' is used to store the simple motion search data for + // partition pruning in inter frames. Hence, the memory allocations and + // initializations related to it are avoided for allintra encoding mode. + if (cpi->oxcf.kf_cfg.key_freq_max == 0) return 0; + + AV1_COMMON *const cm = &cpi->common; + const int stat_generation_stage = is_stat_generation_stage(cpi); + const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; + const int tree_nodes = + av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); + int sms_tree_index = 0; + SIMPLE_MOTION_DATA_TREE *this_sms; + int square_index = 1; + int nodes; + + aom_free(td->sms_tree); + td->sms_tree = + (SIMPLE_MOTION_DATA_TREE *)aom_calloc(tree_nodes, sizeof(*td->sms_tree)); + if (!td->sms_tree) return -1; + this_sms = &td->sms_tree[0]; + + if (!stat_generation_stage) { + const int leaf_factor = is_sb_size_128 ? 4 : 1; + const int leaf_nodes = 256 * leaf_factor; + + // Sets up all the leaf nodes in the tree. + for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) { + SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index]; + tree->block_size = square[0]; + } + + // Each node has 4 leaf nodes, fill each block_size level of the tree + // from leafs to the root. + for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) { + for (int i = 0; i < nodes; ++i) { + SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index]; + tree->block_size = square[square_index]; + for (int j = 0; j < 4; j++) tree->split[j] = this_sms++; + ++sms_tree_index; + } + ++square_index; + } + } else { + // Allocation for firstpass/LAP stage + // TODO(Mufaddal): refactor square_index to use a common block_size macro + // from firstpass.c + SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index]; + square_index = 2; + tree->block_size = square[square_index]; + } + + // Set up the root node for the largest superblock size + td->sms_root = &td->sms_tree[tree_nodes - 1]; + return 0; +} + +void av1_free_sms_tree(ThreadData *td) { + aom_free(td->sms_tree); + td->sms_tree = NULL; +} diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h new file mode 100644 index 0000000000..0be7ccbb54 --- /dev/null +++ b/third_party/aom/av1/encoder/context_tree.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_ +#define AOM_AV1_ENCODER_CONTEXT_TREE_H_ + +#include "config/aom_config.h" + +#include "av1/common/blockd.h" +#include "av1/encoder/block.h" +#include "av1/encoder/speed_features.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_PRIMARY; +struct AV1_COMP; +struct AV1Common; +struct ThreadData; + +typedef struct { + tran_low_t *coeff_buf[MAX_MB_PLANE]; + tran_low_t *qcoeff_buf[MAX_MB_PLANE]; + tran_low_t *dqcoeff_buf[MAX_MB_PLANE]; +} PC_TREE_SHARED_BUFFERS; + +// Structure to hold snapshot of coding context during the mode picking process +typedef struct PICK_MODE_CONTEXT { + MB_MODE_INFO mic; + MB_MODE_INFO_EXT_FRAME mbmi_ext_best; + uint8_t *color_index_map[2]; + uint8_t *blk_skip; + + tran_low_t *coeff[MAX_MB_PLANE]; + tran_low_t *qcoeff[MAX_MB_PLANE]; + tran_low_t *dqcoeff[MAX_MB_PLANE]; + uint16_t *eobs[MAX_MB_PLANE]; + uint8_t *txb_entropy_ctx[MAX_MB_PLANE]; + uint8_t *tx_type_map; + + int num_4x4_blk; + // For current partition, only if all Y, U, and V transform blocks' + // coefficients are quantized to 0, skippable is set to 1. + int skippable; +#if CONFIG_INTERNAL_STATS + THR_MODES best_mode_index; +#endif // CONFIG_INTERNAL_STATS + RD_STATS rd_stats; + + int rd_mode_is_ready; // Flag to indicate whether rd pick mode decision has + // been made. +#if CONFIG_AV1_TEMPORAL_DENOISING + int64_t newmv_sse; + int64_t zeromv_sse; + int64_t zeromv_lastref_sse; + PREDICTION_MODE best_sse_inter_mode; + int_mv best_sse_mv; + MV_REFERENCE_FRAME best_reference_frame; + MV_REFERENCE_FRAME best_zeromv_reference_frame; + int sb_skip_denoising; +#endif +} PICK_MODE_CONTEXT; + +typedef struct PC_TREE { + PARTITION_TYPE partitioning; + BLOCK_SIZE block_size; + PICK_MODE_CONTEXT *none; + PICK_MODE_CONTEXT *horizontal[2]; + PICK_MODE_CONTEXT *vertical[2]; +#if !CONFIG_REALTIME_ONLY + PICK_MODE_CONTEXT *horizontala[3]; + PICK_MODE_CONTEXT *horizontalb[3]; + PICK_MODE_CONTEXT *verticala[3]; + PICK_MODE_CONTEXT *verticalb[3]; + PICK_MODE_CONTEXT *horizontal4[4]; + PICK_MODE_CONTEXT *vertical4[4]; +#endif + struct PC_TREE *split[4]; + int index; +} PC_TREE; + +typedef struct SIMPLE_MOTION_DATA_TREE { + BLOCK_SIZE block_size; + PARTITION_TYPE partitioning; + struct SIMPLE_MOTION_DATA_TREE *split[4]; + + // Simple motion search_features + FULLPEL_MV start_mvs[REF_FRAMES]; + unsigned int sms_none_feat[2]; + unsigned int sms_rect_feat[8]; + int sms_none_valid; + int sms_rect_valid; +} SIMPLE_MOTION_DATA_TREE; + +void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params, + PC_TREE_SHARED_BUFFERS *shared_bufs, + struct aom_internal_error_info *error); +void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs); + +PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize); +void av1_free_pc_tree_recursive(PC_TREE *tree, int num_planes, int keep_best, + int keep_none, + PARTITION_SEARCH_TYPE partition_search_type); + +PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi, + BLOCK_SIZE bsize, + PC_TREE_SHARED_BUFFERS *shared_bufs); +void av1_reset_pmc(PICK_MODE_CONTEXT *ctx); +void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes); +void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, + PICK_MODE_CONTEXT *src_ctx); + +static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = { + BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128, +}; + +static AOM_INLINE int av1_get_pc_tree_nodes(const int is_sb_size_128, + int stat_generation_stage) { + const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0; + const int tree_nodes = + stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1); + return tree_nodes; +} + +// Returns 0 on success, -1 on memory allocation failure. +int av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td); +void av1_free_sms_tree(struct ThreadData *td); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_ diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c new file mode 100644 index 0000000000..323e2aed58 --- /dev/null +++ b/third_party/aom/av1/encoder/cost.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "av1/encoder/cost.h" +#include "av1/common/entropy.h" + +// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255. +const uint16_t av1_prob_cost[128] = { + 512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435, + 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361, + 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294, + 289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232, + 228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175, + 171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122, + 119, 115, 112, 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, + 70, 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, 26, + 23, 20, 18, 15, 12, 9, 6, 3, +}; + +void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, + const int *inv_map) { + int i; + aom_cdf_prob prev_cdf = 0; + for (i = 0;; ++i) { + aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf; + p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15; + prev_cdf = AOM_ICDF(cdf[i]); + + if (inv_map) + costs[inv_map[i]] = av1_cost_symbol(p15); + else + costs[i] = av1_cost_symbol(p15); + + // Stop once we reach the end of the CDF + if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break; + } +} diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h new file mode 100644 index 0000000000..be0241a820 --- /dev/null +++ b/third_party/aom/av1/encoder/cost.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_COST_H_ +#define AOM_AV1_ENCODER_COST_H_ + +#include "aom_dsp/prob.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern const uint16_t av1_prob_cost[128]; + +// The factor to scale from cost in bits to cost in av1_prob_cost units. +#define AV1_PROB_COST_SHIFT 9 + +// Cost of coding an n bit literal, using 128 (i.e. 50%) probability +// for each bit. +#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT)) + +// Calculate the cost of a symbol with probability p15 / 2^15 +static INLINE int av1_cost_symbol(aom_cdf_prob p15) { + // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the + // following cost calculation works correctly. Otherwise, if p15 = + // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong. + p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1); + assert(0 < p15 && p15 < CDF_PROB_TOP); + const int shift = CDF_PROB_BITS - 1 - get_msb(p15); + const int prob = get_prob(p15 << shift, CDF_PROB_TOP); + assert(prob >= 128); + return av1_prob_cost[prob - 128] + av1_cost_literal(shift); +} + +void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, + const int *inv_map); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_COST_H_ diff --git a/third_party/aom/av1/encoder/deltaq4_model.c b/third_party/aom/av1/encoder/deltaq4_model.c new file mode 100644 index 0000000000..60a7e6d2cf --- /dev/null +++ b/third_party/aom/av1/encoder/deltaq4_model.c @@ -0,0 +1,7776 @@ +/* Embedded file: model.tflite */ +const int av1_deltaq4_model_fsize = 101032; +const unsigned char av1_deltaq4_model_file[101032] = { + 0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x14, 0x00, 0x20, 0x00, 0x1c, + 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, + 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, + 0x00, 0xc0, 0x00, 0x00, 0x00, 0xc0, 0x7e, 0x01, 0x00, 0xd0, 0x7e, 0x01, 0x00, + 0x24, 0x8a, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, + 0x00, 0x00, 0x00, 0x6a, 0x80, 0xfe, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x1c, 0x00, + 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x72, + 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb4, 0xff, 0xff, 0xff, 0x14, + 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x65, + 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x00, 0x04, 0x00, 0x00, 0x00, 0xca, 0x81, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, + 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, + 0x00, 0x08, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74, 0x69, + 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00, 0x17, 0x00, + 0x00, 0x00, 0xfc, 0x7d, 0x01, 0x00, 0xf4, 0x7d, 0x01, 0x00, 0xdc, 0x7d, 0x01, + 0x00, 0x84, 0x7d, 0x01, 0x00, 0xf4, 0x7c, 0x01, 0x00, 0xa4, 0x7c, 0x01, 0x00, + 0x74, 0x7c, 0x01, 0x00, 0x5c, 0x7c, 0x01, 0x00, 0x4c, 0x5c, 0x00, 0x00, 0xbc, + 0x5b, 0x00, 0x00, 0x8c, 0x5a, 0x00, 0x00, 0x7c, 0x48, 0x00, 0x00, 0x6c, 0x00, + 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, + 0x00, 0x4c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, + 0x00, 0x00, 0x00, 0x7e, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x77, 0xfe, 0xff, 0x54, 0x77, 0xfe, 0xff, + 0x58, 0x77, 0xfe, 0xff, 0x5c, 0x77, 0xfe, 0xff, 0x60, 0x77, 0xfe, 0xff, 0x64, + 0x77, 0xfe, 0xff, 0x68, 0x77, 0xfe, 0xff, 0x6c, 0x77, 0xfe, 0xff, 0x70, 0x77, + 0xfe, 0xff, 0xbe, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, + 0x00, 0x3e, 0x84, 0xfc, 0x3b, 0xef, 0x95, 0x2f, 0xbd, 0xd3, 0x21, 0x96, 0xbd, + 0x11, 0x9a, 0xc6, 0x3d, 0xd9, 0x7e, 0x0c, 0xbe, 0xcb, 0xd2, 0x8c, 0xbb, 0x60, + 0xf5, 0x92, 0xbd, 0x70, 0xce, 0x9e, 0x3d, 0x26, 0x67, 0xc4, 0x3d, 0x9b, 0x2a, + 0x8b, 0x3b, 0x3b, 0xdd, 0x2a, 0xbd, 0xf9, 0x09, 0x8a, 0xbd, 0x1b, 0xae, 0xd7, + 0x3c, 0xbf, 0x39, 0x87, 0xbd, 0x4c, 0x9e, 0xe2, 0x3d, 0x50, 0x9c, 0xe7, 0xbd, + 0x1e, 0x58, 0x57, 0x3d, 0x38, 0x8c, 0x58, 0xbd, 0x48, 0x9f, 0x4a, 0x3d, 0xcb, + 0x1c, 0x93, 0xbd, 0xeb, 0xb8, 0x5a, 0xbc, 0x63, 0x04, 0x4b, 0xbd, 0x9b, 0x76, + 0xa8, 0x3d, 0x20, 0xb4, 0x69, 0x3d, 0xee, 0xcc, 0xe5, 0x3a, 0x4f, 0x40, 0x02, + 0x3e, 0x21, 0x2e, 0x03, 0x3e, 0x25, 0x77, 0x99, 0xbd, 0xf5, 0xa1, 0xd0, 0x3c, + 0xc5, 0x15, 0xeb, 0x3c, 0x58, 0xb5, 0xb7, 0x3c, 0x80, 0x63, 0x33, 0xbd, 0xc9, + 0x66, 0x63, 0xbd, 0xf6, 0xef, 0xb8, 0xbd, 0xd7, 0xbf, 0x9f, 0x3b, 0x93, 0x68, + 0x35, 0x3d, 0x60, 0xfc, 0xf3, 0xbd, 0xed, 0xd9, 0x35, 0xbd, 0x57, 0xef, 0x8a, + 0x3d, 0x31, 0x97, 0xa4, 0x3d, 0x8e, 0x55, 0xe2, 0x3d, 0x27, 0xa5, 0xe9, 0x3d, + 0x36, 0x26, 0x67, 0xbc, 0xeb, 0xd1, 0x9e, 0xbd, 0xc7, 0xcd, 0x37, 0x3d, 0x31, + 0xfc, 0xce, 0x3d, 0x5e, 0xe3, 0x96, 0xbd, 0xeb, 0x24, 0x4d, 0x3c, 0xe6, 0x00, + 0xe2, 0xbd, 0x9b, 0x00, 0x17, 0xbd, 0xee, 0x9f, 0xc4, 0xbd, 0x6a, 0xcd, 0xba, + 0xbc, 0x2c, 0x2b, 0x97, 0xbd, 0x8a, 0x02, 0x68, 0xbc, 0xc3, 0x46, 0x9f, 0xbd, + 0x85, 0x3d, 0xc2, 0x3d, 0xbc, 0x16, 0x22, 0x3c, 0xf1, 0xca, 0xdf, 0x3d, 0xaf, + 0xef, 0xbc, 0x3c, 0x4c, 0xde, 0xe8, 0xbd, 0x5c, 0x5a, 0xc9, 0xbb, 0x35, 0xe5, + 0xc1, 0x3d, 0x14, 0xc7, 0xba, 0xbc, 0x05, 0xfb, 0x1d, 0x3d, 0x61, 0x23, 0xb7, + 0xbb, 0x17, 0x50, 0xb0, 0xbd, 0x14, 0x5b, 0xf4, 0xbd, 0xb1, 0x4d, 0x40, 0x3d, + 0x7e, 0x3d, 0xd8, 0x3d, 0x35, 0x2e, 0x90, 0x3d, 0x93, 0xcd, 0x0d, 0xbe, 0x8d, + 0x60, 0x70, 0x3d, 0x4a, 0x7c, 0xf2, 0x3c, 0x07, 0x2a, 0x7f, 0x3d, 0x2c, 0xab, + 0xd8, 0x3d, 0xb3, 0x1f, 0x1d, 0xbd, 0x44, 0x69, 0xf7, 0x3c, 0x71, 0xfd, 0x5e, + 0x3c, 0xc8, 0x14, 0x28, 0x3d, 0x71, 0x2e, 0x0c, 0x3b, 0x7f, 0xa3, 0xb5, 0x3d, + 0x55, 0x5c, 0x07, 0x3e, 0x0f, 0xf0, 0x3b, 0x3c, 0xd9, 0xc2, 0xbd, 0xbc, 0x71, + 0xaa, 0xc5, 0xbb, 0xa3, 0x86, 0xc7, 0x3d, 0xcf, 0x37, 0x95, 0xbd, 0x09, 0x63, + 0xc3, 0x3d, 0x0c, 0x01, 0x4e, 0xbd, 0xf1, 0xf9, 0x8d, 0x3d, 0xe2, 0x98, 0x45, + 0x3d, 0x76, 0xbc, 0x3b, 0x3d, 0x2a, 0xa2, 0x47, 0x3d, 0x8c, 0x1d, 0xae, 0xbd, + 0x5f, 0x35, 0x8c, 0xbd, 0x17, 0xeb, 0x05, 0x3d, 0x75, 0x62, 0xdb, 0xbd, 0x37, + 0xf8, 0xea, 0x3d, 0xf8, 0xa6, 0x6c, 0xbd, 0x8a, 0x86, 0x03, 0x3d, 0x67, 0x6c, + 0x8d, 0xbd, 0x58, 0xaf, 0xc5, 0xbd, 0x36, 0x51, 0x14, 0xbe, 0x60, 0xac, 0xe3, + 0x3d, 0x86, 0x4f, 0xf4, 0x3c, 0xf6, 0xa3, 0x29, 0x3d, 0xc3, 0x1d, 0x9a, 0x3c, + 0x44, 0xdc, 0x0e, 0xbc, 0x6b, 0x97, 0x8f, 0x3c, 0xc9, 0x3d, 0x88, 0xbc, 0x74, + 0x90, 0x9d, 0x3d, 0x0f, 0x02, 0xec, 0xbd, 0x12, 0xec, 0xb2, 0x3d, 0x6c, 0x32, + 0x31, 0x3d, 0x0b, 0x84, 0x35, 0x3d, 0xfc, 0xc2, 0x3c, 0x3d, 0x59, 0xdf, 0x16, + 0x3d, 0x8e, 0x29, 0xee, 0x3d, 0x83, 0xc3, 0xb7, 0xbd, 0x66, 0xbd, 0x84, 0xbd, + 0xb7, 0x49, 0x1b, 0x3d, 0x3f, 0xc1, 0x4a, 0x3d, 0x1a, 0x7d, 0xdf, 0x3d, 0xee, + 0x12, 0xb1, 0x3c, 0x29, 0x47, 0xe6, 0xbd, 0xd6, 0x04, 0xd6, 0x3d, 0xc2, 0x31, + 0x6f, 0xbd, 0xb0, 0x2c, 0x3e, 0xbd, 0x20, 0xd8, 0x43, 0xbd, 0x2d, 0x0c, 0x26, + 0xbd, 0x23, 0x47, 0x06, 0xbe, 0xb9, 0xd2, 0xb9, 0xbd, 0x7b, 0xef, 0xc8, 0x3d, + 0x23, 0x06, 0x06, 0x3d, 0x65, 0xc6, 0x45, 0xbd, 0x20, 0xc9, 0x24, 0xbc, 0xf7, + 0x2b, 0xf5, 0x3d, 0x41, 0x91, 0x15, 0xbd, 0x90, 0xbe, 0x0f, 0x3d, 0xe8, 0x94, + 0x8c, 0xbd, 0xdf, 0x96, 0x72, 0x3c, 0x8d, 0xb4, 0xed, 0x3d, 0x33, 0xf0, 0xb3, + 0xbd, 0x60, 0x49, 0xbc, 0xbd, 0x32, 0xf2, 0xd5, 0x3d, 0x3e, 0x3e, 0x6b, 0xbd, + 0xb4, 0x31, 0x09, 0x3e, 0xc6, 0x40, 0xfb, 0xbc, 0x75, 0x1a, 0x88, 0xbd, 0xbf, + 0x13, 0xb2, 0xbd, 0xe3, 0x78, 0xc4, 0xba, 0x68, 0xfc, 0x10, 0x3e, 0x27, 0x4c, + 0xf5, 0x3c, 0xfc, 0x68, 0x27, 0x3d, 0xb2, 0x2c, 0xe0, 0x3c, 0x6e, 0x4f, 0x9a, + 0xbb, 0xbb, 0x9f, 0xa1, 0xbd, 0x91, 0x7b, 0x9a, 0xbc, 0x17, 0x21, 0x52, 0xba, + 0x39, 0x8e, 0x4c, 0xbd, 0x03, 0xf5, 0xe5, 0x3d, 0x3a, 0x22, 0xcd, 0xbd, 0x90, + 0x1c, 0x78, 0xbd, 0x3f, 0xb1, 0x8d, 0xbd, 0xfc, 0x77, 0x25, 0xbe, 0x48, 0x9a, + 0xfd, 0x3c, 0xca, 0x6a, 0xa2, 0x3d, 0x45, 0xd6, 0x7a, 0xbd, 0xce, 0x9d, 0xbf, + 0x3d, 0x94, 0x1c, 0xbe, 0xbd, 0xcc, 0xc4, 0x83, 0xbc, 0xe9, 0xc7, 0xf3, 0xbc, + 0xdc, 0x31, 0x19, 0x39, 0x3a, 0x36, 0xea, 0x3d, 0x40, 0xa6, 0x72, 0xbd, 0x66, + 0xeb, 0x85, 0xb9, 0x68, 0xa0, 0x97, 0xbd, 0xa7, 0xeb, 0xa9, 0x3c, 0x4d, 0x79, + 0xf9, 0x3c, 0x55, 0x67, 0xb2, 0x3c, 0x80, 0x2a, 0x8f, 0xbd, 0xd5, 0x70, 0x17, + 0x3b, 0x41, 0xfb, 0xed, 0xbd, 0xae, 0xfe, 0x0e, 0xbd, 0x6d, 0x06, 0xd6, 0xbc, + 0x90, 0xc9, 0xd1, 0x3d, 0xb4, 0x6c, 0x19, 0x3b, 0xa3, 0x4f, 0x11, 0x3c, 0xb1, + 0x71, 0xc1, 0xbd, 0xcc, 0x5b, 0x20, 0xbc, 0x7a, 0xb5, 0xe9, 0x3d, 0x6f, 0x8c, + 0x95, 0x3d, 0x10, 0x56, 0x79, 0xbd, 0x45, 0x06, 0x69, 0x3c, 0xe4, 0x89, 0x9f, + 0xbd, 0xad, 0x43, 0x82, 0xbd, 0x7a, 0x1f, 0xbd, 0xbd, 0xbb, 0x25, 0x9b, 0x3c, + 0x27, 0xdc, 0x0f, 0xbe, 0x42, 0x7b, 0xe1, 0x3d, 0xaa, 0xd9, 0xcb, 0xbd, 0xa4, + 0xdf, 0x0e, 0x3e, 0xdd, 0x57, 0xbe, 0xbd, 0xf0, 0xb7, 0x87, 0xbd, 0xbb, 0x8a, + 0x73, 0xbd, 0x20, 0x8b, 0xb5, 0x3c, 0xb3, 0xac, 0x57, 0xbd, 0x4a, 0x5c, 0x68, + 0x3d, 0x46, 0xc5, 0x6e, 0x3b, 0x44, 0xd8, 0x22, 0xbd, 0xc8, 0x88, 0x93, 0xbd, + 0x71, 0x42, 0xd3, 0xbc, 0x80, 0x60, 0xf6, 0xbc, 0xe0, 0xb7, 0x04, 0x3d, 0xcb, + 0x28, 0xf7, 0xbd, 0xfd, 0x2e, 0x9d, 0xbd, 0xd8, 0x81, 0x5b, 0x3d, 0x90, 0x88, + 0x06, 0xbd, 0xb1, 0x2d, 0x8b, 0xbc, 0x74, 0x4d, 0x80, 0xbd, 0x1b, 0xce, 0x54, + 0x3d, 0xd3, 0xea, 0x89, 0xbd, 0x7a, 0x0a, 0xc6, 0x3c, 0x8b, 0x33, 0xa2, 0x3d, + 0x68, 0xe5, 0x8b, 0x3d, 0xcf, 0x19, 0x63, 0xbd, 0x50, 0x05, 0xc1, 0xbd, 0x2b, + 0x1f, 0xc4, 0xbc, 0x9f, 0xed, 0xaf, 0xbd, 0xc6, 0x72, 0x07, 0xbb, 0xc1, 0x58, + 0xa2, 0x3d, 0xf6, 0x27, 0x43, 0xbc, 0xa1, 0x5b, 0x36, 0x3d, 0x6b, 0x6b, 0x20, + 0x3d, 0x03, 0xb0, 0xfb, 0xbd, 0xf9, 0xf7, 0x9b, 0xbd, 0x9a, 0xbf, 0x92, 0x3d, + 0xa2, 0x0c, 0x5c, 0x3d, 0xd2, 0xc2, 0x73, 0xbd, 0x5c, 0xd3, 0xac, 0x3d, 0x9f, + 0x28, 0xa6, 0x3d, 0x23, 0xf4, 0x46, 0xbd, 0xf5, 0xfe, 0x6b, 0x3d, 0x2d, 0x03, + 0x56, 0x3d, 0x0c, 0x21, 0xe8, 0x3c, 0x6f, 0xdb, 0xe5, 0xbd, 0xd4, 0x8c, 0xe3, + 0xbd, 0xdf, 0x9d, 0x62, 0x3d, 0x38, 0xa0, 0xd1, 0xbd, 0x67, 0x9e, 0x8d, 0xbc, + 0xab, 0x78, 0x46, 0x3d, 0xf8, 0x88, 0x8e, 0xbc, 0x5a, 0x87, 0xd3, 0xbd, 0x40, + 0xba, 0xab, 0xbd, 0x45, 0xf8, 0x9a, 0x3d, 0x77, 0x60, 0x49, 0xbd, 0xa5, 0x29, + 0x98, 0xbc, 0xf9, 0xa7, 0x6b, 0x3d, 0xf8, 0x57, 0x1b, 0x3e, 0xf9, 0x7f, 0xcb, + 0x3d, 0xc8, 0x38, 0x3f, 0xbb, 0x0e, 0x77, 0xd9, 0x3d, 0xa9, 0x8f, 0xca, 0x3d, + 0x78, 0xbc, 0x92, 0x3d, 0xde, 0xe4, 0x31, 0xbc, 0x7f, 0x35, 0xec, 0x3d, 0x0b, + 0x98, 0x5c, 0x3d, 0x3a, 0x86, 0xa0, 0x3d, 0x9d, 0xb7, 0xad, 0xbd, 0x42, 0x3c, + 0xc2, 0xbc, 0x26, 0x4b, 0x7b, 0x3d, 0xbe, 0x8b, 0x0a, 0xb9, 0x28, 0x3e, 0xc5, + 0x3d, 0xef, 0xac, 0xbb, 0xbd, 0xb3, 0xcc, 0x69, 0xbd, 0xb9, 0xff, 0x07, 0x3d, + 0x30, 0xf6, 0x26, 0x3d, 0xa9, 0x18, 0xe6, 0x3d, 0x85, 0x72, 0xdb, 0xbd, 0xda, + 0x6e, 0xa1, 0x3d, 0x3b, 0x16, 0xf7, 0x3c, 0xb1, 0x3d, 0x96, 0xbd, 0xd9, 0x88, + 0xeb, 0x3b, 0x52, 0x76, 0x9a, 0xbd, 0xb9, 0x81, 0x1a, 0xbd, 0x81, 0x94, 0x96, + 0xbc, 0xd4, 0x4b, 0xe8, 0x3d, 0x0f, 0x6c, 0xe4, 0xbc, 0xc0, 0xbd, 0xab, 0x3c, + 0x1b, 0xdd, 0x76, 0x3c, 0x98, 0x18, 0xae, 0xbd, 0xfb, 0x1a, 0x6f, 0xbd, 0x72, + 0x50, 0x83, 0xbd, 0x46, 0x0b, 0x12, 0xbc, 0x64, 0x93, 0xf2, 0x3d, 0x1f, 0xad, + 0x71, 0x3b, 0xcf, 0x26, 0x77, 0xbd, 0x8b, 0x31, 0x2d, 0xbd, 0x0d, 0xb7, 0x54, + 0x3b, 0x5b, 0x00, 0xc4, 0x3d, 0x57, 0x4c, 0x58, 0x3d, 0x11, 0x4c, 0x15, 0x3d, + 0x1a, 0xfc, 0xa2, 0xbc, 0xf2, 0xed, 0xea, 0x3d, 0x9e, 0xad, 0xf7, 0xbd, 0x47, + 0x8d, 0x41, 0x3d, 0xce, 0xc5, 0x96, 0xbb, 0x2a, 0x72, 0xa0, 0xbd, 0x93, 0x27, + 0x9a, 0xbd, 0x3f, 0xcb, 0xef, 0xbb, 0xb5, 0xa5, 0x1e, 0x3d, 0xd6, 0x2a, 0xfd, + 0xbc, 0xf5, 0xe0, 0xd4, 0xbc, 0xa1, 0x7d, 0x9d, 0x3d, 0xbb, 0x60, 0x22, 0xbd, + 0x32, 0x15, 0x16, 0x3e, 0x80, 0x77, 0xb7, 0xbc, 0xba, 0x1c, 0xa4, 0xbd, 0x45, + 0xb7, 0x0b, 0xbd, 0x6a, 0x33, 0x9a, 0x3d, 0xfc, 0x27, 0xab, 0xbc, 0x10, 0xcd, + 0x2c, 0x3e, 0xb3, 0xf1, 0xa5, 0x3d, 0x03, 0xf7, 0xa3, 0x3c, 0x25, 0x0c, 0xe1, + 0x3c, 0xc4, 0x82, 0xaa, 0xbd, 0x3a, 0x4a, 0x15, 0x3c, 0x5c, 0x56, 0x9e, 0x3d, + 0x96, 0x52, 0xee, 0x3d, 0x67, 0xf7, 0x96, 0x3d, 0x3e, 0xb0, 0xd6, 0xbd, 0x6e, + 0xbd, 0x8e, 0xbd, 0x16, 0xb3, 0x85, 0x3d, 0x84, 0xca, 0x6e, 0xbd, 0x0f, 0xfc, + 0x40, 0x3d, 0x2d, 0xe0, 0xdc, 0x3d, 0xc1, 0xa1, 0xde, 0x39, 0x30, 0x79, 0xe7, + 0x3d, 0x0a, 0xab, 0xba, 0x3d, 0x35, 0x57, 0xc7, 0xbd, 0x7e, 0x38, 0xa1, 0x3d, + 0xe3, 0x25, 0x60, 0x3d, 0x47, 0xbd, 0x56, 0x3d, 0x62, 0xcf, 0xf6, 0x3d, 0xad, + 0x06, 0xd5, 0xbd, 0x41, 0xda, 0xe8, 0x3a, 0x81, 0xcb, 0xbb, 0x3d, 0xce, 0x38, + 0x4c, 0xbc, 0x17, 0xc0, 0x88, 0xbd, 0x12, 0x25, 0xd7, 0xbd, 0x3b, 0xf5, 0x9b, + 0xbd, 0x4e, 0xa0, 0xb1, 0xbc, 0xa1, 0x8c, 0x9c, 0x3d, 0xc5, 0x2f, 0xb3, 0x3d, + 0xe0, 0xc2, 0x08, 0x3e, 0x0b, 0xcc, 0x2f, 0x3d, 0x87, 0x3f, 0x1d, 0x3e, 0x76, + 0xcd, 0xc3, 0xbd, 0x4f, 0x1d, 0xd4, 0xbd, 0x65, 0x6f, 0x00, 0x3e, 0x95, 0x4f, + 0x9a, 0x3d, 0xa2, 0x66, 0x28, 0xbd, 0xaf, 0x81, 0x90, 0x3d, 0x16, 0x50, 0xde, + 0x3b, 0x65, 0xec, 0xe3, 0xbd, 0x47, 0x6c, 0x34, 0xbc, 0xae, 0xe8, 0xe5, 0xbd, + 0x5b, 0x7c, 0xa6, 0xbb, 0x1d, 0x4d, 0x8d, 0xbc, 0xb1, 0x7a, 0x1d, 0x3e, 0xbf, + 0x37, 0xe6, 0xbc, 0x7b, 0x0c, 0x70, 0x3d, 0x09, 0x57, 0xe2, 0x3d, 0x10, 0x4a, + 0x35, 0xbc, 0x5d, 0x58, 0xf5, 0xbc, 0xb9, 0x89, 0xa1, 0x3d, 0x6a, 0xb2, 0x68, + 0xbd, 0xf4, 0xf6, 0x03, 0x3e, 0xf1, 0xc6, 0x3a, 0xbd, 0xf5, 0x3b, 0xe2, 0x3d, + 0x3a, 0xd2, 0x4a, 0x3d, 0xe7, 0xb8, 0x9e, 0xbd, 0x18, 0xe7, 0xd9, 0x3c, 0x1d, + 0x95, 0x8e, 0x3d, 0xde, 0x6f, 0x9e, 0xbc, 0xae, 0x7d, 0x0f, 0x3e, 0xb0, 0xf3, + 0x04, 0x3d, 0xe0, 0xdc, 0x6b, 0x3d, 0x02, 0x2c, 0xee, 0xbd, 0x7c, 0xb2, 0x9f, + 0xbd, 0xae, 0x94, 0xc3, 0x3c, 0x82, 0xba, 0xab, 0x3d, 0x07, 0x80, 0xde, 0x3c, + 0x75, 0xec, 0xb3, 0xbd, 0x34, 0x42, 0x74, 0xbd, 0x44, 0xce, 0x7a, 0x3d, 0x21, + 0xac, 0x28, 0xbe, 0xb1, 0xbb, 0x14, 0xbd, 0xe2, 0xe1, 0xdb, 0x3c, 0x41, 0x82, + 0xc7, 0x3d, 0x3e, 0x0f, 0x9c, 0xbd, 0x92, 0x4e, 0x97, 0x3d, 0x69, 0x45, 0xf2, + 0x3d, 0xc3, 0x86, 0xc4, 0xbb, 0x57, 0x0f, 0xb1, 0x3d, 0x8c, 0xa7, 0xc6, 0x3d, + 0x27, 0xe2, 0xf3, 0xbc, 0xdd, 0x31, 0x44, 0xbd, 0x94, 0x2c, 0x29, 0xbc, 0xe6, + 0xeb, 0xd1, 0xbd, 0x74, 0xf9, 0x02, 0x3d, 0x43, 0x51, 0x92, 0xbd, 0x38, 0xb8, + 0x72, 0x3d, 0x73, 0xd3, 0x89, 0xbc, 0x06, 0x13, 0xdb, 0x3d, 0x75, 0xc5, 0xb2, + 0x3b, 0x9a, 0xe9, 0x95, 0xbc, 0xd2, 0x6a, 0x05, 0x3e, 0x65, 0xc5, 0xa3, 0x3d, + 0x59, 0x09, 0x72, 0xbd, 0x93, 0x0e, 0x85, 0xbc, 0x0d, 0x55, 0x6b, 0xbd, 0x55, + 0x64, 0x16, 0xbd, 0x50, 0x04, 0x9f, 0x3d, 0x93, 0x37, 0x14, 0xbd, 0xe9, 0x24, + 0x58, 0x3d, 0x04, 0x8e, 0xe9, 0xbd, 0xe4, 0x6e, 0x2b, 0xbd, 0x43, 0xbc, 0xba, + 0xbd, 0x80, 0xa1, 0xc3, 0xbd, 0x32, 0x81, 0xf5, 0xbd, 0x94, 0x5a, 0x10, 0x3d, + 0xfb, 0x5d, 0x27, 0x3c, 0xd7, 0x26, 0xc5, 0x3d, 0xf5, 0xc3, 0x4b, 0x3d, 0x32, + 0xca, 0xdc, 0x3d, 0xb2, 0xe8, 0x35, 0xbc, 0xb2, 0x47, 0xb9, 0xbd, 0xfa, 0x59, + 0x29, 0xbe, 0xab, 0x6f, 0x0a, 0x3e, 0x81, 0xa5, 0x10, 0xbd, 0x73, 0x96, 0x99, + 0xbd, 0x39, 0x77, 0x23, 0xbc, 0xa8, 0x50, 0xf8, 0xbd, 0x4c, 0x1d, 0xdd, 0xbd, + 0xf8, 0xf5, 0xb9, 0xbd, 0x65, 0x4e, 0x12, 0x3e, 0xc0, 0xa1, 0x7a, 0xbd, 0x16, + 0x33, 0x27, 0x3d, 0xc4, 0xc6, 0x31, 0x3b, 0x0e, 0xcd, 0x48, 0xbd, 0xd2, 0x7f, + 0xb4, 0xbd, 0x2c, 0x3a, 0x8b, 0x3c, 0x6f, 0x43, 0x59, 0x3d, 0x4e, 0x8a, 0x52, + 0x3d, 0x91, 0x68, 0xc4, 0x3d, 0xa2, 0x78, 0x16, 0xbd, 0xe5, 0x2c, 0x60, 0x3d, + 0x7f, 0x73, 0x8f, 0x3d, 0x9f, 0x70, 0x09, 0xbe, 0xf2, 0xf2, 0x05, 0x3c, 0x1e, + 0x58, 0x98, 0x3d, 0xec, 0xfc, 0x03, 0x3e, 0x88, 0xbf, 0x56, 0xbd, 0x2b, 0xc8, + 0x99, 0xbd, 0x9e, 0x13, 0x9a, 0xbc, 0x4f, 0x72, 0xca, 0xbd, 0x79, 0x6e, 0xef, + 0x3d, 0x87, 0xc3, 0x80, 0xbc, 0xe7, 0xef, 0x05, 0x3d, 0xc7, 0x99, 0x0a, 0x3d, + 0x17, 0x7c, 0x56, 0x3d, 0x01, 0xab, 0xd3, 0xbd, 0x48, 0x8b, 0xa2, 0xbd, 0x06, + 0xad, 0xcc, 0xbc, 0xf0, 0xf5, 0x6d, 0xbd, 0x6a, 0x67, 0x0c, 0xbe, 0x7e, 0x2e, + 0x6e, 0x3d, 0x53, 0x50, 0x29, 0xbd, 0x8c, 0x40, 0xb3, 0x3d, 0x5c, 0x9a, 0x0f, + 0xbd, 0xe9, 0x4e, 0x0a, 0x3e, 0x4d, 0x05, 0xac, 0x3d, 0xf9, 0x1a, 0x8e, 0x3d, + 0x0d, 0x69, 0xa6, 0xbd, 0x88, 0x94, 0x60, 0x3d, 0x48, 0x2a, 0x8a, 0xbb, 0x5a, + 0x5d, 0x39, 0x3d, 0x88, 0x56, 0xc8, 0x3c, 0xb8, 0x91, 0x93, 0x3a, 0x64, 0x69, + 0x8b, 0x3d, 0x4b, 0x48, 0x43, 0xbd, 0xb8, 0x91, 0xa7, 0xbd, 0x92, 0x96, 0xe5, + 0x3d, 0x4c, 0x62, 0xd6, 0x3d, 0xa6, 0x7a, 0x88, 0xbd, 0x6c, 0xdb, 0xc6, 0x3d, + 0x1c, 0x4d, 0xab, 0x3d, 0xe0, 0x1d, 0x57, 0x3c, 0x2a, 0xa3, 0x0c, 0x3d, 0xac, + 0xff, 0xe8, 0xbb, 0x12, 0x86, 0x89, 0xbd, 0xc6, 0x68, 0xd3, 0xbd, 0xe7, 0xb0, + 0xa6, 0xbc, 0x3c, 0xd2, 0xfa, 0xbb, 0xf2, 0xd6, 0xda, 0xbd, 0x80, 0x95, 0xc5, + 0xbd, 0x0a, 0x19, 0x93, 0xbd, 0x94, 0xc1, 0xe4, 0xbd, 0xdd, 0x20, 0x18, 0x3e, + 0xb3, 0x48, 0xba, 0xbd, 0xdd, 0x6b, 0x86, 0xbd, 0x3d, 0xbc, 0xb1, 0xbd, 0xbe, + 0xc1, 0x7f, 0xbc, 0xfc, 0x54, 0x83, 0x3d, 0xb5, 0x4e, 0x1e, 0xbd, 0x5f, 0x54, + 0xc3, 0x3c, 0xe4, 0x2e, 0x0a, 0x3e, 0xc9, 0x05, 0x05, 0x3d, 0xc7, 0x8d, 0x2c, + 0xbc, 0x37, 0x21, 0xc2, 0xbc, 0xea, 0x7e, 0x96, 0x3d, 0x64, 0x7a, 0xca, 0x3d, + 0xcb, 0xcf, 0xc8, 0x3b, 0x5a, 0xd4, 0x00, 0xbe, 0x5f, 0x49, 0xd0, 0x3d, 0xbe, + 0x56, 0x15, 0x3e, 0x3f, 0x1d, 0x9e, 0xbd, 0xd4, 0x91, 0xa9, 0x3d, 0xf1, 0xea, + 0x4b, 0xbb, 0x78, 0x4a, 0xa5, 0x3c, 0xc2, 0x9b, 0xac, 0xbd, 0x8c, 0xd3, 0x94, + 0xbd, 0xb1, 0x52, 0x94, 0xbd, 0x55, 0xdd, 0x0d, 0xbe, 0x93, 0x2e, 0xa1, 0x3d, + 0x31, 0x1e, 0xe0, 0x3c, 0xaf, 0xba, 0x6c, 0x3d, 0x8e, 0xec, 0x8f, 0xbd, 0x38, + 0x79, 0xd2, 0xbc, 0x21, 0x7e, 0x9d, 0x3d, 0xbb, 0x21, 0xeb, 0x3d, 0x6e, 0x68, + 0xec, 0x3d, 0xc2, 0xf4, 0xb6, 0xbd, 0x80, 0xe2, 0x91, 0xbc, 0x45, 0xa5, 0x8f, + 0xbb, 0xf8, 0xb2, 0xc7, 0xbd, 0xe4, 0x47, 0x3a, 0xbd, 0xa2, 0x4f, 0xe9, 0xbd, + 0xcc, 0x37, 0x53, 0x3c, 0x51, 0x03, 0x4f, 0x3d, 0x35, 0xa2, 0xfa, 0x3d, 0xea, + 0x64, 0x7b, 0xbc, 0xbf, 0x49, 0xfb, 0x3d, 0x3d, 0x8e, 0x7b, 0x3b, 0x9c, 0x4b, + 0x35, 0xbd, 0x62, 0xf1, 0x10, 0xbe, 0xac, 0xd2, 0xd8, 0xbd, 0x80, 0x00, 0x9d, + 0x3d, 0xcc, 0x19, 0xaf, 0xbc, 0x97, 0x73, 0xdb, 0x3d, 0x6d, 0xb6, 0xf3, 0x3d, + 0x19, 0xe7, 0x7a, 0xbd, 0xcf, 0xba, 0xc6, 0x3c, 0x77, 0xfc, 0x23, 0x3d, 0xd6, + 0xfe, 0x3f, 0x3d, 0x73, 0xf2, 0xdb, 0xbd, 0x3d, 0x21, 0x95, 0xbb, 0x58, 0xb8, + 0x86, 0xbd, 0x01, 0x3c, 0x6f, 0x3d, 0xaf, 0x2e, 0x3e, 0xbd, 0x7b, 0x6d, 0x73, + 0xbd, 0x33, 0xe2, 0x5f, 0xbc, 0x64, 0x5f, 0xdb, 0xbd, 0x31, 0xf5, 0xb6, 0xbd, + 0xfc, 0x90, 0xd4, 0xbd, 0x25, 0xd8, 0xc4, 0xbd, 0x38, 0xdf, 0xb9, 0x3d, 0x89, + 0x14, 0x8b, 0x3d, 0x8d, 0x05, 0x2c, 0xbd, 0x20, 0xb8, 0xa3, 0xbc, 0xaf, 0x68, + 0x12, 0x3d, 0xce, 0x53, 0xb0, 0xbd, 0xca, 0x8a, 0x95, 0x3d, 0x11, 0x84, 0x8a, + 0x3d, 0x6d, 0xbd, 0x67, 0xbb, 0xe8, 0xd5, 0x76, 0xbc, 0xac, 0xc8, 0xfb, 0xbd, + 0xa9, 0x8b, 0xa4, 0xbb, 0x3e, 0x3a, 0xba, 0x3d, 0xe2, 0xa5, 0x50, 0x3d, 0xf0, + 0x4d, 0x81, 0x3b, 0x96, 0x79, 0x31, 0xbd, 0x87, 0xaf, 0xe5, 0x3a, 0x27, 0xb7, + 0xa5, 0x3d, 0xd4, 0x71, 0xb5, 0xbd, 0x95, 0x06, 0xd1, 0xbd, 0x82, 0x3d, 0x1c, + 0xbc, 0xdc, 0xe4, 0x6e, 0x3d, 0x21, 0xcf, 0x80, 0xbc, 0xbe, 0xc7, 0xb7, 0xbc, + 0x21, 0x87, 0x3c, 0x3d, 0x11, 0x3a, 0x67, 0xbd, 0xa5, 0xd3, 0xe8, 0xbd, 0x9a, + 0xb7, 0xc2, 0x3d, 0x2e, 0xa7, 0x86, 0xbc, 0xbe, 0x03, 0x26, 0xbc, 0x5e, 0x12, + 0x08, 0xbe, 0x1d, 0xd9, 0xf8, 0xbd, 0xf3, 0x79, 0xe4, 0xbd, 0x38, 0xaa, 0x04, + 0x3e, 0x98, 0x40, 0xa7, 0x3d, 0xfa, 0xd9, 0xce, 0xbd, 0x08, 0x73, 0x16, 0xb9, + 0xd6, 0x47, 0x2c, 0x3d, 0x08, 0xb5, 0x8b, 0xbd, 0x04, 0x66, 0x70, 0x3c, 0x9f, + 0xe6, 0xe4, 0xbd, 0x7f, 0xcd, 0xa5, 0x3b, 0x5b, 0x92, 0x8b, 0xbd, 0x29, 0x55, + 0x19, 0xbd, 0x79, 0x98, 0x26, 0x3d, 0x32, 0x3d, 0xc3, 0xb9, 0x29, 0x8a, 0x05, + 0xbe, 0xe8, 0x61, 0x92, 0x3d, 0x4f, 0x64, 0xa9, 0x3d, 0x00, 0x9a, 0xa0, 0xbd, + 0x34, 0xcc, 0xd8, 0x3c, 0xcd, 0x8a, 0xaf, 0x3d, 0x69, 0xc6, 0x5c, 0x3c, 0xe0, + 0x76, 0xd3, 0x3d, 0x49, 0x6a, 0x79, 0x3b, 0x33, 0x10, 0xbd, 0x3c, 0xe9, 0x47, + 0x2a, 0xbd, 0x7f, 0xb4, 0x3e, 0xbb, 0x80, 0xd2, 0x18, 0xbe, 0xf3, 0x5c, 0x90, + 0xbd, 0x0b, 0x88, 0xaf, 0xbd, 0x24, 0x0c, 0x94, 0xbd, 0xfd, 0xa9, 0xa1, 0xbd, + 0x40, 0xc9, 0x82, 0xbd, 0x24, 0x56, 0xa0, 0x3c, 0xa0, 0x3e, 0x09, 0x3e, 0x30, + 0x93, 0xc7, 0x3d, 0x03, 0xa3, 0x0c, 0x3c, 0x88, 0xdc, 0x96, 0x3d, 0xac, 0x34, + 0xc7, 0xbd, 0x64, 0xb0, 0xe5, 0x3d, 0x61, 0x56, 0xc8, 0x3d, 0x08, 0x55, 0x99, + 0x3d, 0xb5, 0xa9, 0x56, 0xbd, 0xfb, 0x4f, 0x95, 0xbd, 0xe9, 0xeb, 0x55, 0x3d, + 0xbf, 0x4c, 0xdf, 0xbd, 0xbf, 0x4a, 0x12, 0xbb, 0x93, 0x9d, 0x65, 0xbd, 0x26, + 0xd0, 0xce, 0x3d, 0x89, 0x19, 0x64, 0xbd, 0x91, 0x3d, 0x3f, 0x3d, 0x23, 0x3a, + 0x3b, 0xbd, 0xc8, 0x9d, 0x20, 0xbc, 0xa1, 0x2c, 0xff, 0xbb, 0x8c, 0x39, 0xb2, + 0x3b, 0xf3, 0xbe, 0x86, 0x3d, 0xa3, 0xfa, 0xcc, 0xbd, 0x3d, 0x3c, 0x07, 0xbe, + 0xd4, 0xb4, 0xa7, 0xbd, 0x94, 0xfc, 0x71, 0x3d, 0x8b, 0xe6, 0x2e, 0x3d, 0x94, + 0x30, 0x41, 0xbd, 0xb3, 0x63, 0x18, 0x3d, 0xbf, 0x35, 0x3c, 0xbb, 0x4c, 0xaa, + 0xd9, 0xbd, 0x20, 0x83, 0xa1, 0x3d, 0xdb, 0xca, 0x49, 0x3c, 0x1d, 0xbb, 0xac, + 0xbb, 0x3c, 0xea, 0x1c, 0xbc, 0x5b, 0xc3, 0xd1, 0x3d, 0x15, 0xd3, 0xc9, 0xbd, + 0xb9, 0x30, 0x12, 0xbb, 0xe3, 0x34, 0xde, 0xbd, 0xa0, 0x31, 0xeb, 0xbd, 0xc2, + 0x64, 0xe2, 0x3d, 0xb2, 0xfd, 0xf4, 0xbd, 0x45, 0xa5, 0xbe, 0x3c, 0xa1, 0x40, + 0x56, 0xbd, 0x52, 0x01, 0xed, 0x3d, 0xd0, 0x6b, 0xfc, 0xbd, 0xef, 0x73, 0xb2, + 0xbd, 0x03, 0xa0, 0xcd, 0xbd, 0x24, 0x69, 0xbe, 0x3c, 0x76, 0xcd, 0x9e, 0x3d, + 0xbe, 0xcb, 0x3b, 0x3d, 0x55, 0x49, 0x4e, 0xbd, 0x99, 0xe9, 0xd5, 0xbc, 0x9c, + 0x73, 0x88, 0x3c, 0x9a, 0x64, 0x75, 0xbd, 0x53, 0x89, 0xb2, 0xbd, 0x73, 0xa4, + 0xb9, 0x3d, 0xa8, 0x68, 0xf3, 0xbd, 0x2a, 0xf3, 0x89, 0xbd, 0x8d, 0x63, 0x85, + 0x3c, 0xbb, 0x72, 0x63, 0x3d, 0x29, 0x8a, 0xe8, 0xbd, 0x87, 0x03, 0xab, 0x3d, + 0xbf, 0x88, 0x44, 0xbd, 0x74, 0x28, 0xae, 0xbd, 0xf7, 0xe8, 0x87, 0xbd, 0x16, + 0x46, 0x04, 0xbd, 0x87, 0xf6, 0xcf, 0xbd, 0x8b, 0x67, 0x44, 0xbd, 0xac, 0xd4, + 0xa5, 0xbd, 0xed, 0x0b, 0xf2, 0xbd, 0x20, 0x9e, 0xf5, 0xbd, 0xc1, 0xbd, 0x70, + 0x3d, 0xae, 0xfe, 0x77, 0x3d, 0x27, 0x07, 0x82, 0xbd, 0xbe, 0x56, 0x19, 0xbd, + 0xae, 0x94, 0xc9, 0xbd, 0x7a, 0x52, 0xc6, 0xbd, 0x4e, 0x64, 0x4d, 0x3c, 0xf7, + 0xe4, 0x18, 0x3d, 0xef, 0x06, 0xa4, 0xbd, 0x8c, 0xad, 0xa8, 0xbd, 0xab, 0xcc, + 0x62, 0xbc, 0x4a, 0x7c, 0x09, 0xba, 0x01, 0x0d, 0x2b, 0xbd, 0x3d, 0x77, 0xb6, + 0x3b, 0xd3, 0x48, 0xc8, 0x3d, 0x89, 0xcf, 0x05, 0x3e, 0xdb, 0x48, 0x92, 0x3d, + 0x1e, 0xa5, 0xc9, 0x3c, 0xc7, 0xad, 0x74, 0x3d, 0x66, 0x26, 0x4e, 0xbd, 0x8f, + 0x4c, 0x85, 0x3d, 0xe2, 0x14, 0xe3, 0x3d, 0xad, 0x90, 0x2b, 0xbd, 0xcd, 0x7c, + 0xf4, 0x3d, 0xe6, 0xae, 0x98, 0x3c, 0xa6, 0x86, 0x66, 0x3c, 0x18, 0x11, 0x1f, + 0xbc, 0xb8, 0xe5, 0xa3, 0xbc, 0xea, 0xd7, 0x47, 0xbd, 0x39, 0x8a, 0xbb, 0x3d, + 0x1c, 0x27, 0x4c, 0xba, 0x50, 0x9a, 0x4b, 0xbd, 0xda, 0x55, 0x5c, 0xbd, 0xa7, + 0xd6, 0xb4, 0x3d, 0x40, 0x3f, 0xa0, 0xbd, 0x26, 0xa7, 0xba, 0xbd, 0x4c, 0xc0, + 0x5c, 0x3d, 0x5c, 0xe1, 0x96, 0x3d, 0x50, 0xd9, 0x36, 0xbb, 0x8b, 0xf8, 0x7e, + 0xbb, 0xb4, 0x9c, 0xf0, 0x3d, 0x88, 0xf4, 0xa8, 0xbd, 0x92, 0x72, 0x0e, 0xbd, + 0x18, 0xc1, 0xa0, 0x3c, 0x78, 0x3f, 0xc6, 0xbd, 0xfa, 0xec, 0xe8, 0xbd, 0xa4, + 0xbc, 0x3d, 0xbd, 0x47, 0x9d, 0xc6, 0xbc, 0x8e, 0x10, 0x4b, 0x3d, 0x18, 0x89, + 0x51, 0xbd, 0x26, 0xd5, 0x9b, 0xbd, 0xb9, 0xbb, 0x0a, 0xbe, 0xa7, 0x0f, 0x8f, + 0x3d, 0x62, 0x63, 0x4b, 0xbb, 0xfe, 0x46, 0x56, 0xbd, 0x64, 0xcc, 0xbb, 0x3d, + 0x85, 0x17, 0x52, 0x3d, 0x08, 0xa8, 0x0e, 0x3d, 0x75, 0xdc, 0x4c, 0xbd, 0xf9, + 0xc3, 0x92, 0x3d, 0xe0, 0x13, 0x84, 0x3d, 0xa1, 0x30, 0xe8, 0xbd, 0x2d, 0x2b, + 0xd0, 0xbd, 0x68, 0x62, 0x91, 0xbc, 0x32, 0xd7, 0xd3, 0xbb, 0xac, 0xd6, 0xdb, + 0x3d, 0x0d, 0x70, 0xe9, 0xbd, 0xed, 0xea, 0x69, 0x3d, 0xa4, 0xa3, 0x99, 0x3d, + 0x60, 0xa0, 0xcd, 0xbd, 0xd8, 0x9b, 0x20, 0x3c, 0x29, 0x39, 0xaf, 0x3d, 0xd3, + 0x2d, 0x2e, 0x3d, 0x10, 0xd7, 0x60, 0x3d, 0x2b, 0x82, 0xb1, 0xbd, 0x3d, 0x6b, + 0x94, 0xbd, 0x73, 0xa6, 0x24, 0x3d, 0x33, 0x6b, 0xf9, 0xbd, 0x94, 0xe1, 0xac, + 0x3d, 0xdf, 0x2c, 0x77, 0x3d, 0x82, 0x66, 0xa0, 0x3c, 0x9d, 0x7c, 0xd1, 0xbd, + 0x67, 0x66, 0x39, 0x3d, 0x1b, 0xb4, 0x5e, 0x3d, 0x0a, 0x50, 0x7f, 0x3d, 0x1a, + 0x08, 0x6c, 0x3d, 0x6c, 0x55, 0xac, 0xbd, 0x27, 0x4d, 0x04, 0xbc, 0x28, 0x6e, + 0x54, 0x3c, 0x8d, 0x2e, 0x95, 0xbd, 0x56, 0x25, 0xd5, 0x3a, 0x8d, 0xf8, 0xde, + 0xbd, 0x53, 0xd6, 0xe0, 0x3c, 0x09, 0xfc, 0x3f, 0x3d, 0x95, 0x29, 0xbe, 0xba, + 0x9b, 0x98, 0xa6, 0x3d, 0xfd, 0xd1, 0xe1, 0x3d, 0x00, 0x2a, 0x04, 0xbe, 0x06, + 0x73, 0x8b, 0xbd, 0x1e, 0x77, 0xcd, 0x3d, 0xf3, 0x47, 0x01, 0xbe, 0x41, 0x8d, + 0xd2, 0xbc, 0x98, 0xba, 0x02, 0xbe, 0x14, 0x4e, 0x84, 0xbc, 0x7b, 0xee, 0xc1, + 0x3d, 0x5c, 0x1f, 0x5f, 0xbd, 0x66, 0x1e, 0xd4, 0xbd, 0xa7, 0x18, 0x51, 0x3d, + 0xaa, 0xbb, 0x7f, 0x3b, 0x9a, 0x15, 0x33, 0x3d, 0xcd, 0x6b, 0x8d, 0x3d, 0x9c, + 0x73, 0x6d, 0xbd, 0x76, 0x3e, 0x54, 0x3c, 0x3d, 0x4f, 0xe4, 0x3d, 0x89, 0xaf, + 0xf9, 0x3d, 0x0f, 0x5f, 0x8b, 0xbd, 0x5d, 0xcc, 0x9c, 0xbd, 0x8b, 0x08, 0xf1, + 0xbd, 0xe3, 0xc3, 0x04, 0xbd, 0x5f, 0x0b, 0xf8, 0x3d, 0x4f, 0xd8, 0xaf, 0x3d, + 0x2f, 0xff, 0x3e, 0x3d, 0x07, 0xf0, 0x5f, 0xbb, 0xcd, 0x6b, 0xbd, 0xbd, 0x0a, + 0x80, 0xee, 0x3d, 0x58, 0xa2, 0xbd, 0x3c, 0xa6, 0x43, 0xf9, 0xbc, 0x7e, 0x76, + 0xbb, 0x3d, 0x0b, 0x75, 0x11, 0xb9, 0x7c, 0x78, 0x46, 0x3d, 0xe9, 0xf0, 0x73, + 0x3d, 0x6d, 0x01, 0x50, 0xbc, 0x6f, 0x55, 0x80, 0x3d, 0x88, 0x5d, 0xd4, 0xbc, + 0x20, 0x61, 0x94, 0xbd, 0xbd, 0x32, 0xa3, 0x3c, 0x91, 0x29, 0xb3, 0xbd, 0x7a, + 0x60, 0x62, 0xbc, 0xd8, 0x67, 0x99, 0xbb, 0xea, 0xd6, 0x4a, 0xbd, 0xb2, 0xb3, + 0x14, 0xbd, 0x15, 0x9f, 0xf6, 0x3d, 0xc4, 0x35, 0xbe, 0xbd, 0xc6, 0x0b, 0x63, + 0x3d, 0x43, 0x76, 0x43, 0xbd, 0x4f, 0x5e, 0x18, 0xbc, 0x6b, 0xac, 0xb1, 0x3d, + 0x4e, 0xca, 0xd8, 0xbd, 0x2f, 0xef, 0xc3, 0x3d, 0x96, 0xc3, 0x48, 0x3c, 0x1c, + 0x73, 0x17, 0x3d, 0x56, 0x34, 0xfb, 0x3c, 0x25, 0xa7, 0xb2, 0x3d, 0x29, 0x5e, + 0xac, 0x3d, 0xdd, 0x3b, 0x80, 0x3d, 0x5a, 0xec, 0x37, 0x3c, 0xdc, 0xf9, 0x92, + 0x3b, 0x66, 0x0b, 0xc6, 0xbd, 0x75, 0x09, 0xfc, 0xbc, 0x55, 0xd9, 0xea, 0xbd, + 0x01, 0xed, 0x7a, 0x3c, 0x90, 0x7d, 0x5e, 0xbd, 0xb8, 0x38, 0xc9, 0x3d, 0xb8, + 0x23, 0xa6, 0x3d, 0xb8, 0x83, 0x01, 0x3e, 0xe8, 0x22, 0xda, 0x3c, 0x66, 0xf5, + 0x92, 0x3d, 0x82, 0xe0, 0x87, 0x3c, 0x6f, 0xa1, 0x6e, 0x3d, 0x27, 0xca, 0xaf, + 0x3c, 0x7f, 0x68, 0xd6, 0xbd, 0x38, 0x98, 0x93, 0x3d, 0x4d, 0xdc, 0x5e, 0x3d, + 0xc8, 0xb8, 0xb2, 0x3d, 0xab, 0xeb, 0x8a, 0xbb, 0x39, 0x48, 0xbb, 0xbd, 0x17, + 0xe6, 0x0f, 0x3d, 0x57, 0x79, 0xea, 0xbc, 0xb2, 0x5e, 0xdb, 0x3d, 0x0c, 0x19, + 0xc7, 0xbd, 0xeb, 0x33, 0x2b, 0x3d, 0x4b, 0x15, 0xf6, 0x3d, 0x96, 0x9b, 0xa1, + 0xbc, 0x5c, 0xc8, 0x03, 0xbd, 0x88, 0x56, 0x21, 0x3e, 0x85, 0x0c, 0xa5, 0x3c, + 0x85, 0xcb, 0xf4, 0xbd, 0x61, 0x03, 0x4d, 0x3c, 0xf1, 0xf4, 0x8c, 0xbd, 0x7b, + 0x39, 0x34, 0x3b, 0xf4, 0xa2, 0x47, 0xbc, 0x10, 0x2d, 0xfc, 0xbd, 0xe8, 0xdd, + 0xe6, 0x3c, 0xa5, 0x7c, 0x85, 0x3c, 0x3f, 0xcd, 0xeb, 0xbc, 0x42, 0x94, 0xba, + 0xbd, 0x50, 0x23, 0xe3, 0xbd, 0x92, 0xf6, 0xa7, 0xbd, 0x5c, 0x36, 0xd0, 0xbd, + 0x27, 0x9e, 0x18, 0x3e, 0x33, 0x9a, 0xe8, 0xbc, 0x80, 0x3a, 0x5d, 0x3d, 0xd0, + 0xdc, 0x9c, 0xbd, 0xa3, 0x93, 0x51, 0xbd, 0x36, 0xab, 0x7a, 0x3d, 0x74, 0x9c, + 0x63, 0x3d, 0x1c, 0x19, 0x9b, 0xbd, 0xa6, 0x10, 0xb4, 0xbd, 0xf4, 0x80, 0xb4, + 0xbc, 0xd3, 0x9c, 0xd2, 0xbc, 0x6d, 0x1b, 0x68, 0xbd, 0x31, 0x6a, 0xfd, 0xbd, + 0xdc, 0xa4, 0x82, 0xbd, 0xa7, 0xe7, 0x37, 0xbd, 0x5c, 0xd1, 0x07, 0xbd, 0x4e, + 0x82, 0x15, 0xbc, 0x31, 0x43, 0x16, 0x3e, 0xe2, 0xf3, 0x1e, 0x3e, 0x62, 0x22, + 0x14, 0x3e, 0x27, 0x65, 0x0d, 0x39, 0xaa, 0x9e, 0x8f, 0x3d, 0xdd, 0x59, 0x4c, + 0x3c, 0x4a, 0xc5, 0xc5, 0xbd, 0x4a, 0xa5, 0xc7, 0x3b, 0xb9, 0x73, 0xcc, 0x3d, + 0x10, 0x62, 0x5c, 0x3c, 0x87, 0xd8, 0xb2, 0xbd, 0x15, 0x50, 0xf8, 0x3d, 0xd7, + 0x7f, 0x91, 0xbd, 0xf4, 0x07, 0xfb, 0x3c, 0x93, 0x09, 0xae, 0xbc, 0x54, 0x19, + 0x76, 0x3a, 0x42, 0x4f, 0xbe, 0xbc, 0x6a, 0xef, 0xee, 0x3d, 0x98, 0x97, 0xb7, + 0x3d, 0x33, 0x07, 0x3c, 0xbd, 0xe0, 0xc2, 0x46, 0x3c, 0x33, 0x5f, 0x80, 0x3c, + 0x4d, 0x5e, 0xff, 0xbc, 0x4e, 0x02, 0xe8, 0xbc, 0x1f, 0x5b, 0xcd, 0xbc, 0x2d, + 0x41, 0x8a, 0x3d, 0x2d, 0xeb, 0x5e, 0xbd, 0xff, 0x53, 0xb0, 0x3d, 0x7c, 0x37, + 0xb0, 0x3c, 0x0b, 0xc9, 0x87, 0xbd, 0x32, 0xd1, 0xe6, 0xbb, 0xc0, 0x2f, 0xcf, + 0x3d, 0x42, 0x5e, 0xb5, 0x3d, 0xd4, 0xbf, 0x36, 0xbd, 0x26, 0xd8, 0xf1, 0xbd, + 0xf3, 0x8b, 0xc2, 0x3d, 0x1d, 0xd9, 0xe7, 0xbb, 0xab, 0xf9, 0x16, 0x3d, 0x13, + 0x82, 0x93, 0x3d, 0x5e, 0xab, 0xbc, 0xbd, 0x57, 0xf5, 0x2f, 0x3c, 0x86, 0x19, + 0x96, 0x3c, 0x17, 0xb1, 0x3e, 0x3d, 0xcd, 0xfd, 0x72, 0xbd, 0xae, 0x8d, 0xbf, + 0x3c, 0x5e, 0x94, 0x5c, 0x3d, 0x16, 0x67, 0x88, 0x3d, 0xf1, 0xcb, 0x43, 0xbd, + 0xc5, 0x5e, 0x6b, 0xbd, 0xa0, 0xc2, 0xdb, 0x3d, 0x94, 0x36, 0x11, 0xbd, 0x26, + 0xb6, 0xb2, 0xbd, 0xe6, 0x9d, 0x93, 0xbd, 0x66, 0x04, 0x5e, 0xbd, 0xed, 0xfe, + 0xaf, 0xbb, 0xbc, 0x70, 0x50, 0x3d, 0x0a, 0xeb, 0xd0, 0xbd, 0x3d, 0x06, 0xb5, + 0x3d, 0xa7, 0x77, 0x31, 0xbd, 0x5f, 0x4b, 0xa6, 0xbd, 0x9b, 0x0f, 0x96, 0xbc, + 0x7e, 0x02, 0xd4, 0xbc, 0x39, 0x52, 0xc4, 0xbd, 0xc3, 0x4e, 0x09, 0x3e, 0x5c, + 0xc9, 0x48, 0x3d, 0xa4, 0x28, 0x36, 0xbd, 0xe3, 0xa7, 0x31, 0x3b, 0xdd, 0x29, + 0xf4, 0x3d, 0x30, 0x52, 0x76, 0x3d, 0x10, 0xa8, 0x27, 0x3c, 0x0c, 0x16, 0x56, + 0x3d, 0x84, 0xd6, 0x1a, 0xbd, 0x34, 0xea, 0xaa, 0x3c, 0x8b, 0xaa, 0x50, 0xbc, + 0x02, 0x56, 0xc2, 0x3c, 0xee, 0x61, 0xe8, 0xbd, 0xf2, 0xaa, 0xb0, 0x3d, 0x22, + 0xd5, 0x23, 0x3e, 0x2d, 0x7d, 0x62, 0xbd, 0x8a, 0x95, 0x6d, 0xbc, 0x6a, 0xaf, + 0xb4, 0xbb, 0x34, 0x65, 0xad, 0x3d, 0x14, 0xff, 0xda, 0xbd, 0x43, 0xdc, 0x04, + 0xbd, 0x26, 0xed, 0xa8, 0xbd, 0x97, 0xc7, 0xc3, 0x3d, 0x76, 0x2d, 0xd3, 0xbc, + 0xe1, 0xc3, 0xbd, 0xbd, 0x75, 0x52, 0xca, 0x3c, 0x84, 0xfa, 0x13, 0x3c, 0x2e, + 0xea, 0x00, 0xbd, 0xb9, 0xbc, 0xcf, 0x3d, 0xcb, 0x67, 0x65, 0xbd, 0xda, 0x95, + 0xac, 0xbd, 0x51, 0x71, 0xed, 0x3c, 0xaf, 0xe1, 0x2c, 0xbd, 0xbf, 0x09, 0x2c, + 0xba, 0xd1, 0xdc, 0xab, 0xbd, 0x60, 0xab, 0x71, 0xbc, 0x10, 0xa2, 0x2b, 0xbd, + 0xb7, 0xba, 0x8f, 0xbd, 0x5e, 0x4b, 0x18, 0x3d, 0x4f, 0x72, 0xa6, 0xbc, 0xbb, + 0x54, 0xc5, 0x3d, 0x2a, 0x54, 0xeb, 0xbd, 0x5b, 0x2e, 0x67, 0xbd, 0xc0, 0xd2, + 0x61, 0x3b, 0x30, 0x8d, 0x34, 0x3d, 0xaa, 0x2e, 0xfe, 0xbc, 0x37, 0xa2, 0x7b, + 0xbd, 0xb0, 0x0d, 0x7c, 0xbd, 0x05, 0x3f, 0x39, 0x3d, 0x52, 0xfc, 0xb2, 0x3d, + 0xe8, 0x4a, 0xe6, 0xbd, 0x49, 0x3f, 0xd0, 0x3c, 0x1d, 0x43, 0x1a, 0xbd, 0x52, + 0xcc, 0xc7, 0x3d, 0x6a, 0x3f, 0x72, 0x3b, 0x47, 0x6e, 0xdb, 0xbd, 0x6b, 0x97, + 0xc2, 0xbd, 0xa0, 0x78, 0xe5, 0xbc, 0x01, 0xb0, 0xd8, 0xbc, 0xd0, 0x9f, 0x9f, + 0xbc, 0x51, 0x99, 0x79, 0x3d, 0xf1, 0xd4, 0x1d, 0x3b, 0xe6, 0x19, 0x78, 0x3c, + 0xb0, 0x8a, 0x8e, 0xbd, 0x90, 0xfc, 0xc9, 0x3d, 0x91, 0xe7, 0x85, 0x3d, 0xdd, + 0xe2, 0x09, 0x3d, 0xb6, 0xf7, 0x5a, 0xbd, 0x26, 0xe8, 0xdc, 0xbd, 0x42, 0xca, + 0x18, 0xbd, 0x2a, 0x1d, 0xb4, 0xbd, 0x83, 0x0b, 0xf1, 0x3a, 0xbd, 0x7b, 0x15, + 0x3c, 0xf1, 0x7b, 0xa6, 0xbd, 0x55, 0xe4, 0x4d, 0xbd, 0xed, 0x07, 0xf8, 0xbc, + 0xf3, 0x73, 0xa0, 0x3d, 0x75, 0x8a, 0xc5, 0xbd, 0x44, 0x2f, 0x7f, 0x3d, 0x35, + 0x6c, 0x87, 0x3c, 0x61, 0x2c, 0x4b, 0xbc, 0x67, 0xde, 0x7d, 0xbd, 0x17, 0xaf, + 0xe9, 0x3c, 0xaa, 0xd5, 0x0c, 0x3d, 0x98, 0xf5, 0xd8, 0xbc, 0x86, 0xa5, 0x2c, + 0xbb, 0xad, 0x8e, 0x43, 0x3d, 0xd2, 0x59, 0xbd, 0xbd, 0x94, 0xc9, 0x69, 0xbd, + 0x15, 0xa0, 0x81, 0x3d, 0x18, 0x49, 0x1e, 0x3d, 0xe7, 0xd7, 0xb5, 0xbd, 0x1f, + 0x20, 0x10, 0xbd, 0xb0, 0x8b, 0xe0, 0xbd, 0xe0, 0x7c, 0x46, 0x3d, 0x1f, 0xc6, + 0x5c, 0xbd, 0xbc, 0xc1, 0x1b, 0x3d, 0xc1, 0x1c, 0xc5, 0xbd, 0xf3, 0x52, 0x48, + 0xbb, 0x39, 0x79, 0x86, 0x3d, 0x72, 0xbd, 0x36, 0x3c, 0xa5, 0xd7, 0x95, 0xbd, + 0x73, 0xe0, 0x13, 0x3c, 0xe4, 0x9a, 0x50, 0xbd, 0x90, 0x58, 0x93, 0xbd, 0x3d, + 0x9e, 0xac, 0x3d, 0x57, 0x08, 0xbb, 0x3d, 0x4e, 0xaf, 0x84, 0xbd, 0xdc, 0x16, + 0xbc, 0xbd, 0x51, 0x1a, 0xbf, 0x3d, 0x62, 0x61, 0x97, 0x3d, 0x7a, 0xeb, 0x45, + 0x3d, 0xa1, 0x27, 0xe7, 0x3d, 0x20, 0xcb, 0x45, 0xbd, 0xc3, 0x36, 0xda, 0x3d, + 0xa2, 0x88, 0x48, 0x3d, 0x7c, 0x0d, 0x0d, 0x3b, 0x00, 0xa8, 0xaf, 0xbd, 0xda, + 0x09, 0x51, 0xbd, 0xbd, 0xb3, 0x99, 0xbc, 0x6e, 0x40, 0x6a, 0xbd, 0x31, 0xdb, + 0x71, 0x3c, 0x14, 0x0e, 0x0b, 0xbd, 0xe8, 0x4f, 0xae, 0xbd, 0xbb, 0xf3, 0xd4, + 0x3d, 0xad, 0xdb, 0x8d, 0x3c, 0x72, 0x12, 0x66, 0xbd, 0x1f, 0xea, 0x98, 0xbd, + 0xf7, 0xd0, 0x68, 0x3d, 0x47, 0x27, 0x13, 0x3d, 0xe9, 0x9d, 0xa2, 0xbd, 0x01, + 0x07, 0xa9, 0x3d, 0x81, 0xa9, 0xa2, 0x3c, 0x54, 0x75, 0xb5, 0xbc, 0xbc, 0x9f, + 0x8e, 0x3c, 0xdd, 0x55, 0x8c, 0x3c, 0xf6, 0x8f, 0xdc, 0x3d, 0x63, 0x45, 0xe7, + 0x3c, 0xc2, 0x06, 0x48, 0x3c, 0x63, 0x7a, 0xe9, 0xbd, 0xb0, 0x14, 0x3f, 0x3d, + 0x1b, 0x99, 0xe4, 0xbd, 0x0d, 0xa5, 0x89, 0x3d, 0x5d, 0x1e, 0xc4, 0xbd, 0x9b, + 0x12, 0x8e, 0x3d, 0x47, 0xa7, 0xb6, 0xbc, 0xc7, 0x3f, 0xf3, 0xbd, 0x82, 0x32, + 0x8f, 0xbd, 0xed, 0x11, 0xbe, 0x3d, 0xe4, 0x1e, 0xc6, 0xbc, 0x9d, 0x73, 0xee, + 0xbd, 0xce, 0x18, 0xe3, 0xbd, 0x3f, 0x2c, 0x90, 0xbd, 0xc6, 0x82, 0xad, 0x3d, + 0xa4, 0x9e, 0xf1, 0xbd, 0x6e, 0x4f, 0xe7, 0x3d, 0x63, 0x8b, 0x28, 0xbd, 0x0a, + 0x66, 0x80, 0xbd, 0xa0, 0xa5, 0x84, 0xbd, 0xb0, 0xce, 0xbb, 0xbd, 0x72, 0xba, + 0xa1, 0xbd, 0x42, 0x55, 0xa6, 0xbd, 0x36, 0x00, 0xce, 0x3d, 0x11, 0x44, 0xbc, + 0x3b, 0xb4, 0x63, 0xa9, 0x3d, 0x07, 0x61, 0x9b, 0x3d, 0x50, 0xb7, 0xb3, 0xbd, + 0xe1, 0xcc, 0x74, 0xbd, 0xa1, 0x8e, 0x6c, 0x3d, 0xa6, 0x54, 0xb6, 0xbd, 0xce, + 0xde, 0xb4, 0x3c, 0x29, 0xd3, 0x31, 0xbc, 0x74, 0x1c, 0x78, 0xbd, 0xa7, 0xa4, + 0x25, 0xbb, 0x01, 0xe0, 0x85, 0x3d, 0x67, 0xc7, 0xbd, 0xbc, 0xae, 0xdb, 0x3a, + 0xbd, 0xaa, 0x9c, 0xdd, 0xbd, 0x7a, 0x65, 0xaa, 0xbc, 0x11, 0x1d, 0x53, 0xbd, + 0xc0, 0xf8, 0x3a, 0xbd, 0x50, 0xd4, 0x84, 0xbc, 0x3b, 0x49, 0x7f, 0xbd, 0x44, + 0x79, 0xde, 0x3d, 0xb9, 0x83, 0xfb, 0x3d, 0x12, 0x34, 0x8d, 0xbd, 0x0a, 0x31, + 0xf0, 0x3c, 0x16, 0x71, 0x4e, 0xbd, 0xc4, 0x6a, 0x5f, 0x3d, 0x5a, 0xbe, 0x7e, + 0x3d, 0xca, 0x56, 0xe7, 0xbc, 0xe7, 0xa1, 0xb8, 0xbd, 0xf7, 0xac, 0x17, 0x3d, + 0xf1, 0x7c, 0x83, 0xbd, 0xe4, 0x5f, 0xec, 0xbd, 0x18, 0x92, 0xa9, 0xbb, 0x71, + 0x9a, 0x3d, 0xbd, 0xd1, 0x18, 0x20, 0xbd, 0x94, 0xfa, 0xbd, 0x3d, 0x2f, 0x1f, + 0x85, 0xbd, 0xc1, 0xc3, 0xa3, 0x3d, 0x36, 0xdb, 0x96, 0x3d, 0xa5, 0xae, 0x4e, + 0xbc, 0xaa, 0x11, 0x9c, 0xbd, 0x44, 0xa2, 0x95, 0x3d, 0xe7, 0x39, 0x73, 0x3b, + 0x1d, 0x57, 0x86, 0xbd, 0x14, 0x17, 0xa7, 0xbd, 0xaf, 0xc3, 0x09, 0xbd, 0x2f, + 0x90, 0x20, 0xbd, 0x08, 0x91, 0x9c, 0x3c, 0x88, 0x0c, 0xd1, 0x3d, 0x56, 0x99, + 0x9d, 0xbd, 0xb3, 0x75, 0xb2, 0x3d, 0xa1, 0x04, 0x59, 0xbb, 0x44, 0x0a, 0x6f, + 0x3b, 0x5a, 0x42, 0xce, 0xbd, 0x1b, 0x3b, 0x91, 0x3d, 0x14, 0xb8, 0xdf, 0xbd, + 0x85, 0x51, 0x8c, 0xbc, 0xa7, 0xd5, 0x5f, 0x3d, 0xe7, 0x88, 0x61, 0xbd, 0x97, + 0x11, 0xd9, 0x39, 0x5c, 0x0b, 0x6d, 0xbd, 0xe4, 0xe3, 0xb1, 0xbd, 0xeb, 0xfe, + 0xeb, 0xbd, 0xd3, 0x37, 0x66, 0x3c, 0x4b, 0x72, 0x49, 0xbd, 0x12, 0x06, 0xbf, + 0x3b, 0x12, 0x40, 0x77, 0x3d, 0x7c, 0x9d, 0x92, 0x3d, 0xb2, 0xcd, 0xad, 0x3d, + 0xb2, 0xe3, 0x65, 0x3d, 0x91, 0x55, 0xbd, 0x3c, 0x31, 0x00, 0xc0, 0xbd, 0xc9, + 0x3b, 0x46, 0x3d, 0x51, 0xd9, 0xa6, 0x3d, 0xb9, 0xcb, 0xaf, 0xbd, 0xf8, 0x85, + 0xd4, 0xbd, 0x47, 0x6f, 0xf2, 0xbd, 0x70, 0xd4, 0x13, 0x3d, 0x2c, 0x38, 0x55, + 0x3d, 0x61, 0x11, 0xd7, 0x3d, 0x62, 0x90, 0xed, 0xbc, 0xd0, 0x71, 0x79, 0xbd, + 0xc5, 0xc9, 0x87, 0xbd, 0x6d, 0x23, 0x96, 0xbc, 0xc1, 0x06, 0x9b, 0xbd, 0xc8, + 0x2d, 0xfc, 0xbc, 0x79, 0x8d, 0xb8, 0xbd, 0xb3, 0x32, 0xca, 0xbc, 0x17, 0x71, + 0xd3, 0xbd, 0x51, 0x07, 0xc6, 0xbc, 0x59, 0x04, 0x49, 0x3d, 0x15, 0x14, 0x8a, + 0xbd, 0xd0, 0xae, 0xa4, 0xbd, 0x4c, 0x5f, 0xdd, 0x3d, 0xb5, 0x52, 0xbc, 0x3b, + 0x4d, 0xca, 0x3f, 0xbd, 0x85, 0x21, 0xb0, 0xbd, 0x9e, 0x8b, 0xc3, 0xbd, 0x51, + 0xd9, 0xa8, 0x3d, 0x53, 0x49, 0xd1, 0x3c, 0x35, 0x6f, 0xe3, 0xbd, 0x7f, 0xe2, + 0x9e, 0xbd, 0x42, 0xd8, 0x14, 0xbd, 0x00, 0x6f, 0x19, 0x3d, 0xe1, 0x4e, 0x53, + 0x3d, 0xda, 0xc8, 0x66, 0xbd, 0xf1, 0x51, 0xea, 0xbd, 0x8a, 0x7f, 0xbb, 0x3d, + 0xa6, 0x85, 0x10, 0xbd, 0x4e, 0xcc, 0xd7, 0x3d, 0x8b, 0x94, 0xad, 0xbd, 0xaa, + 0x92, 0x92, 0xbc, 0xdb, 0xcd, 0x3a, 0x3d, 0x43, 0x71, 0x99, 0x3d, 0xa0, 0xeb, + 0xe1, 0x3d, 0xbe, 0x5e, 0xe3, 0x3c, 0x43, 0x28, 0x98, 0xbd, 0x04, 0x2b, 0x96, + 0xbd, 0xc6, 0x1a, 0x21, 0xbb, 0xce, 0xba, 0xd3, 0xbd, 0x57, 0xee, 0x04, 0x3d, + 0x87, 0xf6, 0x8a, 0xbb, 0xda, 0x72, 0x99, 0x3d, 0xcb, 0x2f, 0x8a, 0x3d, 0x1f, + 0x20, 0xb5, 0xbd, 0xbe, 0x1f, 0x1e, 0xbd, 0x17, 0x5e, 0x84, 0xbd, 0xfd, 0xce, + 0xb2, 0xbd, 0xfc, 0xcc, 0x74, 0x3d, 0x66, 0x53, 0xca, 0x3c, 0x35, 0x5e, 0x9e, + 0x3d, 0x6c, 0x9b, 0xb4, 0x3d, 0x08, 0xbd, 0x90, 0x3d, 0x45, 0xc0, 0xc1, 0xbd, + 0x83, 0x2c, 0xd3, 0xbc, 0x85, 0xa9, 0x81, 0xbc, 0xa4, 0x47, 0xbc, 0x3d, 0xc2, + 0xc6, 0x91, 0xbb, 0x45, 0xf7, 0x51, 0x3d, 0x7c, 0x74, 0x32, 0x3d, 0x64, 0x6d, + 0x67, 0xbd, 0xaf, 0x34, 0x37, 0x3d, 0xea, 0xb0, 0x95, 0xbd, 0xe6, 0x42, 0x22, + 0x3d, 0xe4, 0x2b, 0xf9, 0xbd, 0x27, 0x85, 0x8c, 0xbc, 0x57, 0x16, 0xd4, 0x3d, + 0x0d, 0x41, 0xb9, 0xbc, 0xde, 0xf7, 0xb3, 0xbc, 0xb1, 0x86, 0x5a, 0x3d, 0x16, + 0x06, 0x99, 0x3d, 0x36, 0x5c, 0xf2, 0x3d, 0x96, 0x49, 0xfc, 0xbd, 0xd0, 0xda, + 0x0b, 0xbd, 0x74, 0x35, 0xfd, 0x3d, 0x3c, 0x9d, 0x12, 0xbd, 0x88, 0xae, 0xc0, + 0xbd, 0xd6, 0xe7, 0x5e, 0x3d, 0x31, 0x3f, 0xba, 0xbd, 0x0a, 0x05, 0xb9, 0xbd, + 0x8d, 0xe3, 0x35, 0xbd, 0x83, 0xd0, 0x26, 0xbd, 0x04, 0xba, 0x97, 0xbc, 0x46, + 0x99, 0xbf, 0xbd, 0xa1, 0x44, 0x75, 0x3b, 0xb8, 0x9b, 0x07, 0x3e, 0x32, 0xe6, + 0xd5, 0xbd, 0xc0, 0x9f, 0xf3, 0x3d, 0x7f, 0x4f, 0x36, 0xbc, 0x42, 0xda, 0xe3, + 0x3d, 0x3b, 0xb2, 0x5c, 0x3c, 0x97, 0x30, 0xd7, 0x3d, 0x51, 0xe8, 0xea, 0xbc, + 0x6e, 0x73, 0x4d, 0x3d, 0x2f, 0x77, 0xb5, 0x3b, 0x0b, 0x79, 0xc1, 0x3c, 0x2f, + 0xd9, 0x8c, 0xbd, 0x0e, 0x78, 0xbf, 0xbd, 0x3c, 0xec, 0x84, 0x3d, 0x59, 0xa9, + 0xaa, 0xbd, 0x35, 0xdc, 0xe4, 0xbd, 0x91, 0xcf, 0x2e, 0x3d, 0x3c, 0x17, 0x0d, + 0xbc, 0x10, 0xd0, 0xf9, 0x3d, 0xab, 0xca, 0xf9, 0xbd, 0x4b, 0xd7, 0x9b, 0x3d, + 0xd0, 0x10, 0xc9, 0xbd, 0x11, 0x82, 0x05, 0x3e, 0xd0, 0x14, 0x21, 0xbd, 0x6d, + 0x61, 0x99, 0xbd, 0xae, 0x85, 0x7a, 0xbd, 0x67, 0xc0, 0x86, 0xbb, 0x1e, 0xd0, + 0xbf, 0x3d, 0x92, 0x46, 0xf8, 0xbc, 0x0d, 0xad, 0xa1, 0x3c, 0xea, 0x8d, 0xd0, + 0x3c, 0x61, 0x10, 0x49, 0x3c, 0x8a, 0x7e, 0xe9, 0xbc, 0x31, 0x95, 0xdf, 0xb9, + 0xb5, 0x03, 0x0d, 0x3d, 0x0b, 0xf5, 0xd9, 0xbb, 0xba, 0x95, 0x8f, 0xbd, 0x7c, + 0x81, 0xde, 0xbd, 0xfc, 0x64, 0xcb, 0x3d, 0x0e, 0x80, 0x2c, 0x3d, 0x64, 0xa8, + 0x0b, 0x3d, 0x58, 0xd7, 0xcc, 0xbc, 0x06, 0x10, 0x81, 0x3d, 0xd6, 0x24, 0x2f, + 0xbe, 0x2f, 0x77, 0x4e, 0xbd, 0x53, 0x72, 0x1a, 0xbd, 0xc1, 0x05, 0x6e, 0x3d, + 0x0b, 0x99, 0x8e, 0xbd, 0x30, 0x10, 0x04, 0xbd, 0xc3, 0x1c, 0x00, 0xbd, 0xf1, + 0x16, 0xba, 0xbd, 0x00, 0x43, 0x03, 0xbc, 0xb8, 0x2d, 0xf4, 0x3c, 0x18, 0x18, + 0x4d, 0x3d, 0x70, 0x7c, 0x99, 0xb9, 0x49, 0xef, 0xd2, 0xbc, 0x8a, 0xa4, 0x11, + 0x3d, 0xe4, 0x8b, 0x5b, 0xbc, 0x16, 0xc1, 0x8c, 0xb9, 0x71, 0xa4, 0x37, 0x3d, + 0xb2, 0xa4, 0xb0, 0x3c, 0x79, 0x6c, 0x8a, 0x3d, 0xb6, 0x86, 0x96, 0x3c, 0x06, + 0xd1, 0x58, 0xbd, 0xae, 0x40, 0x92, 0xbc, 0x4c, 0x63, 0xa7, 0x3d, 0xac, 0x67, + 0xb4, 0xbd, 0x5b, 0xda, 0x17, 0xbd, 0xeb, 0xfc, 0x09, 0x3d, 0x44, 0x95, 0x68, + 0x3c, 0x03, 0xee, 0xd7, 0x3d, 0x57, 0x9f, 0xc2, 0x3d, 0x9c, 0xa6, 0xe7, 0x3b, + 0xff, 0x8e, 0xcd, 0xbc, 0x22, 0x41, 0xf7, 0x3c, 0x19, 0xe0, 0x1d, 0xbd, 0xae, + 0xcc, 0xe2, 0x3b, 0x70, 0xb1, 0x9f, 0x3d, 0xd8, 0x1d, 0xb7, 0x3d, 0xa1, 0xde, + 0x4d, 0x3c, 0x12, 0xb6, 0x08, 0x3e, 0x1d, 0x9c, 0xbf, 0x3d, 0xd8, 0x48, 0x4a, + 0xbb, 0x07, 0xd1, 0x5e, 0xbd, 0xd3, 0x82, 0xb1, 0x3d, 0x82, 0xef, 0x8d, 0x3d, + 0x40, 0x79, 0xe5, 0xbc, 0x3f, 0x85, 0x8b, 0x3d, 0x6a, 0xa3, 0xa7, 0xbd, 0xed, + 0xd4, 0xaf, 0xbd, 0x15, 0xf2, 0x96, 0xbd, 0x16, 0x8b, 0xf2, 0xbc, 0xdc, 0x5f, + 0xc8, 0xbd, 0xef, 0x46, 0xb3, 0xbd, 0x41, 0x7a, 0x8c, 0xbd, 0x24, 0xfe, 0x62, + 0xbd, 0xdf, 0xab, 0x89, 0xbb, 0xa9, 0x9c, 0xd6, 0x3d, 0xf5, 0xc0, 0x2c, 0x3d, + 0x20, 0x81, 0xef, 0x3d, 0x1d, 0x1f, 0xd8, 0x3d, 0xe3, 0xea, 0xb7, 0xbc, 0xe5, + 0x98, 0xb7, 0x3d, 0x97, 0x67, 0x48, 0x3d, 0x42, 0x5e, 0x10, 0xbe, 0x52, 0xdd, + 0xb2, 0xbd, 0x79, 0x0f, 0x60, 0x3d, 0x7e, 0xc5, 0x1c, 0x3d, 0x9b, 0x47, 0x8a, + 0xbd, 0xfe, 0x5a, 0x90, 0xba, 0xb3, 0x60, 0x7e, 0xbd, 0x59, 0x16, 0x7e, 0xbd, + 0xb6, 0xb7, 0x01, 0x3d, 0x0d, 0x3c, 0xed, 0xbc, 0x0d, 0x44, 0x3c, 0xbb, 0x77, + 0x3f, 0xf6, 0xbc, 0x74, 0x91, 0xb9, 0x3d, 0x15, 0xa6, 0x38, 0xbd, 0x6f, 0xa1, + 0x39, 0x3d, 0xc8, 0x2e, 0xd8, 0x3d, 0x70, 0xf9, 0x7c, 0xbc, 0x17, 0x9c, 0xa5, + 0x3a, 0xfd, 0x15, 0x0a, 0x3d, 0x55, 0x8c, 0xa7, 0x3d, 0xff, 0x06, 0x22, 0xbd, + 0x2d, 0x31, 0x15, 0xbe, 0x70, 0x92, 0x92, 0xbd, 0x29, 0x8a, 0x0d, 0x3b, 0x6b, + 0xca, 0x3d, 0xbd, 0xf2, 0xe1, 0x28, 0xbc, 0x36, 0x7a, 0x44, 0xbc, 0xea, 0x62, + 0xd9, 0x3a, 0xd2, 0xdd, 0x9e, 0xbc, 0xda, 0xce, 0x16, 0xbe, 0x79, 0x5e, 0x97, + 0x3b, 0x26, 0x34, 0x38, 0xbd, 0x77, 0x5d, 0x97, 0x3c, 0xc6, 0xcb, 0x84, 0xbd, + 0xed, 0xa4, 0xda, 0x3d, 0xd2, 0x4f, 0x6d, 0xbc, 0x35, 0x16, 0xdc, 0xbd, 0xea, + 0xfb, 0x08, 0xbe, 0x84, 0xea, 0x1e, 0xbd, 0x0e, 0x3a, 0x60, 0xb8, 0x4f, 0x4b, + 0x0a, 0xbe, 0xfe, 0x33, 0x87, 0x3d, 0x63, 0x5e, 0x8d, 0x3d, 0x68, 0x29, 0x17, + 0x3e, 0xa5, 0x25, 0x8f, 0xbc, 0x0a, 0x09, 0x78, 0xbd, 0x43, 0x98, 0x6d, 0xbd, + 0x98, 0xa8, 0xa0, 0xbd, 0x7c, 0xa3, 0x13, 0x3d, 0xd4, 0xb8, 0x6d, 0xbc, 0x20, + 0x1f, 0xc5, 0xbc, 0x06, 0xb5, 0x16, 0x3e, 0xcd, 0x4d, 0x90, 0xbd, 0xb8, 0xcc, + 0xd4, 0x3d, 0xbd, 0xe9, 0xd1, 0xbd, 0x90, 0x68, 0xcf, 0x3d, 0xa7, 0xc6, 0x08, + 0xbe, 0x1c, 0xe5, 0x5c, 0xbd, 0x6e, 0x56, 0xa6, 0x3d, 0x74, 0x4f, 0xa5, 0x3d, + 0x96, 0x2b, 0x5a, 0x3d, 0xbe, 0xc6, 0x9b, 0xbd, 0x94, 0x33, 0x18, 0x3d, 0x57, + 0x1a, 0x6b, 0xbd, 0xd7, 0x3d, 0x03, 0xbe, 0x6a, 0x36, 0x65, 0xbd, 0x13, 0x36, + 0xbf, 0x3d, 0x82, 0x9a, 0x0a, 0x3d, 0x3c, 0x1d, 0xca, 0xbd, 0x0c, 0x40, 0x0e, + 0xbe, 0x3f, 0x94, 0xae, 0xbd, 0x1f, 0x7e, 0x89, 0x3d, 0xe3, 0xbf, 0x30, 0xbe, + 0x7a, 0x48, 0x23, 0x3a, 0xe5, 0x0e, 0x5d, 0x3d, 0x91, 0xd3, 0xf2, 0x3d, 0xb6, + 0xef, 0x4a, 0xbd, 0xd4, 0xb3, 0x08, 0xbe, 0xa9, 0xba, 0xac, 0x3d, 0x31, 0x40, + 0x86, 0x3d, 0xc2, 0xc7, 0x04, 0xbe, 0x7c, 0x3b, 0xdb, 0x3d, 0x11, 0x25, 0x04, + 0xbd, 0x3f, 0x5d, 0xf3, 0xbc, 0xc2, 0x3f, 0xfb, 0x3c, 0x12, 0xac, 0xf4, 0xbd, + 0xa7, 0xc4, 0x32, 0x3c, 0xc9, 0xea, 0xe3, 0x3c, 0x7d, 0xda, 0x36, 0x3c, 0x43, + 0x55, 0x09, 0x3e, 0x5f, 0xd8, 0x22, 0xbd, 0x33, 0xf5, 0x29, 0x3e, 0xb8, 0x23, + 0x8a, 0xbc, 0xfb, 0x3f, 0x52, 0xbe, 0xec, 0x1c, 0x79, 0x3d, 0x09, 0x9e, 0x24, + 0xbd, 0x5b, 0x3c, 0xd3, 0xbd, 0x9f, 0x0b, 0x1f, 0x3e, 0x1f, 0xa2, 0xfc, 0xbd, + 0x3b, 0x42, 0x9b, 0x3b, 0x0a, 0xae, 0xc4, 0xbc, 0x8b, 0xc8, 0xa7, 0x3d, 0x88, + 0xaa, 0x9b, 0xbd, 0xaa, 0x37, 0xb6, 0x3d, 0x0d, 0x6a, 0x15, 0x3d, 0x47, 0xa8, + 0x87, 0x3d, 0x53, 0xb1, 0xe3, 0x3d, 0xf7, 0x63, 0x0e, 0x3c, 0x37, 0x70, 0x8e, + 0xbc, 0xc5, 0x5c, 0x32, 0xbe, 0x72, 0x7a, 0xd5, 0x3d, 0xcb, 0xac, 0xc7, 0xbd, + 0x6f, 0xf1, 0x3a, 0xbd, 0x74, 0x40, 0x99, 0x3d, 0x35, 0x16, 0x88, 0xbc, 0xb4, + 0x80, 0x14, 0x3e, 0x0b, 0x98, 0xd9, 0x3c, 0xa7, 0x98, 0x17, 0xbc, 0x6e, 0xd0, + 0x60, 0xbb, 0xd9, 0xc2, 0x8f, 0x3d, 0xea, 0x37, 0xe1, 0xbd, 0x00, 0x42, 0xfd, + 0x3d, 0xde, 0xb0, 0x3a, 0x3d, 0x4f, 0xe2, 0x50, 0x3c, 0x76, 0x9f, 0x42, 0xbd, + 0x73, 0x18, 0x4e, 0xbe, 0x9b, 0xfd, 0x69, 0xbd, 0x69, 0xb2, 0x88, 0xbc, 0x6a, + 0x13, 0x3e, 0xbd, 0x29, 0xf0, 0x0c, 0x3c, 0x1f, 0x81, 0x18, 0x3d, 0x03, 0x2e, + 0x0c, 0x3e, 0xff, 0xf1, 0x4a, 0xbc, 0xb7, 0x9c, 0x14, 0xbe, 0xd5, 0x52, 0xce, + 0xbd, 0xf6, 0x45, 0xf0, 0x3d, 0x8d, 0xc8, 0x55, 0xbd, 0x8f, 0xf0, 0x88, 0x3d, + 0x8c, 0x8f, 0x20, 0xbd, 0x38, 0x7c, 0x4d, 0x3e, 0x6d, 0xba, 0x95, 0xbd, 0xdc, + 0x7b, 0x0d, 0xbe, 0x3d, 0xbf, 0x2d, 0x3c, 0xee, 0xf6, 0xcb, 0x3c, 0x42, 0x85, + 0x2e, 0x3d, 0x43, 0x4c, 0xb3, 0x3d, 0xe6, 0x70, 0x91, 0xbd, 0x58, 0x98, 0xfd, + 0x3d, 0x70, 0x75, 0x52, 0xbd, 0xb7, 0x44, 0x34, 0xbe, 0x62, 0x65, 0xdc, 0xbd, + 0xb8, 0xc7, 0x83, 0x3c, 0x0d, 0x0a, 0xaa, 0xbd, 0x09, 0xcb, 0x92, 0x3c, 0xbd, + 0x5d, 0xc7, 0xb9, 0x3a, 0x4e, 0xa6, 0xbd, 0xd8, 0xfb, 0xa6, 0xbd, 0xcd, 0xfc, + 0x72, 0xbe, 0x12, 0xdc, 0x4d, 0xbd, 0x0a, 0x7c, 0x5d, 0x3d, 0x8c, 0xce, 0x7a, + 0x3d, 0xe8, 0x3d, 0x83, 0xbd, 0x0d, 0x6c, 0x9e, 0x3d, 0x14, 0xb3, 0x3c, 0x3d, + 0x05, 0x0e, 0xdf, 0x3d, 0xf7, 0x27, 0xb7, 0xbd, 0xa3, 0x18, 0x08, 0x3d, 0x54, + 0xdb, 0x6a, 0x3c, 0x93, 0x1a, 0x80, 0xbd, 0xf9, 0x13, 0x05, 0x3e, 0xd9, 0x61, + 0x87, 0x3d, 0x08, 0xa5, 0x9b, 0xbd, 0x70, 0x5d, 0xc9, 0xbc, 0x9b, 0x99, 0x94, + 0xbd, 0xc5, 0x6e, 0xd4, 0xbd, 0xc8, 0x60, 0xad, 0x3d, 0x29, 0x62, 0x05, 0xbd, + 0x83, 0xd8, 0xc1, 0xbd, 0xa2, 0x72, 0xf1, 0x3d, 0x57, 0x3f, 0x2e, 0xbb, 0xb8, + 0x1a, 0xcf, 0xbc, 0xc3, 0xda, 0x96, 0xbd, 0xd3, 0xbc, 0x81, 0xbd, 0xca, 0x52, + 0xa1, 0xbb, 0xe8, 0xaf, 0x6a, 0x3d, 0x49, 0xaa, 0xf8, 0x3c, 0x5f, 0x2a, 0x9a, + 0xbd, 0xcb, 0x12, 0x6b, 0xbd, 0xc9, 0x4a, 0x8f, 0xbc, 0xce, 0x3c, 0xfd, 0x3d, + 0x71, 0x17, 0xed, 0x3d, 0x54, 0x40, 0xea, 0xbd, 0xcb, 0x7f, 0x2d, 0xbd, 0x2c, + 0x13, 0x86, 0x3d, 0xcd, 0x8c, 0x44, 0xbd, 0xe4, 0x65, 0xa6, 0xbb, 0x06, 0x81, + 0x04, 0x3d, 0x64, 0x45, 0x8e, 0x3d, 0xef, 0x80, 0x22, 0xbd, 0x35, 0x90, 0xaa, + 0xbd, 0x02, 0xb6, 0x48, 0x3d, 0x76, 0xba, 0x39, 0x3d, 0xf3, 0xce, 0x66, 0xbd, + 0x3f, 0x8e, 0xf1, 0xbd, 0x2a, 0x81, 0x0e, 0xbd, 0x82, 0x05, 0x0b, 0x3e, 0x7b, + 0xdb, 0x2f, 0x3d, 0x86, 0xe3, 0xba, 0x3d, 0xac, 0x47, 0x17, 0x3e, 0xcb, 0x96, + 0x8f, 0x3c, 0x3b, 0x58, 0xe7, 0xbd, 0x38, 0x64, 0x46, 0xbe, 0x9e, 0x73, 0x88, + 0xbd, 0x0f, 0xf0, 0x8e, 0xbd, 0xc1, 0x4c, 0x00, 0xbd, 0x70, 0xbb, 0x54, 0xbd, + 0x74, 0x55, 0x20, 0x3b, 0x1f, 0x22, 0x8d, 0x3d, 0xc9, 0x1d, 0xce, 0x3c, 0xad, + 0x53, 0x3f, 0x3d, 0x7e, 0xd8, 0xb2, 0x3d, 0x9e, 0xc0, 0xf5, 0x3d, 0x79, 0x01, + 0x32, 0xbd, 0x49, 0x13, 0x2e, 0x3d, 0xff, 0x7a, 0xce, 0x3d, 0xb5, 0xbc, 0x46, + 0x3d, 0x43, 0xa5, 0xc8, 0xbd, 0xf2, 0x4d, 0xd3, 0x3b, 0x78, 0x3e, 0x39, 0x3d, + 0x2c, 0x01, 0xc7, 0xbd, 0x5d, 0x5b, 0x8d, 0xbd, 0xb1, 0x3b, 0xa3, 0xbd, 0x1f, + 0x70, 0x6e, 0x3c, 0x62, 0x07, 0x58, 0xbd, 0x29, 0xd9, 0xc8, 0xba, 0x13, 0xa6, + 0xd3, 0xbd, 0xc1, 0x45, 0xbf, 0xbc, 0x3e, 0x9f, 0xea, 0xbc, 0x7c, 0x4d, 0xcc, + 0x3d, 0x6c, 0x0c, 0x2e, 0xbd, 0xcf, 0xa0, 0x9a, 0x3b, 0x83, 0x9e, 0xfa, 0xbd, + 0x77, 0x21, 0xaa, 0x3d, 0xcf, 0x18, 0xf5, 0xbd, 0xfe, 0x30, 0x79, 0x3d, 0x24, + 0x33, 0x4d, 0x3d, 0xf7, 0x5f, 0x54, 0x3d, 0xda, 0x9d, 0xc9, 0xbd, 0x28, 0x08, + 0x16, 0x3d, 0x53, 0x5a, 0xf6, 0xbc, 0xa5, 0x86, 0x84, 0xbd, 0x91, 0x39, 0xc5, + 0xbc, 0x54, 0x2b, 0xda, 0xbd, 0x49, 0x34, 0xae, 0xbd, 0x9d, 0xad, 0x3a, 0xbd, + 0x43, 0x59, 0xf1, 0x3d, 0x5c, 0xef, 0x06, 0x3e, 0xc7, 0xe0, 0x32, 0x3d, 0x43, + 0xb3, 0x87, 0x3d, 0x12, 0x6c, 0x02, 0xbe, 0x9c, 0xdc, 0x02, 0x3e, 0x22, 0xcc, + 0x1b, 0xbe, 0x46, 0x37, 0xe8, 0x3d, 0xf0, 0x11, 0x3b, 0xbd, 0x0d, 0x62, 0x51, + 0x3d, 0x8b, 0x64, 0x2f, 0x3d, 0x57, 0x97, 0x5e, 0x3d, 0x53, 0xdd, 0xd6, 0x3c, + 0x00, 0xf5, 0xfb, 0xbc, 0x6f, 0x83, 0xea, 0x3b, 0xec, 0x88, 0x20, 0xbb, 0xe5, + 0x7f, 0xe6, 0x3d, 0xe6, 0xc4, 0xb5, 0x3d, 0x05, 0x76, 0x0f, 0xbe, 0x4a, 0x2f, + 0x61, 0xbd, 0xa0, 0x69, 0xe2, 0x3d, 0xab, 0xc9, 0xb4, 0x3d, 0xeb, 0xd7, 0x88, + 0xbc, 0x8f, 0x65, 0xfb, 0xbd, 0xc5, 0xca, 0x93, 0xbc, 0x1f, 0xe5, 0xa9, 0x3d, + 0x0b, 0x34, 0x06, 0x3e, 0xbd, 0x9e, 0xe1, 0x3d, 0x58, 0x9d, 0xec, 0xbd, 0x60, + 0x28, 0xe3, 0xbc, 0x62, 0x2e, 0x85, 0x3d, 0xec, 0x10, 0xb6, 0x3d, 0xd4, 0x0e, + 0x55, 0x3d, 0x6a, 0xd9, 0x22, 0xbd, 0xa4, 0x2c, 0xb0, 0xbd, 0x8f, 0x8c, 0x8b, + 0x3d, 0x05, 0xa0, 0xbb, 0x3d, 0x7b, 0xf7, 0xc0, 0x3d, 0xca, 0x2f, 0x90, 0xbc, + 0x07, 0x79, 0xe3, 0xbd, 0x8b, 0x7d, 0x83, 0xbd, 0xfe, 0x8a, 0x93, 0xbc, 0xc0, + 0xe9, 0xd0, 0x3d, 0xfb, 0x88, 0x76, 0xbc, 0x2d, 0x4b, 0x99, 0x3c, 0x69, 0x04, + 0xd3, 0x3c, 0xb6, 0xd2, 0x88, 0x3d, 0xeb, 0xe2, 0x71, 0xbd, 0xa8, 0xb5, 0x98, + 0x3d, 0x08, 0x79, 0xea, 0xbd, 0x7c, 0x53, 0x03, 0xbd, 0xb1, 0xda, 0xf9, 0xbd, + 0xf1, 0x53, 0x83, 0xbc, 0xa0, 0xb3, 0x49, 0xbd, 0x7c, 0x79, 0x07, 0x3c, 0x68, + 0x60, 0x21, 0x3c, 0xb1, 0x1f, 0x38, 0x3d, 0x5d, 0x0c, 0x4e, 0x3d, 0x36, 0x83, + 0x62, 0x3c, 0x87, 0x96, 0x22, 0xbd, 0xd2, 0x3a, 0x09, 0x3c, 0xa2, 0x6e, 0x7a, + 0xbd, 0x54, 0xc7, 0x31, 0xbc, 0x3a, 0x58, 0x1e, 0xbd, 0x51, 0x31, 0x94, 0x3d, + 0x28, 0x85, 0xde, 0xbc, 0x52, 0x0e, 0xce, 0xbd, 0x79, 0x6a, 0xfb, 0xbd, 0x0f, + 0x76, 0x14, 0xbd, 0xb4, 0xf0, 0xb3, 0x3c, 0x30, 0x4e, 0xab, 0xbd, 0xbc, 0x21, + 0x2a, 0x3d, 0xa7, 0x29, 0x93, 0x3d, 0x05, 0x5e, 0x79, 0x3c, 0xc0, 0xdc, 0x93, + 0xbd, 0x8c, 0x46, 0xd3, 0x3d, 0x6d, 0xef, 0x21, 0x3d, 0xcd, 0x62, 0xe5, 0x3d, + 0xf2, 0x5f, 0xbc, 0xbd, 0xec, 0xb5, 0x6e, 0x3d, 0x8f, 0xdd, 0xd1, 0x3c, 0xb6, + 0x13, 0x93, 0xbd, 0x1e, 0x1d, 0x0a, 0x3e, 0xfe, 0x00, 0x0a, 0x3d, 0xfe, 0xea, + 0x70, 0x3c, 0x1e, 0x69, 0x94, 0xbd, 0x54, 0x92, 0xdf, 0x3d, 0x8d, 0xc4, 0xe3, + 0xbd, 0xa8, 0x26, 0xc1, 0x3d, 0x90, 0x69, 0x97, 0x3d, 0x5f, 0xf7, 0x21, 0x3e, + 0xd8, 0xf4, 0x13, 0x3d, 0x8e, 0x0f, 0x2a, 0x3d, 0x1a, 0xf3, 0xe8, 0x3d, 0xb1, + 0x70, 0x75, 0xbd, 0x3d, 0x10, 0x87, 0x3d, 0xf2, 0x55, 0x8f, 0xbd, 0x7f, 0x15, + 0x07, 0xbe, 0xe0, 0x3c, 0xba, 0x3d, 0x6d, 0x1f, 0xc2, 0xbc, 0xd6, 0xbf, 0x2c, + 0xbd, 0x01, 0x4c, 0x87, 0x3c, 0xd8, 0xe5, 0x93, 0x3d, 0x6e, 0x5a, 0x12, 0x3d, + 0xff, 0x3a, 0xd1, 0x3d, 0xfa, 0x05, 0x0a, 0x3d, 0x5a, 0xce, 0xa3, 0xbc, 0xc5, + 0x2b, 0xd8, 0x3d, 0x98, 0xb3, 0xce, 0xbd, 0x6b, 0x72, 0x90, 0x3d, 0xa7, 0x35, + 0xbb, 0xbd, 0xe2, 0xcb, 0xae, 0xbc, 0x8e, 0xe3, 0x74, 0x3d, 0xcd, 0x32, 0xcf, + 0xbd, 0x76, 0x8d, 0x1d, 0x3d, 0x27, 0xc5, 0x0c, 0xbe, 0x27, 0x7e, 0x6c, 0xbd, + 0x54, 0xf1, 0xdb, 0x3d, 0x39, 0x03, 0xed, 0xbc, 0xd7, 0x4b, 0xe1, 0x3a, 0x19, + 0x67, 0x90, 0x3d, 0xf5, 0x03, 0x89, 0x3d, 0x31, 0x9d, 0xd4, 0x3a, 0x06, 0x9d, + 0x05, 0x3e, 0xde, 0xaf, 0x63, 0xbd, 0xed, 0xfe, 0x54, 0x3c, 0xdd, 0x40, 0xc5, + 0xbd, 0xf5, 0x54, 0x0d, 0xbc, 0x3e, 0xaa, 0xcd, 0x3c, 0x08, 0x18, 0xbf, 0xbd, + 0x79, 0x2e, 0x90, 0xbd, 0x15, 0xe3, 0x8a, 0x3d, 0x7b, 0x54, 0x7c, 0xbd, 0x85, + 0x07, 0xd0, 0x3d, 0xfb, 0x39, 0x01, 0xbd, 0x12, 0x57, 0xf0, 0xbd, 0x56, 0x7c, + 0x8d, 0xbd, 0xae, 0x9e, 0xaf, 0x3c, 0x90, 0xc3, 0x85, 0x3d, 0x9c, 0x00, 0x88, + 0x3d, 0x1f, 0x9a, 0x8f, 0xbd, 0x80, 0xef, 0xc4, 0xb9, 0x60, 0xba, 0x5b, 0xbd, + 0x05, 0x25, 0xd8, 0x3c, 0x76, 0x60, 0x6d, 0x3d, 0xc5, 0xf0, 0xe1, 0x3c, 0x0d, + 0x00, 0xf7, 0x3d, 0x57, 0xb7, 0x24, 0x3d, 0x2c, 0x11, 0x06, 0xbe, 0x48, 0x15, + 0x5b, 0xbd, 0x0c, 0x67, 0x22, 0xbd, 0xc9, 0x10, 0x07, 0x3c, 0x69, 0x42, 0xbb, + 0xbd, 0x5b, 0x32, 0xb8, 0xbd, 0x62, 0x5e, 0x35, 0xbd, 0xfc, 0xe1, 0x22, 0xbd, + 0xff, 0xb3, 0x51, 0xbd, 0x6e, 0x4d, 0x2d, 0x3c, 0xfb, 0xca, 0xc5, 0xbd, 0x15, + 0x16, 0x32, 0x3d, 0x50, 0xff, 0xbe, 0xbd, 0xf7, 0x84, 0x5e, 0xbb, 0x27, 0xa2, + 0x17, 0x3c, 0x83, 0x85, 0xda, 0xbd, 0xd3, 0x8f, 0xd8, 0x3d, 0x19, 0xd4, 0x9d, + 0xbd, 0x05, 0x56, 0xbd, 0x3b, 0x80, 0x5c, 0x8d, 0xbd, 0x02, 0x07, 0x01, 0x3e, + 0x46, 0x0a, 0xd0, 0x3c, 0x28, 0x0a, 0x74, 0x3d, 0x45, 0xd8, 0x9c, 0x3d, 0x51, + 0x8c, 0xe1, 0x3d, 0x94, 0x9d, 0x44, 0xbc, 0x1a, 0xfd, 0x6d, 0x3d, 0x6a, 0xa7, + 0x00, 0x3e, 0x03, 0xb0, 0xa5, 0xbd, 0x84, 0xb6, 0x94, 0x3c, 0x6e, 0x1b, 0xd2, + 0xbd, 0xff, 0xcf, 0xbd, 0xbd, 0x7f, 0x7c, 0x6c, 0xbd, 0xa0, 0xb0, 0x4a, 0xbd, + 0x8c, 0xfc, 0xca, 0xbc, 0xf4, 0xa1, 0x81, 0xbd, 0x22, 0xad, 0xe2, 0x3c, 0xfa, + 0x91, 0xaf, 0x3d, 0xf4, 0x2e, 0x19, 0xbd, 0x0b, 0x57, 0x71, 0xbc, 0x21, 0xca, + 0x8d, 0x3c, 0xee, 0x8c, 0x2b, 0x3a, 0x46, 0x1a, 0xc1, 0xbb, 0x51, 0xbe, 0x2c, + 0xbd, 0xc0, 0x3f, 0x40, 0x3d, 0xb2, 0xbb, 0x96, 0x3d, 0x88, 0x43, 0x23, 0xbe, + 0x26, 0xd9, 0xe8, 0xbd, 0xf7, 0xfc, 0x9d, 0xbd, 0x4e, 0xf6, 0xd3, 0xbc, 0x2a, + 0xda, 0xba, 0xbd, 0xe1, 0x21, 0xe1, 0x3d, 0x81, 0xea, 0x2e, 0xbd, 0xde, 0xaa, + 0xd2, 0xbb, 0xde, 0x20, 0xbe, 0x3d, 0x15, 0x2f, 0x44, 0x3d, 0x37, 0x58, 0x6e, + 0xbd, 0xcd, 0x34, 0x4c, 0xbb, 0x8d, 0xad, 0x08, 0xbc, 0xd9, 0xe2, 0x21, 0x3d, + 0xfe, 0x8b, 0xab, 0x3d, 0xa2, 0x7f, 0x47, 0xbd, 0xad, 0xbe, 0xe3, 0xbc, 0x5f, + 0x5d, 0x20, 0x3d, 0xa7, 0xa7, 0x19, 0xbe, 0x27, 0x1b, 0x8a, 0xbd, 0x2e, 0xcf, + 0x4d, 0x3d, 0x68, 0x43, 0xb0, 0x3d, 0x54, 0xe8, 0xec, 0x3b, 0x5f, 0x47, 0x57, + 0xbd, 0xde, 0x1b, 0xc4, 0x3d, 0xd2, 0x08, 0xfa, 0xbb, 0x23, 0x97, 0xe5, 0x3d, + 0xb3, 0x70, 0x6b, 0x3d, 0x33, 0x68, 0x2a, 0xbc, 0xbb, 0xc7, 0xb5, 0xbd, 0x31, + 0xe2, 0xcd, 0xbd, 0xe3, 0x77, 0x44, 0x3d, 0xb1, 0xf5, 0x60, 0x3d, 0x03, 0x24, + 0xf7, 0xbd, 0x6c, 0x04, 0xb0, 0x3c, 0xba, 0x53, 0xa9, 0xbd, 0xcb, 0x94, 0x03, + 0xbe, 0x19, 0x25, 0xfc, 0xbb, 0x8d, 0xaf, 0xe5, 0x3d, 0x95, 0xec, 0xa3, 0x3d, + 0xca, 0x8d, 0xcb, 0xbd, 0x71, 0x02, 0xee, 0x3c, 0x31, 0x55, 0xdf, 0xbd, 0x85, + 0xd6, 0x69, 0x3d, 0xa1, 0xd8, 0x1d, 0x3d, 0xd6, 0x60, 0x12, 0xbb, 0x46, 0x47, + 0x46, 0x3d, 0x75, 0xf9, 0x97, 0x3d, 0x4c, 0xd5, 0x87, 0x3d, 0xc4, 0x77, 0xb7, + 0x3c, 0x0a, 0xd5, 0x08, 0x3d, 0x7f, 0x4d, 0x74, 0xbd, 0xdd, 0x0e, 0x07, 0xbe, + 0x0d, 0xb1, 0x51, 0xbb, 0x95, 0xf0, 0xa7, 0x3d, 0x8d, 0xdc, 0xe7, 0xbd, 0x11, + 0x22, 0xd1, 0x3d, 0x81, 0xad, 0x8c, 0x3d, 0x51, 0x36, 0x1e, 0x3d, 0xe3, 0x75, + 0x01, 0x3e, 0xa1, 0xd1, 0x9a, 0x3d, 0x4f, 0xd4, 0xc4, 0x3d, 0x50, 0x2a, 0x61, + 0x3c, 0x9a, 0xd5, 0xbd, 0xbd, 0x37, 0xd1, 0xd5, 0x3c, 0xd5, 0x83, 0x8e, 0x3d, + 0xbd, 0x05, 0xb6, 0xbb, 0x52, 0x6b, 0x66, 0x3d, 0x25, 0xcb, 0x0c, 0xbe, 0x3a, + 0xff, 0xd3, 0xbd, 0xaf, 0xdc, 0xb3, 0xbd, 0xde, 0xdf, 0x06, 0x3d, 0x91, 0x0f, + 0xc8, 0xbd, 0x62, 0xa1, 0x8f, 0xbc, 0x1c, 0x36, 0x40, 0x3c, 0x7d, 0x4f, 0xfa, + 0x3d, 0x99, 0x76, 0xd5, 0x3d, 0xc3, 0x21, 0x5c, 0xbb, 0x61, 0x54, 0x52, 0xbc, + 0xc4, 0x07, 0x9b, 0xbd, 0xb3, 0x00, 0x44, 0xbc, 0xbe, 0x1b, 0x06, 0xbd, 0x35, + 0x4c, 0x5d, 0x3d, 0x6b, 0x45, 0x17, 0xbd, 0x10, 0xd6, 0xe5, 0xbd, 0x40, 0x57, + 0x83, 0x3d, 0x62, 0xd1, 0x64, 0xbd, 0x79, 0x90, 0xbd, 0xbc, 0xce, 0xf0, 0x07, + 0x3e, 0xc0, 0xbd, 0xaf, 0x3d, 0x88, 0xe1, 0x84, 0xbd, 0xf0, 0xdb, 0x4c, 0x3d, + 0x17, 0x35, 0x02, 0x3b, 0x30, 0x1c, 0xed, 0xbd, 0x4f, 0xfc, 0xda, 0x3d, 0x92, + 0x80, 0x87, 0xbc, 0x02, 0x74, 0x1a, 0xbe, 0xdc, 0xb1, 0xb3, 0xbd, 0x6c, 0x01, + 0xc0, 0xbc, 0x8f, 0x2d, 0x8c, 0x3d, 0xf5, 0x96, 0xc0, 0xbd, 0x77, 0xbc, 0x7f, + 0xbd, 0x8a, 0x64, 0xf1, 0x3c, 0xb7, 0x6c, 0xb4, 0xbd, 0x1c, 0x6f, 0x84, 0x3d, + 0xa1, 0xd5, 0xc0, 0xbd, 0xbf, 0x63, 0xd4, 0x3d, 0xd6, 0xd7, 0xe7, 0x3d, 0x89, + 0x1e, 0x64, 0x3c, 0xf3, 0x81, 0xbe, 0xbd, 0xb3, 0x57, 0xe9, 0xbd, 0x84, 0x5e, + 0x9a, 0x3d, 0x77, 0x22, 0x01, 0xbe, 0x53, 0xa3, 0xb8, 0xbd, 0xc0, 0x62, 0xff, + 0x3b, 0x9a, 0xfb, 0xbd, 0x3d, 0x13, 0x1a, 0xeb, 0x3b, 0x3b, 0x96, 0x78, 0x3d, + 0xfc, 0xc6, 0x93, 0x3d, 0xfc, 0x33, 0x92, 0x3d, 0xcc, 0xc1, 0x62, 0xbd, 0x63, + 0x7c, 0x77, 0xbd, 0x69, 0x92, 0x05, 0xbd, 0xbd, 0xee, 0xb8, 0x3a, 0xa2, 0x9d, + 0x0e, 0xbe, 0xf3, 0xba, 0xed, 0xbd, 0x2f, 0x6a, 0xaa, 0x3d, 0x77, 0x4a, 0xc6, + 0x3d, 0x4f, 0xe7, 0xa8, 0x3d, 0x1e, 0x3f, 0xbb, 0xbd, 0xae, 0x6c, 0xb8, 0xbc, + 0x75, 0xf1, 0x6d, 0xbd, 0xc1, 0x5d, 0x11, 0xbe, 0x2b, 0xe2, 0x4f, 0xbd, 0x54, + 0x21, 0xf6, 0x3b, 0x5c, 0xe2, 0x96, 0x3c, 0xbe, 0xe8, 0x2e, 0x3d, 0x38, 0x39, + 0x93, 0x3c, 0xc3, 0x50, 0xbc, 0x3d, 0x67, 0x1d, 0xc4, 0x3d, 0xe6, 0x29, 0x56, + 0xbc, 0x4d, 0x70, 0x4d, 0x3c, 0xd2, 0xca, 0xc4, 0xbd, 0xa1, 0x30, 0x3b, 0xbd, + 0x97, 0x9b, 0xb5, 0xbd, 0x65, 0x99, 0x9b, 0xbd, 0xb5, 0x65, 0xb7, 0xbd, 0x51, + 0xe1, 0x9a, 0xbd, 0x2f, 0x56, 0x4a, 0xbb, 0x9c, 0x68, 0x98, 0xbd, 0x36, 0x75, + 0x73, 0xbd, 0x19, 0xe1, 0x83, 0xbd, 0x37, 0x69, 0xee, 0x3d, 0xe7, 0xd1, 0xad, + 0xbd, 0x3b, 0x29, 0x95, 0xbd, 0xcd, 0x10, 0x75, 0x3d, 0xb4, 0x82, 0xc2, 0xbc, + 0x72, 0xd7, 0x91, 0x3d, 0xc8, 0x77, 0x49, 0xbd, 0x96, 0x67, 0x4d, 0xbd, 0xc5, + 0x75, 0x98, 0xbd, 0x96, 0x67, 0xcc, 0x3d, 0xba, 0x7a, 0x1e, 0xbe, 0x30, 0x3a, + 0x02, 0x3d, 0xc1, 0xf8, 0x78, 0x3d, 0x46, 0xfc, 0xc1, 0x3d, 0x99, 0x3c, 0xc5, + 0xbd, 0xbc, 0x69, 0x39, 0x3d, 0x7f, 0x95, 0xf0, 0x3b, 0x50, 0x78, 0x57, 0xbd, + 0xfa, 0xf7, 0xa9, 0xbc, 0xb2, 0xae, 0x2b, 0x3c, 0x22, 0x75, 0x0d, 0x3e, 0x63, + 0xaa, 0x03, 0x3d, 0xfa, 0x00, 0xd7, 0x3d, 0xc3, 0xcb, 0x60, 0x3c, 0xab, 0xf2, + 0x61, 0x3c, 0x1b, 0x9a, 0x38, 0xbd, 0x1a, 0x33, 0xef, 0xbd, 0x9e, 0x11, 0xc5, + 0x3d, 0xf5, 0xb1, 0x99, 0xbc, 0x65, 0xee, 0x5e, 0xbc, 0xde, 0x02, 0xe8, 0xbd, + 0xef, 0x87, 0x58, 0x3d, 0x0e, 0x01, 0xcf, 0x3d, 0x51, 0xf7, 0xcb, 0xbc, 0x9e, + 0x48, 0x50, 0xbd, 0xd2, 0xc8, 0x88, 0xbc, 0x56, 0x0a, 0x18, 0x3e, 0x49, 0xa6, + 0xce, 0xbd, 0x9d, 0x8d, 0xf4, 0x3d, 0xd9, 0x71, 0x7e, 0x3d, 0x49, 0xcb, 0x67, + 0x3d, 0x3d, 0x4f, 0xdb, 0x3c, 0x8c, 0x3b, 0xaa, 0xbd, 0xce, 0xc4, 0x1f, 0x3d, + 0xda, 0x94, 0xaa, 0x3c, 0x4c, 0xae, 0x89, 0x3d, 0xac, 0x7e, 0x8d, 0x3d, 0xff, + 0xfe, 0xf7, 0x3d, 0x89, 0xba, 0xbd, 0xbd, 0x98, 0xc1, 0x5c, 0x3d, 0x9a, 0xcf, + 0x1b, 0xba, 0xdb, 0x22, 0xf3, 0x3d, 0x3a, 0xa6, 0x58, 0xbd, 0x6b, 0x7d, 0x2b, + 0x3d, 0x22, 0x6f, 0xa2, 0xbd, 0x95, 0xf3, 0x07, 0x3e, 0x14, 0xfb, 0x7a, 0x3d, + 0xda, 0x56, 0x40, 0xbd, 0x85, 0xe7, 0xcf, 0xbd, 0x7f, 0x4c, 0xb8, 0x3c, 0xf0, + 0x6d, 0xc1, 0xbd, 0xb1, 0x01, 0xbd, 0x3d, 0xb4, 0xc0, 0xc0, 0xbd, 0x4f, 0x5f, + 0xca, 0xbd, 0x4e, 0x96, 0xe1, 0x3d, 0x92, 0x0a, 0xa6, 0x3d, 0xd6, 0xd9, 0xb7, + 0x3d, 0x8b, 0x52, 0xa8, 0x3d, 0xa9, 0xe6, 0xb4, 0xbc, 0x16, 0x49, 0xc0, 0x3b, + 0xed, 0x64, 0xd1, 0x3d, 0xf1, 0xaf, 0x20, 0xbc, 0x8f, 0x44, 0xd9, 0x3b, 0xc0, + 0x7a, 0xb4, 0x3d, 0x31, 0xb6, 0x15, 0xbe, 0x82, 0x8e, 0x62, 0xbd, 0xb3, 0x93, + 0x1e, 0xbd, 0xae, 0x33, 0x8c, 0xbd, 0x82, 0xf3, 0xa6, 0x3c, 0xd2, 0x41, 0xb2, + 0xbc, 0x58, 0x37, 0xce, 0x3d, 0xb9, 0xd2, 0xce, 0x3d, 0x99, 0x90, 0x69, 0x3d, + 0xc3, 0x4b, 0xc8, 0x3d, 0xba, 0xfa, 0xcb, 0x3d, 0xee, 0x4a, 0xfe, 0xbc, 0x24, + 0xc5, 0x3c, 0xbd, 0x5a, 0x95, 0xb3, 0xbd, 0xb1, 0xc0, 0x1f, 0xbd, 0x61, 0x53, + 0xb4, 0x3c, 0x2e, 0x79, 0xc7, 0xbd, 0xd6, 0x70, 0x9d, 0xbd, 0x9d, 0xe7, 0x16, + 0x3d, 0x4f, 0xe9, 0xa9, 0xbc, 0x7d, 0xbb, 0x7c, 0xbd, 0xf0, 0xdf, 0xe9, 0xbc, + 0x66, 0xc4, 0x3f, 0xbd, 0xfc, 0xd3, 0x20, 0xbd, 0xd3, 0x4f, 0x36, 0xbd, 0x72, + 0x8d, 0xec, 0x3d, 0x79, 0xbc, 0xaa, 0x3d, 0x69, 0x95, 0xe7, 0x3d, 0x46, 0xb6, + 0xcc, 0xbc, 0xdd, 0x97, 0x70, 0xbd, 0x96, 0x31, 0x0c, 0xbe, 0x48, 0x86, 0xeb, + 0x3d, 0x74, 0xf6, 0xa3, 0x3c, 0xe8, 0x26, 0xa1, 0x3d, 0xe3, 0xdd, 0x70, 0xbd, + 0xcf, 0xbd, 0x02, 0x3c, 0x13, 0x3e, 0xbc, 0xbd, 0x69, 0xad, 0x05, 0xbd, 0xc0, + 0xad, 0x53, 0x3c, 0xb6, 0x7c, 0xb2, 0xbd, 0x27, 0xc3, 0xfd, 0xbc, 0x5f, 0x42, + 0xc5, 0x3d, 0x2f, 0x17, 0xd6, 0x3d, 0xb2, 0x68, 0xda, 0xbd, 0x95, 0xe5, 0x4f, + 0x3c, 0xae, 0x99, 0xe4, 0x3d, 0x8f, 0x5c, 0xde, 0xbd, 0xf1, 0x87, 0x02, 0xbb, + 0x17, 0x17, 0x7a, 0x3d, 0x75, 0x72, 0x1f, 0x3d, 0x70, 0x34, 0xa4, 0xbd, 0x43, + 0x2a, 0xb2, 0x3d, 0xd9, 0x5a, 0xc7, 0x3d, 0xa5, 0x58, 0xc6, 0x3d, 0xa3, 0xb8, + 0x76, 0xbd, 0x5b, 0xf5, 0x27, 0x3c, 0x58, 0xfa, 0x60, 0x3c, 0xcc, 0x2e, 0xd4, + 0x3d, 0x71, 0xc3, 0x54, 0x3c, 0x75, 0xe3, 0x6b, 0x3d, 0x29, 0xf3, 0x9a, 0x3d, + 0x9d, 0x62, 0x8b, 0xbd, 0xcd, 0xa8, 0x9f, 0xbd, 0xee, 0xaa, 0xbf, 0x3c, 0xd7, + 0xe4, 0x20, 0xbd, 0x9f, 0x2c, 0xa4, 0x3c, 0x3a, 0x5e, 0x76, 0xbd, 0x9b, 0xcb, + 0x07, 0x3e, 0x3e, 0x33, 0x34, 0x3d, 0x69, 0x57, 0x26, 0x3c, 0xf5, 0x54, 0xef, + 0xbd, 0xf5, 0x3d, 0xe9, 0xbd, 0x8e, 0xed, 0x2b, 0x3d, 0x86, 0xf8, 0xb2, 0x3c, + 0xb2, 0x7f, 0x45, 0x3d, 0xe1, 0x4f, 0xbd, 0x3c, 0xa7, 0xc8, 0x91, 0xbd, 0xea, + 0x4c, 0xc5, 0x3d, 0x7a, 0x60, 0x7c, 0x3d, 0xce, 0x3e, 0xb6, 0x3d, 0xc3, 0x22, + 0x52, 0xbd, 0xbf, 0x54, 0xd3, 0xbc, 0xc7, 0xe0, 0xe1, 0xbd, 0x08, 0x86, 0xc8, + 0x3c, 0x98, 0x6c, 0xc3, 0xbd, 0xe6, 0xe1, 0x25, 0xbd, 0xdb, 0x07, 0x53, 0xbb, + 0xbd, 0x04, 0x5f, 0xbd, 0x12, 0xfd, 0xe6, 0xbd, 0x2d, 0x0f, 0xe8, 0x3d, 0x9e, + 0x08, 0x47, 0x3d, 0x93, 0xc8, 0xdc, 0xbd, 0x97, 0x91, 0xc9, 0xbd, 0xbd, 0x45, + 0x88, 0xbd, 0x45, 0x8e, 0x0b, 0xbe, 0x8f, 0xb7, 0xd1, 0xbd, 0x9b, 0x3c, 0xc2, + 0x3c, 0x04, 0xc5, 0xda, 0xba, 0xce, 0x19, 0x9a, 0x3d, 0xaf, 0xee, 0x25, 0x3e, + 0xdf, 0x56, 0x48, 0xbd, 0x9d, 0x42, 0x02, 0x3e, 0x2c, 0x6a, 0xef, 0x3c, 0x25, + 0x99, 0x07, 0x3c, 0x74, 0xa1, 0xca, 0x3c, 0xae, 0x08, 0x9e, 0x3c, 0xe5, 0xec, + 0x25, 0xbd, 0x63, 0x8f, 0xd5, 0x3d, 0xf3, 0x4a, 0xc5, 0xbc, 0xab, 0x02, 0x53, + 0xbd, 0x3e, 0xec, 0x5e, 0x3d, 0xea, 0xf2, 0x8f, 0x3d, 0xb9, 0xa3, 0x91, 0xbd, + 0xa9, 0x34, 0x93, 0xbd, 0xd4, 0x95, 0x78, 0x3d, 0x84, 0x2b, 0x04, 0x3e, 0xe7, + 0x61, 0x87, 0x3d, 0x41, 0x40, 0xe9, 0x3d, 0x3f, 0xea, 0xdc, 0xbc, 0xc9, 0xfd, + 0xa4, 0x3d, 0xf6, 0xd5, 0x69, 0x3d, 0xa5, 0x93, 0x99, 0xbb, 0x21, 0x84, 0x76, + 0x3d, 0xaa, 0xf2, 0x52, 0x3d, 0xbb, 0x3d, 0x9f, 0xbd, 0xd3, 0xd6, 0x6c, 0x3d, + 0xe6, 0xb2, 0xcc, 0xbc, 0x18, 0x3b, 0x30, 0x3d, 0x25, 0xcf, 0xc5, 0xbc, 0xe0, + 0xfd, 0xb4, 0x3c, 0x5c, 0x92, 0x6b, 0x3d, 0xa8, 0x01, 0x17, 0x3d, 0xf6, 0xed, + 0xa2, 0xbd, 0x42, 0x7b, 0xec, 0x3d, 0x8e, 0x87, 0xd7, 0x3d, 0xfa, 0x30, 0xb7, + 0x3d, 0x54, 0x66, 0x38, 0xbd, 0x68, 0xb5, 0xa9, 0xbd, 0x30, 0x1e, 0x7d, 0x3d, + 0x93, 0xf4, 0xd5, 0xbc, 0x69, 0x6a, 0x98, 0xbd, 0x8f, 0x2b, 0x4f, 0xbd, 0xd3, + 0x99, 0x9a, 0xbd, 0x9b, 0x72, 0xfe, 0xbc, 0xaf, 0xc3, 0xad, 0xbd, 0xe2, 0xdf, + 0xde, 0x3c, 0xdc, 0x3e, 0xd3, 0x3d, 0x46, 0xb7, 0x92, 0xbd, 0x22, 0xd0, 0x21, + 0xbd, 0x7a, 0x5e, 0xae, 0x3c, 0xb6, 0x91, 0xa4, 0x3d, 0xba, 0xda, 0x8f, 0xbc, + 0xad, 0xb4, 0x18, 0x3b, 0xb1, 0x16, 0x9c, 0xbd, 0x2f, 0xf7, 0x89, 0xbd, 0x89, + 0x33, 0xba, 0xbd, 0x03, 0x89, 0x61, 0xbd, 0xa8, 0x17, 0x50, 0xbd, 0xf5, 0xfe, + 0x1a, 0x3d, 0xd2, 0x25, 0x02, 0x3d, 0xbb, 0xc9, 0x67, 0xbd, 0xc8, 0x32, 0xe0, + 0x3d, 0x8e, 0xb2, 0x9e, 0xbd, 0x57, 0x57, 0x2a, 0xbc, 0xb4, 0xc4, 0x76, 0x3d, + 0xfd, 0x46, 0x11, 0x3b, 0x38, 0x45, 0xe8, 0x3a, 0x90, 0x49, 0xc6, 0xbd, 0xc3, + 0x50, 0x0b, 0xbe, 0x19, 0xca, 0xd9, 0x3d, 0x17, 0x4d, 0xe0, 0x3d, 0x68, 0x36, + 0x3f, 0xbc, 0x3a, 0x6e, 0xda, 0xbd, 0x50, 0xd8, 0xde, 0x3d, 0x6f, 0x09, 0x29, + 0xbe, 0x9d, 0x50, 0x03, 0xbd, 0x9a, 0x25, 0xf6, 0xbd, 0x43, 0xa2, 0xbc, 0x3d, + 0x9a, 0x55, 0xa5, 0x3d, 0xa9, 0x0d, 0x2f, 0xbd, 0x5c, 0x8e, 0x22, 0xbd, 0x2e, + 0xc1, 0x58, 0xbd, 0x5a, 0x05, 0x2c, 0xbd, 0xec, 0x19, 0xa1, 0xbd, 0xd7, 0x75, + 0x7b, 0x3d, 0x9a, 0xcf, 0x82, 0x3c, 0x46, 0xc6, 0xff, 0x3c, 0x37, 0xc8, 0xca, + 0x3d, 0xa0, 0xb7, 0x28, 0x3d, 0xaa, 0xb5, 0x2f, 0x3d, 0xaa, 0xa3, 0x9e, 0xbb, + 0x01, 0x2b, 0xd6, 0xbd, 0xa5, 0x6d, 0xb1, 0x3d, 0x2c, 0x3d, 0x97, 0xbc, 0x63, + 0xfb, 0x18, 0xbe, 0xb9, 0xa9, 0xcb, 0x3d, 0xb0, 0x7d, 0xb4, 0x3d, 0x22, 0x6a, + 0x65, 0x3d, 0x7a, 0xaf, 0xf5, 0xba, 0xed, 0x29, 0x0e, 0x3d, 0x5c, 0xd5, 0x6f, + 0xbd, 0xbe, 0xd9, 0xa0, 0xbc, 0x05, 0x8b, 0xe2, 0x3c, 0x35, 0xec, 0x8b, 0xbc, + 0xa9, 0x59, 0x0d, 0x3c, 0x0b, 0x4c, 0x56, 0x3c, 0x39, 0x59, 0xad, 0xbd, 0x41, + 0x06, 0xe3, 0xbd, 0xb1, 0xcd, 0xaa, 0x3d, 0xa8, 0xcc, 0xa1, 0xbd, 0x35, 0x63, + 0x36, 0xbd, 0x44, 0xf9, 0x43, 0x3c, 0xee, 0x2c, 0xdb, 0x3c, 0x79, 0xd4, 0x78, + 0x3d, 0x81, 0x34, 0x96, 0x3d, 0xc0, 0x43, 0xda, 0x3b, 0x9f, 0x9c, 0x0b, 0xbd, + 0xaf, 0x07, 0xac, 0x3d, 0xcf, 0xe3, 0xf0, 0x3c, 0x44, 0x9b, 0xf8, 0x3d, 0xd4, + 0x1f, 0x4e, 0xbd, 0xa6, 0xab, 0x9f, 0x3d, 0xcb, 0xd4, 0x30, 0x3d, 0x4b, 0xd4, + 0x17, 0x3d, 0x7e, 0xf2, 0x3d, 0x3b, 0x47, 0x47, 0xac, 0x3b, 0x2f, 0xda, 0xa8, + 0xbd, 0xb0, 0x53, 0xde, 0xbd, 0x2e, 0x06, 0xdc, 0x3d, 0x9a, 0x92, 0x9a, 0xbd, + 0x86, 0xf9, 0xf2, 0xbd, 0xb0, 0x9b, 0xd6, 0xbd, 0x8f, 0x36, 0x53, 0x3d, 0x09, + 0x68, 0x99, 0x3d, 0x25, 0xbb, 0xeb, 0x3d, 0x76, 0x5e, 0xfb, 0xbc, 0x24, 0x11, + 0x05, 0xbd, 0xcf, 0xaf, 0xb7, 0xbd, 0x97, 0xcd, 0x65, 0xbd, 0xeb, 0x59, 0xf7, + 0xb8, 0x95, 0x28, 0xb1, 0xbc, 0xff, 0xba, 0x91, 0xbd, 0x58, 0x33, 0xf0, 0x3c, + 0x42, 0x68, 0xd9, 0xbd, 0xa7, 0x71, 0x95, 0xbb, 0x41, 0x0b, 0x6a, 0x3d, 0xe4, + 0x83, 0x06, 0x3d, 0xae, 0x90, 0xa0, 0xbd, 0xfe, 0xf5, 0x27, 0xbd, 0x7f, 0xdc, + 0xb4, 0x3d, 0x32, 0xf0, 0x75, 0xbd, 0x99, 0xfa, 0x7b, 0x3d, 0x5f, 0xca, 0x7a, + 0x3d, 0xd9, 0x7e, 0x49, 0xbd, 0x7f, 0x2b, 0x5b, 0x3d, 0x02, 0x92, 0x46, 0xbb, + 0x20, 0x77, 0x5b, 0x3c, 0x57, 0xa6, 0xd1, 0x3a, 0x74, 0x68, 0xb2, 0xbd, 0xa2, + 0x4c, 0x0a, 0xbe, 0xb9, 0xcf, 0x43, 0xbd, 0xd6, 0x2e, 0x2d, 0xbc, 0x0f, 0x5d, + 0xde, 0x3d, 0xfc, 0xdc, 0x1c, 0xb9, 0x6d, 0x7b, 0x91, 0xbc, 0x33, 0x39, 0x97, + 0x3d, 0x37, 0xcf, 0x1f, 0x3d, 0xb3, 0x0b, 0xe3, 0x3d, 0x45, 0xbe, 0xa0, 0x3d, + 0xda, 0x7c, 0x0e, 0x3d, 0x66, 0xd7, 0x25, 0xbd, 0xa7, 0xe0, 0x0f, 0x3d, 0xd2, + 0x48, 0x8f, 0xbc, 0x2b, 0xbd, 0x9a, 0x3d, 0xf9, 0xe3, 0xd9, 0x3d, 0x0d, 0x1e, + 0xf3, 0x3c, 0x12, 0xc5, 0xfe, 0xbc, 0x59, 0x75, 0x9f, 0x3c, 0x76, 0x0e, 0x46, + 0xbd, 0xa3, 0x5d, 0xb9, 0x3d, 0x8c, 0x5a, 0xc9, 0x3c, 0xb5, 0x90, 0xbd, 0x3d, + 0xe5, 0xaa, 0x42, 0x3d, 0xaf, 0x43, 0x9b, 0xbd, 0x50, 0x0e, 0xc9, 0xbc, 0xea, + 0x53, 0x75, 0x3d, 0xfd, 0x0d, 0x4b, 0x3d, 0x7d, 0xc8, 0x17, 0x3d, 0xdd, 0xf0, + 0xb5, 0xbd, 0x00, 0x53, 0xf4, 0xba, 0xa6, 0x3a, 0x54, 0xbd, 0x7f, 0x57, 0x5f, + 0xbd, 0x00, 0x98, 0x56, 0xbd, 0xe6, 0x33, 0xbe, 0x3c, 0xe2, 0x66, 0x96, 0x3c, + 0x41, 0x08, 0x88, 0x3c, 0x66, 0x40, 0x88, 0xbd, 0xfd, 0x89, 0xbb, 0x3d, 0xa6, + 0xde, 0x99, 0x3a, 0xa4, 0x22, 0xf4, 0x3c, 0x94, 0xbc, 0xaf, 0xbd, 0x94, 0x01, + 0xcd, 0xbd, 0x89, 0x93, 0x0d, 0x3d, 0x74, 0x5a, 0xdf, 0x3b, 0x5b, 0x0a, 0xce, + 0xbd, 0xee, 0x6d, 0x87, 0x3d, 0x7c, 0x6a, 0xb0, 0x3d, 0x6d, 0xb0, 0x7b, 0x3c, + 0x6f, 0xb8, 0x4e, 0x3d, 0x06, 0x6a, 0x25, 0xbd, 0x7c, 0xb9, 0xcc, 0x3d, 0xf5, + 0x54, 0xb0, 0xbd, 0xf3, 0xf9, 0xe1, 0xbd, 0xcf, 0x6d, 0x91, 0x3c, 0x8d, 0x15, + 0xa4, 0x3c, 0x15, 0xa1, 0x86, 0x3d, 0x47, 0x35, 0xc3, 0xbd, 0x34, 0xa8, 0x16, + 0xbd, 0x11, 0xda, 0x49, 0x3d, 0x45, 0xb4, 0x61, 0x3d, 0x41, 0x15, 0xbf, 0xbc, + 0xd4, 0x07, 0xfa, 0x3d, 0xb0, 0x3a, 0x18, 0x3d, 0xda, 0x7f, 0x69, 0xbd, 0x6b, + 0xec, 0x9f, 0xbd, 0x6e, 0xfc, 0xe6, 0x3d, 0xc9, 0x5d, 0xb4, 0x3d, 0xa2, 0x1d, + 0x12, 0xbc, 0x51, 0x23, 0xce, 0xbd, 0x0a, 0x20, 0x86, 0xbc, 0xc4, 0x1f, 0xbe, + 0x3d, 0x18, 0x10, 0x6a, 0x3d, 0xe1, 0x58, 0x9f, 0x3c, 0x22, 0x7f, 0xc9, 0xbc, + 0x1a, 0xed, 0x1e, 0xbe, 0x47, 0x93, 0x87, 0x3c, 0x4d, 0x77, 0x31, 0xbc, 0xf9, + 0x29, 0xb2, 0x3d, 0xa9, 0xb3, 0x77, 0xbd, 0x43, 0x16, 0x0a, 0x3d, 0x88, 0x2f, + 0x98, 0x3d, 0x3b, 0x7c, 0x2b, 0x3d, 0xfc, 0x29, 0x07, 0x3e, 0xa6, 0x27, 0x93, + 0xbd, 0x5a, 0xa8, 0x13, 0xbe, 0xa8, 0xb8, 0x88, 0xbd, 0x9b, 0x64, 0xc5, 0xbc, + 0xef, 0xb1, 0xe6, 0x3d, 0x33, 0x47, 0xc3, 0x38, 0x56, 0x92, 0x7b, 0xbd, 0x87, + 0x81, 0xc7, 0x3c, 0x94, 0xe2, 0x21, 0x3c, 0xc2, 0x28, 0x75, 0x3d, 0xb7, 0x6f, + 0x8b, 0xbd, 0x2b, 0xdd, 0x09, 0xbc, 0x1f, 0xb9, 0xbc, 0xbd, 0xd6, 0xef, 0x90, + 0xbd, 0x52, 0xc7, 0xa5, 0xbc, 0xf7, 0x2c, 0x4d, 0x3c, 0xc7, 0xfe, 0x94, 0x3c, + 0x24, 0x12, 0x46, 0xbc, 0x95, 0x3b, 0x59, 0x3c, 0x64, 0x96, 0xd7, 0xbc, 0xb3, + 0x3c, 0xc7, 0xbd, 0xe6, 0x41, 0xbc, 0x3d, 0x70, 0xd8, 0x5c, 0x3b, 0xe2, 0x16, + 0x88, 0xbd, 0x21, 0x12, 0xfc, 0x3d, 0xbd, 0x55, 0x1e, 0xbe, 0x3a, 0xf9, 0x1f, + 0xbd, 0x59, 0xd3, 0x27, 0xbd, 0x14, 0x3b, 0xd7, 0x3d, 0x13, 0xf9, 0x66, 0x3d, + 0x79, 0x92, 0x77, 0xbd, 0x9a, 0x35, 0x63, 0x3d, 0x07, 0xf2, 0x75, 0xbc, 0xc1, + 0x6f, 0x73, 0x3d, 0x0f, 0x02, 0xc2, 0x3c, 0xd0, 0x45, 0x0c, 0x3d, 0x37, 0x87, + 0x5e, 0x3d, 0x03, 0x9e, 0xce, 0x3d, 0x2b, 0x90, 0x13, 0xbd, 0xf4, 0x1a, 0xc5, + 0xbd, 0xdf, 0x42, 0xdb, 0x3d, 0x47, 0x02, 0x58, 0xbd, 0x0f, 0x74, 0x1a, 0xbd, + 0x1d, 0x5f, 0x05, 0x3d, 0x99, 0x81, 0xff, 0xbc, 0x56, 0x85, 0xb3, 0x3d, 0xac, + 0x62, 0x17, 0xbd, 0xaa, 0x30, 0xc3, 0x3d, 0xdc, 0x53, 0x0f, 0xbe, 0x9b, 0x95, + 0x49, 0x3d, 0xf8, 0x4e, 0xa7, 0x3d, 0x76, 0x74, 0x10, 0xbd, 0x2c, 0xe0, 0x9c, + 0x3d, 0x7b, 0xc1, 0xc7, 0xbd, 0x15, 0x39, 0xe6, 0x3d, 0x52, 0xb3, 0xff, 0xbd, + 0x72, 0x77, 0xd3, 0x3d, 0x6a, 0xc4, 0xfb, 0x3c, 0x27, 0x15, 0x5b, 0x3d, 0xba, + 0xa2, 0x6b, 0xbd, 0x2b, 0xbc, 0x02, 0x3e, 0x6c, 0x7c, 0xda, 0x3c, 0x24, 0xa1, + 0x61, 0xbb, 0xfb, 0x9b, 0xc9, 0xbc, 0x20, 0xcb, 0x93, 0xbc, 0x95, 0x98, 0x6c, + 0xbd, 0x96, 0x34, 0xda, 0x3d, 0x5b, 0xa3, 0xe1, 0xbc, 0x71, 0xff, 0x07, 0x3d, + 0x5e, 0x18, 0xd0, 0xbd, 0xc1, 0x9e, 0x26, 0x3e, 0x8b, 0x3d, 0x9c, 0x3d, 0x90, + 0xe5, 0x84, 0x3d, 0x0d, 0xaa, 0x37, 0x3b, 0x99, 0x2d, 0xf6, 0x3c, 0x40, 0x23, + 0xca, 0x3d, 0x1c, 0x56, 0xb4, 0xbd, 0xa9, 0x04, 0x97, 0xbd, 0x41, 0xa7, 0x9e, + 0x3a, 0xb3, 0xfe, 0xb9, 0xbd, 0xf9, 0x34, 0x02, 0xbd, 0x44, 0x97, 0xb4, 0xbd, + 0x67, 0x43, 0x80, 0xbd, 0xb0, 0xce, 0x36, 0xbd, 0x28, 0x48, 0xa2, 0x3d, 0x32, + 0x52, 0xd3, 0x3d, 0x2a, 0xd4, 0x12, 0x3e, 0x8e, 0x41, 0xd5, 0x3c, 0x5e, 0x6b, + 0x64, 0xbd, 0x19, 0x1a, 0xee, 0xbd, 0x91, 0xf3, 0xb1, 0xbb, 0x9e, 0x4f, 0x9b, + 0x3d, 0x50, 0x3a, 0x9d, 0x3d, 0x25, 0xbc, 0xb5, 0xbd, 0xf7, 0xd6, 0x7b, 0x3d, + 0x69, 0x87, 0x94, 0xbb, 0xed, 0x33, 0x31, 0xbd, 0x8f, 0xf3, 0xaa, 0xbd, 0x5b, + 0x0b, 0xc0, 0x3d, 0xd9, 0xac, 0x60, 0xbd, 0x24, 0xa6, 0x9c, 0x3d, 0xfb, 0x17, + 0x3f, 0x3d, 0x49, 0x6a, 0x97, 0x3d, 0x02, 0xe9, 0xef, 0xbd, 0x44, 0xbe, 0xb5, + 0xbc, 0x61, 0x77, 0x94, 0xbb, 0x9e, 0x6d, 0xe1, 0xbc, 0xfa, 0x8c, 0xf2, 0xbc, + 0x9c, 0xfc, 0x45, 0xbd, 0xed, 0x91, 0xde, 0xbd, 0xcd, 0xa8, 0xe7, 0x3d, 0x4e, + 0x05, 0x10, 0xbe, 0x33, 0x4d, 0xa1, 0x3c, 0x01, 0x95, 0x91, 0x3d, 0x33, 0xf9, + 0x13, 0xbd, 0x78, 0x50, 0x03, 0xbd, 0x7f, 0xa1, 0xd7, 0xbd, 0x0f, 0xe3, 0x92, + 0x3d, 0x46, 0x19, 0x9e, 0x3d, 0xa8, 0xa7, 0x06, 0xbc, 0x0e, 0x64, 0xa6, 0x3d, + 0xb4, 0x52, 0xe8, 0xbd, 0x87, 0xc6, 0x8f, 0xbd, 0x50, 0x8c, 0xbf, 0xbb, 0x76, + 0x39, 0x34, 0x3d, 0xd2, 0x2f, 0x0b, 0xbd, 0xf4, 0xa3, 0x51, 0xbd, 0xb0, 0x28, + 0x7d, 0xbd, 0x83, 0x61, 0x57, 0x3d, 0xca, 0x95, 0xb5, 0x3d, 0xdc, 0x22, 0x32, + 0xbc, 0x58, 0xb3, 0x69, 0xbd, 0x09, 0x10, 0x79, 0x3c, 0x3c, 0x79, 0x35, 0xbd, + 0xa0, 0x99, 0xa9, 0xbd, 0xdf, 0x93, 0x18, 0x3e, 0x6f, 0x5f, 0xad, 0x3d, 0xb2, + 0x0b, 0x8e, 0xbd, 0xf5, 0xf2, 0xaa, 0x3d, 0xf2, 0x2e, 0xa9, 0xbd, 0xf6, 0xe2, + 0x23, 0x3d, 0x17, 0xa2, 0xaf, 0x3d, 0xd9, 0x35, 0x8e, 0xbd, 0xf1, 0x8d, 0x08, + 0x3e, 0xcc, 0x76, 0xb4, 0xbd, 0x71, 0xb4, 0xc9, 0xbd, 0x00, 0x10, 0xd4, 0xbc, + 0xbe, 0x87, 0xf0, 0x3c, 0xe8, 0x15, 0xad, 0xbd, 0xfb, 0x2e, 0x5e, 0xbd, 0x6f, + 0x3b, 0x99, 0xbc, 0x77, 0xc7, 0xe5, 0xbd, 0xf4, 0x52, 0x03, 0xbe, 0x74, 0x7b, + 0x00, 0xbe, 0xe8, 0x51, 0x8c, 0x3d, 0xe1, 0x8d, 0x1c, 0xbc, 0x3d, 0x3c, 0x16, + 0x3d, 0x94, 0x51, 0xd5, 0x3d, 0xff, 0x2e, 0xb0, 0x3d, 0xf5, 0x3c, 0xaa, 0xbc, + 0x39, 0x6b, 0xb2, 0x3d, 0x1f, 0x8b, 0x44, 0x3d, 0xe4, 0xa4, 0xa8, 0x3d, 0xa9, + 0xbc, 0x81, 0x3d, 0x67, 0x10, 0x83, 0xbd, 0x03, 0x1b, 0x08, 0x3d, 0xed, 0xef, + 0x29, 0x3d, 0x46, 0x38, 0x58, 0xbc, 0x98, 0x03, 0xa3, 0x3d, 0x7d, 0xd6, 0x34, + 0xbd, 0x36, 0xbd, 0xf7, 0x3d, 0xe7, 0xf9, 0x5d, 0xbd, 0x9c, 0x88, 0x87, 0x3d, + 0x85, 0x7d, 0xa3, 0x3d, 0x81, 0x29, 0x75, 0xbc, 0xca, 0x17, 0x97, 0x3d, 0xbf, + 0xd1, 0x04, 0x3e, 0xc9, 0x18, 0xfa, 0x3b, 0x0f, 0x59, 0xc3, 0x3d, 0x40, 0xa6, + 0x05, 0xbd, 0x5e, 0x98, 0x8d, 0x3c, 0x8f, 0x73, 0xff, 0x3c, 0xb2, 0x58, 0xde, + 0xbc, 0x97, 0x10, 0x04, 0xbd, 0x2d, 0xd2, 0x1c, 0x3d, 0xac, 0x03, 0x6e, 0xbd, + 0xa8, 0x9a, 0xa8, 0x3d, 0x1c, 0x0e, 0x41, 0x3d, 0x30, 0x7a, 0xab, 0xbd, 0xec, + 0x58, 0x14, 0xbd, 0xac, 0xe9, 0x9e, 0xbb, 0x0b, 0x14, 0x02, 0x3d, 0xac, 0x78, + 0x00, 0x3e, 0xa1, 0xb6, 0xc2, 0xbd, 0x04, 0x51, 0x91, 0xbc, 0x57, 0x51, 0xf1, + 0xbd, 0x95, 0x42, 0x49, 0x3d, 0x91, 0x54, 0xa2, 0x3c, 0xbd, 0x0f, 0x03, 0xbe, + 0x0a, 0xf8, 0x17, 0xbd, 0xbb, 0x25, 0x14, 0x3d, 0xf2, 0x00, 0x19, 0xbd, 0x79, + 0xea, 0x85, 0xbd, 0x4a, 0xf9, 0xb6, 0xbc, 0x4f, 0x1c, 0x34, 0xbc, 0x2e, 0x3e, + 0x31, 0x3d, 0xe3, 0x63, 0x5e, 0xbd, 0x63, 0xf1, 0xaf, 0x3d, 0x4e, 0xee, 0xaa, + 0x3d, 0x91, 0xc0, 0xcc, 0xbc, 0xc3, 0x43, 0xb2, 0xbc, 0xab, 0x9d, 0x54, 0xbd, + 0x0b, 0x92, 0xa3, 0xbc, 0xc5, 0xe0, 0xf6, 0x3d, 0xb5, 0x2d, 0x52, 0xbd, 0x89, + 0x8d, 0xf0, 0xbd, 0xd4, 0x40, 0x0c, 0xbe, 0x88, 0xf8, 0xaa, 0x3d, 0xc6, 0x0d, + 0x10, 0x3d, 0xe0, 0x7d, 0xcb, 0xbc, 0x14, 0x58, 0xba, 0x3a, 0x11, 0x9d, 0x24, + 0xbd, 0x14, 0x54, 0x03, 0x3b, 0x2c, 0xb4, 0x7d, 0x3c, 0x5a, 0x71, 0x99, 0xbd, + 0x5d, 0xa3, 0xa3, 0xbd, 0xfc, 0xd0, 0xe5, 0x39, 0x4a, 0x6c, 0xf8, 0xbd, 0x81, + 0x0e, 0xab, 0x3d, 0x0d, 0x40, 0x9a, 0x3d, 0x89, 0xff, 0x07, 0x3d, 0xd4, 0x8c, + 0x97, 0x3b, 0x8a, 0x7a, 0xc5, 0x3c, 0xbb, 0xbf, 0xe3, 0x3a, 0xcb, 0x47, 0x41, + 0x3d, 0x80, 0x8d, 0x29, 0x3d, 0x16, 0xe7, 0xf6, 0xbc, 0x01, 0x5f, 0xc0, 0x3d, + 0xf1, 0x20, 0xe3, 0xbc, 0xec, 0x9f, 0x29, 0x3e, 0x8f, 0x46, 0x8d, 0x3d, 0x20, + 0x99, 0xe9, 0x3c, 0x90, 0x04, 0x00, 0x3e, 0x35, 0xda, 0xba, 0xbd, 0x6c, 0xc5, + 0x5b, 0x3d, 0x9a, 0x42, 0x41, 0xbd, 0x1a, 0x84, 0x6f, 0x3d, 0x94, 0xc4, 0x0c, + 0xbd, 0x08, 0x43, 0x8a, 0x3d, 0xd8, 0xdb, 0xa4, 0x3d, 0xac, 0xc6, 0xa8, 0x3d, + 0xa5, 0xf4, 0xff, 0xb9, 0xdc, 0x01, 0x58, 0xbc, 0x43, 0x37, 0xf0, 0x3d, 0xed, + 0x73, 0x3b, 0xbd, 0x8d, 0x1f, 0x00, 0x3c, 0x4c, 0x89, 0x71, 0x3d, 0xb0, 0xbf, + 0x4e, 0x3d, 0x1e, 0x61, 0x83, 0xbd, 0x82, 0xf6, 0x02, 0xbe, 0x3c, 0x97, 0xf9, + 0x3d, 0x06, 0x96, 0x97, 0x3d, 0x5c, 0x13, 0xd7, 0xbd, 0xce, 0x77, 0x88, 0xbd, + 0x26, 0x76, 0xba, 0x3c, 0x46, 0x28, 0xc4, 0x3d, 0x35, 0x72, 0x8d, 0x3c, 0x3e, + 0x63, 0x81, 0xbd, 0x06, 0x13, 0x9b, 0x3d, 0xf9, 0x80, 0x20, 0x3d, 0x9c, 0xfb, + 0x94, 0x3c, 0x50, 0x2c, 0x16, 0xbd, 0xdb, 0x7d, 0x59, 0xbd, 0x7a, 0xa8, 0x8d, + 0x3d, 0x8b, 0x56, 0x94, 0xbd, 0xa5, 0x49, 0x8b, 0x3d, 0x76, 0xae, 0x99, 0xbc, + 0x6e, 0x40, 0x84, 0x3d, 0xe0, 0x5a, 0x40, 0xbd, 0x33, 0xb8, 0x0b, 0xbd, 0x96, + 0x14, 0x25, 0x3c, 0x3e, 0x5c, 0x78, 0xbd, 0x31, 0x40, 0x06, 0x3e, 0x05, 0x0b, + 0xb7, 0x3c, 0x24, 0x3e, 0xe5, 0xbd, 0x94, 0x06, 0x12, 0x3d, 0x14, 0x07, 0x96, + 0xbd, 0x14, 0x1d, 0x80, 0xbd, 0xfc, 0xd3, 0x66, 0xbd, 0xfa, 0xef, 0x67, 0x3d, + 0x62, 0x1e, 0x9f, 0x3c, 0x27, 0x05, 0x2a, 0xbc, 0xbb, 0x0b, 0xa2, 0x3d, 0x07, + 0x02, 0xaf, 0x3d, 0xcb, 0x9d, 0xc9, 0x3d, 0xbe, 0x5c, 0x15, 0x3b, 0x73, 0xc6, + 0x92, 0xbd, 0x70, 0x29, 0xe4, 0x3d, 0x46, 0xa2, 0xb2, 0xbc, 0x56, 0xb8, 0xe1, + 0x3d, 0x82, 0xf9, 0x0d, 0xbd, 0x9b, 0x59, 0xa8, 0xbd, 0x42, 0x59, 0x98, 0x3d, + 0xae, 0x31, 0x22, 0xbd, 0x0d, 0xa2, 0x1f, 0x3e, 0xc8, 0xfd, 0x58, 0xbc, 0x4e, + 0xd4, 0xca, 0x3d, 0xbd, 0x39, 0x81, 0xbd, 0x7c, 0x0a, 0x25, 0x3e, 0xdb, 0x88, + 0x7f, 0x3c, 0xf1, 0x64, 0x07, 0x3e, 0xd2, 0x99, 0x1d, 0x3d, 0x2c, 0xc9, 0xb0, + 0xbd, 0x7a, 0xe0, 0x9d, 0xbc, 0x9e, 0x93, 0x19, 0x3d, 0x7f, 0xfd, 0xd2, 0xbc, + 0xec, 0x44, 0xd5, 0x3d, 0x69, 0x81, 0xbf, 0x3d, 0x9e, 0xff, 0xac, 0x3c, 0x60, + 0x6b, 0x6a, 0xbd, 0xe6, 0x22, 0x48, 0xbd, 0x3b, 0xc4, 0xa3, 0xbd, 0x0c, 0xd3, + 0xf5, 0x3c, 0x08, 0x03, 0x62, 0x3c, 0x5c, 0x46, 0x16, 0x3e, 0xd3, 0x2a, 0xce, + 0x3c, 0xfc, 0x31, 0xa8, 0x3d, 0xbd, 0x02, 0x95, 0x3c, 0xe8, 0xc7, 0x7a, 0x3c, + 0xff, 0xc5, 0xf8, 0x3c, 0x3a, 0xb0, 0x79, 0x3b, 0xe6, 0xfd, 0x37, 0xbd, 0x5e, + 0xd3, 0x06, 0x3e, 0x21, 0x21, 0xe8, 0x3c, 0xa1, 0x6f, 0xf1, 0x3d, 0xa6, 0xc2, + 0x54, 0x3d, 0x9c, 0xae, 0x9c, 0x3d, 0xcb, 0xfd, 0x0a, 0x3c, 0x3e, 0x2e, 0x00, + 0xbd, 0xdc, 0xf2, 0x4b, 0xbd, 0x7a, 0xdf, 0xbd, 0x3d, 0xbd, 0x27, 0x8b, 0x3c, + 0x1c, 0x12, 0x2d, 0xbd, 0xf9, 0xf3, 0x28, 0x3e, 0x4c, 0x90, 0xb3, 0xbd, 0x49, + 0xfc, 0x84, 0x3d, 0x2e, 0xc1, 0x82, 0x3d, 0x54, 0xc7, 0x62, 0x3d, 0xcb, 0x24, + 0xf9, 0x3d, 0xf4, 0x6a, 0x2b, 0x3c, 0x38, 0x27, 0x1c, 0xbd, 0x05, 0xf1, 0xf5, + 0x3d, 0xc0, 0x87, 0xa2, 0x3d, 0x7e, 0x5c, 0x92, 0x3d, 0xef, 0x33, 0xad, 0x3d, + 0x34, 0xff, 0x43, 0x3d, 0x87, 0x47, 0xc6, 0x3d, 0x58, 0x18, 0x76, 0xbd, 0x1d, + 0x74, 0x9e, 0x3d, 0xae, 0x41, 0xb1, 0xbc, 0x7d, 0x42, 0x94, 0xbd, 0x37, 0x01, + 0x66, 0x3d, 0xb4, 0x18, 0x96, 0xbd, 0x69, 0x31, 0xc4, 0x3c, 0xe7, 0x09, 0x00, + 0xbe, 0x46, 0x1a, 0x2b, 0xbd, 0x76, 0xd4, 0x7b, 0xbd, 0x48, 0xcd, 0xfc, 0x3b, + 0xf9, 0x98, 0xf6, 0xbc, 0x33, 0x91, 0x2c, 0xbe, 0xe1, 0x08, 0xf5, 0xbd, 0xb0, + 0xcd, 0x79, 0x3d, 0xd3, 0x1d, 0x0f, 0x3e, 0x5a, 0x9f, 0x13, 0xbd, 0x7d, 0x6b, + 0x44, 0x3c, 0xcf, 0x14, 0x38, 0x3d, 0xe3, 0xfb, 0x47, 0x3d, 0x37, 0x1e, 0x2f, + 0x3c, 0x89, 0xa0, 0xb2, 0xbd, 0x89, 0x21, 0x81, 0xbd, 0x04, 0xda, 0xc5, 0x3d, + 0xa7, 0xa8, 0x16, 0xbc, 0x07, 0x2e, 0xc1, 0xbb, 0x8c, 0x6f, 0xc2, 0x3c, 0x3b, + 0x0c, 0x03, 0xbd, 0x74, 0xc2, 0xa5, 0x3d, 0x3f, 0xeb, 0xb2, 0xbd, 0x2f, 0x66, + 0x94, 0xbd, 0x4f, 0x30, 0xab, 0xbd, 0xc4, 0xdd, 0x45, 0x3d, 0x4a, 0xb7, 0x48, + 0x3d, 0x55, 0x77, 0x26, 0x3e, 0xbe, 0x1c, 0x96, 0xbb, 0x5b, 0xca, 0x62, 0xbd, + 0xcf, 0x1e, 0xd3, 0x3c, 0xa7, 0x0e, 0xb9, 0xbd, 0x67, 0x75, 0x2b, 0xbd, 0x26, + 0x12, 0xd5, 0xbc, 0xb6, 0x0f, 0xc0, 0xbd, 0x12, 0xab, 0x23, 0x3d, 0xf6, 0x23, + 0xb2, 0x3d, 0x3f, 0x71, 0x83, 0x3d, 0x2a, 0x08, 0x95, 0xbc, 0xd8, 0x6e, 0xdc, + 0xbd, 0x1c, 0x85, 0xa6, 0xbd, 0xc4, 0xbc, 0x52, 0xbd, 0xa8, 0xe0, 0x9c, 0x3d, + 0xf8, 0xa9, 0xe5, 0x3d, 0xfe, 0xbd, 0x9c, 0x3d, 0x9d, 0x62, 0xc3, 0x3c, 0xe6, + 0x95, 0xd6, 0xbc, 0x08, 0x07, 0x68, 0xbc, 0x99, 0x7b, 0xe4, 0xbd, 0xcf, 0x18, + 0xb0, 0x3d, 0xdb, 0x65, 0x8e, 0xbd, 0x47, 0x34, 0xa9, 0xbd, 0x65, 0xab, 0x0a, + 0xbe, 0xb3, 0x57, 0x24, 0xbe, 0x1f, 0xce, 0xa2, 0xbc, 0xd2, 0x8a, 0xb7, 0xbc, + 0x1e, 0xd4, 0x53, 0x3d, 0xec, 0x02, 0x14, 0xbd, 0xd7, 0xc2, 0x05, 0x3d, 0x05, + 0xe3, 0xcb, 0xbc, 0x18, 0xc7, 0x9d, 0x3d, 0x99, 0x69, 0x0a, 0xbe, 0xee, 0x58, + 0xa1, 0x3d, 0xae, 0xa3, 0x36, 0xbe, 0x5c, 0x5d, 0x9c, 0xbd, 0x39, 0xfb, 0x00, + 0xbd, 0x38, 0xcd, 0x70, 0xbd, 0x2f, 0x77, 0xf2, 0xbd, 0x8a, 0x7d, 0x74, 0xbd, + 0x4b, 0x08, 0x7b, 0xbd, 0x42, 0xaf, 0x4a, 0xba, 0x56, 0x2e, 0x80, 0xbd, 0x81, + 0x9b, 0xb9, 0x3d, 0xf0, 0x6d, 0x86, 0x3c, 0xfe, 0x53, 0x82, 0xbd, 0xb8, 0xac, + 0x56, 0xbd, 0xf7, 0xc9, 0x14, 0x3d, 0xea, 0xe6, 0x1f, 0xbd, 0x9f, 0x23, 0xd0, + 0xbd, 0x73, 0xd5, 0x6a, 0x3d, 0x24, 0xdb, 0xba, 0xbd, 0xf5, 0xf1, 0xda, 0xbc, + 0xe6, 0x8b, 0x34, 0xbd, 0x6c, 0x15, 0x8a, 0x3c, 0x26, 0x05, 0x63, 0x3d, 0x27, + 0xc2, 0x8b, 0xbd, 0x62, 0xb2, 0x83, 0x3d, 0x71, 0x11, 0x50, 0xbc, 0x67, 0x3d, + 0xe4, 0x3d, 0xa5, 0x3d, 0x59, 0xbd, 0x18, 0xa4, 0x70, 0x3c, 0x6b, 0x86, 0x9c, + 0x3d, 0xa6, 0xe4, 0xbf, 0x3d, 0x3a, 0x8f, 0xe2, 0xbd, 0xd7, 0xf8, 0x71, 0x3d, + 0x1d, 0x46, 0x00, 0xbd, 0x3c, 0x59, 0xc0, 0xbc, 0x1f, 0x60, 0x50, 0xbd, 0x91, + 0xe2, 0xe6, 0xbd, 0x4c, 0x72, 0xb6, 0xbd, 0x49, 0x1e, 0xba, 0x3d, 0xdd, 0x1e, + 0x77, 0xbc, 0x35, 0x26, 0xab, 0x3c, 0x63, 0x83, 0xd7, 0xbd, 0x41, 0x6f, 0xa8, + 0x3d, 0x6d, 0xf0, 0x50, 0xbd, 0xdc, 0x5f, 0x2f, 0xbd, 0x73, 0x67, 0xce, 0xbc, + 0x10, 0x47, 0x0b, 0xbd, 0xdc, 0x85, 0x41, 0x3c, 0xcd, 0x61, 0xc9, 0xbd, 0x9d, + 0x79, 0x77, 0x3d, 0xbd, 0xe5, 0xb5, 0xbd, 0xa4, 0x88, 0xf7, 0xbd, 0x43, 0xf7, + 0x5e, 0x3b, 0x95, 0x23, 0x26, 0xbd, 0x39, 0x1e, 0xa7, 0x3d, 0x60, 0xd5, 0x2e, + 0xbd, 0x78, 0xa7, 0x1b, 0x3d, 0xad, 0x5b, 0xcd, 0x3d, 0x73, 0xba, 0x9d, 0xbd, + 0xb7, 0xe0, 0x91, 0x3d, 0xa7, 0x90, 0x8e, 0x3d, 0x12, 0x0d, 0x11, 0x3d, 0x6d, + 0xf8, 0x9b, 0xbd, 0x7d, 0xd4, 0xdf, 0x3d, 0x67, 0x4c, 0xa3, 0x3d, 0x21, 0x33, + 0x88, 0xbc, 0xc8, 0xd2, 0xc7, 0xbd, 0x93, 0xea, 0x80, 0xbd, 0x4d, 0xe7, 0x42, + 0xbd, 0x0b, 0x43, 0xfb, 0xbc, 0xb0, 0x8c, 0x7f, 0xbc, 0x16, 0x83, 0xc3, 0x3d, + 0x42, 0xd0, 0x86, 0xbd, 0x7f, 0x6f, 0xa6, 0x3d, 0xed, 0xee, 0x4c, 0x3d, 0xc9, + 0x3e, 0x03, 0x3d, 0x72, 0x47, 0x9e, 0xbd, 0x2f, 0x66, 0xda, 0x3d, 0x3d, 0x45, + 0x80, 0x3b, 0x3c, 0xab, 0xa6, 0xbd, 0x73, 0xe8, 0x9f, 0xbd, 0xf6, 0x76, 0xc2, + 0xbd, 0x18, 0xaf, 0xb4, 0x3d, 0x94, 0x94, 0x9f, 0xbd, 0x46, 0xcd, 0xad, 0xbd, + 0xdb, 0xe6, 0x87, 0xbd, 0x67, 0x03, 0x07, 0x3d, 0x05, 0xc2, 0x84, 0xbc, 0xb7, + 0x1f, 0x8d, 0xbd, 0x19, 0x72, 0xa1, 0x3d, 0xd8, 0xa5, 0x52, 0x3d, 0x63, 0x90, + 0x03, 0xbd, 0xf5, 0xe3, 0xcd, 0x3d, 0xd8, 0xfb, 0x9c, 0x3d, 0x74, 0xd7, 0x06, + 0xbd, 0x8c, 0xb5, 0xdd, 0xbd, 0x20, 0x07, 0xba, 0xbd, 0x83, 0xa1, 0xd2, 0x3d, + 0x4c, 0x58, 0xe3, 0x3d, 0x31, 0x7d, 0xe1, 0xbd, 0x29, 0x06, 0xa1, 0xbd, 0x64, + 0xa9, 0x2e, 0xbd, 0x79, 0x6c, 0xb5, 0xbd, 0x8f, 0xe5, 0xac, 0x3d, 0x68, 0xc1, + 0xc3, 0x3c, 0xd5, 0xa7, 0xf2, 0xbd, 0x2e, 0x24, 0x40, 0xbd, 0xd6, 0x39, 0xe7, + 0x3d, 0xe0, 0xaf, 0x02, 0xbd, 0xe1, 0xd6, 0xe1, 0xbd, 0xfa, 0xa0, 0x25, 0x3d, + 0x26, 0xe8, 0x57, 0x3d, 0xa5, 0x58, 0xf6, 0xbd, 0xd2, 0x32, 0x0f, 0xbd, 0x8e, + 0xa1, 0x8d, 0x3c, 0xb6, 0x98, 0xce, 0xbc, 0x71, 0x96, 0xfa, 0xbc, 0xe2, 0x69, + 0x35, 0x3c, 0x3d, 0x07, 0x21, 0x3d, 0xc1, 0x9f, 0x8a, 0x3d, 0x0a, 0x9e, 0x64, + 0xbd, 0x3b, 0x91, 0x57, 0xbb, 0x99, 0x41, 0x8c, 0x3d, 0xcf, 0x60, 0x8f, 0xbd, + 0x5e, 0xe6, 0x25, 0xbd, 0xec, 0x60, 0xb0, 0xbd, 0xcf, 0xd7, 0x87, 0x3d, 0x1a, + 0x3f, 0x4e, 0xbd, 0xd7, 0xbf, 0x78, 0xbd, 0xe3, 0x77, 0xd9, 0x3d, 0x81, 0xd8, + 0x81, 0xbd, 0x52, 0x2a, 0xd3, 0x3d, 0xc1, 0x32, 0x80, 0xbd, 0xaa, 0xbf, 0x9d, + 0x3d, 0xbf, 0x21, 0x3b, 0x3d, 0x30, 0x5e, 0x9e, 0xbd, 0xfa, 0xf3, 0xda, 0xbc, + 0x41, 0xeb, 0x9c, 0xbd, 0x71, 0x88, 0xd3, 0xbc, 0xf1, 0x4c, 0x00, 0xbd, 0x38, + 0xd5, 0x2f, 0x3c, 0xcd, 0xd9, 0x3e, 0x3d, 0xf4, 0xf8, 0xa4, 0x3d, 0xbc, 0x2f, + 0x0e, 0xbd, 0x28, 0x35, 0x34, 0x3d, 0x3a, 0x20, 0x5c, 0x3d, 0x97, 0x22, 0xdb, + 0xbd, 0x75, 0xd3, 0x5f, 0xbd, 0xf9, 0x3b, 0x66, 0xbd, 0x4a, 0x18, 0xe7, 0xbb, + 0x4e, 0x21, 0x5d, 0xbd, 0x9c, 0x6c, 0x45, 0xbd, 0x2c, 0xb8, 0xe7, 0x3c, 0x65, + 0xbf, 0x45, 0x3d, 0x15, 0xbb, 0xa5, 0xbd, 0x7e, 0x1c, 0xba, 0xbd, 0xfa, 0x2d, + 0xfc, 0x3c, 0xc2, 0xfb, 0x20, 0xbd, 0x62, 0xc3, 0xa6, 0xbd, 0xae, 0x66, 0xc1, + 0x3b, 0x8e, 0x5e, 0x29, 0xbd, 0x1a, 0x5d, 0x27, 0xbd, 0xce, 0x36, 0xaf, 0xbd, + 0x6d, 0x03, 0xdd, 0x3d, 0xb5, 0x5d, 0x95, 0x3c, 0xd2, 0x9d, 0x60, 0xbd, 0xf0, + 0xb5, 0x60, 0xbc, 0x80, 0x21, 0x34, 0xbd, 0xf1, 0x05, 0xc8, 0x3b, 0x2c, 0x2a, + 0x2f, 0x3e, 0x99, 0x23, 0x3c, 0x3d, 0x73, 0x2f, 0xe4, 0x3d, 0xc8, 0x22, 0xce, + 0x3d, 0xbf, 0x98, 0xad, 0xbd, 0xa5, 0xb2, 0xd4, 0xbd, 0x6d, 0xca, 0x3b, 0xbe, + 0xd1, 0xa0, 0x95, 0x3c, 0xa0, 0xed, 0xe1, 0x3b, 0x8c, 0x5d, 0x6f, 0x3d, 0x10, + 0x04, 0x88, 0xbd, 0x76, 0x62, 0xe7, 0x3d, 0x53, 0x28, 0x8c, 0xbd, 0x7b, 0x4f, + 0x5d, 0xbd, 0x2e, 0x69, 0x8b, 0x3c, 0xe7, 0x7f, 0x79, 0x3c, 0x2e, 0xe5, 0xbf, + 0x3c, 0x56, 0x90, 0xf6, 0xbc, 0x8a, 0xc6, 0x3b, 0x3d, 0x86, 0xbf, 0xb8, 0xbd, + 0xe6, 0xf7, 0xd7, 0xbc, 0xc5, 0x96, 0xcb, 0x3d, 0x48, 0xe0, 0x9a, 0xbd, 0xd8, + 0xe1, 0x45, 0xbd, 0xa7, 0x00, 0xd7, 0xbd, 0xda, 0x57, 0x1c, 0xbc, 0x8e, 0x49, + 0x40, 0x3d, 0x8b, 0x52, 0x0a, 0x3d, 0xe2, 0xe8, 0x1b, 0xbd, 0x74, 0xd1, 0x0f, + 0x3e, 0x17, 0x20, 0xc1, 0x3d, 0x3a, 0xbe, 0x8a, 0xbd, 0xa4, 0xd5, 0xca, 0x3c, + 0x4f, 0x17, 0x82, 0xbc, 0x1f, 0xea, 0x09, 0xbd, 0x8e, 0xcb, 0xd0, 0x3d, 0x9c, + 0x1a, 0x36, 0xbd, 0x99, 0xee, 0x5b, 0xbd, 0x5c, 0x1d, 0x10, 0xbe, 0x9e, 0x99, + 0x22, 0x3d, 0x8f, 0x8f, 0xda, 0x3c, 0x42, 0xa7, 0x2e, 0x3d, 0x37, 0x33, 0x03, + 0xbe, 0x11, 0x7b, 0x8f, 0xbd, 0xb8, 0xa1, 0x7e, 0x3d, 0x31, 0x04, 0x62, 0x3d, + 0x93, 0x03, 0xfe, 0x3b, 0x59, 0x82, 0xa0, 0xbd, 0x07, 0xb8, 0x24, 0x3d, 0x7a, + 0x45, 0xf2, 0x3d, 0xab, 0xf4, 0xd7, 0xbd, 0x2f, 0xbd, 0xc6, 0x3d, 0xb2, 0x1c, + 0x47, 0x3d, 0xbe, 0xf6, 0xb2, 0x3d, 0xe2, 0xd0, 0x92, 0xbd, 0x0d, 0xec, 0xb2, + 0xbd, 0x40, 0x5c, 0xc0, 0xbd, 0xa8, 0xf7, 0x0e, 0x3c, 0xef, 0x56, 0xb1, 0xbd, + 0x91, 0x09, 0x4f, 0xbd, 0x47, 0x51, 0xcc, 0x3d, 0xcd, 0x6d, 0x85, 0xbd, 0xfe, + 0xb2, 0x6f, 0xbd, 0x3f, 0x9b, 0xec, 0x3c, 0x64, 0x20, 0x98, 0xbb, 0x82, 0x78, + 0x09, 0x3d, 0x2f, 0xbf, 0xe7, 0xbc, 0x5d, 0x5e, 0x01, 0xbd, 0x0c, 0xca, 0x4b, + 0x3d, 0xf2, 0xa2, 0x89, 0xbd, 0xa6, 0x59, 0x54, 0x3d, 0x62, 0x46, 0x04, 0x3c, + 0x99, 0x2f, 0x48, 0xbd, 0x22, 0x21, 0x1b, 0xbd, 0x07, 0x3b, 0xb4, 0xbd, 0x88, + 0x42, 0x0a, 0x3e, 0x7e, 0x29, 0xc3, 0xbb, 0xab, 0x7a, 0x86, 0x3d, 0xe7, 0x26, + 0xc0, 0x3c, 0xac, 0x99, 0x0f, 0xbd, 0x6e, 0xdb, 0x74, 0x3d, 0xba, 0x02, 0xdb, + 0x3d, 0x3c, 0x38, 0xae, 0x3d, 0xdf, 0x34, 0xe1, 0xbd, 0x53, 0xa6, 0x26, 0xbe, + 0x26, 0xa7, 0x82, 0x3d, 0x7b, 0x0f, 0x03, 0xbe, 0x85, 0xb6, 0xaa, 0xbc, 0xc5, + 0x08, 0xbf, 0x3c, 0x4f, 0xd1, 0xa8, 0xbb, 0x9f, 0x58, 0xa6, 0x3c, 0x51, 0xdc, + 0xfb, 0x3d, 0x2e, 0x30, 0xab, 0xbd, 0x38, 0x19, 0x19, 0x3c, 0xa2, 0x6a, 0x7c, + 0x3d, 0x1d, 0x52, 0xd5, 0xbc, 0x15, 0x5f, 0xb3, 0x3b, 0x9b, 0xd8, 0x75, 0xbd, + 0x5f, 0xa1, 0x13, 0xbd, 0xdc, 0xc7, 0xfd, 0xbb, 0x44, 0x9b, 0x73, 0xbd, 0x41, + 0x1d, 0x82, 0xbd, 0xa7, 0x0b, 0x15, 0x3c, 0x87, 0x91, 0x80, 0x3c, 0x74, 0x55, + 0xab, 0xbd, 0xf4, 0xb6, 0x3d, 0x3b, 0xa7, 0x2c, 0xcd, 0xbd, 0x19, 0xa5, 0x96, + 0xbc, 0xea, 0x8f, 0xfa, 0x3d, 0x98, 0x47, 0x12, 0xbd, 0xfc, 0x40, 0x62, 0x3d, + 0x72, 0x61, 0xa0, 0xbd, 0x79, 0x4d, 0x71, 0x3d, 0x2f, 0x4a, 0x89, 0x3d, 0xb8, + 0xdc, 0x98, 0x3d, 0x66, 0x46, 0x6f, 0x3d, 0xa2, 0xf2, 0x0d, 0x3d, 0x36, 0xf5, + 0xd4, 0x3c, 0xb9, 0xe5, 0x88, 0x3d, 0xa4, 0x93, 0x05, 0x3e, 0x64, 0x7e, 0x18, + 0xbe, 0xb6, 0x47, 0x76, 0x3d, 0x8e, 0x31, 0xca, 0x3d, 0x2f, 0x72, 0xf3, 0x3d, + 0x73, 0x45, 0x0d, 0x3e, 0xf4, 0x52, 0xfa, 0xbc, 0x40, 0x37, 0x88, 0xbd, 0x44, + 0x13, 0xae, 0xbc, 0x25, 0x7e, 0x0a, 0xbd, 0xbe, 0x26, 0x45, 0xbd, 0x2c, 0xf1, + 0x37, 0x3d, 0x29, 0xbd, 0x9f, 0xbd, 0xcb, 0xff, 0x1c, 0xbd, 0x62, 0xf2, 0xa0, + 0xba, 0x20, 0x57, 0xa8, 0xbc, 0xaa, 0xc1, 0x9c, 0xbd, 0xfb, 0xd0, 0x3b, 0x3d, + 0xe2, 0xae, 0x3f, 0x3d, 0x41, 0x4d, 0x93, 0x3d, 0x28, 0x11, 0xcc, 0x3d, 0x52, + 0x6e, 0x06, 0x3e, 0x8f, 0x9b, 0xc0, 0x3d, 0x40, 0xb0, 0xa4, 0xbc, 0xb0, 0x45, + 0x86, 0x3d, 0xc9, 0x85, 0x40, 0xbd, 0xfa, 0xdb, 0xe3, 0xbd, 0xf3, 0x0e, 0x9b, + 0x3d, 0x48, 0x39, 0x03, 0xbe, 0xc4, 0xfc, 0x2f, 0xbd, 0xb9, 0xbf, 0xbe, 0x3d, + 0xd9, 0x2f, 0x11, 0xbd, 0x71, 0x6a, 0x75, 0x3c, 0x89, 0x2b, 0xc2, 0xbd, 0x21, + 0x82, 0xd4, 0xbd, 0x36, 0xcc, 0xf5, 0x3d, 0xa3, 0x91, 0x3d, 0x3d, 0x16, 0xd1, + 0x7d, 0xbd, 0x40, 0xba, 0x75, 0x3b, 0x5a, 0x82, 0xfa, 0x3d, 0xc1, 0x09, 0xaf, + 0x3d, 0x1e, 0x44, 0xa3, 0x3d, 0xd7, 0x2a, 0x37, 0xbd, 0xd9, 0x72, 0xcc, 0x3d, + 0x58, 0x58, 0x9a, 0xbd, 0xea, 0x90, 0x35, 0xbc, 0x0e, 0x69, 0x92, 0x3c, 0x68, + 0x7e, 0x5c, 0xbc, 0x0a, 0xba, 0x55, 0x3d, 0x7e, 0xd4, 0xb9, 0x3b, 0x45, 0x5b, + 0xe7, 0xbd, 0x6b, 0xe6, 0xd5, 0xbc, 0xbc, 0x3e, 0x14, 0xbd, 0xe8, 0xb5, 0x09, + 0x3d, 0xbd, 0xde, 0xaf, 0x3d, 0xcf, 0x2d, 0x94, 0xbd, 0x12, 0x0f, 0xac, 0x3d, + 0x21, 0x99, 0xc2, 0xbd, 0x45, 0x93, 0x0d, 0x3d, 0x8a, 0x1e, 0xe4, 0x3d, 0xe8, + 0xfe, 0xb2, 0x3d, 0x0e, 0x69, 0xb8, 0xbd, 0xab, 0x2a, 0x91, 0xbc, 0x02, 0x24, + 0x8f, 0xbd, 0xef, 0x96, 0xa7, 0x3b, 0x39, 0x39, 0xda, 0xbd, 0x31, 0x03, 0xcd, + 0x3d, 0xe5, 0xf7, 0x4c, 0x3c, 0xca, 0x45, 0x3f, 0x3c, 0xb4, 0xf6, 0x8c, 0xbd, + 0x4a, 0x36, 0x4f, 0x3c, 0x5c, 0xe7, 0x56, 0x3d, 0xe3, 0x81, 0xd6, 0xbd, 0x44, + 0x9d, 0x3d, 0xbd, 0xb2, 0xf5, 0xe2, 0x3d, 0xaa, 0xd0, 0xff, 0xbc, 0x49, 0x86, + 0x4b, 0x3d, 0x79, 0x40, 0x51, 0xbd, 0x60, 0xd2, 0x91, 0xbd, 0x9d, 0x61, 0x26, + 0xbe, 0x32, 0x82, 0xe5, 0x3d, 0xa3, 0x28, 0xc5, 0xbc, 0x3f, 0x02, 0x08, 0xbd, + 0x9b, 0xe8, 0xca, 0x3d, 0xb4, 0x34, 0xed, 0x3c, 0x48, 0x7f, 0xea, 0x3d, 0xd6, + 0x07, 0xa1, 0xbd, 0xf9, 0xad, 0x18, 0x3c, 0xba, 0x0d, 0x8b, 0x3d, 0xa6, 0x13, + 0x0f, 0x3e, 0x25, 0xfc, 0x99, 0x3c, 0xc4, 0x8e, 0xc1, 0x3c, 0xfe, 0xa2, 0x14, + 0x3d, 0x0f, 0x96, 0xd5, 0xbc, 0x21, 0x99, 0xbb, 0xbc, 0xd7, 0x9c, 0xd1, 0x3d, + 0x14, 0xd2, 0xa2, 0x3d, 0x8b, 0x64, 0xd9, 0xbd, 0x11, 0x36, 0xa2, 0x3c, 0xec, + 0xbe, 0x24, 0xbd, 0x9f, 0x0f, 0x2a, 0x3d, 0x9d, 0xd5, 0xa6, 0xbd, 0xba, 0xe4, + 0x83, 0xbd, 0xc1, 0xce, 0x45, 0xbd, 0x4a, 0x99, 0x8c, 0xbd, 0xa0, 0x8d, 0x99, + 0x3b, 0xf1, 0x4b, 0x7a, 0xbc, 0x9d, 0x76, 0xd1, 0xbd, 0x65, 0x96, 0xd5, 0x3d, + 0x65, 0xd5, 0x0a, 0xbd, 0x03, 0xb9, 0x60, 0x3c, 0xbe, 0xb3, 0x0e, 0xbe, 0xf3, + 0x86, 0xf3, 0x3d, 0x28, 0xc1, 0x0f, 0x3d, 0x88, 0x69, 0xc0, 0xbc, 0x0e, 0x06, + 0x7e, 0x3d, 0x42, 0x82, 0xa5, 0x3d, 0x28, 0x95, 0x1b, 0x3d, 0xb7, 0x6d, 0xac, + 0xbd, 0xe0, 0xc9, 0x14, 0xbd, 0x5c, 0xf4, 0xb3, 0x3d, 0x74, 0x9e, 0xd4, 0xbd, + 0x8d, 0x9a, 0xed, 0x3c, 0x9c, 0xe3, 0x01, 0x3d, 0x08, 0x0d, 0xc5, 0xbd, 0xc5, + 0xba, 0xa7, 0xbd, 0xf2, 0xf8, 0x30, 0x3c, 0x41, 0x3c, 0xa8, 0x3d, 0x15, 0x63, + 0x60, 0xbd, 0x31, 0x27, 0xc6, 0xbc, 0x61, 0x0f, 0xe8, 0xbd, 0xcf, 0x0c, 0xbb, + 0xbc, 0xf5, 0x06, 0xbd, 0x3d, 0x99, 0x20, 0xb4, 0x3c, 0x5c, 0x27, 0x2d, 0xbd, + 0x5f, 0x29, 0x4b, 0xbd, 0xe6, 0x17, 0xef, 0x3d, 0x9c, 0x60, 0x84, 0xbd, 0x6a, + 0x76, 0xce, 0x3d, 0xf7, 0x48, 0x92, 0x3d, 0x6a, 0x72, 0xa3, 0x3d, 0x07, 0x7e, + 0x04, 0x3e, 0x71, 0x2a, 0xa8, 0x3d, 0x9a, 0x94, 0x74, 0x3d, 0x78, 0x1b, 0xf6, + 0x3d, 0x98, 0x1e, 0xfd, 0xbc, 0x3a, 0xf5, 0xc4, 0x39, 0x5f, 0x45, 0xc6, 0x3d, + 0x14, 0xc4, 0x8b, 0x3d, 0xea, 0x0c, 0x16, 0xbd, 0x43, 0x08, 0x98, 0x3c, 0x42, + 0x6d, 0x04, 0x3d, 0x8f, 0x4f, 0xc5, 0xbd, 0x88, 0x9e, 0x35, 0xbd, 0xfd, 0x1d, + 0xfc, 0xbc, 0x82, 0x9f, 0xa5, 0x3c, 0xfe, 0xe2, 0x30, 0xbc, 0x6a, 0x80, 0xf1, + 0x3c, 0xc0, 0x61, 0x39, 0x3d, 0xcd, 0x81, 0x08, 0xbe, 0x6f, 0xa9, 0xa9, 0xbd, + 0x51, 0x50, 0x2b, 0xba, 0xaa, 0xd4, 0xa1, 0xbd, 0x13, 0x64, 0xdf, 0xbd, 0xa4, + 0xd4, 0x5c, 0xbc, 0x2d, 0x83, 0xad, 0xbd, 0xc3, 0x31, 0x07, 0x3d, 0x7d, 0x7a, + 0x97, 0xbc, 0xa7, 0x23, 0xf7, 0xbd, 0x61, 0x7f, 0xda, 0xbd, 0x1d, 0x39, 0xd4, + 0xbd, 0x0b, 0x50, 0x8f, 0xbc, 0xfc, 0xa2, 0x06, 0x3e, 0x7b, 0x0e, 0x90, 0x3d, + 0xf8, 0xa0, 0x9d, 0xbd, 0x25, 0x0f, 0x6d, 0x3d, 0xae, 0x7f, 0xb7, 0xbc, 0xe9, + 0x1f, 0x10, 0xbe, 0x5b, 0x7f, 0x52, 0xbd, 0xe5, 0x86, 0x0d, 0xbd, 0x03, 0x12, + 0x58, 0x3c, 0xee, 0x04, 0xaa, 0xbd, 0x08, 0x85, 0x0a, 0x3d, 0x73, 0x0b, 0x93, + 0xbd, 0x4c, 0x42, 0x0d, 0xbd, 0xe9, 0xa4, 0x7f, 0x3d, 0x3b, 0x8a, 0xa8, 0x3c, + 0xa6, 0x4d, 0x88, 0x3d, 0x44, 0xe9, 0x1e, 0x3c, 0x05, 0x39, 0xd0, 0x3d, 0x09, + 0xc4, 0xc7, 0x3b, 0xdb, 0x43, 0x88, 0xbd, 0xb2, 0x44, 0x9d, 0x3d, 0x00, 0x42, + 0x13, 0xbe, 0x25, 0x15, 0x9a, 0x3d, 0xee, 0x5d, 0x9d, 0x3d, 0x04, 0x63, 0x5b, + 0xbb, 0x67, 0x1c, 0x9e, 0x3d, 0xe1, 0x8e, 0xb4, 0x3d, 0x68, 0xae, 0x8c, 0x3d, + 0x1a, 0xdc, 0xac, 0x3d, 0xdb, 0x00, 0x86, 0x3d, 0x60, 0xb7, 0x07, 0xbd, 0x92, + 0x7c, 0xbc, 0xbd, 0x47, 0xb6, 0x8f, 0x3c, 0x16, 0x03, 0xc1, 0x3d, 0xbb, 0x65, + 0x94, 0x3d, 0x0c, 0x98, 0x05, 0xbd, 0xf1, 0xe1, 0xc2, 0x3d, 0xb5, 0xf2, 0x01, + 0xbe, 0xf2, 0xe0, 0x01, 0x3d, 0xb4, 0x4a, 0xa5, 0x3d, 0x7c, 0x67, 0x97, 0x3d, + 0xa4, 0xbe, 0x52, 0x3d, 0x17, 0x60, 0x1c, 0x3d, 0x95, 0x83, 0x5b, 0xbc, 0x33, + 0x59, 0xd3, 0xbd, 0x45, 0x05, 0xf7, 0xbd, 0xa5, 0x82, 0xbe, 0x3d, 0x91, 0xc4, + 0x46, 0x3d, 0x5c, 0x4b, 0x27, 0xb8, 0x32, 0xe3, 0xf9, 0x3c, 0xdf, 0xcb, 0xcc, + 0x3d, 0xc3, 0x94, 0x6f, 0xbd, 0x10, 0xa2, 0xec, 0x3d, 0x2e, 0xaf, 0x09, 0xbc, + 0x49, 0x91, 0x8d, 0x3d, 0x6e, 0xc8, 0xc5, 0xbc, 0x45, 0x0e, 0x66, 0xbc, 0x37, + 0xd6, 0xfd, 0xbc, 0x2a, 0xea, 0x81, 0xbd, 0xf7, 0xc2, 0xc2, 0x3d, 0x12, 0x27, + 0x6b, 0x3c, 0x97, 0x69, 0xf3, 0x3b, 0xc8, 0xb7, 0xa6, 0xbc, 0xd6, 0xdf, 0x96, + 0xbc, 0xe0, 0x8a, 0x1b, 0x3e, 0xe3, 0x34, 0xc5, 0x3c, 0x96, 0xcd, 0x12, 0xbe, + 0xcd, 0x75, 0x5a, 0x3c, 0x81, 0xd5, 0xd6, 0xbd, 0x2f, 0x97, 0x6e, 0xbd, 0x92, + 0x28, 0x45, 0xbc, 0x81, 0xaf, 0xce, 0x3d, 0xc3, 0x35, 0xd3, 0x3d, 0x97, 0x1f, + 0x99, 0x3c, 0x48, 0xb6, 0x5b, 0x3d, 0x98, 0x96, 0x9d, 0x3d, 0xed, 0x0a, 0xa3, + 0x3c, 0x5e, 0x72, 0xe5, 0xbb, 0xad, 0x65, 0xaa, 0xbd, 0x16, 0x57, 0x8c, 0xbd, + 0x4a, 0x37, 0x6b, 0xbd, 0x18, 0x35, 0xbe, 0xbd, 0xa8, 0xaa, 0x07, 0xbd, 0xbe, + 0xcb, 0xf5, 0xbb, 0xbe, 0x69, 0xad, 0x3c, 0x1f, 0x82, 0x54, 0x3d, 0x32, 0xbe, + 0x87, 0xbd, 0x67, 0x54, 0x41, 0x3d, 0x46, 0xb6, 0x2e, 0xbd, 0x04, 0xb2, 0x75, + 0x3c, 0xb8, 0xf0, 0xcd, 0xbc, 0x63, 0x01, 0x7f, 0x3d, 0x92, 0xb6, 0x84, 0xbd, + 0x43, 0x6b, 0xe0, 0x3d, 0x4a, 0xa8, 0xb3, 0x3c, 0x05, 0x93, 0x8f, 0xbd, 0xca, + 0xa0, 0x84, 0x3d, 0x84, 0x4b, 0x27, 0x3e, 0x68, 0xce, 0xe2, 0xbd, 0x30, 0x5d, + 0x22, 0x3d, 0xa3, 0x3c, 0xc0, 0x3d, 0xc3, 0xa5, 0x37, 0xbd, 0xc8, 0xb2, 0xa3, + 0x3d, 0x79, 0xee, 0x82, 0x3d, 0xc6, 0xb3, 0xab, 0x3a, 0x72, 0xa4, 0x65, 0xbb, + 0x5c, 0x20, 0xa7, 0x3d, 0xdd, 0xd9, 0xe5, 0xba, 0xbe, 0xcb, 0x9d, 0xbd, 0xdc, + 0x19, 0xc5, 0xbd, 0xa8, 0x93, 0xc8, 0x3d, 0x4d, 0x2f, 0x1a, 0x3d, 0x24, 0x73, + 0xa2, 0x3d, 0x11, 0xb1, 0x08, 0x3e, 0x8a, 0x27, 0xcf, 0x3d, 0xb6, 0xee, 0xab, + 0xbd, 0x1f, 0xd7, 0xe1, 0x3d, 0x5d, 0xcf, 0x5f, 0xbd, 0x8e, 0xa9, 0xb0, 0x3c, + 0x86, 0xb9, 0x31, 0x3d, 0xd7, 0xa8, 0x92, 0xbd, 0x7f, 0x37, 0xd0, 0x3d, 0x4c, + 0xbb, 0xb6, 0x3d, 0xa4, 0x4d, 0x09, 0xbd, 0xc5, 0x8e, 0x0f, 0xbd, 0xbf, 0x27, + 0xa8, 0xbd, 0x62, 0x94, 0xb2, 0x3d, 0x2d, 0x35, 0xe8, 0x3d, 0xd5, 0x78, 0xee, + 0xbd, 0x2a, 0x5b, 0x5a, 0xbd, 0x72, 0x89, 0x4d, 0x3d, 0x7f, 0x5b, 0xfd, 0xb8, + 0x11, 0x80, 0x58, 0xbd, 0x69, 0xa9, 0xbc, 0xbc, 0xdb, 0xe9, 0xd3, 0xbc, 0x45, + 0x3b, 0xf5, 0xbc, 0xa6, 0x28, 0xc5, 0x3d, 0xe2, 0x48, 0x31, 0x3d, 0x49, 0xab, + 0x36, 0x3b, 0xca, 0xd2, 0xc6, 0xbc, 0x29, 0x1f, 0x5a, 0x3d, 0x90, 0xe6, 0x3b, + 0xbd, 0xf7, 0x5f, 0xa0, 0x3d, 0xb7, 0xc1, 0x91, 0x3d, 0x18, 0xcc, 0xc4, 0x3c, + 0x0a, 0xc0, 0x8a, 0xbd, 0x2a, 0x5e, 0x63, 0xbd, 0xa1, 0x2f, 0xb7, 0xbc, 0xf2, + 0xfb, 0xac, 0x3b, 0xa4, 0xed, 0x17, 0x3d, 0xc1, 0x09, 0x59, 0xbd, 0xe9, 0xf7, + 0xf4, 0x3d, 0xad, 0xe5, 0x8f, 0xbd, 0xa9, 0x9e, 0xd0, 0x3d, 0x0a, 0x98, 0x40, + 0xbd, 0xbc, 0x1f, 0x95, 0x3d, 0x0b, 0x17, 0xf0, 0x3c, 0x64, 0x3f, 0x60, 0xbd, + 0xc0, 0xb2, 0xc7, 0x3b, 0x42, 0x3f, 0x62, 0x3c, 0x6a, 0x39, 0x8c, 0xbd, 0xbf, + 0x72, 0xfd, 0xbd, 0x47, 0x3d, 0xd1, 0xbd, 0x7c, 0x0b, 0x6d, 0x3d, 0xf3, 0x4a, + 0xda, 0xbc, 0xce, 0x57, 0x9d, 0x3d, 0xf0, 0x13, 0x53, 0x3b, 0x94, 0x39, 0x31, + 0x3d, 0x3d, 0xa7, 0x3f, 0xbd, 0xfa, 0x3e, 0x6b, 0x3d, 0xfb, 0x19, 0xa9, 0x3d, + 0x07, 0xfc, 0x5e, 0xbd, 0xfa, 0x47, 0xd3, 0x3d, 0xd6, 0x83, 0x9a, 0xbd, 0x2c, + 0xa9, 0x14, 0x3e, 0x01, 0xb5, 0x7e, 0x3d, 0x27, 0xfb, 0x00, 0x3a, 0x7d, 0xe5, + 0x35, 0xbd, 0x68, 0x50, 0x05, 0xbc, 0x87, 0xdb, 0x19, 0x3d, 0xbe, 0x2e, 0xe3, + 0x3d, 0xe4, 0x41, 0x07, 0xbd, 0x53, 0x57, 0xcc, 0xb9, 0x28, 0x92, 0x96, 0x3d, + 0xb6, 0x14, 0xa4, 0xbc, 0xad, 0x84, 0x69, 0x3c, 0x19, 0xe4, 0xde, 0xbd, 0x3b, + 0xad, 0x04, 0xbe, 0xd9, 0xe3, 0xbc, 0x3d, 0x5b, 0x59, 0xd3, 0x3d, 0x00, 0x12, + 0xcc, 0xbd, 0x2d, 0x0c, 0x8a, 0xbd, 0xc6, 0x1c, 0x79, 0x3d, 0x03, 0xf3, 0x14, + 0xbc, 0xb7, 0x28, 0xa6, 0x3d, 0x28, 0x0d, 0xa5, 0xbd, 0xa9, 0x8e, 0x32, 0x3b, + 0x60, 0xef, 0x30, 0x3d, 0x21, 0x9f, 0x68, 0xbc, 0x13, 0x02, 0x83, 0xbc, 0x21, + 0x90, 0x9e, 0x3c, 0x78, 0xfa, 0xf4, 0xbc, 0xf9, 0x40, 0x6e, 0x3a, 0x11, 0xdb, + 0x05, 0x3e, 0xc1, 0xb7, 0xff, 0x3b, 0x04, 0x47, 0x65, 0xbd, 0x6b, 0x8a, 0x85, + 0xbd, 0x30, 0xd5, 0x95, 0x3d, 0x3c, 0x4a, 0x92, 0x3d, 0xa6, 0x20, 0x11, 0x3d, + 0x03, 0xd8, 0xb1, 0x3c, 0x7d, 0x1e, 0x0b, 0xbd, 0xe9, 0x0a, 0x92, 0x3d, 0x7e, + 0x9d, 0xb8, 0x3c, 0xb5, 0x1e, 0x6d, 0x3d, 0x6d, 0x4e, 0x6f, 0x3d, 0xbc, 0x1e, + 0xdc, 0x3c, 0x2e, 0x87, 0xa0, 0x3d, 0x2d, 0x00, 0x5c, 0xb8, 0x8f, 0xfb, 0xb3, + 0xbd, 0x9e, 0x36, 0x08, 0x3d, 0xa4, 0x19, 0xe0, 0xbb, 0x5f, 0xc0, 0xb7, 0xbb, + 0xc7, 0x3c, 0x78, 0x3d, 0x53, 0xe4, 0x65, 0x3d, 0xca, 0xdf, 0xc9, 0x3d, 0x18, + 0x8b, 0x27, 0xbd, 0x19, 0x05, 0xa6, 0x3d, 0x23, 0xa2, 0xa2, 0x3d, 0xc2, 0x4b, + 0xac, 0xbd, 0x1b, 0x23, 0xd7, 0xbd, 0xc2, 0x53, 0x97, 0x3d, 0x2e, 0xb2, 0x45, + 0xbd, 0x73, 0x7b, 0xbc, 0xbd, 0x33, 0xfc, 0x47, 0xbc, 0x0b, 0x36, 0x91, 0x3d, + 0xaa, 0x1e, 0x0b, 0xbd, 0xc8, 0x3a, 0xda, 0x3c, 0x22, 0x29, 0xc5, 0x3d, 0x62, + 0x18, 0xf3, 0x3c, 0x75, 0x25, 0xc1, 0xbc, 0xe8, 0x19, 0xb8, 0x3d, 0x30, 0x46, + 0x47, 0x3d, 0x22, 0x80, 0x9f, 0xbc, 0x59, 0xcc, 0xcf, 0x3d, 0x00, 0x51, 0x95, + 0xbc, 0x8b, 0x00, 0xbf, 0xbc, 0xf5, 0xca, 0x89, 0xbd, 0xca, 0x56, 0xe4, 0x3d, + 0x7f, 0x86, 0x24, 0x3e, 0x23, 0xd7, 0x14, 0x3d, 0xe2, 0x8f, 0xa7, 0xbc, 0x1d, + 0x6d, 0xb3, 0x3c, 0xa4, 0x8a, 0x85, 0xbd, 0x4a, 0x36, 0x40, 0xbd, 0x20, 0xa4, + 0xa7, 0xbd, 0xfe, 0x10, 0xa3, 0xbc, 0xa3, 0x3b, 0xce, 0x3d, 0x88, 0x99, 0x12, + 0xbd, 0x3d, 0x58, 0xd5, 0xbd, 0x76, 0xe5, 0x7f, 0x3c, 0x87, 0xa0, 0x68, 0xbd, + 0x8a, 0xd4, 0xb7, 0xbd, 0xdb, 0x68, 0x6f, 0x3c, 0x22, 0x84, 0x2e, 0xbc, 0x94, + 0x63, 0xa6, 0xbc, 0x35, 0xa4, 0xa9, 0x3d, 0x17, 0xec, 0x0d, 0xbd, 0xd4, 0x25, + 0x9b, 0xbd, 0xf1, 0x84, 0x04, 0xbd, 0x3a, 0x19, 0xdd, 0x3d, 0xd8, 0xba, 0xb1, + 0x3d, 0xb2, 0xb7, 0x21, 0xbd, 0xeb, 0x7e, 0x19, 0x3d, 0xb9, 0xd3, 0xb9, 0x3b, + 0xa5, 0x6a, 0x88, 0xbd, 0xdc, 0x78, 0x99, 0xbd, 0xf4, 0x9f, 0xc4, 0x3d, 0x23, + 0xfe, 0x49, 0xbb, 0xbe, 0xa0, 0x98, 0xbb, 0x05, 0xe8, 0x84, 0xbd, 0x0e, 0x24, + 0x20, 0x3d, 0x30, 0x96, 0x80, 0xbd, 0xd8, 0x1e, 0xef, 0x3c, 0x0a, 0xad, 0xfe, + 0x3d, 0xa3, 0xaa, 0x3b, 0xbd, 0x24, 0xd1, 0xb9, 0xbd, 0xfd, 0xb4, 0xd6, 0x3c, + 0xe7, 0xfe, 0xe9, 0xbb, 0xf7, 0xd6, 0xaa, 0x3c, 0xa5, 0x35, 0xc1, 0xbc, 0x39, + 0xbd, 0x00, 0xbe, 0x19, 0xed, 0x3b, 0x3d, 0x7f, 0x4e, 0x99, 0x3d, 0x09, 0x63, + 0xe3, 0xbd, 0x74, 0xc3, 0x73, 0xbd, 0xb7, 0x7d, 0xa4, 0x3d, 0x68, 0x37, 0x50, + 0xbd, 0xb0, 0xb0, 0xe8, 0xbd, 0x28, 0x4f, 0xa7, 0xbd, 0x22, 0x85, 0x9e, 0xbd, + 0x32, 0xce, 0x12, 0x3e, 0x60, 0x47, 0xbb, 0x3c, 0xdb, 0xa8, 0xc6, 0x3d, 0x50, + 0xcf, 0x0c, 0x3d, 0x4b, 0x7d, 0x9c, 0x3b, 0xa9, 0xeb, 0xb9, 0xbd, 0x07, 0x97, + 0x13, 0x3c, 0xbe, 0x6b, 0x8f, 0xbd, 0x9c, 0xb3, 0xa9, 0x3d, 0x64, 0xd6, 0x96, + 0xbd, 0x75, 0x6a, 0xc4, 0x3c, 0x20, 0xb6, 0x7e, 0x3d, 0x9b, 0x0e, 0x0c, 0x3e, + 0xf3, 0xd5, 0xc5, 0x3d, 0x54, 0xb8, 0xdf, 0xbd, 0x12, 0x6e, 0xf2, 0x3a, 0x7b, + 0xe4, 0xaa, 0x3c, 0xe3, 0x7c, 0xb5, 0xbd, 0xe6, 0x11, 0x05, 0x3d, 0xc6, 0x65, + 0xa2, 0x3d, 0x95, 0x9e, 0x0c, 0x3d, 0x7f, 0xfe, 0xea, 0xbc, 0x22, 0x51, 0xcf, + 0x3b, 0x7b, 0xdd, 0x98, 0xbc, 0x6e, 0x2f, 0xba, 0xbc, 0xb3, 0x8e, 0xe6, 0xbd, + 0x5e, 0x5e, 0x76, 0x3d, 0x3e, 0xd4, 0xaf, 0xbd, 0x25, 0xbc, 0xa8, 0x3d, 0xb0, + 0xd0, 0x81, 0x3c, 0x4c, 0x3f, 0x52, 0x3c, 0x10, 0xd7, 0x13, 0xbd, 0xd0, 0x83, + 0x02, 0x3e, 0xd3, 0x03, 0xa5, 0x3d, 0xeb, 0xa7, 0xca, 0xbd, 0x91, 0x09, 0x1b, + 0x3d, 0x7a, 0x8c, 0xbf, 0x3c, 0x89, 0x04, 0xdb, 0xbd, 0xf8, 0xfc, 0x56, 0xbd, + 0x8a, 0x66, 0x36, 0x3d, 0x42, 0x8f, 0x6e, 0xbd, 0xc9, 0x79, 0x87, 0x3d, 0xbf, + 0xfb, 0x26, 0x3d, 0x56, 0xeb, 0xbc, 0xbb, 0x3b, 0xa7, 0x17, 0x3d, 0x17, 0x46, + 0x27, 0x3d, 0x87, 0xfb, 0xb4, 0x3d, 0x09, 0x7b, 0x9d, 0xbc, 0xf4, 0xdc, 0x30, + 0x3d, 0xca, 0xee, 0xf7, 0xbd, 0x08, 0x73, 0xec, 0x3d, 0x60, 0xed, 0x24, 0x3d, + 0x77, 0xa3, 0x26, 0x3c, 0x07, 0x95, 0xe2, 0x3c, 0x27, 0x2f, 0xde, 0x3c, 0xd3, + 0x8a, 0x94, 0xbc, 0x58, 0x57, 0xaa, 0xbd, 0x86, 0xdd, 0x0d, 0x3d, 0x29, 0x14, + 0x56, 0x3d, 0x94, 0xdf, 0xa8, 0x3d, 0x33, 0x86, 0xbd, 0x3d, 0xb2, 0x8a, 0x7b, + 0x3c, 0x8d, 0x7b, 0x26, 0xbc, 0x2f, 0x59, 0xb8, 0xbd, 0x65, 0xc2, 0x87, 0xbd, + 0xd3, 0x4b, 0x76, 0x3d, 0x16, 0x20, 0x22, 0x3d, 0xb9, 0xef, 0x62, 0x3b, 0xda, + 0x3b, 0x6b, 0x3d, 0xce, 0x75, 0x59, 0x3d, 0x90, 0xde, 0x33, 0x3d, 0x77, 0x8b, + 0xf7, 0x3d, 0x98, 0xfd, 0xa0, 0xbd, 0xcc, 0xa0, 0xd2, 0x3d, 0xec, 0x73, 0x84, + 0xbd, 0x2c, 0x7a, 0x34, 0x3c, 0xbd, 0x44, 0x07, 0x3e, 0xd8, 0xf6, 0x74, 0xbd, + 0x0a, 0x72, 0x8c, 0xbd, 0xad, 0xd3, 0xd5, 0xbd, 0x78, 0xf7, 0xc9, 0x3d, 0x28, + 0xef, 0x5f, 0x3d, 0x01, 0xbf, 0x80, 0xbd, 0xcc, 0xd6, 0x01, 0xbd, 0x37, 0x34, + 0x75, 0xbd, 0x4a, 0x00, 0x87, 0x3d, 0x4c, 0xd9, 0x4c, 0xbb, 0xcd, 0x86, 0x42, + 0xbd, 0x7b, 0xef, 0x1a, 0x3d, 0x98, 0x2b, 0x3a, 0x3d, 0x97, 0x7a, 0x18, 0x3c, + 0xd0, 0x24, 0xe6, 0xbd, 0xcd, 0xc5, 0xc2, 0x3c, 0x8d, 0x69, 0x7f, 0xbc, 0xed, + 0xef, 0x88, 0xbd, 0x54, 0x72, 0xd6, 0x3d, 0xc4, 0x5b, 0xba, 0x3d, 0x13, 0xd9, + 0x1d, 0xbd, 0xa9, 0x69, 0xd5, 0x3d, 0xf6, 0xab, 0x4b, 0x3d, 0xaf, 0x3c, 0xab, + 0x3d, 0xad, 0x17, 0x02, 0x3d, 0xfe, 0x82, 0x97, 0xbd, 0xe7, 0x5b, 0xca, 0x3d, + 0x0d, 0x04, 0x1b, 0x3d, 0x6a, 0x95, 0xb5, 0x3d, 0xa7, 0x5f, 0xc5, 0x3d, 0x57, + 0xf4, 0xdc, 0x3d, 0x25, 0xf3, 0xa2, 0xbd, 0xad, 0x96, 0xd3, 0x3d, 0x16, 0xb7, + 0x2f, 0xbe, 0x61, 0x4c, 0xaa, 0x3d, 0x71, 0x82, 0xcc, 0x3d, 0x44, 0x36, 0xbb, + 0x3d, 0xba, 0x8f, 0xca, 0xbc, 0xe0, 0xa3, 0x63, 0x3c, 0xfa, 0x02, 0xb3, 0xbd, + 0x0a, 0xcf, 0x00, 0xbe, 0x4b, 0xce, 0x7e, 0xbd, 0xe9, 0x90, 0xcf, 0x3b, 0x32, + 0x0d, 0xa9, 0xbd, 0x54, 0x4d, 0x42, 0x3d, 0x30, 0x36, 0x32, 0x3d, 0x04, 0xa6, + 0xb2, 0xbd, 0x79, 0x05, 0x0a, 0x3e, 0xbb, 0x45, 0xe6, 0x3c, 0xfd, 0xf6, 0x79, + 0x3d, 0x1c, 0x9f, 0x1d, 0x3d, 0xe5, 0x27, 0x97, 0x3c, 0x31, 0xf4, 0x02, 0xbd, + 0x30, 0x19, 0x45, 0x3d, 0xa4, 0x54, 0x06, 0x3d, 0x94, 0x4d, 0xb9, 0xbd, 0x3b, + 0x21, 0xdf, 0xbd, 0xbb, 0x79, 0x1f, 0xbd, 0x41, 0x34, 0x9f, 0x3d, 0x02, 0x58, + 0xb8, 0x3d, 0xe1, 0xb2, 0x03, 0xbe, 0x5e, 0x71, 0x29, 0x3d, 0x9e, 0xf7, 0xbf, + 0xbd, 0xc7, 0x01, 0x75, 0xbd, 0x0d, 0xe3, 0x14, 0xbd, 0x38, 0x23, 0xa3, 0x3d, + 0x93, 0xbc, 0xaa, 0xbd, 0xc9, 0x19, 0x91, 0x3d, 0xcb, 0xba, 0x69, 0x3d, 0xfc, + 0xfa, 0xd7, 0x3d, 0x95, 0xd9, 0x38, 0xbd, 0x4e, 0x3f, 0x75, 0x3d, 0x73, 0xdb, + 0x15, 0xbe, 0xdf, 0x76, 0x8d, 0x3d, 0x0f, 0xb1, 0x13, 0x3d, 0x90, 0x32, 0x24, + 0x3e, 0x3a, 0x17, 0xf9, 0xbd, 0xcd, 0xd1, 0x38, 0xbd, 0x27, 0xf4, 0x9b, 0xbd, + 0x10, 0x6c, 0xa3, 0xbc, 0x1e, 0x12, 0x42, 0x3d, 0xee, 0x38, 0xff, 0xbc, 0xb4, + 0x28, 0x2e, 0x3d, 0xba, 0x69, 0xbd, 0xbc, 0x7c, 0x69, 0xbb, 0xbc, 0x1a, 0xe8, + 0xde, 0xbd, 0xd8, 0xa2, 0x17, 0x3c, 0xb8, 0x9e, 0xb6, 0xbb, 0xae, 0x5e, 0x96, + 0x3c, 0x4f, 0xbb, 0x03, 0xbd, 0x8f, 0x72, 0xb4, 0xbc, 0x94, 0x57, 0xd7, 0x3d, + 0xf5, 0xe3, 0xaf, 0xbc, 0xa4, 0x0c, 0x0d, 0xbd, 0x13, 0xbb, 0x83, 0x3d, 0x62, + 0x06, 0xda, 0x3d, 0xb7, 0xa5, 0x1c, 0x3e, 0x90, 0xd8, 0x86, 0xbd, 0xf5, 0x7e, + 0xd0, 0xbd, 0x8b, 0x5e, 0xcb, 0xbd, 0x0e, 0x81, 0xf5, 0xbd, 0xfe, 0xf3, 0xe4, + 0xbc, 0xe2, 0xc9, 0xd6, 0xbc, 0x4c, 0xa9, 0xc8, 0x3b, 0x04, 0xd2, 0x49, 0xbc, + 0xf0, 0xb2, 0xa5, 0xbd, 0xc7, 0xd6, 0xea, 0x3d, 0xa6, 0xa6, 0x77, 0x3d, 0xdf, + 0x24, 0x03, 0x3d, 0x05, 0x9e, 0x86, 0xbd, 0xce, 0x27, 0x31, 0x3d, 0x46, 0x54, + 0xa4, 0x3d, 0x27, 0x9b, 0x35, 0xbd, 0x28, 0x86, 0x68, 0xbb, 0x2c, 0x1e, 0xc1, + 0xbd, 0xda, 0x7e, 0xa2, 0x3b, 0xa6, 0xe6, 0xe9, 0x3d, 0x8a, 0xcf, 0x0f, 0x3d, + 0x5e, 0xf0, 0x6f, 0xbd, 0xa0, 0xc6, 0xb1, 0xbb, 0x08, 0xc6, 0x77, 0xbc, 0x6d, + 0x17, 0x16, 0xbd, 0xf5, 0xc6, 0x21, 0x3d, 0x70, 0x2a, 0x11, 0xbd, 0x3f, 0x5a, + 0x6c, 0xbd, 0xfb, 0xd9, 0xbc, 0x3d, 0x91, 0x33, 0xb4, 0x3c, 0xc1, 0xc7, 0x84, + 0x3d, 0xd9, 0xca, 0x41, 0xbd, 0xd8, 0x5d, 0xec, 0x3d, 0x17, 0xe2, 0x94, 0x3d, + 0xbf, 0x3f, 0x04, 0xbe, 0x24, 0xa8, 0x66, 0xbd, 0xc4, 0xcd, 0xc0, 0x3d, 0x07, + 0xce, 0x9e, 0xbd, 0x67, 0x5d, 0xe0, 0x3d, 0x9e, 0xdd, 0x1c, 0xbe, 0x77, 0xe5, + 0x5c, 0x3d, 0x98, 0x1f, 0xaf, 0x3d, 0x8a, 0xfd, 0x02, 0x3e, 0x9f, 0x9a, 0xba, + 0xbc, 0x40, 0xe9, 0xbb, 0x3c, 0x4e, 0x51, 0x10, 0xbc, 0xc6, 0xcc, 0x81, 0x3d, + 0x83, 0x18, 0x78, 0xbc, 0x7f, 0x25, 0xe8, 0xbd, 0x2e, 0xa6, 0xcb, 0x3c, 0x2f, + 0x8c, 0x3e, 0x3c, 0x38, 0xdc, 0x67, 0xbb, 0x57, 0xf8, 0xbd, 0x3d, 0xa2, 0x4b, + 0x13, 0x3e, 0x6d, 0x76, 0x64, 0x3d, 0xcf, 0x5e, 0x98, 0x3c, 0x09, 0xc1, 0x8a, + 0x3c, 0x42, 0x2b, 0x82, 0x3d, 0xa3, 0x83, 0x4a, 0x3d, 0xe3, 0x74, 0xb9, 0xbb, + 0x26, 0xf8, 0x62, 0x3d, 0xd6, 0x4d, 0xa4, 0xbc, 0x68, 0x44, 0x13, 0x3d, 0x3b, + 0x7d, 0x54, 0x3d, 0xf4, 0xdf, 0x8c, 0x3d, 0xef, 0x72, 0xcf, 0xbd, 0x4e, 0xd6, + 0x85, 0x3c, 0x6a, 0x11, 0x38, 0xbc, 0xa5, 0xec, 0x83, 0xbd, 0x23, 0x95, 0x86, + 0xbd, 0x93, 0xa0, 0xbf, 0x3c, 0x91, 0xc5, 0x11, 0xbd, 0x96, 0x1b, 0x23, 0x3d, + 0xbc, 0x6d, 0x00, 0x3d, 0x55, 0xb7, 0x9d, 0x3d, 0x44, 0x45, 0x8d, 0x3c, 0x83, + 0x34, 0x19, 0xbd, 0x1c, 0x2e, 0xbe, 0xbd, 0xfb, 0x4b, 0xd5, 0x3c, 0x25, 0xec, + 0xd9, 0xba, 0xe0, 0xcd, 0xa9, 0x3d, 0x72, 0x99, 0xa1, 0x3d, 0xa6, 0xa1, 0x91, + 0xbd, 0xc8, 0x70, 0x39, 0xbd, 0x33, 0x54, 0x24, 0x3d, 0x80, 0x25, 0xd8, 0x3c, + 0x3c, 0x36, 0xdb, 0x3b, 0x04, 0x22, 0x3c, 0xbd, 0xc8, 0x81, 0xfb, 0x3d, 0x89, + 0x15, 0xe1, 0x3d, 0xa5, 0x9d, 0x17, 0xbd, 0x68, 0xad, 0x64, 0xbd, 0xad, 0xbd, + 0x59, 0xbc, 0xfc, 0x1a, 0xa5, 0xbd, 0xf5, 0x88, 0x44, 0x3d, 0x53, 0xa7, 0x9b, + 0x3d, 0x2e, 0x00, 0x93, 0xbd, 0xbd, 0xb1, 0xb9, 0x3c, 0x61, 0x54, 0xc8, 0x3c, + 0xe3, 0xe9, 0xd7, 0x3d, 0x78, 0xe2, 0xe0, 0x3d, 0x6c, 0xe0, 0x08, 0xbe, 0x80, + 0xc2, 0xaf, 0x3d, 0x2a, 0x5c, 0x10, 0xbd, 0x60, 0xcb, 0xf0, 0x3d, 0x7a, 0xa1, + 0xf0, 0xbb, 0x02, 0x56, 0xa9, 0x3d, 0x11, 0xf1, 0x1c, 0x3c, 0x39, 0xec, 0xa9, + 0xbd, 0x73, 0xfd, 0x24, 0xbd, 0xd5, 0x86, 0x8c, 0x3d, 0xdc, 0x85, 0x21, 0x3c, + 0xa7, 0x6f, 0xf6, 0x3d, 0xe0, 0x6b, 0x0c, 0xbd, 0x08, 0x15, 0xf2, 0x3d, 0xd6, + 0x6a, 0xed, 0x3d, 0xda, 0xc1, 0x51, 0xbd, 0x27, 0x6e, 0x11, 0xbe, 0xbe, 0x8f, + 0xcf, 0xbc, 0xa9, 0xf1, 0x05, 0x3d, 0xa1, 0x30, 0x8d, 0xbd, 0x35, 0x5e, 0x97, + 0xbd, 0xee, 0x02, 0x9d, 0xbc, 0xf8, 0xba, 0xe9, 0xbd, 0x61, 0xe1, 0xb5, 0xbd, + 0xaa, 0x6d, 0x0c, 0xbd, 0xeb, 0x1f, 0x5d, 0xbd, 0x17, 0x11, 0xda, 0x3c, 0xe3, + 0x75, 0x55, 0xbd, 0x8b, 0x40, 0x4a, 0x3d, 0xb2, 0x5b, 0x17, 0xbd, 0xc2, 0xbb, + 0x66, 0xbd, 0x42, 0x20, 0xf7, 0x3d, 0x05, 0x75, 0xff, 0xbd, 0xce, 0xd3, 0xca, + 0x3c, 0x76, 0x10, 0xbb, 0x3d, 0x66, 0xa2, 0xcc, 0xbc, 0x96, 0x30, 0xf7, 0xba, + 0xad, 0xa8, 0x16, 0xbc, 0x32, 0x10, 0x77, 0x3b, 0x98, 0xde, 0x1f, 0xbd, 0xc7, + 0xd6, 0x72, 0x3d, 0x33, 0xea, 0xe1, 0x3d, 0xb5, 0x5d, 0x8d, 0x3c, 0xfe, 0xf1, + 0x64, 0x3d, 0x3f, 0xe1, 0x88, 0x3c, 0x0d, 0xa2, 0x92, 0x3d, 0x52, 0x90, 0x20, + 0xbd, 0xcd, 0x17, 0x88, 0xbd, 0xf7, 0xf1, 0x7b, 0x3d, 0x55, 0xbe, 0x9c, 0x3b, + 0x1a, 0x3f, 0xd1, 0x3c, 0x46, 0xbe, 0x0d, 0x3d, 0x53, 0xd7, 0xd9, 0x3d, 0xda, + 0x58, 0xb5, 0xbc, 0x3a, 0x41, 0x78, 0xbd, 0x78, 0xc0, 0x54, 0xbd, 0x3c, 0x27, + 0x10, 0x3e, 0x16, 0x00, 0xe9, 0x3b, 0x6e, 0xcd, 0xc5, 0x3d, 0xd9, 0xf0, 0x82, + 0x3d, 0x44, 0x3e, 0x82, 0x3d, 0xde, 0x31, 0x83, 0x3d, 0x10, 0x32, 0x4e, 0xbd, + 0x13, 0x46, 0xd7, 0xbd, 0x60, 0xa0, 0xbb, 0xbc, 0x33, 0xc9, 0xb0, 0xbd, 0x8d, + 0x52, 0xfb, 0x3d, 0x5e, 0xa7, 0x07, 0x3d, 0x05, 0xd7, 0xb7, 0x3d, 0x34, 0x8c, + 0x71, 0x3d, 0xcf, 0x5d, 0x66, 0xbd, 0x2a, 0x61, 0x1c, 0x3d, 0xa5, 0xa5, 0x70, + 0xbd, 0xd2, 0xb9, 0x67, 0x3b, 0x9e, 0x63, 0x5a, 0x3d, 0xbe, 0xea, 0xd4, 0xbc, + 0x57, 0xe9, 0xb5, 0x3d, 0x03, 0xe4, 0xa6, 0x3d, 0xc4, 0x6b, 0xb3, 0x3d, 0x6e, + 0x60, 0x9f, 0x3d, 0xac, 0x31, 0xa0, 0x3d, 0xcf, 0xcc, 0xb5, 0x3d, 0xd0, 0x80, + 0xd6, 0x3d, 0xb9, 0x3f, 0x96, 0xbd, 0x2d, 0x17, 0x17, 0xbb, 0x6f, 0xf2, 0xe4, + 0xbd, 0x17, 0x51, 0x6e, 0x3d, 0xc2, 0xe2, 0xc2, 0x3d, 0xfe, 0x71, 0x59, 0x3d, + 0x0e, 0x1c, 0x78, 0xbd, 0xc9, 0xc7, 0xbc, 0xbd, 0x40, 0xb0, 0xa8, 0x3d, 0xbf, + 0xff, 0x42, 0xbd, 0xe4, 0x2e, 0x67, 0x3d, 0xca, 0x73, 0x81, 0xbd, 0x0b, 0x0d, + 0xf3, 0x3d, 0xce, 0x97, 0x70, 0x3d, 0xe9, 0x59, 0xe9, 0x3d, 0x45, 0x22, 0x73, + 0xbd, 0x24, 0xb8, 0xdf, 0x3d, 0x96, 0xbb, 0x3f, 0x3c, 0x02, 0xed, 0x65, 0x3d, + 0x84, 0x40, 0x25, 0x3c, 0x6c, 0xc5, 0xd2, 0x3c, 0xea, 0x38, 0x4a, 0x3d, 0xf9, + 0xa2, 0xc9, 0x3d, 0x6f, 0x30, 0xbc, 0x3a, 0x2d, 0xd5, 0x81, 0xbd, 0xd2, 0xae, + 0xa3, 0xbb, 0x8e, 0x91, 0xe7, 0x3c, 0x28, 0x6b, 0xc4, 0xbd, 0xf3, 0x0c, 0xbf, + 0xbc, 0x66, 0xf8, 0xd3, 0x3b, 0x6d, 0x3e, 0x01, 0x3d, 0xf3, 0xbf, 0xc2, 0xbc, + 0x0d, 0xc5, 0x6f, 0xbd, 0xb7, 0x9b, 0x9c, 0x3d, 0xeb, 0x79, 0x88, 0x3d, 0x81, + 0x8a, 0x7d, 0xbc, 0xde, 0x8b, 0x14, 0x3d, 0xa4, 0x3f, 0x7d, 0x3d, 0xb4, 0x27, + 0xa9, 0x3d, 0xb7, 0x75, 0x51, 0x3d, 0xff, 0x73, 0x85, 0x3d, 0x3f, 0xf3, 0x51, + 0x3d, 0xe6, 0xdd, 0xe2, 0xbb, 0x83, 0xc7, 0x65, 0xbd, 0x6a, 0x16, 0xb6, 0xbd, + 0xcf, 0xe8, 0x90, 0x3d, 0x5b, 0xc8, 0xad, 0xbc, 0xa1, 0x27, 0x29, 0xbd, 0x57, + 0xbd, 0x3d, 0x3d, 0x61, 0x4e, 0x41, 0xbc, 0x21, 0x2f, 0x29, 0x3d, 0x55, 0x0b, + 0xba, 0x3d, 0xaa, 0x67, 0xf3, 0xba, 0x7d, 0x60, 0xe4, 0x3d, 0xab, 0xe7, 0x20, + 0xbd, 0x01, 0x71, 0x9f, 0x3d, 0x5a, 0xd5, 0x95, 0xbd, 0x2f, 0x75, 0xd5, 0x3d, + 0x7c, 0x91, 0xf6, 0x3d, 0xaa, 0xd6, 0x0c, 0x3d, 0x6d, 0x1c, 0xd9, 0xbd, 0xb4, + 0x4e, 0x82, 0xbc, 0x3f, 0x5a, 0x1a, 0x3b, 0xb4, 0x94, 0xfb, 0x3d, 0x0a, 0x71, + 0x3c, 0xbd, 0x97, 0xba, 0x12, 0xbc, 0xfd, 0x3d, 0x33, 0xbd, 0xa3, 0x4d, 0x01, + 0x3e, 0x54, 0xe2, 0x33, 0xbd, 0x8d, 0x32, 0x5d, 0x3d, 0x92, 0x84, 0xcb, 0x3d, + 0x91, 0x67, 0xde, 0xbd, 0x4b, 0xfd, 0xc7, 0xbd, 0x4b, 0x11, 0x04, 0xbe, 0x3e, + 0xde, 0xac, 0x3d, 0xe4, 0x9e, 0x3c, 0x3d, 0x5e, 0x7d, 0xfb, 0x3d, 0xfd, 0x4d, + 0xae, 0x3d, 0x63, 0xcf, 0x6f, 0xbd, 0xa0, 0x4f, 0x8b, 0x3d, 0x46, 0x2c, 0x84, + 0xbd, 0xda, 0x69, 0x11, 0x3b, 0xca, 0x5b, 0x1c, 0xbd, 0x59, 0x23, 0x26, 0x3e, + 0x16, 0xb1, 0x68, 0xbd, 0x1c, 0xd4, 0x98, 0xbd, 0x9c, 0x91, 0x6e, 0xbd, 0xa5, + 0xc6, 0x55, 0xbc, 0xd0, 0xf3, 0xcc, 0xbd, 0xe8, 0x91, 0xe0, 0xbd, 0xdf, 0xe3, + 0xb4, 0x3d, 0x04, 0x77, 0xc2, 0xbd, 0xcc, 0x21, 0xda, 0xbd, 0x7d, 0xed, 0x1d, + 0x3d, 0x1c, 0xa9, 0x0f, 0x3e, 0x25, 0x19, 0x67, 0x3d, 0xcc, 0x29, 0x65, 0xbd, + 0x34, 0x00, 0xdd, 0x3d, 0xe3, 0x04, 0x15, 0xbd, 0x79, 0xb8, 0x50, 0xbd, 0x98, + 0x5b, 0x44, 0xbc, 0x32, 0x55, 0xd1, 0x3d, 0x19, 0x20, 0x2a, 0xbd, 0xbd, 0x28, + 0xb6, 0x3c, 0x33, 0xf4, 0xc4, 0xbb, 0x95, 0x26, 0x9f, 0xbb, 0x93, 0xb7, 0x7f, + 0x3d, 0x16, 0xbc, 0x5f, 0x3d, 0x0a, 0x14, 0x82, 0x3c, 0x3a, 0x40, 0x12, 0x3e, + 0x99, 0x9c, 0xbe, 0x3c, 0x6c, 0x22, 0x72, 0x3d, 0xb3, 0x18, 0x10, 0xbe, 0x2b, + 0x6f, 0x4b, 0x3d, 0xaf, 0x83, 0x90, 0x3c, 0x67, 0x6b, 0x57, 0x3d, 0xae, 0xba, + 0x1d, 0xbd, 0x42, 0x58, 0xda, 0xbd, 0xcd, 0x16, 0xc6, 0xbd, 0x28, 0x11, 0xa1, + 0xbd, 0xc3, 0xfa, 0x6b, 0x3d, 0xff, 0x35, 0xc4, 0x3d, 0xca, 0x54, 0x9d, 0x3d, + 0x65, 0xc0, 0x0a, 0x3d, 0xbe, 0xbd, 0x73, 0xbc, 0xee, 0xf8, 0xfb, 0x3a, 0x88, + 0xcf, 0x2c, 0x3d, 0xa4, 0x2d, 0xb9, 0x3d, 0x30, 0xbf, 0x9c, 0xbd, 0x16, 0xf6, + 0x97, 0x3c, 0x72, 0xf4, 0x12, 0x3d, 0x4c, 0xc6, 0x01, 0xbd, 0x68, 0x2e, 0xc0, + 0xbd, 0x38, 0xd4, 0x2c, 0x3d, 0xe6, 0xb4, 0xbf, 0x3d, 0xf5, 0x15, 0x66, 0xbd, + 0x29, 0x0f, 0x83, 0x3d, 0x44, 0x2b, 0xb0, 0x3d, 0xa1, 0x53, 0xeb, 0x3d, 0xc6, + 0x86, 0x8a, 0x3d, 0xe0, 0x36, 0x48, 0xbd, 0x29, 0xff, 0x22, 0xbd, 0xff, 0x33, + 0xae, 0x3d, 0xa2, 0x5b, 0x13, 0xbd, 0x1d, 0x6f, 0x9e, 0x3d, 0x0e, 0x6d, 0x09, + 0x3d, 0x7f, 0x06, 0x01, 0xbe, 0xc8, 0x08, 0xc7, 0x3d, 0xc2, 0xe8, 0xae, 0x3d, + 0xe6, 0x4a, 0xc7, 0x3d, 0x29, 0x40, 0xb3, 0x3d, 0xb5, 0x99, 0x83, 0xbd, 0xa4, + 0x23, 0x8f, 0x3d, 0x4a, 0xa2, 0x9c, 0x3d, 0x0d, 0xe2, 0x04, 0x3d, 0x40, 0xff, + 0x07, 0x3d, 0xa4, 0x8c, 0x30, 0x3d, 0x75, 0x00, 0x1c, 0x3d, 0x45, 0x9b, 0x02, + 0x3e, 0xb2, 0xce, 0x2e, 0x3d, 0x16, 0x9d, 0x3f, 0xbd, 0x8e, 0xf1, 0x1b, 0xbc, + 0x9b, 0x59, 0x04, 0xbd, 0xae, 0xd7, 0xd3, 0x3d, 0x2b, 0x15, 0x05, 0x3b, 0x12, + 0xec, 0x5d, 0x3c, 0x30, 0xe9, 0xea, 0x3d, 0x58, 0xe5, 0xe4, 0xbd, 0x9b, 0x54, + 0x86, 0xbd, 0xf0, 0x47, 0x4e, 0xbd, 0x21, 0xa7, 0xef, 0x3b, 0x89, 0xf9, 0x23, + 0x3d, 0xec, 0x14, 0x48, 0xbd, 0xfc, 0x86, 0x20, 0x3e, 0x08, 0x69, 0x95, 0x3d, + 0x26, 0x08, 0xb6, 0xbd, 0xd9, 0xe2, 0xb3, 0xbd, 0x27, 0x6f, 0xf0, 0x3d, 0x9d, + 0xc4, 0x1c, 0xbe, 0x1a, 0x6e, 0x22, 0x3d, 0xc5, 0xe3, 0x68, 0x3d, 0x45, 0x2d, + 0x8a, 0xbb, 0xbe, 0xf3, 0x84, 0x3d, 0x63, 0xef, 0x10, 0x3d, 0x54, 0xfa, 0xde, + 0x3c, 0x57, 0x4c, 0xc4, 0x3d, 0xa7, 0x44, 0x8b, 0xbd, 0x9e, 0xf0, 0x33, 0xbd, + 0x9a, 0x6c, 0x89, 0x3d, 0x6c, 0xc9, 0x21, 0xbe, 0x0e, 0x60, 0x9d, 0xbd, 0xd9, + 0x35, 0x1f, 0xbd, 0x0d, 0x4f, 0x9a, 0x3d, 0xd4, 0x24, 0xca, 0x3d, 0xc4, 0x5c, + 0x45, 0xbd, 0x28, 0x24, 0xea, 0x3c, 0xee, 0xea, 0xef, 0xbd, 0x4d, 0xae, 0x89, + 0x3d, 0x91, 0x99, 0x79, 0xbc, 0xb6, 0x1b, 0xc2, 0x3d, 0xcb, 0x8d, 0xb4, 0xbc, + 0x63, 0xaa, 0x7f, 0xbd, 0x19, 0xbc, 0xe6, 0xbc, 0x82, 0x28, 0x4e, 0xbd, 0xf4, + 0x7a, 0xbc, 0x3d, 0xe4, 0xe7, 0xcd, 0xbd, 0x2c, 0xe3, 0xda, 0xbd, 0xc6, 0x98, + 0xec, 0x3d, 0xd7, 0xfc, 0xf8, 0xbc, 0xd4, 0x80, 0x76, 0x3d, 0xbf, 0x17, 0x3e, + 0xbd, 0x20, 0x69, 0x48, 0x3a, 0x1c, 0x2c, 0xa2, 0x3d, 0xc2, 0x8b, 0x95, 0x3d, + 0xc4, 0xb5, 0xa9, 0x3d, 0x43, 0x5b, 0xde, 0xbc, 0xf1, 0x1e, 0x0f, 0xbd, 0x52, + 0x3e, 0xbb, 0x3d, 0xff, 0xaf, 0xfd, 0x3d, 0x66, 0x65, 0x59, 0x3d, 0x03, 0x95, + 0x55, 0x3d, 0x97, 0x22, 0x04, 0xbe, 0xcb, 0x24, 0x32, 0xbd, 0xf3, 0x26, 0xa5, + 0xbd, 0xaa, 0xd3, 0xdb, 0xbc, 0x75, 0x5b, 0x41, 0xbd, 0x2e, 0x2c, 0xc4, 0x3d, + 0xd5, 0x98, 0xc4, 0x3c, 0xa3, 0x19, 0x01, 0x3c, 0x4e, 0x3f, 0x3c, 0x3d, 0xea, + 0xee, 0x2d, 0xbd, 0x3f, 0x97, 0x13, 0xbc, 0xed, 0xdd, 0x55, 0x3d, 0x49, 0xba, + 0xfb, 0xbd, 0x5c, 0xbd, 0xc9, 0xbd, 0xe8, 0x9f, 0xad, 0x3d, 0x9c, 0x26, 0x32, + 0xbd, 0xf6, 0xfa, 0x15, 0xbe, 0x09, 0x88, 0xc0, 0xbd, 0xe2, 0xcc, 0xaf, 0xbd, + 0xdb, 0x22, 0x56, 0x3d, 0x78, 0x3f, 0x0f, 0xbc, 0x50, 0xe5, 0x93, 0xbd, 0x55, + 0x90, 0x09, 0x3d, 0xac, 0xec, 0x6d, 0xbd, 0x93, 0x0e, 0xce, 0xbc, 0x5b, 0xde, + 0x85, 0x3d, 0x08, 0x1d, 0x4b, 0x3d, 0x8f, 0x16, 0xf4, 0xbd, 0x89, 0xf8, 0x83, + 0xbd, 0x65, 0xf3, 0xf8, 0xbc, 0xe3, 0x37, 0x09, 0x3b, 0x37, 0x89, 0x91, 0xbc, + 0x69, 0xea, 0x2f, 0xbd, 0x2c, 0xf2, 0xbf, 0x3c, 0xd0, 0x57, 0xa7, 0x3d, 0xae, + 0x94, 0xbf, 0x3d, 0x15, 0x1d, 0x63, 0x3d, 0x53, 0x20, 0x4b, 0xbd, 0x4f, 0xf2, + 0x00, 0x3e, 0x29, 0x36, 0x54, 0xbd, 0x49, 0x2d, 0x8c, 0xbd, 0x29, 0xbc, 0xb6, + 0x3d, 0x08, 0xc4, 0xc7, 0x3d, 0xb6, 0x3d, 0xf9, 0xbd, 0x84, 0x0f, 0xa1, 0x3d, + 0xe8, 0x20, 0xb1, 0xbd, 0x8b, 0xf6, 0xa8, 0xbd, 0x51, 0xec, 0x75, 0x3d, 0x85, + 0xeb, 0x13, 0xbe, 0x5c, 0xe5, 0x4f, 0x3d, 0xe5, 0x90, 0xf3, 0xbc, 0x5a, 0xb0, + 0x39, 0xbd, 0xbf, 0x7a, 0x63, 0x3d, 0xa4, 0x35, 0x08, 0x3e, 0xae, 0x8a, 0xa6, + 0xbd, 0x4d, 0x53, 0x46, 0xbd, 0x8e, 0xb0, 0x46, 0xbc, 0x9d, 0x94, 0x15, 0x3d, + 0x6d, 0xdc, 0x62, 0x3c, 0x75, 0x33, 0x29, 0x3d, 0x61, 0xba, 0x3d, 0x3d, 0x0a, + 0xdb, 0x72, 0xbc, 0x18, 0x43, 0xdb, 0xbc, 0xb0, 0xca, 0x83, 0xbc, 0x33, 0x9b, + 0x12, 0xbe, 0xdb, 0x85, 0xb2, 0xbd, 0xe1, 0x52, 0xc7, 0xbd, 0xd6, 0xbc, 0x12, + 0xbd, 0x19, 0x0f, 0x90, 0xbc, 0x75, 0xb0, 0x4c, 0x3d, 0x91, 0x46, 0xd2, 0x3b, + 0xae, 0x95, 0x0e, 0x3d, 0x51, 0xa0, 0x74, 0x3d, 0x9b, 0x73, 0x90, 0xba, 0xec, + 0x61, 0x85, 0x3c, 0xaa, 0x01, 0xb7, 0x3d, 0x83, 0x19, 0x96, 0xbd, 0xeb, 0x6f, + 0xce, 0x3c, 0x46, 0x50, 0x15, 0xbe, 0x4c, 0x9d, 0xe2, 0xbb, 0xee, 0x86, 0x59, + 0xbb, 0xd9, 0xea, 0x8c, 0x3d, 0x5e, 0x80, 0x96, 0x3b, 0x9e, 0x36, 0xf2, 0x3d, + 0xfc, 0x4e, 0xa8, 0x3c, 0x67, 0x32, 0xb0, 0x3d, 0x93, 0xf9, 0x1a, 0x3d, 0x71, + 0x3b, 0xaa, 0xbd, 0xd4, 0xcf, 0x34, 0x3d, 0x93, 0x11, 0x84, 0xbd, 0x76, 0x9c, + 0xc7, 0x3d, 0x6b, 0xee, 0xd5, 0xbd, 0xb6, 0x03, 0xd8, 0x3d, 0xb8, 0x56, 0x53, + 0xbd, 0x61, 0x89, 0xab, 0xbd, 0x69, 0x71, 0x46, 0xbc, 0x79, 0x31, 0x81, 0xbd, + 0xa0, 0xaa, 0x9d, 0xbc, 0xab, 0x17, 0x0c, 0x3d, 0x31, 0xb8, 0x0a, 0x3d, 0xc3, + 0x40, 0xb4, 0xbd, 0xab, 0xb6, 0x97, 0x3d, 0xc1, 0x3a, 0x47, 0x3d, 0x31, 0xdc, + 0xdb, 0xbc, 0xb4, 0x23, 0x60, 0xbc, 0x9d, 0x47, 0x93, 0x3d, 0xc9, 0x69, 0xa1, + 0x3d, 0xbb, 0x2f, 0x7a, 0x3d, 0x07, 0x8d, 0x91, 0x3d, 0x20, 0xdb, 0xca, 0x3d, + 0xf8, 0x44, 0xd3, 0xbd, 0x68, 0xfc, 0x66, 0xbc, 0xfa, 0xab, 0x29, 0x3d, 0xcb, + 0xb6, 0xa4, 0x3d, 0x9e, 0xbd, 0x06, 0x3d, 0xd1, 0x54, 0xb1, 0x3d, 0x06, 0x7e, + 0xcb, 0xbd, 0x24, 0x71, 0xc4, 0x3d, 0x08, 0x17, 0x40, 0x3d, 0x7a, 0xf7, 0xae, + 0xbd, 0xc0, 0x66, 0xc1, 0xbd, 0xfa, 0x2a, 0x22, 0xbd, 0xf0, 0x3d, 0xd2, 0xbc, + 0x2e, 0xc7, 0x71, 0xbd, 0xc5, 0x4f, 0xd0, 0xbd, 0xf7, 0x68, 0x85, 0xbd, 0xab, + 0xeb, 0x92, 0xbd, 0x5e, 0xb7, 0xe8, 0xbd, 0x66, 0xc1, 0xef, 0xbd, 0xb7, 0x07, + 0x06, 0xbd, 0x5b, 0x2f, 0x40, 0x3d, 0xd6, 0xb0, 0xa8, 0xbd, 0xb8, 0x1a, 0xe8, + 0x3d, 0x9f, 0xb7, 0xc4, 0x3d, 0x3c, 0xb5, 0x8f, 0xbd, 0x23, 0x9f, 0xbc, 0x3d, + 0xfd, 0x90, 0x88, 0xbd, 0xa2, 0xa9, 0x27, 0xbc, 0x41, 0xe4, 0xd7, 0xbd, 0x29, + 0x97, 0x07, 0xbd, 0xff, 0x72, 0x04, 0x3c, 0x56, 0x5a, 0x34, 0xbd, 0xf4, 0x8a, + 0x9d, 0xbd, 0x7e, 0x5d, 0x83, 0xbd, 0xd2, 0x00, 0x4e, 0x3d, 0xbe, 0x7e, 0x5d, + 0x3d, 0x03, 0xd1, 0x38, 0xbd, 0xb2, 0x2b, 0xbc, 0xbd, 0x04, 0xa8, 0x4d, 0x3d, + 0xa8, 0x0b, 0xaa, 0xbd, 0x84, 0x50, 0xac, 0xbd, 0x09, 0xef, 0xbf, 0xbc, 0xfa, + 0xb8, 0xb2, 0xbd, 0xeb, 0x7e, 0xd9, 0x3d, 0x54, 0x08, 0xda, 0xbd, 0x21, 0x24, + 0x61, 0xbd, 0xae, 0x1e, 0xae, 0xbd, 0xb4, 0x50, 0x3a, 0xbc, 0x2e, 0x07, 0xe9, + 0xbd, 0xec, 0xb1, 0x9d, 0xbd, 0x88, 0x5d, 0xca, 0xbc, 0x0c, 0x8a, 0x8c, 0x3d, + 0x58, 0x56, 0xf9, 0x3c, 0x57, 0x0f, 0xe7, 0x3d, 0xd4, 0xd9, 0x1c, 0xbd, 0x87, + 0xfe, 0x38, 0xbd, 0x1c, 0x08, 0x17, 0xbd, 0x72, 0xbb, 0xc1, 0xbc, 0x5b, 0xa9, + 0xf7, 0xba, 0xf2, 0xd5, 0x34, 0xbd, 0x71, 0x2f, 0x4b, 0xbd, 0x6a, 0xd6, 0xab, + 0xbd, 0x07, 0x81, 0xcd, 0x3d, 0x03, 0xf0, 0x2e, 0x3d, 0xcd, 0x20, 0xd4, 0xbd, + 0x0e, 0xf4, 0x3f, 0xbc, 0xf3, 0xed, 0xe1, 0x3d, 0xf6, 0xc4, 0x82, 0x3d, 0x0b, + 0x42, 0x48, 0x3d, 0xf9, 0xcd, 0x87, 0x3d, 0x91, 0x7d, 0x49, 0x3b, 0x9a, 0xc7, + 0x28, 0xbd, 0xf6, 0x02, 0xc3, 0x3d, 0x6e, 0x82, 0xa4, 0xbd, 0x41, 0x1f, 0xe7, + 0x3d, 0x44, 0x06, 0x76, 0x3d, 0x3b, 0xbc, 0xc1, 0x3b, 0x20, 0xf7, 0x7c, 0xbd, + 0x0d, 0x0d, 0xe0, 0xbd, 0x2b, 0xa5, 0xc5, 0x3d, 0x51, 0x84, 0x6f, 0xbd, 0xd0, + 0x24, 0x22, 0x3d, 0x33, 0x68, 0xb7, 0x3d, 0x37, 0x88, 0x87, 0x3d, 0x24, 0x04, + 0x98, 0xbd, 0x1b, 0xba, 0x04, 0xbd, 0x48, 0x09, 0xdf, 0x3b, 0xac, 0x9e, 0x3c, + 0xbd, 0x4b, 0xbf, 0x2c, 0x3c, 0x07, 0xba, 0xf4, 0xbd, 0x6e, 0x91, 0x84, 0x3d, + 0x99, 0x5a, 0x7e, 0x3c, 0x21, 0x9e, 0xeb, 0x3c, 0xde, 0x69, 0x18, 0x3d, 0x1f, + 0x8f, 0xaa, 0x3d, 0x09, 0x55, 0x08, 0xbd, 0x42, 0xf3, 0xe5, 0xbd, 0x61, 0x6b, + 0x82, 0xbd, 0xe1, 0xe2, 0xd2, 0x3d, 0x3f, 0xd1, 0xb6, 0x3d, 0xf9, 0xf5, 0xc7, + 0xbd, 0x47, 0x47, 0x90, 0xbd, 0x74, 0xa3, 0x42, 0xbd, 0xa5, 0xda, 0x3e, 0x3d, + 0xaf, 0x45, 0xc1, 0x3d, 0x68, 0x46, 0xe5, 0xbd, 0x79, 0x83, 0x31, 0x3d, 0x7e, + 0xd3, 0xce, 0x3c, 0xea, 0x30, 0xca, 0xbd, 0x00, 0xb0, 0xae, 0x3b, 0x66, 0x91, + 0xde, 0xbd, 0x0e, 0x11, 0xc0, 0xbd, 0xd0, 0x6a, 0x41, 0xbd, 0x6d, 0x7a, 0x8e, + 0xbd, 0x0a, 0xe2, 0x70, 0x3d, 0x7b, 0x4d, 0xcf, 0x3d, 0x2c, 0x2b, 0x3d, 0xbd, + 0x7e, 0xc3, 0x6f, 0xbd, 0xd0, 0x38, 0xac, 0x3c, 0xac, 0x35, 0xd0, 0xbd, 0x88, + 0x08, 0xe3, 0xbd, 0x78, 0x27, 0xbf, 0x3d, 0x80, 0x1e, 0xf8, 0xbc, 0x52, 0x7a, + 0x84, 0xbc, 0x77, 0x84, 0xbb, 0xbc, 0x22, 0xdf, 0x2b, 0x3d, 0xa8, 0x16, 0xe9, + 0xbd, 0xec, 0xab, 0xda, 0x3b, 0xb9, 0x2f, 0x9b, 0x3d, 0x28, 0x97, 0xd6, 0x3d, + 0x08, 0xde, 0x2c, 0xbc, 0x8a, 0x6c, 0x29, 0x3d, 0xdd, 0xfe, 0xa4, 0xbc, 0x13, + 0xb3, 0x4e, 0xbc, 0x4f, 0x72, 0x81, 0xbc, 0x33, 0x6c, 0xcc, 0x3d, 0x1c, 0xbc, + 0x76, 0xbc, 0xfd, 0xd7, 0x8f, 0xbd, 0x99, 0xfd, 0x53, 0xbd, 0x2c, 0x76, 0x80, + 0xbd, 0x65, 0x2e, 0x1d, 0xbd, 0x9d, 0xd5, 0x8e, 0x3d, 0xeb, 0x16, 0xac, 0x3d, + 0xa6, 0x14, 0x3d, 0x3d, 0x75, 0x14, 0x97, 0x3d, 0x5e, 0x11, 0xf5, 0xbc, 0xca, + 0x20, 0x46, 0xbb, 0xb1, 0x04, 0xa1, 0xbd, 0x90, 0xcd, 0x3a, 0x3d, 0x70, 0xaf, + 0x01, 0xbe, 0x9d, 0xe3, 0xb2, 0xbd, 0xc3, 0xdf, 0x99, 0x3d, 0x20, 0x09, 0xab, + 0x3d, 0x35, 0x91, 0x06, 0xbd, 0x10, 0x3a, 0xa0, 0xbc, 0xc2, 0xd1, 0xad, 0x3d, + 0x60, 0x90, 0xe4, 0x3d, 0x9f, 0x47, 0xfd, 0x3c, 0x84, 0xa1, 0x5f, 0x3d, 0x06, + 0x5e, 0xf0, 0x3c, 0xab, 0x8c, 0x07, 0xbc, 0xf4, 0x6c, 0x16, 0x3d, 0x64, 0x06, + 0x04, 0xbe, 0xa8, 0x16, 0x85, 0x3d, 0xea, 0x1a, 0xa1, 0xbd, 0x0d, 0xb4, 0xdc, + 0xbd, 0xf4, 0x77, 0xc0, 0xbc, 0x5d, 0x03, 0x28, 0xbd, 0x29, 0x7d, 0xcc, 0xbc, + 0xae, 0x19, 0x9f, 0x3d, 0x09, 0x2a, 0xcd, 0x3d, 0xa4, 0x58, 0xaa, 0xbd, 0x6d, + 0xb8, 0xa9, 0x3c, 0xa1, 0xb7, 0xe6, 0xbd, 0xa9, 0x41, 0x9a, 0xbd, 0x69, 0xa4, + 0xab, 0x3c, 0xdd, 0x32, 0xa9, 0x3d, 0x19, 0x90, 0xd4, 0x3d, 0x52, 0xa8, 0xea, + 0xbd, 0x1e, 0x3d, 0xd4, 0x39, 0x84, 0x91, 0x03, 0xbe, 0xc9, 0x63, 0x3f, 0x3d, + 0x81, 0x1e, 0xe0, 0x3d, 0x05, 0xc5, 0x95, 0xbd, 0x2e, 0x1d, 0xc9, 0xbd, 0xf2, + 0x9c, 0x7c, 0xbc, 0x69, 0x19, 0xdb, 0xbc, 0x09, 0x3d, 0x6f, 0xbd, 0x58, 0x94, + 0xf8, 0x3d, 0x2c, 0x78, 0xb6, 0x3d, 0x96, 0xbe, 0xf8, 0x3d, 0x98, 0x4e, 0xb6, + 0x3d, 0x1a, 0xa0, 0x90, 0x3d, 0xa3, 0xeb, 0xd2, 0xbd, 0x4c, 0xfb, 0x2d, 0xbd, + 0xcb, 0xca, 0xa8, 0xbc, 0xa7, 0xca, 0x80, 0xbd, 0x65, 0xe2, 0x87, 0xbd, 0x9d, + 0x9a, 0x25, 0x3c, 0xc7, 0xf2, 0xcc, 0x3c, 0x38, 0x81, 0x48, 0xbd, 0xd3, 0x83, + 0xea, 0x3d, 0x4f, 0x72, 0xad, 0xbd, 0x6d, 0xef, 0x3f, 0xbc, 0x22, 0xc7, 0xbf, + 0xbc, 0xb6, 0x25, 0x64, 0x3c, 0x82, 0x76, 0x53, 0xbd, 0xd7, 0x9a, 0x89, 0x3c, + 0x01, 0xa7, 0x40, 0x3d, 0xbe, 0x03, 0x69, 0xbd, 0x5c, 0x79, 0x0e, 0xbe, 0xeb, + 0x87, 0x9f, 0xbd, 0x14, 0xa6, 0xad, 0x3c, 0x78, 0x6b, 0x25, 0x3d, 0xea, 0xa0, + 0xd7, 0x3d, 0x19, 0xb6, 0x22, 0xbd, 0xc6, 0xf6, 0xba, 0xbc, 0xe9, 0xd6, 0xe4, + 0x3c, 0x55, 0x68, 0x2a, 0xbd, 0xc0, 0x4c, 0xb0, 0xbc, 0xf5, 0xa5, 0x01, 0x3e, + 0x59, 0x9a, 0xd0, 0xbd, 0x4a, 0xb2, 0xfc, 0x3d, 0x3a, 0x59, 0x8f, 0x3d, 0x4a, + 0x0a, 0xb4, 0xbd, 0x7d, 0xc4, 0x63, 0x3d, 0xb6, 0xb8, 0xb9, 0x3d, 0xb0, 0x95, + 0x81, 0x3c, 0x2f, 0x7a, 0x32, 0x3d, 0x32, 0x87, 0xe4, 0xbc, 0xf0, 0xfc, 0xd5, + 0x3d, 0xfc, 0xe6, 0xf1, 0x3d, 0x04, 0x66, 0x98, 0x3c, 0x14, 0x23, 0x72, 0x3c, + 0xfe, 0x50, 0x95, 0x3d, 0xdf, 0xe6, 0x4c, 0x3d, 0x84, 0x80, 0x8e, 0x3d, 0x13, + 0xe8, 0x4c, 0xbd, 0xd4, 0xca, 0x83, 0xbd, 0x20, 0x86, 0xb0, 0xbd, 0xed, 0x66, + 0x89, 0x3c, 0x6a, 0x59, 0x19, 0xbd, 0xc2, 0x32, 0xc3, 0xbd, 0x04, 0x3f, 0x8d, + 0xbc, 0x51, 0xcc, 0x23, 0xbc, 0xb4, 0x4f, 0xa3, 0xbc, 0x30, 0x98, 0xc8, 0x3d, + 0x29, 0xaa, 0xd4, 0xbb, 0x5c, 0x7d, 0x88, 0xbd, 0x3a, 0xe9, 0xa9, 0xbd, 0xc3, + 0x4f, 0x40, 0xbd, 0x2d, 0x12, 0x49, 0xbd, 0x9e, 0x4e, 0x9a, 0xbd, 0xf1, 0xa9, + 0x84, 0xbd, 0x29, 0x09, 0x94, 0x3d, 0x98, 0x3c, 0xf0, 0x3d, 0x5f, 0xfe, 0x2a, + 0xbd, 0xd8, 0xa8, 0x46, 0xbd, 0xa1, 0xc8, 0x1c, 0xbb, 0x12, 0x3d, 0xbc, 0x3d, + 0x38, 0x39, 0x51, 0x3c, 0x3a, 0x00, 0x95, 0x3d, 0xd8, 0x2e, 0x67, 0x3c, 0x48, + 0x7e, 0xe0, 0xbd, 0x8c, 0x90, 0x79, 0x3c, 0xf2, 0x3d, 0x50, 0x3d, 0xbc, 0x2f, + 0xa1, 0x3c, 0xf9, 0xf0, 0x8a, 0x3d, 0x0e, 0x11, 0x30, 0x3c, 0x7c, 0xc8, 0xf8, + 0x3c, 0xe0, 0x88, 0x10, 0x3d, 0x4b, 0xaa, 0xbe, 0xbd, 0xa4, 0x0a, 0x5b, 0x3d, + 0xe2, 0x3c, 0x94, 0x3d, 0xdd, 0x36, 0x95, 0xbd, 0xc7, 0x70, 0x89, 0xbd, 0x95, + 0xe7, 0x89, 0x3d, 0x91, 0x0e, 0x23, 0x3c, 0xfe, 0x32, 0x4f, 0x3b, 0xd4, 0x79, + 0xc2, 0x3d, 0x52, 0xab, 0xb4, 0xbd, 0xb3, 0x98, 0xd2, 0x3d, 0xb8, 0x70, 0x88, + 0xbd, 0x2e, 0x3e, 0x77, 0x3d, 0xb5, 0x44, 0x00, 0x3d, 0xb4, 0xe9, 0x59, 0x3d, + 0xae, 0x3b, 0x9d, 0x3d, 0x3d, 0x89, 0x36, 0x3d, 0x22, 0x67, 0x9b, 0xbb, 0xca, + 0xca, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0xcf, 0x02, + 0xcf, 0x3d, 0x6b, 0xe2, 0x84, 0x3d, 0x62, 0xaa, 0xdc, 0x3d, 0xdf, 0x55, 0xef, + 0x3b, 0xc1, 0x2b, 0x41, 0xbd, 0x6e, 0x82, 0xb3, 0xbd, 0x08, 0xc0, 0x6c, 0xbd, + 0x7c, 0xb9, 0x10, 0xbe, 0x97, 0x76, 0xbb, 0xbc, 0xa3, 0x52, 0x00, 0xbe, 0xd9, + 0x90, 0x32, 0xbe, 0xac, 0x38, 0x62, 0x3d, 0x6c, 0xdc, 0xae, 0xbc, 0x2a, 0x7d, + 0x01, 0xbe, 0x2f, 0xf8, 0x30, 0xbd, 0x8f, 0x24, 0x45, 0xbe, 0x0c, 0x74, 0x1f, + 0xbe, 0x5e, 0x0b, 0x0f, 0xbd, 0xf7, 0xb6, 0xc5, 0x3d, 0xe9, 0x3c, 0xbb, 0xbd, + 0x61, 0x11, 0x19, 0x3d, 0x68, 0xf0, 0x44, 0x3e, 0x26, 0x64, 0x95, 0x3c, 0xa1, + 0xde, 0x54, 0x3d, 0x25, 0x8b, 0x14, 0x3e, 0x0f, 0xed, 0xfe, 0x3b, 0x1b, 0x37, + 0xf4, 0xbd, 0x9e, 0x28, 0xbd, 0x3d, 0x26, 0x5c, 0xca, 0x3d, 0xbb, 0xad, 0x02, + 0x3d, 0x1f, 0xc1, 0x25, 0x3e, 0x85, 0x0a, 0x39, 0xbe, 0xfa, 0xc3, 0xf7, 0xbd, + 0xda, 0x75, 0xc6, 0xbd, 0x06, 0x2d, 0x4a, 0x3c, 0x1a, 0xc1, 0x94, 0xbd, 0xb0, + 0x62, 0xa0, 0xbd, 0x63, 0x0c, 0x0e, 0xbe, 0xf3, 0x67, 0x01, 0xbe, 0xd9, 0x42, + 0x48, 0xbe, 0xaa, 0xf0, 0xf6, 0xbd, 0xc7, 0xa6, 0x39, 0xbe, 0xf6, 0xef, 0xb2, + 0x3d, 0xe6, 0x6f, 0xd7, 0xbd, 0x14, 0x4f, 0xfb, 0xbc, 0x7f, 0xb1, 0x86, 0x3d, + 0xcc, 0xca, 0xd9, 0xbd, 0x34, 0x6f, 0x3e, 0xbc, 0x90, 0x24, 0xe8, 0x3d, 0xda, + 0x5a, 0xf9, 0x3d, 0x78, 0xc9, 0xf0, 0xbd, 0x1e, 0x50, 0xa5, 0x3d, 0xce, 0xed, + 0x6d, 0xbd, 0x65, 0x3b, 0x62, 0xbd, 0x52, 0x36, 0x3d, 0xbd, 0xf8, 0x54, 0x70, + 0x3d, 0x01, 0x85, 0x39, 0x3c, 0x57, 0xf0, 0xa8, 0xbc, 0xf5, 0x69, 0xda, 0xbd, + 0xd5, 0x00, 0xda, 0x3d, 0x47, 0x0a, 0xe6, 0x3d, 0xf1, 0xed, 0xae, 0xbd, 0x1b, + 0x51, 0x93, 0x3d, 0x25, 0x8d, 0x1e, 0x3e, 0x65, 0x36, 0x24, 0x3e, 0xab, 0x4e, + 0x3b, 0xbe, 0x73, 0x91, 0x7b, 0x3d, 0x79, 0x2a, 0xa6, 0x3c, 0x6e, 0x13, 0x29, + 0x3e, 0xae, 0x98, 0x8b, 0x3d, 0x61, 0xec, 0x36, 0xbe, 0xee, 0xd9, 0x8a, 0x3d, + 0xe8, 0xd8, 0xff, 0xbd, 0x87, 0xae, 0x13, 0xbe, 0x45, 0x02, 0xae, 0x3d, 0xbc, + 0x03, 0x94, 0xbd, 0xf6, 0x5b, 0x17, 0xbe, 0x3c, 0x46, 0x15, 0x3e, 0x99, 0xe3, + 0x3b, 0x3e, 0x6c, 0x0a, 0x82, 0xbd, 0x67, 0xb1, 0xb4, 0x3c, 0x68, 0xc6, 0x0a, + 0x3e, 0x7f, 0xe1, 0xa5, 0x3d, 0x38, 0x5c, 0x61, 0x3e, 0x0d, 0x37, 0xdd, 0xbd, + 0x14, 0xae, 0xff, 0xbc, 0x00, 0xba, 0x97, 0x3d, 0x61, 0xf4, 0xd7, 0x3c, 0xb9, + 0x7e, 0x0b, 0xbe, 0x87, 0xa5, 0x59, 0xbc, 0x01, 0x95, 0x19, 0x3c, 0x3e, 0xf3, + 0x72, 0xbd, 0x8b, 0x32, 0x0e, 0xbe, 0x8e, 0x5c, 0x30, 0x3e, 0xd1, 0x09, 0x10, + 0x3e, 0xfb, 0xc9, 0x13, 0x3e, 0x82, 0x6f, 0xe2, 0x3d, 0x71, 0xd7, 0xc8, 0xbd, + 0x57, 0x14, 0xbb, 0xbd, 0x0f, 0x10, 0x40, 0x3d, 0xa6, 0x30, 0x1e, 0x3d, 0xc8, + 0x3f, 0x4a, 0x3e, 0x06, 0xe9, 0x15, 0xbd, 0x8a, 0x87, 0x11, 0x3e, 0xe2, 0xa4, + 0x0b, 0xbe, 0xe5, 0x96, 0x3d, 0x3e, 0x5e, 0x78, 0x0c, 0x3e, 0x32, 0x79, 0x7a, + 0xba, 0x24, 0x9f, 0x1f, 0xbe, 0xe1, 0x2d, 0xc3, 0xbc, 0xdf, 0x43, 0xb4, 0xbd, + 0xb1, 0x00, 0xde, 0x3d, 0x7e, 0x34, 0x4b, 0xbe, 0xeb, 0x21, 0xdd, 0xbd, 0xbe, + 0x43, 0xe2, 0xbd, 0x4b, 0x49, 0x9f, 0x3d, 0xa3, 0xd0, 0x8e, 0x3d, 0xdf, 0x84, + 0x17, 0xbe, 0x12, 0x0b, 0xc8, 0xbd, 0xcb, 0x0e, 0x64, 0xbd, 0xdd, 0x25, 0x83, + 0xbd, 0xa0, 0x78, 0x1b, 0x3e, 0x2e, 0x77, 0x1e, 0xbe, 0x94, 0x81, 0xc8, 0xbd, + 0x8d, 0x3e, 0xba, 0xbd, 0xff, 0xe9, 0x32, 0x3e, 0xb0, 0x76, 0xb9, 0xbd, 0xfd, + 0x8a, 0x71, 0xbd, 0xab, 0xf3, 0x4c, 0xbc, 0x0c, 0xa0, 0x0c, 0x3e, 0xa2, 0x36, + 0xb2, 0xbc, 0x1b, 0x34, 0xb2, 0xbd, 0x44, 0x18, 0x8c, 0xbd, 0xa3, 0xe3, 0x83, + 0xbd, 0x45, 0x8c, 0xae, 0xbd, 0x4e, 0x7d, 0x09, 0xbe, 0xdf, 0x58, 0x19, 0xbd, + 0xae, 0x8f, 0x5f, 0x3d, 0xa7, 0x36, 0x80, 0xbd, 0xfb, 0x12, 0x22, 0x3e, 0x25, + 0x11, 0x99, 0xbb, 0x51, 0xc9, 0x4a, 0x3d, 0x99, 0x68, 0x32, 0x3e, 0x44, 0xcc, + 0x7a, 0xbc, 0xa8, 0x46, 0xb7, 0x3d, 0x5f, 0xbb, 0x8a, 0xbd, 0xd3, 0xbb, 0x3a, + 0x3e, 0x46, 0x2c, 0x89, 0x3d, 0x26, 0xcb, 0x79, 0x3d, 0xe1, 0x45, 0x40, 0xbd, + 0x01, 0xc4, 0xe3, 0x3d, 0x42, 0x18, 0x24, 0x3e, 0x34, 0x73, 0x19, 0x3e, 0x00, + 0x53, 0xb7, 0x3d, 0x33, 0x6d, 0xf8, 0x3c, 0x2c, 0x5d, 0x3f, 0xbd, 0x85, 0xa9, + 0x1b, 0xbe, 0x18, 0xda, 0xb8, 0xbc, 0xaa, 0x92, 0xb4, 0x3d, 0x53, 0x65, 0x43, + 0x3e, 0x4f, 0xda, 0x03, 0xbd, 0xba, 0x8e, 0x40, 0xbe, 0xc1, 0x11, 0xb8, 0xbb, + 0x3e, 0x07, 0x66, 0x3e, 0xb8, 0x25, 0xe0, 0x3c, 0x7f, 0x4d, 0x0f, 0xbd, 0x35, + 0x57, 0xaa, 0xbd, 0xe5, 0x8b, 0xec, 0xbd, 0x70, 0xda, 0x08, 0xbc, 0x03, 0xc2, + 0xf5, 0xbb, 0xa5, 0x57, 0x83, 0xbd, 0xf1, 0x0b, 0x74, 0x3e, 0x9a, 0x63, 0x5a, + 0xbd, 0x8f, 0xb3, 0xa1, 0xbb, 0xe3, 0x0a, 0xd1, 0x3c, 0xa8, 0xc3, 0xfd, 0x3d, + 0x58, 0x80, 0x04, 0xbe, 0xfb, 0xca, 0xe0, 0x3d, 0x01, 0x75, 0x04, 0xbe, 0xbe, + 0xa9, 0x55, 0xbd, 0x59, 0x90, 0xff, 0xbd, 0x6a, 0xf0, 0x64, 0xbd, 0x89, 0xdc, + 0x1d, 0xbe, 0xb8, 0x8f, 0x26, 0xbd, 0x3b, 0x31, 0xc8, 0xbd, 0x2c, 0x3d, 0x88, + 0xbd, 0x48, 0xea, 0x0f, 0xbd, 0xce, 0x3f, 0x22, 0x3d, 0x8b, 0x31, 0xe7, 0x3d, + 0xa1, 0x13, 0x55, 0xbd, 0x2a, 0x96, 0xcc, 0x3d, 0xa1, 0xd9, 0xcf, 0x3d, 0x9f, + 0x0f, 0xcf, 0x3c, 0xac, 0x8b, 0xa4, 0xbc, 0x88, 0x69, 0xb6, 0x3d, 0x35, 0x40, + 0xc8, 0x3d, 0x5a, 0x6e, 0x23, 0xbe, 0x5f, 0xd9, 0x17, 0xbe, 0x4b, 0x8e, 0x9f, + 0xbd, 0x44, 0xeb, 0x15, 0xbe, 0xe9, 0x93, 0xba, 0x3d, 0x4b, 0x93, 0x08, 0xbe, + 0x79, 0x4d, 0x09, 0x3e, 0x5a, 0x98, 0x6d, 0xbd, 0x02, 0x95, 0x24, 0xbe, 0x80, + 0x67, 0x9d, 0xbd, 0xd2, 0x10, 0x1f, 0xbe, 0x64, 0xd2, 0x62, 0xbd, 0x01, 0x92, + 0x09, 0x3e, 0x96, 0x6e, 0xca, 0xbd, 0x62, 0x32, 0xf3, 0xbd, 0xe1, 0x10, 0x50, + 0x3d, 0x61, 0x3e, 0xdc, 0x3d, 0x7e, 0x6e, 0xd5, 0xbd, 0xf4, 0xea, 0x1f, 0x3e, + 0x2a, 0xd2, 0x10, 0xbd, 0x04, 0xa4, 0xdd, 0x3b, 0x7f, 0x19, 0x50, 0xbd, 0xad, + 0x49, 0x0e, 0x3e, 0x63, 0x14, 0xe3, 0x3d, 0x6f, 0x2d, 0x99, 0x3d, 0x4a, 0x0b, + 0x08, 0xbe, 0xd6, 0x54, 0xdd, 0xbd, 0xfb, 0x6b, 0x9e, 0xbd, 0xc0, 0x42, 0xe9, + 0xbd, 0xba, 0xef, 0x40, 0xbb, 0x9c, 0x44, 0xc5, 0x3d, 0x1e, 0x3a, 0xde, 0xbd, + 0xce, 0x6d, 0xef, 0x3d, 0x92, 0x4d, 0xf6, 0xbd, 0xa3, 0xc5, 0x0c, 0xbe, 0x74, + 0x63, 0xd8, 0xbd, 0xff, 0xd4, 0x11, 0x3e, 0x02, 0x10, 0x28, 0xbd, 0x86, 0xf5, + 0x4f, 0x3d, 0x6a, 0xfb, 0xc6, 0x3d, 0x6d, 0x29, 0x1f, 0xbe, 0xa4, 0x55, 0xab, + 0x3d, 0xaa, 0xc8, 0xc7, 0x3d, 0xf4, 0xec, 0x59, 0x3d, 0xd1, 0x44, 0x75, 0x3d, + 0xe6, 0x18, 0x3c, 0x3e, 0xd7, 0x83, 0xb5, 0x3d, 0xdc, 0xa3, 0xb1, 0xbd, 0xbb, + 0xa7, 0x73, 0xbd, 0x03, 0x00, 0x3c, 0x3d, 0x3b, 0x59, 0x8d, 0xbd, 0x27, 0x1f, + 0x07, 0xbe, 0x46, 0x5f, 0xcf, 0xbd, 0x5b, 0xf5, 0x13, 0xbe, 0xe9, 0xa9, 0x1b, + 0x3e, 0x05, 0x6e, 0x0e, 0x3e, 0xd2, 0xa7, 0xad, 0xbc, 0x55, 0xda, 0x12, 0x3e, + 0xd4, 0xd5, 0xcc, 0xbd, 0x5e, 0x0d, 0x33, 0xbe, 0x5f, 0xfa, 0x99, 0xbd, 0xa1, + 0xd4, 0x96, 0xbd, 0x7b, 0xec, 0x08, 0x3d, 0xf0, 0x43, 0x04, 0xbe, 0xd6, 0x6a, + 0x3e, 0x3d, 0x9c, 0x4c, 0xa5, 0xbd, 0xc1, 0x25, 0xeb, 0x3c, 0x00, 0x84, 0x7f, + 0xbd, 0x8e, 0x5b, 0x2d, 0xbd, 0x5a, 0x0d, 0x93, 0x3c, 0x14, 0x09, 0x5e, 0x3d, + 0x0e, 0x7c, 0x25, 0x3d, 0x4b, 0x3f, 0x0f, 0xbe, 0xad, 0x31, 0xd8, 0xbd, 0x81, + 0xa4, 0x66, 0xbd, 0x25, 0x37, 0x32, 0xbe, 0x64, 0x42, 0x6f, 0x3d, 0x9c, 0xdb, + 0xc2, 0x3d, 0x1f, 0x78, 0xcc, 0x3c, 0x45, 0xa8, 0x0c, 0x3e, 0xe8, 0x27, 0xe3, + 0x3d, 0xbf, 0xb1, 0xff, 0x3d, 0x3e, 0x13, 0xc6, 0x3d, 0xf2, 0x5b, 0x64, 0x3d, + 0xf1, 0xf8, 0x16, 0x3e, 0x24, 0x46, 0x40, 0x3d, 0xa1, 0x7e, 0x99, 0x3c, 0x6d, + 0x30, 0x1e, 0xbe, 0x04, 0xdd, 0x2a, 0xbe, 0x03, 0x25, 0x20, 0xbd, 0x07, 0xf4, + 0x74, 0xbc, 0xc8, 0x71, 0x03, 0xbd, 0x46, 0xf3, 0xd9, 0xbc, 0x33, 0x6d, 0xbb, + 0xbd, 0xbd, 0x8a, 0xd5, 0x3d, 0x68, 0xbd, 0x9e, 0xbc, 0x1c, 0x26, 0x09, 0xbe, + 0x0f, 0x3c, 0x9d, 0xbd, 0xde, 0x13, 0x53, 0xbd, 0x73, 0xe9, 0x90, 0x3d, 0xdc, + 0x50, 0xef, 0x3c, 0x6f, 0x00, 0x32, 0xbc, 0x42, 0x79, 0x18, 0x3e, 0xa8, 0xe4, + 0xb3, 0xbd, 0x04, 0x2f, 0x6e, 0xbd, 0x41, 0xb2, 0x51, 0x3e, 0x56, 0x54, 0xe7, + 0x3d, 0x0c, 0x44, 0xbb, 0xbd, 0xa4, 0xce, 0x8b, 0x3c, 0xad, 0x8a, 0xec, 0x3d, + 0xf7, 0xc9, 0x44, 0xbd, 0xc5, 0xdc, 0x2a, 0x3b, 0xde, 0x9e, 0xb6, 0x3d, 0x20, + 0x2c, 0x1c, 0xbe, 0x04, 0x0c, 0x9f, 0xbd, 0x41, 0x5f, 0xd4, 0xbd, 0x76, 0x92, + 0x06, 0xbe, 0x6a, 0x98, 0x30, 0xbe, 0xc4, 0xa0, 0xd3, 0x3c, 0x38, 0x33, 0xf5, + 0xbd, 0x94, 0x28, 0x0d, 0xbd, 0x42, 0x60, 0x1e, 0x3d, 0xfd, 0x72, 0xca, 0x3d, + 0xee, 0xf6, 0x0d, 0x3e, 0x35, 0xb3, 0x27, 0x3e, 0x15, 0xde, 0x08, 0xbe, 0x34, + 0xc4, 0x8b, 0xbd, 0x4a, 0x4f, 0x9a, 0x3d, 0x87, 0x8f, 0x06, 0xbc, 0x68, 0x43, + 0x10, 0xbd, 0x36, 0x40, 0xb6, 0xbc, 0xf2, 0xad, 0x82, 0xbd, 0xc5, 0xef, 0x13, + 0xbe, 0x4c, 0x38, 0xcd, 0xbd, 0x4a, 0xdf, 0x9d, 0x3c, 0x9d, 0xb0, 0x9a, 0x3d, + 0xe8, 0xf7, 0xd4, 0x3d, 0x9d, 0x50, 0x34, 0x3d, 0xc9, 0x92, 0xdf, 0x3d, 0x20, + 0x66, 0xeb, 0x3d, 0x54, 0x5c, 0x85, 0xbd, 0x2d, 0x0e, 0xc6, 0x3d, 0x90, 0xea, + 0x64, 0xbd, 0xcd, 0xa5, 0x5c, 0xbd, 0x77, 0x8d, 0x7b, 0x3d, 0xf7, 0xda, 0x98, + 0xbd, 0xc2, 0x98, 0xcb, 0x3d, 0x79, 0xa4, 0x2d, 0x3d, 0x52, 0x42, 0x15, 0x3e, + 0xc5, 0x68, 0x47, 0xbd, 0xbf, 0xa0, 0xe7, 0xbd, 0xbf, 0xa4, 0xbd, 0x3b, 0x6f, + 0xe3, 0x05, 0xbd, 0xd3, 0xda, 0xdb, 0xbd, 0x40, 0x3a, 0xa8, 0xbd, 0x87, 0x88, + 0x36, 0xbe, 0xaf, 0x1d, 0xe5, 0x3d, 0xf6, 0xe8, 0x2e, 0xbe, 0xbc, 0x78, 0x9b, + 0x3d, 0x8b, 0x27, 0xf6, 0xbd, 0x18, 0x45, 0xef, 0xbd, 0x8c, 0x3f, 0x3e, 0x3e, + 0x94, 0x69, 0x16, 0xbe, 0x4f, 0xce, 0x48, 0xbe, 0x0c, 0xfa, 0x0b, 0xbc, 0x01, + 0x50, 0x37, 0x3e, 0x87, 0x13, 0x0b, 0xbe, 0xd0, 0xb1, 0x38, 0x3e, 0x71, 0x2c, + 0xa1, 0x3d, 0x4a, 0x15, 0xb4, 0xbd, 0x80, 0x28, 0x2b, 0xbd, 0xc7, 0x3d, 0x7e, + 0x3c, 0xe5, 0xe1, 0xf1, 0x3d, 0x43, 0x56, 0x2c, 0x3d, 0x18, 0xba, 0x20, 0xbe, + 0x4e, 0x30, 0x8d, 0x3d, 0x0b, 0x52, 0x20, 0x3b, 0x2d, 0xbc, 0x48, 0xbd, 0xf8, + 0xff, 0xcf, 0xbb, 0x34, 0xb2, 0xaf, 0x3c, 0xea, 0xad, 0xf0, 0x3d, 0xed, 0xbd, + 0x8d, 0x3d, 0x41, 0x8c, 0xde, 0xbd, 0xb0, 0xb4, 0x32, 0x3e, 0xf8, 0x16, 0x2e, + 0xbe, 0x0c, 0x4a, 0x8c, 0x3d, 0x89, 0x92, 0x13, 0x3e, 0x8b, 0xd2, 0xbb, 0xbd, + 0xf5, 0xce, 0x0f, 0x3e, 0x31, 0x82, 0x7b, 0xbb, 0x7f, 0xac, 0x0e, 0x3e, 0x9f, + 0xe7, 0x0a, 0xbe, 0x5b, 0xef, 0x2b, 0x3d, 0xa9, 0x7f, 0x0d, 0x3e, 0xa4, 0xc0, + 0xde, 0x3d, 0xde, 0x0d, 0xbc, 0xbc, 0x59, 0x6f, 0x81, 0x3a, 0x46, 0x0c, 0x1b, + 0xbe, 0xd0, 0xba, 0xf5, 0xbc, 0xe5, 0x6d, 0x1d, 0x3e, 0x31, 0x08, 0x5a, 0x3d, + 0xab, 0x1c, 0xb5, 0xbc, 0xe7, 0xaa, 0x18, 0x3e, 0xaa, 0xcc, 0x14, 0x3e, 0x4e, + 0x1e, 0x08, 0xbd, 0xfc, 0x9f, 0xbe, 0xbd, 0x44, 0x7b, 0x2b, 0xbe, 0xf1, 0xfa, + 0x90, 0x3c, 0xa4, 0x75, 0x16, 0xbe, 0x27, 0x3b, 0x05, 0xbe, 0xf3, 0x41, 0xde, + 0xbd, 0xb9, 0x96, 0x10, 0xbd, 0xd0, 0x44, 0x6a, 0x3b, 0x5b, 0x04, 0x02, 0xbe, + 0x3c, 0xf7, 0x41, 0xbd, 0xe6, 0xaf, 0x06, 0xbe, 0x52, 0x74, 0x08, 0x3e, 0xda, + 0x81, 0x54, 0x3d, 0xcd, 0xe8, 0xbc, 0x3d, 0xf8, 0x07, 0xdc, 0x3d, 0x84, 0x6f, + 0xd8, 0xbd, 0xe0, 0x65, 0x2a, 0x3e, 0x04, 0xae, 0xe1, 0xbd, 0x34, 0xd5, 0x27, + 0xbd, 0x5c, 0xb4, 0x70, 0xbd, 0x0d, 0x68, 0xfa, 0x3d, 0x04, 0xb0, 0xc5, 0xbd, + 0xa0, 0xf7, 0x87, 0x3d, 0xdc, 0x08, 0x18, 0x3e, 0x86, 0xb9, 0x0f, 0xbe, 0x21, + 0x03, 0x75, 0x3d, 0x2b, 0x4f, 0x15, 0xbd, 0x3c, 0x86, 0x8e, 0xbc, 0xc7, 0xd0, + 0x73, 0x3d, 0xe0, 0x50, 0x37, 0x3c, 0xd6, 0x8d, 0xce, 0x3d, 0x3b, 0x42, 0x1b, + 0x3e, 0xa9, 0xfc, 0x29, 0x3e, 0xe4, 0x58, 0x1d, 0x3d, 0x5d, 0xab, 0x3b, 0xbe, + 0x28, 0x32, 0x07, 0xbd, 0x54, 0x37, 0x9c, 0x3d, 0xd4, 0xdd, 0x04, 0x3d, 0x28, + 0xe1, 0xad, 0xbc, 0x98, 0x0e, 0x13, 0x3e, 0xae, 0x57, 0x2a, 0xbe, 0xc4, 0xf0, + 0x70, 0xbd, 0xf9, 0x8d, 0x0d, 0xbe, 0x5e, 0x46, 0x17, 0xbe, 0x90, 0x6a, 0xbc, + 0x3d, 0x12, 0xa1, 0xf3, 0xbd, 0x0f, 0xf9, 0x88, 0xbd, 0x60, 0xd9, 0x2f, 0xbd, + 0x07, 0x99, 0xa2, 0xbd, 0x0b, 0xa5, 0x1b, 0xbc, 0x92, 0x9d, 0xaf, 0xbc, 0x37, + 0xf5, 0x5a, 0x3c, 0x88, 0xf0, 0xcf, 0x3d, 0x96, 0xdd, 0x54, 0x3d, 0x2f, 0xd2, + 0x0a, 0x3e, 0xe5, 0xbd, 0x46, 0x3c, 0xd2, 0x65, 0xcb, 0xbd, 0x19, 0x00, 0x0b, + 0xbe, 0xd6, 0xf6, 0xb0, 0x3d, 0x39, 0xc2, 0x14, 0x3e, 0x44, 0x63, 0x3f, 0x3e, + 0x4a, 0x6c, 0x1d, 0x3e, 0xf3, 0x6a, 0xe1, 0xbc, 0x31, 0xa5, 0x28, 0xbe, 0x54, + 0x4d, 0x49, 0xbd, 0xd4, 0xbf, 0x64, 0xbd, 0xec, 0x58, 0xbc, 0xbd, 0xff, 0xc6, + 0xd0, 0x3c, 0xb7, 0xf1, 0xa7, 0x3d, 0x55, 0x15, 0x26, 0xbd, 0xe6, 0x14, 0xe2, + 0x3c, 0x6b, 0x28, 0x05, 0x3e, 0x83, 0xaf, 0xbc, 0xbd, 0xc6, 0xb7, 0x6a, 0x3d, + 0x6f, 0xa9, 0x01, 0x3e, 0x93, 0x78, 0x62, 0xb9, 0x23, 0x46, 0x3f, 0xbd, 0x89, + 0xbd, 0x88, 0x3d, 0x4d, 0xeb, 0xa0, 0x3d, 0x5e, 0x68, 0x74, 0xbd, 0x3d, 0xe2, + 0x86, 0xbd, 0x11, 0x15, 0x62, 0xbd, 0x01, 0xde, 0xc8, 0xbd, 0xf0, 0x96, 0xc0, + 0xbd, 0xf4, 0x9d, 0xff, 0xbd, 0x04, 0xcb, 0x80, 0x3c, 0x4f, 0x43, 0x35, 0x3d, + 0x65, 0x45, 0x6c, 0x3d, 0x45, 0x55, 0xaa, 0xbc, 0xe1, 0x1a, 0x59, 0x3d, 0x4c, + 0x54, 0x20, 0xbe, 0x35, 0xaf, 0xe3, 0x3d, 0xd2, 0x5e, 0xae, 0xbd, 0xa7, 0xaa, + 0x15, 0x3e, 0xea, 0x3c, 0xe9, 0x3c, 0xa4, 0xc9, 0x08, 0xbe, 0xca, 0xec, 0x82, + 0x3b, 0x8b, 0x49, 0xfa, 0xbd, 0x9d, 0x1e, 0x8b, 0xbc, 0x1b, 0xb4, 0xed, 0xbd, + 0x1d, 0xbe, 0xc9, 0x3d, 0x8c, 0xdf, 0x2a, 0xbe, 0x8c, 0xba, 0xe3, 0x3d, 0x1f, + 0xa2, 0x14, 0x3d, 0x61, 0xf2, 0xcf, 0xba, 0xd5, 0x67, 0x88, 0xbd, 0xa7, 0xd0, + 0x5d, 0x3e, 0x71, 0x6e, 0xfd, 0x3d, 0xd5, 0xcf, 0x02, 0xbd, 0x0c, 0x25, 0xb5, + 0x3c, 0xa6, 0x27, 0x90, 0x3c, 0x86, 0x80, 0x1c, 0x3e, 0x41, 0x4f, 0x02, 0xbe, + 0xe1, 0x7a, 0x28, 0x3e, 0xef, 0xf7, 0x96, 0xbd, 0x0f, 0x11, 0xd3, 0x3d, 0xd9, + 0x11, 0x00, 0x3e, 0x77, 0x16, 0x98, 0x3d, 0x6a, 0xbc, 0x03, 0xbe, 0xbc, 0x2b, + 0xc9, 0xbd, 0xc0, 0xc5, 0x99, 0x3d, 0xf4, 0x17, 0xc9, 0x3d, 0x37, 0xc7, 0xea, + 0x3d, 0xd0, 0x01, 0x29, 0xbe, 0xae, 0xfd, 0x37, 0xbd, 0x7a, 0xce, 0xba, 0xbc, + 0x7d, 0x16, 0x19, 0x3e, 0x2b, 0x5f, 0x32, 0x3a, 0x54, 0x01, 0x96, 0xbd, 0xd6, + 0xb6, 0x73, 0x3c, 0x8f, 0x5c, 0xa9, 0x3c, 0x67, 0x4e, 0xac, 0x3d, 0x52, 0x49, + 0xab, 0x3d, 0x05, 0x07, 0x29, 0x3e, 0x43, 0x4c, 0x28, 0xbe, 0x0c, 0x1a, 0x12, + 0xbe, 0x05, 0x18, 0x3c, 0x3c, 0x29, 0x0f, 0x22, 0x3e, 0xf3, 0x49, 0x54, 0x3e, + 0xbf, 0xcd, 0x46, 0x3d, 0xea, 0x9f, 0x53, 0x3d, 0xf6, 0xcc, 0xb5, 0x3d, 0x80, + 0x51, 0x9e, 0x3d, 0xff, 0xc1, 0x69, 0x3d, 0x94, 0x19, 0x41, 0xbd, 0x7b, 0x33, + 0x75, 0x3c, 0x9e, 0x51, 0x2f, 0x3e, 0x58, 0x6e, 0x21, 0x3c, 0x46, 0x38, 0x22, + 0x3e, 0x73, 0xf9, 0x15, 0xbe, 0xfa, 0x12, 0x04, 0xbe, 0xaf, 0x1d, 0x1e, 0xbe, + 0xad, 0x03, 0x11, 0xbe, 0xb3, 0xa7, 0x07, 0x3d, 0x4b, 0x76, 0x58, 0xbd, 0x68, + 0xaa, 0x21, 0xbe, 0x18, 0xb3, 0x24, 0xbe, 0x59, 0xa7, 0x9d, 0xbd, 0x8a, 0x64, + 0x92, 0x3d, 0xf4, 0xe8, 0x00, 0xbe, 0xed, 0xd4, 0x85, 0x3c, 0x77, 0x84, 0xf0, + 0xbd, 0x3f, 0x0d, 0x37, 0x3e, 0x2c, 0x42, 0x64, 0x3c, 0x5b, 0x23, 0x27, 0x3e, + 0x3e, 0xc6, 0xb0, 0x3d, 0x1c, 0xba, 0xfe, 0xbc, 0xcf, 0xde, 0xb4, 0xbc, 0x97, + 0x05, 0x1c, 0xbd, 0x0d, 0xa5, 0x92, 0xbb, 0x6a, 0x79, 0x50, 0x3e, 0x62, 0x30, + 0x19, 0x3e, 0xd7, 0x23, 0x02, 0x3e, 0x9d, 0xc1, 0x7e, 0x3d, 0xb5, 0x03, 0x9c, + 0xbd, 0x7b, 0xc5, 0x72, 0x3d, 0xc3, 0xd4, 0x22, 0xbe, 0x55, 0x27, 0x63, 0x3d, + 0xb7, 0x8f, 0x2e, 0xbe, 0x18, 0xe1, 0xbd, 0xbd, 0xa9, 0x10, 0xf0, 0xbd, 0x51, + 0xd4, 0x4d, 0x3d, 0x62, 0x08, 0xe2, 0x3d, 0x3b, 0xf4, 0x5e, 0x3d, 0xa1, 0xeb, + 0xb4, 0x3d, 0xed, 0x6f, 0x72, 0x3d, 0x1c, 0x3b, 0xba, 0xbd, 0x56, 0xa6, 0xc8, + 0xbd, 0x1e, 0x39, 0x3b, 0xbe, 0x83, 0xc7, 0xb4, 0x3d, 0x04, 0xe6, 0xd6, 0x3d, + 0x2a, 0x2c, 0x91, 0x3d, 0x78, 0x72, 0x9f, 0x3d, 0x62, 0xf9, 0xdd, 0xbd, 0x21, + 0x97, 0x28, 0xbe, 0x52, 0xaa, 0x06, 0x3e, 0x55, 0x9e, 0x26, 0xbe, 0xb0, 0x2a, + 0x4f, 0xbd, 0x72, 0x66, 0xeb, 0x3c, 0xa8, 0x84, 0xed, 0x3d, 0x02, 0xca, 0xaf, + 0xbd, 0xbd, 0x90, 0x64, 0xbd, 0x91, 0xd5, 0x81, 0xbd, 0xcd, 0x4a, 0x24, 0x3e, + 0x57, 0x13, 0x44, 0xbd, 0x35, 0x93, 0x1b, 0xbb, 0x9e, 0x75, 0xe0, 0x3d, 0x86, + 0xfb, 0x25, 0xbe, 0x7a, 0xe1, 0xe5, 0x3d, 0x15, 0x97, 0x28, 0x3d, 0xa5, 0x78, + 0xe4, 0x3d, 0x22, 0xf8, 0x0d, 0x3d, 0x18, 0xbb, 0xcb, 0xbc, 0xfc, 0x53, 0x99, + 0xbd, 0xd5, 0x40, 0xcc, 0xbd, 0x2e, 0x47, 0xf6, 0x3d, 0xd0, 0x5c, 0x1c, 0xbb, + 0xac, 0x38, 0xb3, 0x3c, 0x25, 0xfd, 0x8e, 0x3c, 0xd0, 0xc9, 0x4c, 0xbd, 0x37, + 0xc4, 0xfe, 0xbd, 0x1d, 0xca, 0x17, 0xbe, 0x54, 0x50, 0x8f, 0xbd, 0xc1, 0xfb, + 0xed, 0xbd, 0xb9, 0x2f, 0x24, 0x3e, 0xc0, 0x6d, 0x1c, 0xbe, 0xe2, 0xd7, 0x95, + 0x3d, 0x21, 0xa6, 0x7c, 0x3d, 0x1b, 0x02, 0x3c, 0x3d, 0xc6, 0x73, 0x4b, 0x3d, + 0x28, 0x7a, 0xcf, 0x3d, 0x6c, 0x4f, 0xf5, 0x3c, 0x0a, 0x47, 0x88, 0xbd, 0xe1, + 0xc9, 0x39, 0xbe, 0x0d, 0x2d, 0x04, 0x3c, 0x80, 0xf8, 0xd7, 0xbb, 0x8e, 0xa6, + 0xf3, 0xbd, 0x10, 0x3c, 0xe1, 0x3d, 0xde, 0x10, 0xb2, 0xbd, 0x9c, 0x3f, 0x46, + 0xbd, 0xd4, 0x42, 0x01, 0x3e, 0x63, 0x0f, 0x82, 0x3d, 0xab, 0x71, 0xe9, 0xbd, + 0x06, 0xe4, 0x11, 0x3e, 0x12, 0x15, 0x0a, 0xbe, 0x46, 0x0a, 0x5a, 0xbd, 0x83, + 0xff, 0x9a, 0xbc, 0xe4, 0x96, 0xdc, 0xbd, 0xc7, 0xaf, 0x7a, 0x3d, 0x64, 0x84, + 0xbe, 0x3d, 0x90, 0x0c, 0x04, 0xbd, 0xb4, 0x26, 0xb1, 0xbc, 0x35, 0xf6, 0x23, + 0x3e, 0x81, 0x0c, 0x89, 0xbd, 0x8a, 0xe7, 0xd7, 0xbc, 0x3b, 0xce, 0xa5, 0x3d, + 0xc1, 0x40, 0x83, 0x3d, 0x44, 0x14, 0x9a, 0x3d, 0xeb, 0x57, 0xbe, 0x3c, 0xde, + 0x7c, 0x01, 0x3d, 0xa0, 0x13, 0xe4, 0xbc, 0x54, 0xae, 0xca, 0x3d, 0x9d, 0xd5, + 0xc7, 0x3b, 0x59, 0x7b, 0xfc, 0xbd, 0xae, 0x12, 0x00, 0x3e, 0x79, 0xac, 0x07, + 0x3e, 0x40, 0x9b, 0x83, 0xbd, 0x7b, 0xb9, 0xeb, 0xbb, 0x12, 0x58, 0xf6, 0x3d, + 0x10, 0x80, 0x8c, 0xbd, 0x73, 0x18, 0xc8, 0xbd, 0x5e, 0x85, 0xbc, 0xbd, 0xf4, + 0x7c, 0xd0, 0xbd, 0x3b, 0x06, 0x66, 0xbd, 0x88, 0xaf, 0x82, 0xbc, 0x43, 0x81, + 0x80, 0x3d, 0x03, 0x7a, 0x20, 0x3e, 0xc1, 0x44, 0xd1, 0x3c, 0x2f, 0xa0, 0x76, + 0x3d, 0x63, 0x3e, 0x06, 0x3c, 0x80, 0xb6, 0xa4, 0x3d, 0x6d, 0x3d, 0x20, 0x3e, + 0xee, 0xe4, 0xb3, 0x3d, 0x3f, 0xb3, 0xfc, 0x3c, 0x66, 0x46, 0x52, 0x3e, 0x93, + 0x86, 0x14, 0xbd, 0x1f, 0x77, 0x8e, 0xbd, 0x99, 0x66, 0x88, 0x3c, 0xbb, 0xb7, + 0xc1, 0x3d, 0x30, 0x43, 0xcd, 0xbd, 0xd6, 0x81, 0xbe, 0x39, 0x60, 0x9d, 0x21, + 0xbe, 0x77, 0xb4, 0x16, 0x3e, 0x50, 0x6b, 0x88, 0xbb, 0xbe, 0x2a, 0xe1, 0xbc, + 0x7e, 0xfb, 0x13, 0xbe, 0x04, 0xd2, 0x01, 0x3e, 0xd7, 0xf2, 0xfb, 0xbd, 0xa1, + 0x97, 0xa5, 0x3d, 0x51, 0xb1, 0x1d, 0x3e, 0xa6, 0xe9, 0x11, 0x3e, 0x28, 0xe3, + 0xb0, 0xbc, 0xd6, 0xd7, 0xcf, 0xbd, 0xf7, 0x89, 0x10, 0x3e, 0x2d, 0x9d, 0x0b, + 0xbe, 0x08, 0x0a, 0x0e, 0xbd, 0xc7, 0x1e, 0x08, 0x3d, 0x18, 0x40, 0xad, 0xbd, + 0xef, 0x48, 0x05, 0xbd, 0xf6, 0xc0, 0x23, 0xbe, 0xf6, 0x7d, 0xa6, 0x3d, 0x05, + 0xb5, 0x6c, 0x3d, 0x7f, 0x05, 0xd4, 0xbd, 0xd5, 0x2a, 0x1f, 0x3e, 0x60, 0x90, + 0xee, 0xbd, 0x82, 0x03, 0x26, 0xbd, 0x27, 0x9d, 0x05, 0xbd, 0x2d, 0x05, 0x9c, + 0x3c, 0xa0, 0x72, 0xef, 0x3d, 0x4a, 0xd9, 0xad, 0x3d, 0x9f, 0x2a, 0x46, 0xbd, + 0x47, 0x6e, 0xfb, 0xbc, 0x43, 0x4b, 0xde, 0xbd, 0xf0, 0x40, 0x97, 0x3d, 0xd9, + 0xf7, 0xe1, 0xbd, 0xbd, 0xae, 0xce, 0x3c, 0x79, 0xae, 0x8c, 0xbd, 0x34, 0xc9, + 0x34, 0xbe, 0x99, 0x0a, 0xae, 0xbd, 0xae, 0xe2, 0xe9, 0x3d, 0xe7, 0x97, 0xf7, + 0x3d, 0xd1, 0x30, 0x05, 0x3e, 0x14, 0xd3, 0x0c, 0x3d, 0xcd, 0x90, 0x63, 0x3d, + 0x50, 0xac, 0x27, 0xbd, 0x06, 0x6c, 0x30, 0xbe, 0x31, 0x20, 0xa1, 0xbd, 0xf3, + 0x98, 0x87, 0x3d, 0x31, 0x34, 0xac, 0xbd, 0x2e, 0xc3, 0xb3, 0xbb, 0xec, 0xb6, + 0x4d, 0xbd, 0x6f, 0x2c, 0x02, 0xbc, 0xcc, 0xcb, 0x80, 0xbd, 0x7b, 0x15, 0x29, + 0xbe, 0x8f, 0xb6, 0x8b, 0x3c, 0xca, 0x8b, 0x51, 0xbd, 0x64, 0x5f, 0x45, 0xbd, + 0x0f, 0xa3, 0xa4, 0x3d, 0xed, 0x79, 0x9c, 0xbd, 0x31, 0xa0, 0xbb, 0x3d, 0xe9, + 0x06, 0x26, 0x3e, 0x85, 0x78, 0x21, 0x3e, 0x81, 0x35, 0xcd, 0xbd, 0x05, 0x31, + 0x11, 0xbe, 0x9d, 0x19, 0xde, 0xbd, 0x9a, 0xd3, 0x11, 0xbe, 0x58, 0xa7, 0xff, + 0xbc, 0x9f, 0x4a, 0x29, 0x3d, 0xda, 0x56, 0x8c, 0xbc, 0xf6, 0xf9, 0x79, 0x3d, + 0x11, 0xbe, 0x82, 0x3d, 0xda, 0x43, 0x04, 0x3e, 0xed, 0xce, 0xe1, 0x3d, 0x3a, + 0x95, 0x3a, 0x3d, 0x56, 0x31, 0x4e, 0x3d, 0x82, 0x65, 0xbd, 0x3b, 0x4c, 0x6f, + 0xa8, 0xbc, 0xa4, 0xa1, 0x25, 0xbc, 0xad, 0x79, 0x2f, 0xbe, 0x73, 0xac, 0x2b, + 0x3e, 0x2d, 0x80, 0x3f, 0xbd, 0x97, 0xee, 0x80, 0xbd, 0xd8, 0x02, 0x77, 0x3d, + 0xb2, 0xcb, 0x9b, 0x3d, 0x7c, 0x94, 0xc9, 0xbd, 0xce, 0xd1, 0xdd, 0x3d, 0x12, + 0xef, 0x8b, 0x3d, 0x3a, 0xbe, 0x08, 0x3e, 0x73, 0x80, 0x1d, 0xbe, 0x2f, 0xdb, + 0x2d, 0xbe, 0x58, 0x7d, 0xd7, 0xbd, 0x44, 0x0f, 0xae, 0x3d, 0xd6, 0xe7, 0x3d, + 0x3e, 0xe0, 0x3a, 0xad, 0x3c, 0x7b, 0x10, 0x19, 0x3e, 0x1b, 0x4e, 0x78, 0xbd, + 0x3f, 0xf3, 0x07, 0xbe, 0x8c, 0xcc, 0xf7, 0xbd, 0x5a, 0x20, 0xb9, 0xbd, 0x53, + 0x04, 0x34, 0x3d, 0x6b, 0xcf, 0x24, 0x3e, 0x32, 0x1b, 0xc2, 0xbd, 0x92, 0x01, + 0xee, 0x3c, 0x79, 0x75, 0xd8, 0xbd, 0xdf, 0x4b, 0x0a, 0x3c, 0xf3, 0x93, 0xce, + 0x3d, 0x76, 0xf7, 0x31, 0xbd, 0xd7, 0x71, 0x17, 0xbe, 0xac, 0xed, 0x1f, 0xbe, + 0xb5, 0x4d, 0x46, 0x3d, 0xb0, 0xb9, 0x0b, 0xbe, 0x02, 0xb8, 0x9f, 0x3d, 0x7d, + 0x42, 0x28, 0xbe, 0x65, 0x07, 0xc7, 0x3d, 0xb2, 0xd4, 0xb5, 0x3d, 0x28, 0x07, + 0xd3, 0x3c, 0x55, 0x93, 0x2c, 0xbe, 0x79, 0x7c, 0x29, 0x3e, 0x59, 0x10, 0x0a, + 0xbe, 0x9d, 0x0a, 0x08, 0xbd, 0xa3, 0x61, 0x5d, 0x3d, 0xf8, 0xb5, 0xde, 0xbb, + 0x54, 0x24, 0xa7, 0x3d, 0xe3, 0xe4, 0x32, 0xbe, 0x20, 0x3b, 0x3d, 0xbe, 0x48, + 0x67, 0xc2, 0xbd, 0x3c, 0x7b, 0x2b, 0xbd, 0x69, 0xee, 0x56, 0xbd, 0xa9, 0x90, + 0xcb, 0x3d, 0xff, 0xf1, 0xa7, 0xbd, 0xa9, 0xd8, 0x43, 0xbd, 0xb8, 0xcd, 0xb7, + 0x3c, 0xcd, 0xfb, 0xbb, 0x3d, 0xd6, 0x26, 0x8a, 0xbd, 0x45, 0xa4, 0x81, 0x3d, + 0xd2, 0xc9, 0x29, 0x3e, 0xdb, 0xf4, 0xdd, 0xbd, 0x93, 0x95, 0xa9, 0x3d, 0x11, + 0xbb, 0x12, 0x3e, 0xdf, 0xf4, 0xcd, 0xbd, 0xb9, 0xde, 0x82, 0x3c, 0xdf, 0x26, + 0x76, 0x3d, 0xb6, 0x47, 0x32, 0xbe, 0x91, 0x0f, 0x6f, 0x3b, 0x56, 0x16, 0x4c, + 0xbe, 0x77, 0x77, 0x00, 0xbe, 0x2c, 0x1f, 0xd1, 0xbd, 0xf6, 0x43, 0x12, 0x3e, + 0xd8, 0x7c, 0x16, 0x3e, 0x26, 0xec, 0x0c, 0xbe, 0xaf, 0x69, 0xe0, 0x3d, 0x5a, + 0x3b, 0xdf, 0x3d, 0xbb, 0x0f, 0x99, 0x3d, 0xe2, 0x32, 0x2b, 0xbd, 0xf3, 0x1e, + 0x1d, 0x3e, 0x9e, 0xdc, 0xf3, 0x3c, 0x77, 0x8b, 0xf7, 0xbd, 0x46, 0xb5, 0x48, + 0xbc, 0x28, 0xce, 0xbd, 0x3c, 0x22, 0x68, 0x1a, 0x3e, 0x92, 0x40, 0xf0, 0x3c, + 0x35, 0xf1, 0xbe, 0xbd, 0x8d, 0xed, 0xd0, 0x3d, 0x93, 0x67, 0x5e, 0xbd, 0xc8, + 0xa3, 0xb0, 0xbd, 0x83, 0x61, 0x2f, 0x3d, 0x39, 0xce, 0x81, 0x3b, 0xa5, 0x87, + 0x1d, 0x3e, 0xe0, 0x8f, 0x38, 0x3c, 0xce, 0x6f, 0x26, 0x3d, 0x09, 0x7f, 0x9a, + 0x3d, 0x6c, 0x04, 0x8f, 0xbd, 0x31, 0x13, 0x9c, 0xbb, 0xab, 0xbc, 0x3f, 0xbd, + 0xe1, 0x11, 0xc2, 0xbd, 0x47, 0xa8, 0x3a, 0x3d, 0x76, 0xc5, 0x0b, 0xbe, 0x0d, + 0x71, 0xff, 0x3d, 0x30, 0x8e, 0x41, 0x3d, 0xdc, 0xf6, 0x2d, 0xbe, 0x1a, 0x84, + 0x1f, 0x3d, 0xe2, 0xd4, 0x09, 0x3e, 0xe7, 0x1f, 0x1d, 0xbd, 0x20, 0x25, 0x26, + 0x3d, 0x68, 0x8f, 0x61, 0x3d, 0xe7, 0xdf, 0x1f, 0xbe, 0xad, 0x57, 0x1b, 0xbe, + 0x3e, 0xec, 0x1b, 0xbe, 0x6f, 0xe4, 0x09, 0xbe, 0x87, 0x7d, 0xb5, 0xbc, 0xce, + 0x89, 0x07, 0x3d, 0x8a, 0x34, 0xbe, 0x3b, 0x7a, 0x7d, 0x24, 0x3e, 0xde, 0xc8, + 0xfa, 0x3d, 0xa4, 0xc7, 0x9e, 0xbd, 0x5b, 0x97, 0xf0, 0xbd, 0x16, 0xf7, 0x3b, + 0xbe, 0x91, 0xad, 0x27, 0x3e, 0x06, 0x69, 0xf3, 0xbd, 0x6d, 0xb9, 0xe6, 0xbd, + 0xfc, 0xa1, 0x33, 0x3e, 0x73, 0x47, 0xd4, 0xbd, 0xd1, 0x35, 0xc0, 0x3d, 0x74, + 0x47, 0x12, 0x3d, 0x2d, 0x04, 0x23, 0x3d, 0xfc, 0xc6, 0x1b, 0x3d, 0x75, 0x18, + 0x0e, 0xbe, 0xa5, 0x96, 0x55, 0x3c, 0xb8, 0x10, 0xad, 0xbc, 0x93, 0x9b, 0xde, + 0xbd, 0x9f, 0xa2, 0xf4, 0x3d, 0xb8, 0x21, 0xf6, 0xba, 0xd7, 0x96, 0x09, 0xbd, + 0x2a, 0x6c, 0xd9, 0xbd, 0xb1, 0x32, 0x45, 0x3d, 0xc0, 0x16, 0x94, 0xbd, 0x78, + 0xac, 0x97, 0xbd, 0x97, 0xd4, 0xdf, 0xbd, 0x68, 0x97, 0x36, 0xbd, 0x28, 0xce, + 0x2f, 0x3d, 0x12, 0x02, 0x3d, 0xbd, 0x5b, 0x8f, 0x23, 0x3d, 0xf5, 0xc3, 0xda, + 0xba, 0xa6, 0x72, 0x41, 0x3e, 0x27, 0xa9, 0xcd, 0xbd, 0x9c, 0x9a, 0x3c, 0x3d, + 0xf2, 0x7f, 0x45, 0x3e, 0x1c, 0x9f, 0x40, 0x3e, 0xa9, 0xdf, 0x74, 0x3c, 0x6a, + 0x72, 0x6e, 0xbd, 0x46, 0x83, 0xa5, 0x3d, 0x3b, 0x67, 0x6c, 0x3c, 0xfc, 0x84, + 0x2a, 0x3d, 0x3c, 0xf4, 0x35, 0x3e, 0xb4, 0x2c, 0x79, 0xbd, 0x43, 0xb9, 0xd6, + 0x3d, 0xe6, 0xae, 0x13, 0xbd, 0xeb, 0x77, 0xd0, 0xbd, 0x31, 0x51, 0xbe, 0x3d, + 0x5f, 0x2e, 0x23, 0x3c, 0x7a, 0xbe, 0x15, 0x3e, 0x4b, 0x59, 0xdc, 0xbd, 0xa0, + 0x8f, 0xe7, 0xbd, 0x76, 0xa8, 0xf3, 0xbd, 0x88, 0x1c, 0x74, 0x3d, 0x85, 0x4d, + 0xdd, 0xbd, 0x45, 0x96, 0x36, 0xbd, 0xe8, 0x39, 0x98, 0x3d, 0xbe, 0x82, 0xf9, + 0x3d, 0x1d, 0xdb, 0x2d, 0x3b, 0x6f, 0xac, 0x63, 0xbd, 0x8c, 0xc8, 0xe1, 0xbd, + 0xcf, 0x49, 0x73, 0xbd, 0x8a, 0xdd, 0xe3, 0xbd, 0xf8, 0x00, 0x19, 0xbd, 0x17, + 0xe8, 0xdf, 0xbd, 0xba, 0x22, 0x5b, 0x3c, 0xf1, 0x54, 0x21, 0xbe, 0x7b, 0x38, + 0x58, 0xbd, 0x48, 0x88, 0x67, 0xbd, 0x5e, 0xe2, 0x6c, 0x3d, 0xa5, 0x44, 0x20, + 0xbe, 0x69, 0x7f, 0xbf, 0xbc, 0x7c, 0xfa, 0x25, 0x3e, 0xc1, 0xd9, 0xd5, 0xbd, + 0x46, 0x87, 0x75, 0xbd, 0x13, 0x1c, 0x01, 0xbd, 0xe5, 0xc3, 0x19, 0xbb, 0x2d, + 0xc8, 0x30, 0xbe, 0xad, 0xd8, 0xf2, 0x3d, 0xd9, 0x37, 0x14, 0xbd, 0xd2, 0xb5, + 0x9a, 0x3d, 0xf4, 0x37, 0x8d, 0x3c, 0x2f, 0x8f, 0xc0, 0x3d, 0x8e, 0xe9, 0xc5, + 0xbd, 0xf5, 0x4d, 0x21, 0xbe, 0xfd, 0x9a, 0xaa, 0xbd, 0x91, 0xb6, 0x00, 0xbe, + 0xf0, 0x0d, 0xbf, 0x3c, 0xe4, 0x94, 0xed, 0x3d, 0x64, 0xbe, 0x8d, 0x3c, 0x27, + 0xcf, 0x2f, 0x3e, 0x22, 0xa5, 0xf1, 0x3d, 0x96, 0xf2, 0xbf, 0xbd, 0x62, 0xde, + 0xe5, 0xbd, 0x4b, 0x4a, 0x89, 0x3d, 0x7a, 0x3c, 0x1d, 0x3e, 0xfc, 0x83, 0xab, + 0xbc, 0x0f, 0x00, 0x2e, 0xbe, 0xd5, 0xd1, 0x93, 0x3d, 0x32, 0x51, 0xca, 0xbd, + 0x27, 0x77, 0x31, 0xbd, 0x6e, 0xe6, 0xe2, 0x3d, 0xdd, 0xb0, 0x03, 0xbe, 0xd7, + 0xec, 0xe5, 0xbd, 0x97, 0x8e, 0x82, 0x3b, 0x7b, 0xaf, 0x03, 0xbe, 0xbe, 0x24, + 0xc3, 0x3d, 0x1e, 0x4c, 0x51, 0x3e, 0x07, 0x32, 0x10, 0x3e, 0xac, 0xdb, 0x01, + 0xbe, 0xef, 0x14, 0x38, 0x3e, 0x1b, 0xbb, 0x73, 0x3d, 0x6a, 0x42, 0x35, 0xbd, + 0x79, 0x72, 0x13, 0xbe, 0x05, 0x8c, 0xe9, 0x3d, 0xc1, 0x57, 0xe5, 0x3b, 0x50, + 0x38, 0x71, 0x3d, 0x47, 0xb5, 0xe4, 0xbd, 0x0f, 0x18, 0x01, 0xbe, 0xd6, 0x1c, + 0x76, 0x3b, 0x99, 0x36, 0x1c, 0xbe, 0x6d, 0xee, 0x1a, 0x3d, 0x2d, 0xcb, 0x39, + 0xbd, 0xc0, 0x54, 0x24, 0x3e, 0xcb, 0x5b, 0xfb, 0x3c, 0x8d, 0xc8, 0x85, 0x3a, + 0x10, 0xcb, 0xd6, 0x3c, 0xfd, 0x81, 0xd8, 0x3c, 0xc7, 0xab, 0x1b, 0xba, 0xf5, + 0xe1, 0xb5, 0xbd, 0x7a, 0x09, 0xfc, 0x3d, 0x98, 0x7b, 0x6b, 0xbd, 0x31, 0x74, + 0x46, 0xbe, 0x13, 0x26, 0x02, 0x3e, 0x67, 0x37, 0x03, 0xbe, 0x68, 0x29, 0xc4, + 0xbd, 0x8a, 0xc5, 0x8b, 0xbd, 0x50, 0x23, 0x22, 0xbc, 0x6d, 0x99, 0xf5, 0x3d, + 0x01, 0x6c, 0xc5, 0xbd, 0xd6, 0xce, 0x14, 0xbe, 0x29, 0xd4, 0xef, 0xbd, 0x7c, + 0xe1, 0x8b, 0x3c, 0x8f, 0x04, 0xd6, 0xbc, 0x29, 0xf1, 0x60, 0x3c, 0x02, 0x1a, + 0x2c, 0x3b, 0x76, 0x21, 0x00, 0xbe, 0x16, 0x98, 0x66, 0xbd, 0x2a, 0x64, 0x3f, + 0xbd, 0xbf, 0x81, 0x24, 0x3d, 0x30, 0x34, 0x27, 0x3e, 0x90, 0xee, 0x9b, 0x3d, + 0xe1, 0x6c, 0xdd, 0x3c, 0x25, 0x40, 0x25, 0x3e, 0xc0, 0x85, 0x57, 0x3b, 0x16, + 0xa8, 0x4f, 0x3e, 0xa9, 0xfb, 0x48, 0xbd, 0x38, 0x1c, 0xf8, 0x3b, 0x7a, 0x4a, + 0xb0, 0xbd, 0x29, 0xe7, 0xf3, 0xbd, 0xa5, 0x5c, 0x42, 0x3d, 0xab, 0x54, 0x09, + 0x3e, 0x94, 0x68, 0x75, 0x3d, 0x24, 0x37, 0x03, 0xbe, 0x4e, 0xba, 0x09, 0x3e, + 0x16, 0xba, 0x09, 0x3e, 0xbd, 0x97, 0x00, 0xbe, 0x92, 0xe4, 0x95, 0xbd, 0x74, + 0xf5, 0x9f, 0xbd, 0x40, 0x16, 0x81, 0x3d, 0x83, 0x4c, 0x26, 0x3e, 0x61, 0xd1, + 0x25, 0x3e, 0xfb, 0x74, 0x1d, 0xbe, 0x9b, 0x9f, 0x0f, 0x3d, 0xe8, 0x7e, 0x10, + 0x3d, 0x9e, 0xb0, 0x15, 0x3d, 0x34, 0xe6, 0xee, 0x3d, 0xaf, 0xef, 0xf0, 0xbb, + 0xaa, 0x06, 0x24, 0xbe, 0x43, 0x5e, 0xdb, 0x3d, 0x10, 0xd8, 0xa4, 0x3d, 0x6e, + 0xc9, 0x0c, 0xbd, 0x1c, 0xfe, 0xa9, 0x3d, 0xf0, 0xf3, 0x31, 0x3d, 0x38, 0xf5, + 0x7e, 0xba, 0x24, 0x31, 0xe0, 0x3d, 0x6e, 0xf2, 0xa2, 0x3d, 0xbe, 0x8b, 0xd4, + 0xbd, 0x65, 0xc3, 0x25, 0x3c, 0xa3, 0xde, 0x67, 0xba, 0x41, 0xe9, 0x13, 0xbe, + 0x83, 0xd0, 0x02, 0xbd, 0x8b, 0x91, 0x3a, 0x3d, 0x29, 0x20, 0x4c, 0xbc, 0xfc, + 0x3f, 0xcd, 0xbd, 0x5a, 0x01, 0xae, 0xbd, 0x6c, 0x48, 0x1e, 0xbe, 0xe0, 0x29, + 0x80, 0x3d, 0x18, 0x74, 0xa0, 0xbd, 0x2a, 0xeb, 0xbd, 0x39, 0x28, 0xe6, 0x2e, + 0xbe, 0x4b, 0x70, 0x59, 0x3d, 0xd7, 0xcf, 0xd7, 0xbc, 0x34, 0x77, 0xa5, 0x3c, + 0xef, 0x6d, 0x58, 0xbb, 0x31, 0xcc, 0xde, 0xbb, 0xf6, 0xe6, 0xc2, 0xbd, 0x8b, + 0xee, 0x14, 0x3e, 0xf3, 0x70, 0x12, 0xbe, 0x88, 0x93, 0xae, 0xbd, 0x57, 0xd4, + 0xfc, 0x3d, 0x48, 0x74, 0x36, 0x3e, 0xb5, 0xcb, 0x08, 0xbe, 0x32, 0x08, 0xbe, + 0xbd, 0x95, 0xe2, 0x2e, 0xbd, 0x6c, 0xa0, 0xc3, 0x3d, 0x83, 0xdb, 0xc4, 0x3a, + 0xc8, 0x25, 0xf0, 0x3d, 0x8a, 0x78, 0x0f, 0x3e, 0xed, 0xd4, 0x02, 0xbc, 0xd4, + 0x18, 0xad, 0xbd, 0x70, 0x10, 0xbf, 0xbd, 0x9f, 0x8e, 0x1c, 0xbe, 0x41, 0xdf, + 0xf2, 0x3d, 0x20, 0x72, 0x45, 0x3d, 0x7f, 0x52, 0x16, 0xbe, 0xd7, 0xf4, 0x25, + 0xbe, 0x6d, 0x3f, 0x3d, 0x3e, 0xd4, 0xb0, 0x26, 0xbe, 0x23, 0x8c, 0x87, 0x3d, + 0x6c, 0x4e, 0xb9, 0xbc, 0x67, 0x6c, 0x44, 0x3c, 0x35, 0x7b, 0xde, 0x3d, 0x19, + 0x66, 0xd7, 0x3d, 0x1c, 0xc9, 0xc2, 0x3d, 0xf1, 0xee, 0xba, 0xbd, 0xa3, 0xe1, + 0xc8, 0x3d, 0xf5, 0xf9, 0x82, 0x3c, 0x3d, 0x0e, 0x81, 0x3d, 0xea, 0xc7, 0x5d, + 0x3d, 0x19, 0x63, 0x25, 0x3e, 0x59, 0x2f, 0x13, 0xbd, 0xf2, 0x44, 0xeb, 0x3d, + 0xf0, 0xb5, 0xf1, 0xbc, 0x85, 0x77, 0x03, 0x3d, 0xda, 0x66, 0x11, 0xbd, 0xef, + 0xae, 0x1b, 0x3d, 0xe1, 0x4f, 0x94, 0xbd, 0x25, 0x17, 0x56, 0xbd, 0x74, 0x34, + 0x0c, 0x3e, 0xf8, 0x12, 0x88, 0x3d, 0x96, 0x08, 0x97, 0xbd, 0x04, 0xb9, 0x75, + 0xbc, 0x72, 0x9f, 0x8e, 0x3d, 0x0d, 0xf3, 0x7d, 0xbd, 0x51, 0xe7, 0x56, 0xbc, + 0x93, 0x6d, 0x08, 0xbe, 0xa7, 0xd8, 0x09, 0x3e, 0x80, 0xd5, 0xa8, 0xbd, 0x40, + 0x03, 0xd1, 0x3c, 0xe2, 0x44, 0x1f, 0xbd, 0x3e, 0x1f, 0xd6, 0xbd, 0x9f, 0x62, + 0xe7, 0x3c, 0xf7, 0x6d, 0xae, 0xbd, 0xf4, 0x14, 0xf6, 0x3a, 0x54, 0x99, 0xea, + 0x3b, 0x9c, 0xab, 0xf7, 0xbd, 0x74, 0x21, 0xdd, 0x3d, 0x87, 0x18, 0x95, 0xbd, + 0x49, 0x55, 0x0c, 0xbe, 0xd6, 0xdc, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, + 0x01, 0x00, 0x00, 0x5a, 0xd4, 0xee, 0x3d, 0x38, 0x39, 0x64, 0x3e, 0x55, 0xb4, + 0x79, 0x3d, 0x1d, 0xa3, 0xb9, 0x3d, 0xb9, 0x79, 0xe0, 0x3b, 0x30, 0xff, 0xd1, + 0x3d, 0x7a, 0x3b, 0x2d, 0xbd, 0x18, 0x51, 0x07, 0xbe, 0x5c, 0x31, 0x3d, 0x3e, + 0x46, 0x0f, 0x51, 0xbe, 0x29, 0x32, 0x13, 0x3e, 0x7c, 0x11, 0xf3, 0xbd, 0x3a, + 0xbd, 0x4a, 0xbd, 0x56, 0xb3, 0xce, 0xbd, 0x37, 0xd0, 0xf6, 0x3d, 0xd5, 0x9b, + 0xd8, 0x3d, 0xa8, 0xbc, 0x5a, 0xbe, 0x1b, 0x22, 0x0e, 0xbc, 0x03, 0x98, 0xf9, + 0x3d, 0x64, 0xf4, 0x47, 0x3e, 0xa2, 0xb5, 0x2f, 0xbe, 0x70, 0x7a, 0x89, 0xbe, + 0x9c, 0x58, 0x60, 0x3e, 0x71, 0xac, 0x25, 0xbe, 0x17, 0x1c, 0x01, 0x3e, 0x48, + 0x73, 0x93, 0xbd, 0x0d, 0x92, 0xa3, 0x3d, 0xf1, 0xff, 0x62, 0xbe, 0x56, 0xe9, + 0x71, 0xbe, 0x09, 0xf7, 0x96, 0xbe, 0x91, 0x7a, 0x0a, 0x3e, 0xc1, 0x6d, 0x88, + 0x3c, 0x6c, 0xd0, 0x4f, 0xbe, 0x71, 0x75, 0x99, 0xbd, 0x7d, 0x92, 0x01, 0xbe, + 0x35, 0x21, 0x96, 0xbe, 0xd9, 0x0e, 0x2d, 0x3e, 0x63, 0x17, 0x8b, 0x3d, 0x53, + 0x6d, 0xb7, 0x3c, 0xb9, 0x06, 0x20, 0x3d, 0xdf, 0x56, 0x11, 0x3e, 0xc4, 0xcd, + 0xa9, 0x3c, 0x7d, 0x0a, 0x3b, 0x3e, 0xd6, 0x23, 0x7f, 0xbc, 0xaf, 0x06, 0xc4, + 0xbc, 0xe0, 0xe3, 0x63, 0xbd, 0x34, 0x50, 0x2a, 0x3e, 0x1f, 0xff, 0x4c, 0x3e, + 0x34, 0x98, 0x79, 0xbe, 0x4c, 0xbd, 0x18, 0x3e, 0x5b, 0x8b, 0x0f, 0x3e, 0x33, + 0x44, 0x34, 0xbd, 0xd6, 0xd7, 0x90, 0xbe, 0x51, 0x5e, 0x55, 0x3d, 0x46, 0x2b, + 0x54, 0xbe, 0xd8, 0x49, 0x30, 0xbe, 0x45, 0xb3, 0x72, 0xbe, 0x93, 0x18, 0xcd, + 0x3d, 0x86, 0xe1, 0x73, 0xbd, 0x94, 0x56, 0xf3, 0x3d, 0x0a, 0x54, 0xd7, 0xbd, + 0x01, 0xd9, 0x98, 0x3e, 0xd5, 0x11, 0x01, 0xbb, 0x69, 0x07, 0x62, 0xbe, 0x81, + 0x33, 0x03, 0xbb, 0x98, 0xf9, 0x9f, 0x3c, 0xe8, 0x77, 0x96, 0x3e, 0x3a, 0xc2, + 0x73, 0x3e, 0xa1, 0x45, 0x35, 0xbe, 0xea, 0x1c, 0x86, 0xbc, 0xad, 0x90, 0x45, + 0xbe, 0x0b, 0xd2, 0x03, 0x3d, 0x02, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x00, 0x00, 0xa1, 0xc6, 0xcd, 0xbe, 0x46, 0xa7, 0xbd, 0x3e, 0x7c, + 0xe3, 0x00, 0x3f, 0x13, 0x8d, 0xb6, 0xbe, 0x21, 0x72, 0x8b, 0x3e, 0x16, 0x68, + 0x68, 0x3e, 0x05, 0xb7, 0xb6, 0xbe, 0xa0, 0xd3, 0xd4, 0x3e, 0x98, 0x82, 0x83, + 0xbd, 0x8c, 0xb1, 0xe2, 0x3d, 0xd6, 0x94, 0x82, 0x3e, 0x07, 0x6a, 0x70, 0xbe, + 0x6b, 0x74, 0x0b, 0x3f, 0xd8, 0xf5, 0x3d, 0x3e, 0xfb, 0xf3, 0x19, 0xbd, 0x2c, + 0x72, 0xbf, 0x3e, 0xff, 0x95, 0x49, 0x3d, 0xee, 0x70, 0x78, 0x3e, 0xb0, 0x3f, + 0x58, 0x3d, 0x78, 0xea, 0x9d, 0xbe, 0x53, 0x1d, 0x15, 0x3f, 0x0d, 0xfc, 0xbe, + 0xbe, 0xad, 0x10, 0x07, 0xbf, 0xb4, 0x11, 0x87, 0xbe, 0x20, 0x92, 0x62, 0x3e, + 0x58, 0x61, 0xbd, 0x3e, 0xea, 0x54, 0x4a, 0xbd, 0xbd, 0x55, 0xce, 0xbe, 0x12, + 0x48, 0xa2, 0x3e, 0xe0, 0x74, 0x90, 0x3d, 0xce, 0x80, 0xf5, 0x3e, 0xa5, 0xb7, + 0x15, 0x3f, 0x8e, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01, + 0x00, 0x2c, 0xcf, 0x79, 0xbd, 0x8c, 0x37, 0x5a, 0xbc, 0x00, 0x4c, 0x6f, 0x3c, + 0x14, 0x0b, 0x8e, 0x3d, 0xa8, 0xc3, 0x12, 0x3c, 0x10, 0x9f, 0xa5, 0xbb, 0xe8, + 0x7e, 0x17, 0xbd, 0x43, 0x60, 0x74, 0xbd, 0xc6, 0x62, 0x6f, 0x3d, 0x88, 0x83, + 0x6c, 0xbd, 0xf7, 0xf2, 0x36, 0xbd, 0xb7, 0x11, 0x81, 0xbd, 0x69, 0x1c, 0x30, + 0xbd, 0xde, 0xd0, 0x4e, 0x3c, 0xa4, 0x9f, 0x6e, 0xbc, 0x06, 0xd8, 0xd6, 0xbc, + 0x21, 0x75, 0x5f, 0xbd, 0x68, 0x6f, 0x0c, 0xbc, 0xbd, 0x21, 0xcf, 0xbb, 0x20, + 0x31, 0xb0, 0x3b, 0x88, 0xa3, 0x32, 0x3c, 0xa0, 0xec, 0x56, 0x3d, 0x19, 0xfd, + 0xf8, 0x3c, 0x99, 0xd1, 0x75, 0x3d, 0x99, 0x54, 0x3d, 0x3c, 0x4d, 0x0f, 0x12, + 0x3b, 0x34, 0xf2, 0x37, 0xbd, 0xaa, 0x3b, 0x85, 0xbb, 0x23, 0xfe, 0xde, 0xbb, + 0x8a, 0xe4, 0x21, 0x3c, 0xbd, 0x46, 0x8d, 0x3d, 0xd8, 0xf0, 0x03, 0x3d, 0xfa, + 0xb6, 0xb6, 0x3c, 0xb8, 0x2e, 0xc9, 0xbc, 0xac, 0x52, 0x4a, 0xbd, 0xd2, 0x5d, + 0x00, 0x3c, 0x7d, 0x64, 0x6f, 0xbd, 0xe6, 0x47, 0x77, 0x3d, 0xe0, 0x29, 0xbe, + 0x3b, 0x5a, 0xb3, 0xee, 0xbc, 0x40, 0x76, 0xe3, 0xbb, 0x18, 0xf0, 0x8b, 0x3c, + 0xbc, 0x5f, 0x3a, 0x3d, 0x47, 0xdd, 0x08, 0x3d, 0x0b, 0xae, 0x39, 0xbc, 0xa1, + 0xca, 0xd9, 0xbc, 0xf8, 0x6b, 0x92, 0xbc, 0xf8, 0x2b, 0x42, 0x3d, 0xef, 0x4c, + 0x14, 0xbd, 0x64, 0xd7, 0x4b, 0xbd, 0x22, 0x18, 0x18, 0x3c, 0x20, 0xf8, 0x29, + 0xbd, 0x00, 0x5d, 0xdd, 0x3a, 0x56, 0x0c, 0x5f, 0xbd, 0x47, 0x5d, 0x84, 0xbd, + 0x5e, 0xea, 0xa1, 0x3c, 0xc4, 0x53, 0x89, 0xbd, 0x53, 0xde, 0x4d, 0xbc, 0xe7, + 0xc7, 0x88, 0xbc, 0x35, 0xef, 0x56, 0x3d, 0x45, 0x2c, 0xb4, 0x3c, 0xd8, 0x97, + 0x7b, 0xbd, 0x17, 0xec, 0x89, 0x3d, 0xe1, 0x90, 0x45, 0x3d, 0x89, 0xf2, 0x3f, + 0xbd, 0xf1, 0x11, 0xff, 0xbb, 0x1b, 0x6f, 0x03, 0xbd, 0xf7, 0xf7, 0x3d, 0x3b, + 0xc4, 0x7d, 0x91, 0x3c, 0x44, 0x07, 0x0b, 0x3d, 0x4a, 0xc0, 0x6f, 0x3d, 0x79, + 0x51, 0x8f, 0x3d, 0x66, 0x5e, 0x41, 0x3d, 0xf1, 0x9b, 0x8c, 0xbd, 0x38, 0xb9, + 0xca, 0x3c, 0xe3, 0xf8, 0xe8, 0x3c, 0xcd, 0xce, 0x8f, 0xbb, 0xe4, 0xe9, 0x6b, + 0x3c, 0x92, 0xd8, 0x39, 0x3d, 0xbe, 0x6d, 0x52, 0xbd, 0x38, 0xed, 0x4a, 0xbd, + 0x68, 0xd4, 0x28, 0xbc, 0x6f, 0x16, 0x67, 0xbd, 0xd7, 0x55, 0x8a, 0x3d, 0xe0, + 0x69, 0xb0, 0xbb, 0xfa, 0x9c, 0x93, 0xbd, 0x14, 0xe4, 0x21, 0x3d, 0x96, 0x1c, + 0x7b, 0x3d, 0x4c, 0x31, 0x34, 0x3c, 0xa8, 0x41, 0x5c, 0x3c, 0x90, 0xe5, 0x8c, + 0x3d, 0x11, 0x9f, 0x98, 0x3c, 0xf0, 0x3d, 0x16, 0x3d, 0x53, 0xd1, 0x91, 0xbd, + 0x50, 0xc5, 0xef, 0x3c, 0x25, 0x52, 0x83, 0x3c, 0x9e, 0xce, 0x1f, 0x3c, 0x91, + 0xa7, 0x0c, 0xbd, 0xb8, 0x95, 0x03, 0x3c, 0x7a, 0x4c, 0x35, 0x3d, 0x8e, 0xc4, + 0x44, 0x3d, 0x1c, 0x66, 0x2c, 0x3d, 0x00, 0x89, 0x40, 0xba, 0xe1, 0xa3, 0x83, + 0x3d, 0x68, 0xf2, 0x2b, 0xbd, 0x30, 0xd4, 0xde, 0x3b, 0xcf, 0xa1, 0xbc, 0x3c, + 0x24, 0x79, 0x39, 0xbd, 0xe5, 0xf4, 0xb7, 0xbc, 0x79, 0x8d, 0x25, 0x3c, 0x95, + 0xb6, 0x38, 0x3d, 0xd8, 0xc2, 0x74, 0x3c, 0xaa, 0x8e, 0x80, 0xbd, 0x0d, 0x74, + 0xf3, 0x3c, 0x73, 0x5b, 0x98, 0xbc, 0x00, 0x64, 0x5e, 0xbc, 0x44, 0x82, 0xcb, + 0x3c, 0x5a, 0x25, 0x53, 0xbd, 0xe2, 0xd0, 0x93, 0xbd, 0x3b, 0x7a, 0x77, 0xbd, + 0x93, 0x3e, 0xd4, 0x3c, 0x39, 0x81, 0x28, 0xbd, 0x54, 0xd5, 0xef, 0x3c, 0x6c, + 0x29, 0xe1, 0x3c, 0x69, 0xc8, 0x09, 0x3d, 0x83, 0xb3, 0x36, 0xbd, 0x90, 0xe1, + 0xd4, 0xbb, 0x95, 0xa7, 0x1a, 0xbd, 0x39, 0xf5, 0x2b, 0xbc, 0x0c, 0xdf, 0x64, + 0xbd, 0x74, 0xec, 0xdc, 0xbc, 0x20, 0xc6, 0x3b, 0x3d, 0x40, 0x20, 0x46, 0x3c, + 0x18, 0x09, 0x3f, 0xbd, 0x96, 0x4c, 0xdc, 0xbc, 0x98, 0x98, 0x8d, 0xbd, 0xb4, + 0xdd, 0x27, 0xbd, 0x74, 0x45, 0xbb, 0x3c, 0x49, 0xd9, 0x08, 0xbd, 0x8e, 0x06, + 0xa8, 0x3b, 0x91, 0x10, 0xb4, 0x3c, 0xf8, 0x58, 0xf3, 0xbc, 0x06, 0xe9, 0x5e, + 0x3d, 0x14, 0xc8, 0x26, 0x3d, 0xc5, 0xf7, 0x20, 0xbb, 0x6b, 0x78, 0xc0, 0x3c, + 0xae, 0x64, 0x7f, 0x3c, 0xbb, 0xbf, 0x8b, 0x3c, 0x82, 0x4e, 0x0c, 0xbd, 0xb0, + 0xd0, 0xdf, 0xbc, 0xfe, 0x53, 0x97, 0xbc, 0x8a, 0x9e, 0x24, 0xbd, 0xdf, 0x79, + 0x84, 0x3d, 0x7e, 0xff, 0x8e, 0xbd, 0x66, 0x7b, 0xda, 0x3c, 0xb0, 0xdd, 0x8d, + 0xbd, 0xab, 0x91, 0xbb, 0xbc, 0x23, 0x20, 0xb0, 0xbc, 0xbe, 0x43, 0x3f, 0xbd, + 0x64, 0x80, 0xda, 0x3c, 0x32, 0x00, 0xde, 0x3c, 0xb2, 0x8a, 0x86, 0x3c, 0x68, + 0x45, 0x05, 0x3d, 0x8b, 0x7c, 0xd8, 0x3b, 0x68, 0x97, 0xe7, 0x3c, 0x82, 0x8d, + 0x6b, 0x3d, 0xa6, 0x53, 0x2d, 0x3d, 0xc0, 0x43, 0x23, 0x3c, 0xaa, 0xe6, 0x2d, + 0xbd, 0x34, 0x06, 0x57, 0xbc, 0xfc, 0x9f, 0x0c, 0xbd, 0x42, 0x77, 0xc6, 0x3c, + 0x51, 0x7a, 0x70, 0x3c, 0xe5, 0xe4, 0x7c, 0x3d, 0x86, 0x00, 0x67, 0xbd, 0x95, + 0xb8, 0x37, 0xbd, 0xdd, 0x7a, 0x8d, 0x3d, 0x97, 0x08, 0xa9, 0x3c, 0xfd, 0xb6, + 0x09, 0x3d, 0xdc, 0xb7, 0x81, 0x3d, 0xe0, 0x6c, 0x68, 0xbc, 0x79, 0x9b, 0x03, + 0xbd, 0xb8, 0xc7, 0x78, 0xbb, 0x94, 0x60, 0x0f, 0x3d, 0x3b, 0x0e, 0x80, 0x3d, + 0x11, 0xe6, 0x80, 0x3d, 0xb3, 0xab, 0x86, 0x3d, 0xed, 0xe6, 0x9d, 0xbc, 0xd8, + 0xeb, 0xd9, 0xbc, 0xaa, 0x62, 0x80, 0x3d, 0x12, 0xc5, 0x00, 0x3d, 0x2b, 0x4b, + 0x23, 0xbc, 0xc7, 0x31, 0xff, 0xbc, 0xe4, 0x95, 0xdb, 0x3b, 0xa7, 0x90, 0x66, + 0x3c, 0xd3, 0x65, 0xdb, 0xbc, 0x50, 0xe3, 0x47, 0x3d, 0xd4, 0x25, 0x84, 0xbd, + 0x5a, 0xd5, 0xae, 0xbc, 0x90, 0x5e, 0xba, 0x3c, 0x8c, 0x60, 0x90, 0xbd, 0xfc, + 0x57, 0x4c, 0x3d, 0x99, 0x08, 0x7d, 0xbd, 0x9f, 0xac, 0x3b, 0x3c, 0x1c, 0xb1, + 0x61, 0xbc, 0x6a, 0xb5, 0x33, 0xbc, 0x10, 0xb0, 0x28, 0x3c, 0x89, 0x5d, 0x9f, + 0x3c, 0xd2, 0x80, 0x84, 0xbc, 0xb4, 0xb1, 0xd5, 0xba, 0x41, 0x1e, 0xa0, 0x3c, + 0xd1, 0xd9, 0xd0, 0xbb, 0x04, 0xda, 0xd2, 0x3c, 0x58, 0x46, 0x90, 0xbc, 0xc1, + 0x5c, 0x19, 0xbc, 0x01, 0x66, 0x2c, 0xbd, 0xad, 0xdc, 0x88, 0xbd, 0x32, 0xab, + 0xb6, 0xbc, 0x14, 0x1f, 0x0b, 0x3d, 0x87, 0xf0, 0x69, 0x3d, 0x55, 0x30, 0x26, + 0xbd, 0x2e, 0x3a, 0x05, 0xbd, 0xda, 0x08, 0x0e, 0xbd, 0xef, 0x31, 0x57, 0xbd, + 0x0e, 0x44, 0x13, 0xbd, 0x53, 0x11, 0x29, 0xbd, 0x00, 0xd2, 0xea, 0x3a, 0x47, + 0x72, 0xae, 0xbc, 0x54, 0x4a, 0x4d, 0xbd, 0x8a, 0x13, 0x2b, 0xbd, 0xa3, 0xaf, + 0x92, 0x3d, 0x68, 0x15, 0x0d, 0x3c, 0x18, 0x17, 0x35, 0x3c, 0xb8, 0xf2, 0x6a, + 0x3c, 0x15, 0xf8, 0xb2, 0x3c, 0x1d, 0x9d, 0xcd, 0x3c, 0xd3, 0x90, 0x81, 0xbd, + 0x51, 0xe8, 0x21, 0x3d, 0x74, 0x43, 0xa9, 0x3c, 0x00, 0x0b, 0xa0, 0x3c, 0x8e, + 0x69, 0xfb, 0xba, 0x81, 0x27, 0xfa, 0x3c, 0x6b, 0x7c, 0xf5, 0xbc, 0x61, 0x68, + 0x84, 0x3d, 0xe4, 0x1a, 0x6b, 0xbd, 0xd0, 0xe9, 0xc8, 0x3c, 0x26, 0xff, 0x47, + 0xbd, 0x64, 0xb7, 0xe9, 0x3b, 0xf3, 0xad, 0x36, 0x3d, 0x8a, 0x00, 0x3f, 0xbd, + 0x94, 0x41, 0xcf, 0xbc, 0x01, 0xba, 0x55, 0x3d, 0x8c, 0x08, 0x36, 0xbd, 0xa4, + 0x6b, 0x1a, 0x3d, 0x59, 0xfd, 0x83, 0x3d, 0xcc, 0xdd, 0x60, 0xbd, 0x59, 0xc2, + 0xfe, 0xbc, 0xa6, 0x99, 0x2a, 0x3d, 0xbd, 0x45, 0x8b, 0x3d, 0xe2, 0x5e, 0x8c, + 0x3d, 0x18, 0x83, 0x87, 0xbc, 0x10, 0x63, 0xda, 0x3b, 0x58, 0xa1, 0xc2, 0x3c, + 0x78, 0xfa, 0x78, 0x3c, 0xfc, 0x33, 0xf0, 0x3c, 0xc4, 0xab, 0x5b, 0xbd, 0xde, + 0x4b, 0x07, 0x3d, 0x53, 0x76, 0x1b, 0xbd, 0xee, 0xd8, 0x86, 0x3d, 0x7f, 0xd6, + 0x7c, 0xbd, 0x68, 0xb5, 0x8e, 0x3c, 0x49, 0xdd, 0xd5, 0xbc, 0x83, 0x63, 0xed, + 0xbb, 0x4e, 0x00, 0x91, 0xbd, 0x69, 0xce, 0xd5, 0xbb, 0x2f, 0x57, 0x71, 0xbc, + 0x9a, 0xc3, 0x8f, 0xbd, 0x65, 0x27, 0x47, 0x3d, 0x2d, 0x6b, 0x77, 0xbd, 0xdd, + 0x54, 0x43, 0xbc, 0xf7, 0x1f, 0xe8, 0xbc, 0x12, 0x8f, 0x87, 0xbd, 0x4f, 0xcf, + 0x2f, 0x3d, 0x15, 0x51, 0x4b, 0xbd, 0x9d, 0x1f, 0x86, 0x3d, 0x68, 0x35, 0x58, + 0xbd, 0x16, 0xe4, 0x4e, 0xbd, 0xd0, 0x03, 0x91, 0xbd, 0x39, 0xc6, 0x90, 0x3c, + 0xdd, 0xbb, 0x0a, 0xbd, 0x58, 0x1b, 0x33, 0xbd, 0x55, 0x86, 0x91, 0xbd, 0x48, + 0xe7, 0x90, 0xbc, 0xf4, 0x14, 0x3f, 0xbc, 0xc0, 0x75, 0x9e, 0xba, 0x7e, 0x8f, + 0xa8, 0xbc, 0x8c, 0x2b, 0x55, 0x3d, 0x54, 0x4b, 0x70, 0xbd, 0x56, 0x74, 0x52, + 0x3d, 0x6d, 0xf4, 0x02, 0x3b, 0x7d, 0x46, 0x5c, 0x3b, 0x76, 0xf4, 0x0c, 0xbd, + 0xac, 0xa2, 0x1d, 0xbd, 0x5c, 0x63, 0xe2, 0xbc, 0x64, 0x4d, 0x31, 0x3c, 0xf9, + 0x3e, 0x3f, 0x3d, 0xed, 0x12, 0x2c, 0xbd, 0xc8, 0x12, 0xb0, 0xbc, 0x4d, 0x90, + 0x8f, 0x3d, 0x1d, 0xef, 0x89, 0x3d, 0xf0, 0x4f, 0x93, 0xbd, 0x88, 0x79, 0xd8, + 0x3c, 0x74, 0x42, 0x1f, 0xbd, 0xba, 0x43, 0x90, 0x3c, 0xd5, 0x7e, 0xe3, 0xbc, + 0x71, 0x49, 0x7b, 0xbd, 0x5d, 0x36, 0x16, 0x3d, 0x91, 0xb8, 0x22, 0xbd, 0xd4, + 0x0e, 0x1e, 0x3d, 0xaa, 0x17, 0x2d, 0x3c, 0xca, 0x4d, 0xb9, 0x3b, 0x8a, 0x9d, + 0x01, 0x3d, 0x60, 0xcf, 0xc3, 0xbb, 0xc4, 0xc0, 0x00, 0x3b, 0x6d, 0xeb, 0x09, + 0xbd, 0x88, 0x55, 0x9e, 0xbc, 0x04, 0x54, 0xc3, 0xbc, 0x00, 0x93, 0xf2, 0x3a, + 0xe2, 0x88, 0x6e, 0x3d, 0xa0, 0xdb, 0xd4, 0xbc, 0x12, 0x3b, 0xa4, 0x3b, 0x5d, + 0x20, 0x88, 0x3d, 0xb4, 0xe5, 0xdc, 0xbc, 0x93, 0xf0, 0x70, 0xbc, 0xf6, 0x1a, + 0x31, 0xbd, 0xe0, 0xc3, 0x75, 0x3c, 0xbc, 0x2b, 0x96, 0x3c, 0x5b, 0x81, 0x44, + 0xbd, 0x6e, 0x2f, 0xab, 0xbc, 0x4c, 0x4e, 0x82, 0x3d, 0x6c, 0x17, 0x9b, 0xbc, + 0x70, 0x5a, 0x16, 0xbc, 0x70, 0x5e, 0x10, 0x3c, 0x81, 0xf0, 0x7d, 0xbd, 0x55, + 0xca, 0x3d, 0x3d, 0xca, 0x75, 0xa2, 0xbc, 0x7f, 0xc2, 0xe2, 0xbb, 0xc4, 0x59, + 0x82, 0x3d, 0xbd, 0xde, 0xd0, 0xbc, 0xe6, 0x4c, 0x3a, 0x3d, 0x62, 0xc7, 0x62, + 0x3d, 0x3e, 0xd2, 0xc1, 0xba, 0xeb, 0xae, 0xb3, 0xbb, 0x39, 0xf0, 0xa2, 0x3c, + 0xd0, 0xa2, 0x18, 0xbd, 0x65, 0xea, 0x99, 0x3b, 0xd0, 0x01, 0x8d, 0xbc, 0x34, + 0x0c, 0x84, 0xbd, 0xc3, 0x10, 0x3f, 0xbd, 0xb0, 0x26, 0xc4, 0x3b, 0xde, 0xc4, + 0x2e, 0x3d, 0xb4, 0x3f, 0xe5, 0x3c, 0x80, 0x6d, 0xda, 0x3b, 0xd3, 0x01, 0x8f, + 0x3d, 0x7b, 0x2e, 0x70, 0x3b, 0x95, 0x55, 0x51, 0xbd, 0xc2, 0x13, 0x4a, 0x3d, + 0x70, 0xd8, 0x4a, 0x3d, 0x6d, 0xf3, 0xc7, 0xbb, 0x40, 0x46, 0xe8, 0x3c, 0x71, + 0x53, 0x85, 0x3a, 0xea, 0x87, 0xf9, 0x3c, 0xb0, 0xb0, 0xf5, 0x3c, 0xf2, 0x2a, + 0x58, 0x3d, 0xe8, 0xd7, 0xc4, 0x3c, 0x57, 0xd9, 0xc8, 0x3c, 0xf3, 0x05, 0x79, + 0xbd, 0x9c, 0x0e, 0xf5, 0xbb, 0xcd, 0xaa, 0x1b, 0xbc, 0x42, 0xa2, 0x22, 0x3d, + 0x3e, 0x81, 0xe3, 0x3c, 0x66, 0x13, 0x2a, 0xbd, 0x6d, 0xfd, 0x8f, 0x3d, 0xd3, + 0x64, 0xab, 0x3c, 0x1e, 0x94, 0xba, 0x3c, 0x68, 0x42, 0x45, 0xbd, 0x4c, 0x0e, + 0xaf, 0xbc, 0x90, 0xbf, 0x7e, 0x3d, 0x6f, 0x71, 0x91, 0x3d, 0xc3, 0xb6, 0x80, + 0x3d, 0x3a, 0xbd, 0x32, 0xbd, 0x08, 0x63, 0x11, 0xbc, 0xec, 0xf4, 0x08, 0x3d, + 0x60, 0x5c, 0xcc, 0x3b, 0x66, 0x5b, 0x59, 0xbd, 0xb9, 0xcb, 0x8d, 0xbd, 0xfd, + 0x30, 0x54, 0x3d, 0x2e, 0xaa, 0x0f, 0xbc, 0x80, 0x26, 0x1a, 0xbb, 0x47, 0x43, + 0x19, 0xbd, 0x2c, 0x5d, 0xb8, 0x3c, 0x6c, 0xa6, 0xe8, 0x3c, 0xec, 0x3c, 0xcb, + 0xbc, 0x61, 0x53, 0xa4, 0x3c, 0x68, 0xf1, 0x0a, 0x3c, 0x9c, 0x5f, 0x30, 0x3d, + 0x5b, 0x39, 0xb8, 0xbc, 0xd2, 0x8d, 0x99, 0xbc, 0xe7, 0x1e, 0x31, 0xbd, 0x61, + 0x4e, 0x2c, 0xbd, 0x11, 0xeb, 0xb3, 0xbc, 0x80, 0x2e, 0x0b, 0xbc, 0x57, 0xbf, + 0x75, 0x3c, 0xbb, 0xd3, 0x2b, 0x3d, 0xba, 0xc5, 0x1b, 0x3d, 0x43, 0x78, 0x80, + 0x3d, 0xeb, 0x30, 0x0a, 0x3c, 0xf7, 0xf8, 0x04, 0x3d, 0x1f, 0x88, 0x17, 0xbd, + 0x7c, 0x55, 0xf0, 0xbc, 0x4a, 0x93, 0x3c, 0x3d, 0x7a, 0x12, 0x5c, 0xbd, 0x54, + 0x6b, 0x42, 0xbd, 0xa0, 0x16, 0xd8, 0x3b, 0x20, 0x3e, 0x3b, 0x3b, 0x3c, 0xde, + 0x72, 0xbd, 0x68, 0x37, 0x68, 0xbd, 0x37, 0x55, 0x97, 0xbb, 0x19, 0x7b, 0x43, + 0xbd, 0x82, 0xce, 0x8a, 0xbd, 0xcf, 0xc2, 0x88, 0xbd, 0x30, 0xde, 0xd8, 0x3b, + 0xf1, 0xc1, 0xa9, 0x3c, 0x68, 0x51, 0x2d, 0x3d, 0x76, 0xd5, 0xac, 0x3c, 0xb8, + 0x4b, 0x78, 0xbb, 0x0f, 0x1c, 0x5d, 0xbd, 0xf7, 0x31, 0x25, 0xbd, 0x72, 0x4c, + 0x91, 0x3d, 0x6e, 0x4f, 0x51, 0x3d, 0xb4, 0x9b, 0x21, 0xbd, 0x03, 0x73, 0xdd, + 0xbc, 0x38, 0x49, 0x4f, 0x3c, 0xb8, 0xc7, 0x4f, 0x3d, 0x6a, 0x17, 0x0a, 0xba, + 0xf4, 0x4f, 0xcd, 0x3c, 0x93, 0x14, 0x86, 0xbd, 0xde, 0x1e, 0x31, 0x3c, 0x57, + 0x45, 0xf1, 0x3c, 0x53, 0xc3, 0x7c, 0x3d, 0xc8, 0x1a, 0xd8, 0x3c, 0x85, 0xf4, + 0x8d, 0x3d, 0xf2, 0xaa, 0x46, 0x3d, 0xa6, 0x5c, 0x73, 0x3d, 0xf8, 0x5a, 0x3c, + 0x3d, 0xd0, 0x85, 0xaf, 0x3c, 0x60, 0x1f, 0xa0, 0x3c, 0xef, 0xcb, 0x45, 0xbd, + 0x68, 0xc2, 0x24, 0x3d, 0x25, 0x65, 0x14, 0x3b, 0x0c, 0x01, 0x67, 0x3d, 0x43, + 0x57, 0x65, 0xbd, 0x50, 0x8f, 0xec, 0x3b, 0x88, 0xf5, 0x16, 0x3d, 0xde, 0xa3, + 0xe2, 0xbc, 0x92, 0x11, 0xfb, 0x3c, 0x35, 0x93, 0x26, 0x3d, 0x96, 0xe4, 0x70, + 0x3d, 0x30, 0xea, 0x40, 0x3c, 0x50, 0x65, 0x37, 0x3c, 0x56, 0xf8, 0x84, 0xbd, + 0x36, 0xc0, 0x8e, 0x3d, 0x58, 0x45, 0x6b, 0xbd, 0x46, 0xcc, 0x5e, 0xbc, 0x41, + 0x2a, 0x4f, 0xbd, 0x5f, 0xce, 0x80, 0xbb, 0xfb, 0x75, 0xae, 0xbc, 0x19, 0xe3, + 0x0b, 0xbd, 0x54, 0x3e, 0x8a, 0x3c, 0x41, 0x54, 0xb7, 0x39, 0x8f, 0xb4, 0x80, + 0x3d, 0xfb, 0x42, 0x00, 0x3d, 0x5e, 0x0b, 0x19, 0xbd, 0x5d, 0x03, 0xb5, 0x3c, + 0xd8, 0x30, 0x78, 0x3c, 0x3e, 0xef, 0x90, 0xbc, 0xe0, 0x2c, 0xdb, 0x3b, 0x0a, + 0x5a, 0xfc, 0xbc, 0x24, 0x7e, 0x90, 0xbd, 0x1a, 0xd4, 0x1b, 0x3d, 0x10, 0x0a, + 0x87, 0x3d, 0xa3, 0x8c, 0x3b, 0xbd, 0x3f, 0x54, 0xda, 0xbc, 0x0f, 0x59, 0xd8, + 0x3b, 0xbe, 0xea, 0xea, 0x3c, 0x39, 0x2d, 0x7e, 0xbd, 0x19, 0xa0, 0x73, 0xba, + 0x3c, 0xc5, 0x60, 0xbd, 0x57, 0x9e, 0x70, 0xbd, 0xdc, 0x65, 0xfb, 0x3b, 0xbc, + 0x13, 0x32, 0xbd, 0xa4, 0xd0, 0x81, 0xbd, 0x5f, 0x74, 0x85, 0x3d, 0x1a, 0xf5, + 0x58, 0x3d, 0xa3, 0x35, 0x7c, 0x3d, 0xb3, 0x3d, 0x87, 0x3c, 0x83, 0xc6, 0x6b, + 0x3d, 0xff, 0xe3, 0x8e, 0x3d, 0x97, 0xab, 0x01, 0xbd, 0x7c, 0xd4, 0x85, 0x3d, + 0xa0, 0xbd, 0x83, 0xbc, 0x04, 0x12, 0x41, 0x3d, 0x9e, 0x3d, 0x57, 0xbd, 0xa2, + 0x37, 0xc1, 0x3c, 0xf2, 0xa6, 0x81, 0xbd, 0xe0, 0xde, 0xe6, 0xbc, 0xa0, 0x4b, + 0xd4, 0xbb, 0xe8, 0x33, 0xd8, 0xbc, 0x9a, 0x4c, 0x55, 0x3d, 0x16, 0xc0, 0x91, + 0xbd, 0x28, 0xa0, 0x1e, 0x3c, 0xfc, 0xc7, 0x5f, 0xbc, 0xc1, 0x5e, 0x95, 0x3c, + 0xc4, 0x85, 0xa0, 0x3c, 0xf5, 0x01, 0xd7, 0xbc, 0xf3, 0x15, 0xcc, 0xbb, 0x52, + 0x0c, 0x2c, 0xbd, 0xea, 0xdf, 0x7b, 0x3d, 0x06, 0xe0, 0x26, 0xbc, 0x7a, 0x9a, + 0x8d, 0xbd, 0x9c, 0xdb, 0xac, 0x3c, 0x4b, 0xfa, 0x2f, 0x3d, 0xe4, 0x93, 0xf1, + 0x3c, 0x89, 0xe5, 0x91, 0xbd, 0xda, 0x41, 0x28, 0xbd, 0x52, 0x6f, 0x58, 0x3d, + 0x89, 0x2f, 0x43, 0xbd, 0x74, 0xe4, 0x00, 0xbd, 0x59, 0xd4, 0x26, 0xbd, 0x97, + 0x79, 0xa9, 0x3c, 0xb0, 0x62, 0x9f, 0xb9, 0xbc, 0xac, 0x04, 0x3d, 0x5c, 0xce, + 0x3d, 0xbd, 0x15, 0x58, 0x67, 0xbd, 0x0a, 0xce, 0xf4, 0xbc, 0x3a, 0x8f, 0x01, + 0xbd, 0x50, 0xd2, 0x73, 0xbc, 0x8e, 0x54, 0x16, 0xbc, 0xea, 0xd7, 0x3c, 0x3d, + 0xf0, 0xbe, 0xd7, 0x3c, 0x1a, 0x3d, 0x82, 0xbd, 0xba, 0x91, 0x2f, 0x3d, 0x10, + 0xb0, 0x92, 0xbd, 0xf8, 0x36, 0x1c, 0x3d, 0x50, 0x2a, 0x8f, 0xbd, 0xb0, 0x09, + 0x5e, 0x3d, 0x3b, 0xc8, 0x8f, 0xba, 0xf4, 0xce, 0x92, 0xbd, 0x38, 0xc4, 0x78, + 0xbd, 0xe0, 0x8c, 0x5c, 0xbc, 0x98, 0x6b, 0x8b, 0x3d, 0x16, 0x7f, 0x4a, 0x3d, + 0x18, 0xc0, 0xfe, 0xbc, 0x66, 0xbb, 0x4b, 0xbd, 0x90, 0xb6, 0xe1, 0x3b, 0x98, + 0xca, 0x8c, 0x3c, 0x05, 0xfe, 0xec, 0xbc, 0x58, 0x1c, 0x17, 0x3d, 0x37, 0x17, + 0x80, 0x3d, 0x41, 0x6e, 0x14, 0x3d, 0xee, 0x95, 0xcb, 0xbb, 0x1a, 0x56, 0x1f, + 0xbd, 0xae, 0xc7, 0x2c, 0x3c, 0x28, 0x3a, 0x80, 0x3b, 0x00, 0x13, 0x76, 0xbc, + 0x69, 0xaf, 0x5e, 0xbc, 0x80, 0xcc, 0x02, 0xbd, 0xa8, 0xea, 0x04, 0xba, 0xb8, + 0xae, 0x09, 0x3d, 0xb3, 0x0d, 0x8d, 0x3d, 0xc0, 0x22, 0x84, 0xba, 0x04, 0x62, + 0x5c, 0xbd, 0xd8, 0x28, 0x09, 0x3c, 0x68, 0xd3, 0x41, 0x3c, 0x62, 0x52, 0x1e, + 0x3d, 0x99, 0x42, 0x03, 0xbd, 0x3b, 0x4b, 0xd9, 0xba, 0x68, 0x5e, 0x32, 0xbd, + 0x8b, 0x9e, 0x26, 0xbb, 0x9c, 0xd7, 0xcd, 0x3c, 0x4e, 0xdc, 0x16, 0x3d, 0x42, + 0x1a, 0x07, 0x3d, 0xbb, 0xa6, 0x96, 0xbb, 0xf4, 0x47, 0x59, 0xbc, 0x13, 0xa3, + 0xa1, 0xbc, 0x8f, 0x58, 0x0f, 0xbc, 0x88, 0xd1, 0x1d, 0xbd, 0xe0, 0x0f, 0xfb, + 0x3c, 0x81, 0xd3, 0x90, 0x3d, 0xe0, 0x4b, 0x4f, 0xbd, 0x3f, 0x4a, 0x80, 0x3d, + 0x3a, 0x63, 0x67, 0x3d, 0xe2, 0xee, 0x1e, 0x3c, 0xf8, 0x65, 0xdd, 0x3b, 0x1c, + 0x30, 0x09, 0xbd, 0xe9, 0x2f, 0xdb, 0xbc, 0x94, 0x36, 0x55, 0xbd, 0x2c, 0xa4, + 0x95, 0x3a, 0x78, 0x24, 0x2f, 0x3d, 0xc7, 0x9c, 0x44, 0xbd, 0xb5, 0x09, 0x10, + 0xbd, 0x7d, 0x10, 0x49, 0xbd, 0x60, 0xd3, 0x43, 0x3c, 0xef, 0x67, 0x05, 0xbd, + 0x0a, 0x1d, 0x6c, 0x3d, 0xaa, 0x4d, 0x0c, 0x3d, 0x84, 0xfc, 0x8a, 0xbc, 0x0d, + 0xf7, 0x65, 0xbd, 0x5c, 0x71, 0x93, 0xbc, 0xd8, 0xe9, 0x2a, 0x3d, 0x1d, 0xd9, + 0xc6, 0xbc, 0xd6, 0xeb, 0x70, 0xbd, 0xef, 0x92, 0x41, 0xbd, 0x4a, 0xd3, 0x83, + 0xbd, 0x1e, 0xf1, 0x74, 0x3b, 0xa3, 0xb4, 0x1e, 0xbc, 0x4f, 0x0c, 0x12, 0x3d, + 0x69, 0xf6, 0x25, 0x3d, 0x5a, 0x52, 0x35, 0x3d, 0xb5, 0x14, 0x37, 0x3d, 0x2b, + 0xf9, 0x2d, 0xbd, 0xb8, 0xc6, 0x12, 0x3d, 0x2e, 0xeb, 0xf8, 0xbb, 0x31, 0xe0, + 0x43, 0xbd, 0x37, 0x68, 0xf4, 0x3b, 0x4e, 0xd7, 0x55, 0xbd, 0xf2, 0x8f, 0x06, + 0x3d, 0xa3, 0xe0, 0x8a, 0x3d, 0x47, 0xcb, 0x91, 0x3d, 0xc3, 0xaa, 0x1c, 0xbd, + 0x43, 0x44, 0x24, 0x3d, 0x5a, 0xcc, 0x30, 0xbd, 0x72, 0xbe, 0x27, 0x3c, 0xfc, + 0xd5, 0xbe, 0x3c, 0x34, 0x0e, 0x3f, 0x3d, 0xdc, 0x3d, 0x7b, 0xbc, 0x64, 0xe1, + 0xa9, 0x3c, 0x00, 0x61, 0x80, 0x3b, 0x19, 0xd4, 0x82, 0xbd, 0x41, 0xef, 0x8c, + 0x3d, 0x90, 0x50, 0x11, 0xbd, 0x0d, 0x32, 0x8d, 0x3d, 0x56, 0x78, 0x5f, 0x3c, + 0x71, 0x44, 0x6c, 0x3d, 0x21, 0xe4, 0x22, 0x3d, 0x31, 0xfd, 0xb4, 0xbb, 0xcc, + 0x10, 0x7e, 0x3c, 0x7a, 0xb4, 0x06, 0x3d, 0xc5, 0xde, 0x22, 0xbc, 0xd2, 0x57, + 0xfe, 0x3c, 0x30, 0x95, 0x81, 0xbd, 0x00, 0x6d, 0xde, 0x39, 0xfd, 0x2b, 0x3f, + 0x3d, 0x8f, 0xe7, 0xf4, 0x3b, 0x2b, 0xf8, 0xa3, 0xbc, 0xcf, 0x7c, 0x4e, 0x3d, + 0x86, 0xee, 0xf7, 0x3c, 0x20, 0x5a, 0x22, 0xbb, 0x1a, 0xa9, 0x62, 0xbd, 0x0f, + 0x24, 0x7f, 0x3d, 0x74, 0x7e, 0x00, 0x3d, 0x24, 0xd2, 0xcb, 0xbc, 0x06, 0xc6, + 0x44, 0xbd, 0xe1, 0x53, 0xa3, 0x3c, 0x7d, 0x24, 0x08, 0x3d, 0xf6, 0x9f, 0x23, + 0xbd, 0x3f, 0xb0, 0x84, 0xbd, 0xb0, 0xbb, 0xbc, 0x3c, 0x74, 0x6c, 0x22, 0xbc, + 0x0b, 0x32, 0x50, 0xbd, 0x81, 0x6f, 0x8b, 0x3d, 0x98, 0x37, 0xc3, 0x3c, 0xfd, + 0x30, 0x08, 0xbd, 0x11, 0x42, 0x01, 0xbd, 0xd6, 0x91, 0x16, 0x3c, 0x6e, 0xf1, + 0xc2, 0x3a, 0xed, 0x4b, 0x8c, 0xbd, 0x51, 0x70, 0x34, 0xbd, 0x2a, 0x7e, 0x1c, + 0x3b, 0x5a, 0x96, 0xcd, 0x37, 0x9a, 0x8e, 0xf8, 0x3c, 0xce, 0x8a, 0x6d, 0x3d, + 0x62, 0xb2, 0x38, 0x3d, 0x70, 0x0a, 0xbe, 0xbc, 0xd0, 0x3f, 0x66, 0xbc, 0xf4, + 0xfe, 0x24, 0x3d, 0xbe, 0xf9, 0x89, 0x3c, 0xa0, 0x2b, 0xc1, 0xbc, 0x02, 0x6d, + 0x41, 0x3c, 0xa4, 0x00, 0x14, 0xbd, 0xbc, 0xa1, 0xd1, 0x3b, 0xbc, 0x27, 0xa6, + 0x3c, 0xc8, 0x08, 0xfd, 0xbc, 0xa1, 0x0e, 0x9c, 0xbc, 0xa1, 0x28, 0x07, 0xbc, + 0x33, 0xf3, 0x71, 0x3c, 0x96, 0xed, 0x1f, 0x3d, 0xf6, 0x6d, 0x5e, 0xbd, 0x30, + 0x7c, 0x12, 0xbc, 0xf2, 0xaf, 0x7b, 0x3d, 0x56, 0xfa, 0x36, 0xbd, 0x7a, 0x6f, + 0x3a, 0x3d, 0x40, 0x65, 0x8f, 0x3c, 0x2c, 0xa1, 0x4f, 0xbc, 0x80, 0x0f, 0x7b, + 0x3b, 0xaf, 0xc3, 0xf2, 0x3c, 0xae, 0x39, 0x8a, 0xbd, 0xd5, 0xf6, 0x42, 0xbd, + 0x12, 0x9c, 0x33, 0x3d, 0x88, 0x27, 0x4d, 0x3d, 0x61, 0x05, 0x1e, 0xbd, 0x02, + 0xcd, 0x04, 0xbd, 0xe8, 0x6f, 0xe1, 0x3c, 0xf8, 0xd2, 0x73, 0x3d, 0xb9, 0xa3, + 0x61, 0xbd, 0x64, 0x01, 0x92, 0x3c, 0x4f, 0x8e, 0x21, 0xbc, 0x8b, 0xf5, 0x18, + 0x3d, 0xce, 0x3b, 0x77, 0x3d, 0x8d, 0x0e, 0x97, 0x3a, 0x30, 0xfc, 0x85, 0x3c, + 0x1f, 0x24, 0x8e, 0x3a, 0xca, 0xdd, 0x4e, 0x3d, 0x5f, 0x7c, 0xfe, 0x3b, 0x84, + 0xdf, 0x2d, 0x3d, 0x7a, 0x5c, 0x8c, 0x3d, 0x90, 0xf3, 0x79, 0xbc, 0x4f, 0x99, + 0x17, 0xbd, 0x30, 0xb1, 0xd2, 0xbb, 0x1c, 0x5a, 0x32, 0xbd, 0xd4, 0x8c, 0xd9, + 0x3c, 0x08, 0x56, 0xec, 0x3c, 0xf0, 0xcf, 0x64, 0xbd, 0xf0, 0x2a, 0xf1, 0xbb, + 0x28, 0x09, 0x0c, 0xbc, 0x0f, 0xf7, 0x8d, 0xbd, 0x86, 0x8f, 0x59, 0xbd, 0xfa, + 0xbf, 0x52, 0xbd, 0x76, 0x65, 0x4c, 0xbd, 0x79, 0xaa, 0x16, 0xbd, 0x9e, 0x6f, + 0xa7, 0xbc, 0xac, 0x9e, 0x8f, 0xbd, 0x5a, 0xfc, 0x7b, 0xbd, 0x90, 0xe3, 0x20, + 0x3d, 0xd0, 0x2b, 0x81, 0x3d, 0xc1, 0xbf, 0x85, 0x3d, 0x48, 0x79, 0x44, 0x3d, + 0x3e, 0x7b, 0x6d, 0x3d, 0x2b, 0x83, 0x11, 0x3d, 0x45, 0x84, 0x38, 0x3d, 0xbd, + 0x6d, 0x47, 0xb8, 0xe9, 0x7c, 0x29, 0xbd, 0x51, 0xd2, 0xc9, 0x3c, 0x77, 0x53, + 0xf0, 0x3b, 0xca, 0xc2, 0x17, 0xbd, 0xb2, 0xbc, 0x13, 0x3d, 0xbc, 0x58, 0xf9, + 0x3c, 0xed, 0x65, 0xed, 0x3c, 0x05, 0xdd, 0x8e, 0xbc, 0x0f, 0xa5, 0x96, 0xbc, + 0xd2, 0x96, 0x00, 0x3d, 0x90, 0xfe, 0x5c, 0x3d, 0x1f, 0x18, 0x90, 0xbd, 0x68, + 0xbb, 0xc8, 0x3c, 0x86, 0xae, 0xbb, 0xbc, 0x8a, 0x69, 0xea, 0xbc, 0x28, 0x6a, + 0x7c, 0x3c, 0x32, 0x5f, 0x70, 0x3d, 0xdd, 0x12, 0xd4, 0xba, 0xca, 0x54, 0x56, + 0xbd, 0x46, 0x94, 0x3f, 0xbd, 0x28, 0x3e, 0xa6, 0x3c, 0x93, 0x06, 0x43, 0xbd, + 0x58, 0xc7, 0xf0, 0x3c, 0x5d, 0x14, 0xa9, 0xbb, 0x58, 0x98, 0xc8, 0xbc, 0x89, + 0x34, 0x8d, 0x3d, 0x39, 0x90, 0x7b, 0x3d, 0x66, 0x18, 0x63, 0x3d, 0x60, 0x47, + 0x4d, 0x3b, 0x1d, 0x50, 0x6c, 0xbd, 0x55, 0x74, 0x27, 0x3d, 0x11, 0xf1, 0x66, + 0xbd, 0x14, 0xe6, 0x90, 0x3d, 0xdf, 0x99, 0x88, 0x3d, 0x9b, 0xc6, 0x67, 0x3d, + 0x16, 0xca, 0xd3, 0xbc, 0x79, 0xad, 0x87, 0x3d, 0x52, 0x56, 0x7b, 0x3d, 0x6e, + 0x19, 0x14, 0xbc, 0x12, 0x02, 0x26, 0x3d, 0xaf, 0x26, 0x1b, 0xbd, 0x5e, 0x09, + 0x8c, 0xbd, 0xa2, 0x3c, 0x5f, 0x3d, 0x60, 0x7e, 0x7d, 0xbd, 0x10, 0xc0, 0x85, + 0xbd, 0x70, 0x15, 0xc4, 0x3b, 0xe0, 0xfa, 0xf8, 0x3b, 0xe6, 0x2e, 0x00, 0x3d, + 0xf7, 0xd5, 0x1f, 0x3d, 0x48, 0x70, 0x60, 0x3d, 0x2a, 0x3a, 0xed, 0xbc, 0xfd, + 0x05, 0x26, 0xbc, 0x67, 0xf0, 0xee, 0x3a, 0x7e, 0x6e, 0x46, 0x3d, 0x57, 0x87, + 0x90, 0x3d, 0x22, 0xdb, 0x65, 0xbd, 0x70, 0xad, 0x7a, 0x3c, 0xa6, 0xb5, 0xc3, + 0x3c, 0xd4, 0xfa, 0x12, 0x3c, 0x4e, 0x84, 0x2f, 0xbd, 0x00, 0x37, 0x63, 0xbb, + 0xfb, 0x25, 0x41, 0xbc, 0x38, 0xa5, 0x84, 0x3d, 0x8a, 0xd7, 0x5a, 0xbd, 0x11, + 0xf7, 0xd6, 0xbb, 0xd1, 0x99, 0x22, 0xbd, 0xc8, 0xfc, 0x83, 0x3c, 0xd8, 0x91, + 0xd8, 0xbc, 0xa6, 0xf0, 0x3f, 0xbd, 0x08, 0x4d, 0x3b, 0x3d, 0xdd, 0x56, 0x4c, + 0xbd, 0xeb, 0x23, 0x8d, 0xbd, 0x23, 0x09, 0xcc, 0x3c, 0xbb, 0x3d, 0x8a, 0x3d, + 0x47, 0xb9, 0x75, 0xbd, 0x69, 0x75, 0x82, 0x3d, 0x30, 0x78, 0x86, 0x3c, 0x0c, + 0xc2, 0xd6, 0xbc, 0x2a, 0x22, 0x51, 0x3d, 0x9c, 0xfa, 0x3b, 0xbc, 0x00, 0x4b, + 0xbf, 0x39, 0x10, 0x58, 0xe6, 0xbb, 0x22, 0xa4, 0x47, 0x3d, 0x8b, 0xd1, 0x6f, + 0x3c, 0xf3, 0x8b, 0x23, 0xbd, 0xad, 0x67, 0x71, 0xbd, 0xa4, 0xbb, 0x71, 0xbc, + 0x68, 0x9d, 0x36, 0x3d, 0x79, 0xda, 0x00, 0x3d, 0x30, 0x88, 0x15, 0x3d, 0xc4, + 0x55, 0xab, 0x3c, 0xd0, 0xbe, 0x4f, 0x3d, 0x43, 0xa2, 0x8b, 0x3d, 0xc0, 0x0b, + 0x27, 0xbc, 0xfe, 0x35, 0x91, 0xbd, 0x27, 0x33, 0x5b, 0xbc, 0xc5, 0x00, 0x91, + 0xb9, 0x3e, 0x30, 0x74, 0xbd, 0x1c, 0x92, 0x70, 0xbd, 0xfe, 0x13, 0x56, 0xbb, + 0x63, 0x1b, 0x84, 0x3d, 0x24, 0x9a, 0xa1, 0x3c, 0x93, 0x78, 0x83, 0xbc, 0x29, + 0xb2, 0xce, 0x3c, 0x05, 0x6f, 0x8f, 0x3d, 0xe8, 0xb4, 0x3b, 0xbd, 0x12, 0x90, + 0x8e, 0x3d, 0x58, 0x6a, 0x76, 0xbd, 0xee, 0x8f, 0x90, 0xbd, 0x1e, 0x98, 0xde, + 0xbc, 0x88, 0x22, 0x40, 0x3d, 0x1b, 0x7f, 0x87, 0xbd, 0x3e, 0x25, 0x5e, 0x3d, + 0x38, 0xf3, 0x0c, 0xbc, 0x77, 0x6a, 0x8b, 0xbd, 0x0c, 0x98, 0x08, 0xbc, 0xbd, + 0x52, 0xf6, 0x3c, 0x2d, 0x2f, 0x03, 0xbd, 0x15, 0xbf, 0x91, 0x3d, 0xba, 0x41, + 0xef, 0xbc, 0xdf, 0x02, 0xab, 0xbc, 0xe4, 0xac, 0x7e, 0x3d, 0x9e, 0x8c, 0x51, + 0x3d, 0xcc, 0x12, 0x01, 0x3d, 0xfc, 0xfb, 0x1b, 0xbd, 0x75, 0x2b, 0x81, 0xbd, + 0x6a, 0xbf, 0x20, 0x3d, 0xbb, 0x3c, 0x77, 0xbd, 0xae, 0x2f, 0x74, 0xbd, 0x58, + 0x94, 0x53, 0xbd, 0xa0, 0xcf, 0xd4, 0x3c, 0x68, 0x51, 0xd1, 0x3c, 0x1c, 0x40, + 0x22, 0xbd, 0x86, 0x62, 0x04, 0x3d, 0x9c, 0x10, 0x02, 0xbd, 0x5d, 0x31, 0x49, + 0xbb, 0x5d, 0x8e, 0xf5, 0xbc, 0xb8, 0xef, 0x44, 0xbc, 0x06, 0xe5, 0x50, 0xbd, + 0xe6, 0x33, 0x40, 0xbd, 0x20, 0x2e, 0x39, 0x3b, 0x00, 0x2f, 0x96, 0xbb, 0x75, + 0x2e, 0x80, 0xbd, 0x2c, 0x9f, 0x4e, 0x3d, 0xd0, 0x40, 0xf6, 0x3b, 0x2e, 0x56, + 0x8e, 0x3d, 0xcf, 0x00, 0x15, 0x3d, 0xae, 0x5d, 0xc7, 0x3b, 0x44, 0x47, 0x05, + 0x3d, 0x80, 0x19, 0x71, 0xbb, 0x8c, 0xce, 0x87, 0xbd, 0xd2, 0x30, 0x78, 0xbd, + 0xcc, 0x7b, 0x14, 0xbd, 0xf4, 0xb8, 0x91, 0xbd, 0xbe, 0x76, 0x64, 0x3d, 0xf9, + 0x7e, 0x80, 0x3d, 0xda, 0xf8, 0x13, 0xbd, 0x92, 0xd0, 0x11, 0xbd, 0x03, 0x64, + 0x55, 0xbc, 0x50, 0x1a, 0xe8, 0xbc, 0x97, 0xeb, 0x5e, 0xbd, 0x7c, 0xf8, 0x90, + 0x3d, 0xc4, 0x26, 0x4b, 0x3d, 0xc2, 0x04, 0x7d, 0xbd, 0x25, 0x41, 0x14, 0x3b, + 0xac, 0xc2, 0xdf, 0x3c, 0xda, 0x60, 0xd3, 0xbc, 0x1b, 0x00, 0x45, 0xbd, 0x7e, + 0x09, 0xac, 0xbc, 0x28, 0x65, 0xcb, 0xbc, 0xe6, 0xd0, 0xb2, 0xbc, 0xb8, 0xdf, + 0xae, 0x3c, 0xc8, 0xb7, 0xca, 0x3c, 0x98, 0x50, 0xa1, 0x3c, 0x5c, 0xa2, 0xa0, + 0xbc, 0x8c, 0x18, 0x56, 0x3d, 0xea, 0x98, 0x8e, 0xbd, 0xb5, 0xba, 0x49, 0x3b, + 0xff, 0x2b, 0xaf, 0x3c, 0x91, 0xf6, 0x49, 0xbd, 0x0a, 0x19, 0x4d, 0x3d, 0xa1, + 0x7e, 0x69, 0xbd, 0x6c, 0x77, 0x3e, 0xbc, 0xa0, 0x00, 0x6e, 0x3d, 0x81, 0xc6, + 0xb1, 0x3b, 0x8b, 0xbf, 0x40, 0xbd, 0x5e, 0x71, 0xf5, 0xbc, 0x74, 0x2c, 0x96, + 0xbc, 0x3d, 0x0c, 0x8b, 0xbd, 0x45, 0x9a, 0x8a, 0xbd, 0xdb, 0x49, 0xcb, 0x3c, + 0x9b, 0x5b, 0x10, 0x3d, 0xf5, 0x79, 0x45, 0x3d, 0x5a, 0x50, 0x86, 0xbd, 0xf9, + 0x2f, 0x7c, 0xbd, 0xf6, 0x3d, 0x19, 0xbd, 0x54, 0x10, 0x0c, 0x3b, 0xaf, 0x59, + 0x27, 0xbd, 0x1f, 0x75, 0x78, 0x3d, 0x10, 0xb2, 0x9a, 0xbc, 0xc3, 0xb1, 0x99, + 0xbc, 0xb4, 0x08, 0xac, 0x3c, 0x15, 0x41, 0x86, 0x3d, 0xc0, 0x2d, 0x46, 0xbb, + 0xc4, 0x49, 0x56, 0xbc, 0xef, 0x2e, 0x7b, 0xbd, 0x6c, 0xee, 0x14, 0x3d, 0x70, + 0xe7, 0x9c, 0x3c, 0x78, 0x7e, 0xfb, 0xbc, 0xf7, 0x06, 0x51, 0xbd, 0x52, 0xd4, + 0x1a, 0xbd, 0xb0, 0x2b, 0xeb, 0xbc, 0xad, 0xad, 0x4e, 0xbd, 0xa4, 0x7c, 0xe3, + 0x3c, 0x18, 0xa1, 0xd8, 0xbc, 0x6e, 0xa6, 0x8f, 0xbd, 0x79, 0x0d, 0xb7, 0xba, + 0xb2, 0x10, 0x10, 0x3d, 0xe6, 0xcf, 0x52, 0x3d, 0x8e, 0x88, 0x35, 0x3d, 0xdd, + 0x92, 0x8d, 0x3d, 0x54, 0x69, 0x83, 0xbc, 0xab, 0xa9, 0x88, 0xbd, 0xe0, 0xa7, + 0x1c, 0xbb, 0x86, 0x10, 0x2c, 0xbd, 0x24, 0xde, 0x18, 0x3d, 0x4a, 0x04, 0x87, + 0xbd, 0x42, 0x3c, 0x16, 0xbd, 0x62, 0x25, 0x90, 0xbd, 0xce, 0x01, 0x64, 0xbd, + 0x2c, 0x76, 0x6f, 0xbd, 0xd2, 0x15, 0x0b, 0xbd, 0x45, 0x72, 0x73, 0x3b, 0xeb, + 0x46, 0x02, 0xbd, 0x05, 0x12, 0x1c, 0xbd, 0xb8, 0x16, 0x22, 0xbd, 0xe5, 0x22, + 0x89, 0x3d, 0x8c, 0x8a, 0xf4, 0x3c, 0x40, 0x6b, 0xe4, 0x3a, 0x5c, 0xe2, 0x70, + 0xbd, 0x56, 0x08, 0x67, 0xbd, 0x5b, 0xec, 0x4d, 0x3d, 0xba, 0x4d, 0x2a, 0xbd, + 0xb9, 0x55, 0xa4, 0xbc, 0xb7, 0xd7, 0x39, 0x3d, 0xa0, 0x88, 0xfe, 0x3c, 0xbf, + 0x7d, 0x6b, 0xbd, 0xcd, 0xdf, 0xe3, 0xbc, 0x26, 0xa0, 0x3e, 0x3d, 0x19, 0x4b, + 0x17, 0x3d, 0x54, 0x84, 0xa7, 0xbc, 0x78, 0x9a, 0x6a, 0xbd, 0x80, 0xcc, 0xa7, + 0x3c, 0x58, 0x48, 0x3a, 0x3d, 0xd9, 0x9a, 0xe3, 0xbc, 0xe0, 0xa2, 0xb8, 0x3c, + 0x3f, 0x32, 0x4d, 0x3d, 0x8e, 0xa6, 0x80, 0xbc, 0x0f, 0xfc, 0xd6, 0xbb, 0x40, + 0x70, 0x8b, 0xbd, 0xe3, 0xa3, 0xf6, 0xbb, 0x40, 0x26, 0x33, 0xbb, 0x43, 0xb2, + 0x01, 0xbd, 0x2e, 0xf9, 0x27, 0xbd, 0x6c, 0xcf, 0x54, 0x3c, 0xae, 0xca, 0x4d, + 0x3c, 0x6e, 0x2d, 0x1d, 0x3a, 0x04, 0xda, 0x94, 0xbc, 0x2c, 0x2b, 0xc6, 0x3c, + 0x59, 0xc8, 0x1a, 0xbd, 0x80, 0x56, 0xcb, 0x3b, 0xf4, 0xce, 0xa1, 0x3c, 0x84, + 0xdd, 0xeb, 0x3c, 0x95, 0x36, 0x83, 0xbd, 0x60, 0xeb, 0x47, 0x3d, 0x90, 0xf8, + 0x63, 0x3d, 0x8a, 0xc4, 0x6a, 0xbc, 0x40, 0x25, 0xa9, 0x3b, 0x7a, 0xfc, 0x65, + 0x3d, 0xe2, 0xcd, 0x33, 0x3d, 0x69, 0x80, 0xe5, 0xbc, 0xf7, 0xc5, 0x42, 0xbc, + 0x17, 0xf4, 0x31, 0xbd, 0xbe, 0xb3, 0x79, 0x3d, 0xff, 0xfc, 0x6c, 0x3d, 0xc5, + 0x04, 0x7d, 0xbc, 0xd9, 0x4f, 0x8e, 0x3d, 0xfe, 0xd3, 0x86, 0xbd, 0xcd, 0xeb, + 0x3f, 0x3d, 0xd8, 0x90, 0x2e, 0xbd, 0x56, 0x17, 0xbf, 0x3c, 0xbb, 0x23, 0x83, + 0xbd, 0x69, 0x4a, 0x43, 0x3d, 0x0a, 0x76, 0x5e, 0xbd, 0xee, 0x69, 0x8d, 0x3d, + 0x75, 0xda, 0x1c, 0x3c, 0xe8, 0xf7, 0xe0, 0xbc, 0x53, 0xbe, 0xda, 0xb8, 0xc2, + 0x03, 0x2e, 0xbd, 0xe4, 0xa0, 0x38, 0xbc, 0xbc, 0x5e, 0x3b, 0xbd, 0xfc, 0xfc, + 0xb7, 0x3c, 0xd4, 0xfb, 0x13, 0xbd, 0xf6, 0x8c, 0x44, 0x3d, 0x70, 0x13, 0x9d, + 0x3c, 0xf8, 0xb8, 0x11, 0xbc, 0xcc, 0x9b, 0x3b, 0xbd, 0xf7, 0x18, 0xe4, 0xbc, + 0x89, 0xc3, 0x31, 0x3d, 0xde, 0x7c, 0x32, 0xbd, 0x3c, 0xc7, 0x97, 0x3c, 0x2e, + 0xc0, 0xb8, 0xbc, 0xa2, 0xfe, 0x29, 0xbd, 0x17, 0xb2, 0x35, 0xbd, 0xaa, 0x83, + 0xdd, 0x3c, 0x1e, 0xfa, 0x83, 0x3d, 0xc6, 0x4c, 0x16, 0x3d, 0xfd, 0x0f, 0x29, + 0x3d, 0x2d, 0x90, 0xac, 0x3b, 0xfe, 0xe5, 0xc8, 0x3b, 0xac, 0x11, 0xc7, 0xbc, + 0x2d, 0xf3, 0xfa, 0x3c, 0x2a, 0x75, 0x81, 0xbd, 0x2d, 0x84, 0xb4, 0x3c, 0xfd, + 0xad, 0x66, 0xbc, 0xaa, 0x80, 0x2a, 0xbd, 0x58, 0x82, 0x8c, 0x3d, 0x75, 0x06, + 0x78, 0x3d, 0x1b, 0xdd, 0x21, 0xbc, 0x1c, 0x40, 0x38, 0x3d, 0xe0, 0xdc, 0x6e, + 0x3d, 0x50, 0xb8, 0x32, 0xbc, 0x80, 0x13, 0x4f, 0xbb, 0x32, 0x50, 0x6c, 0x3d, + 0xce, 0x1b, 0xf1, 0xbc, 0xd8, 0x20, 0x02, 0x3d, 0x43, 0x68, 0xa2, 0x3c, 0x9a, + 0x6c, 0x29, 0xbd, 0x8d, 0x90, 0x22, 0xbd, 0x14, 0xff, 0xe6, 0xbb, 0xb8, 0xcf, + 0xc1, 0x3c, 0xa6, 0x3b, 0x4a, 0x3d, 0xac, 0xad, 0x11, 0x3d, 0x60, 0x19, 0xc9, + 0x3c, 0x55, 0xae, 0xf1, 0xbc, 0x3d, 0xc0, 0x23, 0xbd, 0xa3, 0x00, 0xcd, 0xbb, + 0x44, 0x9e, 0x17, 0x3d, 0xc0, 0x31, 0xe2, 0x3a, 0x30, 0xdf, 0xf4, 0x3c, 0x31, + 0x09, 0x92, 0xbc, 0xa8, 0xbd, 0x66, 0x3c, 0xa5, 0x06, 0x4f, 0x3c, 0xdc, 0x2e, + 0x92, 0xbd, 0xfb, 0x54, 0x87, 0xb9, 0x9b, 0x34, 0x1f, 0x3d, 0xd8, 0xf7, 0xa7, + 0xbb, 0xff, 0x1d, 0x62, 0xbd, 0xe0, 0xf8, 0x3c, 0x3d, 0x85, 0x58, 0x8f, 0xbd, + 0x75, 0xf9, 0x62, 0xbd, 0xef, 0xf5, 0x7a, 0xbd, 0x58, 0x32, 0x86, 0x3d, 0x90, + 0x17, 0x29, 0x3c, 0x64, 0xcc, 0x4a, 0xbd, 0xf0, 0x07, 0xc1, 0xbc, 0x72, 0xdc, + 0x64, 0xbd, 0x68, 0x3e, 0x2e, 0x3c, 0x38, 0x6d, 0x60, 0xbd, 0x46, 0x1f, 0x59, + 0x3d, 0xd0, 0xa7, 0x3e, 0x3d, 0x77, 0x1d, 0x49, 0x3d, 0xcb, 0xed, 0x7f, 0xbd, + 0xd8, 0x47, 0x40, 0x3c, 0x00, 0xf0, 0xee, 0x39, 0xcc, 0xea, 0x57, 0x3d, 0x10, + 0x1d, 0x8a, 0xbd, 0xb9, 0x55, 0x5f, 0xbd, 0x17, 0x3c, 0x66, 0xbc, 0x02, 0xb8, + 0x06, 0xbd, 0x5f, 0xfb, 0x16, 0xbd, 0x58, 0x15, 0x8c, 0x3d, 0x18, 0x99, 0x5f, + 0x3d, 0x5f, 0x73, 0xb3, 0xbc, 0x61, 0x73, 0x63, 0x3d, 0x61, 0xf2, 0x7b, 0xbc, + 0xbd, 0x2b, 0xad, 0x3a, 0xda, 0x99, 0x5c, 0xbd, 0x81, 0xd1, 0xd0, 0x3c, 0xf0, + 0xf9, 0xb0, 0x3c, 0x84, 0x54, 0x68, 0x3c, 0x24, 0x10, 0x84, 0x3d, 0x4d, 0xec, + 0xa2, 0x3b, 0xd3, 0xab, 0x1e, 0xbd, 0xbd, 0x4d, 0x84, 0x3d, 0xd0, 0xd9, 0xb6, + 0x3c, 0x84, 0xdc, 0x71, 0xbd, 0x84, 0x4a, 0x03, 0x3d, 0x54, 0xb8, 0xc6, 0x3c, + 0x0a, 0x84, 0x0e, 0x3d, 0xdc, 0xfe, 0x64, 0xbd, 0xa6, 0xc2, 0x19, 0x3d, 0xd1, + 0x79, 0x4c, 0x3c, 0x7c, 0x16, 0xbd, 0x3c, 0xc1, 0x7d, 0x3c, 0xbc, 0xb2, 0xe7, + 0x94, 0xbc, 0xf0, 0x46, 0x69, 0xbc, 0x2d, 0x5f, 0x68, 0x3c, 0xbc, 0x78, 0x44, + 0xbd, 0xcf, 0x27, 0x97, 0xbd, 0x03, 0xfb, 0x4b, 0xbd, 0x0c, 0xc4, 0xcd, 0xbc, + 0xd7, 0xc5, 0x11, 0xbd, 0x6b, 0xe3, 0xf5, 0xbb, 0xda, 0x4d, 0x75, 0x3d, 0xb0, + 0xf1, 0x39, 0xbd, 0x02, 0x4e, 0x00, 0xbd, 0xcf, 0x22, 0x81, 0x3d, 0x48, 0x54, + 0x10, 0xbd, 0x93, 0x8c, 0x42, 0x3a, 0x62, 0x1e, 0x18, 0x3d, 0xb5, 0x1d, 0x8d, + 0x3d, 0xbe, 0x37, 0x54, 0xbc, 0x9e, 0xa3, 0x92, 0xbc, 0x6a, 0x91, 0x7b, 0x3d, + 0xc5, 0x13, 0x8c, 0xbb, 0x30, 0x93, 0x55, 0xbd, 0x01, 0x29, 0x2b, 0xbd, 0xd4, + 0x57, 0x3a, 0xbd, 0xaf, 0xbc, 0xed, 0x3c, 0x65, 0xfe, 0x66, 0xbd, 0x2c, 0x98, + 0x11, 0x3d, 0x6e, 0xcf, 0x7c, 0xbd, 0xbe, 0xb4, 0x49, 0x3d, 0x17, 0x7c, 0x4f, + 0xbc, 0x13, 0xfc, 0x28, 0x3d, 0x28, 0xca, 0x2b, 0xbd, 0xdf, 0x3e, 0xa3, 0x3b, + 0x7e, 0xf4, 0x99, 0xbd, 0x9d, 0x89, 0x35, 0xbc, 0x70, 0x4c, 0x8a, 0xbd, 0xf9, + 0x58, 0x3a, 0xbd, 0x6f, 0xa9, 0x4f, 0x3d, 0x30, 0xce, 0x59, 0xbc, 0x52, 0xd4, + 0x41, 0xbd, 0x0d, 0x88, 0x2d, 0xbd, 0x94, 0xe1, 0x30, 0x3d, 0x7a, 0x53, 0xcd, + 0xbb, 0x2d, 0xcc, 0x75, 0x3c, 0x18, 0x30, 0x24, 0x3d, 0xfb, 0xa8, 0x07, 0x3d, + 0xa8, 0x1f, 0x19, 0xbc, 0xdf, 0x0a, 0x1c, 0x3d, 0x76, 0x06, 0x31, 0x3d, 0x6c, + 0x40, 0x82, 0x3c, 0x72, 0xb0, 0x82, 0xbd, 0x10, 0xae, 0x67, 0x3d, 0x00, 0x02, + 0xb5, 0x3a, 0x0a, 0xcd, 0x29, 0x3d, 0x7a, 0xf4, 0x27, 0x3c, 0x9d, 0xe2, 0x75, + 0xbd, 0x1e, 0xcd, 0x09, 0x3c, 0xa7, 0x3e, 0x25, 0xbd, 0x90, 0xb7, 0x8b, 0xbd, + 0xac, 0x2e, 0x6c, 0x3c, 0x22, 0x59, 0x79, 0x3d, 0xaf, 0x3b, 0x02, 0xba, 0x40, + 0xb8, 0x2c, 0x3d, 0xe8, 0x48, 0x6e, 0x3d, 0x13, 0xdb, 0x2f, 0x3b, 0x89, 0x0e, + 0x82, 0x3c, 0xdf, 0xe9, 0xc4, 0xbc, 0xc9, 0x26, 0x19, 0xbc, 0x67, 0x6b, 0x50, + 0x3d, 0xc0, 0x4c, 0x10, 0xbd, 0x30, 0xa9, 0x40, 0x3c, 0x12, 0x2f, 0xb1, 0x3c, + 0x3e, 0x0e, 0x00, 0xbd, 0xe9, 0x1b, 0x6f, 0xbd, 0xe4, 0x4b, 0x81, 0xbd, 0x93, + 0xc1, 0x7f, 0x3d, 0xb7, 0x8d, 0x04, 0xbd, 0x68, 0x33, 0x29, 0xbc, 0xa4, 0x5e, + 0x60, 0x3d, 0x23, 0xc0, 0x0a, 0xbd, 0xf0, 0x22, 0x80, 0xbd, 0x79, 0xea, 0x47, + 0x3d, 0x10, 0x77, 0x87, 0x3d, 0xc1, 0xfb, 0x19, 0xbd, 0x9c, 0xf7, 0x7c, 0x3d, + 0x27, 0x74, 0xb9, 0xbc, 0xc6, 0xea, 0x25, 0x3d, 0x54, 0xbc, 0xa4, 0x3c, 0x88, + 0x18, 0x36, 0x3d, 0x74, 0xd5, 0xd3, 0x3c, 0x68, 0x6e, 0x24, 0x3d, 0x36, 0xb4, + 0x49, 0x3d, 0x3e, 0x98, 0x2c, 0xbd, 0x99, 0x3e, 0x47, 0xbd, 0x21, 0xac, 0x15, + 0x3d, 0xef, 0x4f, 0x26, 0xbd, 0xb4, 0x49, 0x3f, 0xbd, 0xf5, 0xbc, 0x0a, 0xbd, + 0x04, 0x05, 0x6f, 0x3d, 0xf1, 0x5f, 0x15, 0x3d, 0xca, 0x51, 0x3f, 0x3d, 0xc2, + 0x88, 0x3a, 0xbd, 0x40, 0xeb, 0xbf, 0x3c, 0x4c, 0x13, 0xb6, 0x3c, 0xe6, 0x26, + 0xfe, 0x3c, 0xda, 0xab, 0x95, 0xbd, 0xd8, 0xcf, 0x81, 0x3d, 0xa2, 0x19, 0x53, + 0xbd, 0x5d, 0x5e, 0x0d, 0xbd, 0xfe, 0x6b, 0x36, 0x3d, 0xfb, 0x27, 0x4c, 0xbd, + 0x36, 0x92, 0x43, 0xbd, 0x94, 0xee, 0x45, 0xbc, 0x8a, 0x6d, 0xe4, 0x3c, 0xa8, + 0xb1, 0x52, 0xbc, 0x1f, 0x82, 0x88, 0xbb, 0x73, 0x6b, 0x53, 0xbd, 0x56, 0xc3, + 0x6f, 0x3d, 0x78, 0x17, 0x4a, 0x3d, 0xf2, 0x2e, 0x77, 0xbd, 0x2e, 0xae, 0x2a, + 0x3d, 0xa0, 0xd4, 0xa8, 0x3c, 0xe0, 0xb4, 0xd8, 0x3c, 0x24, 0x6d, 0x6a, 0xbd, + 0x16, 0xd2, 0x58, 0xbd, 0x56, 0xf5, 0x5d, 0x3b, 0xae, 0xdb, 0x76, 0xbd, 0x16, + 0x9a, 0x9a, 0xbd, 0x7c, 0x79, 0x51, 0x3d, 0x72, 0x5b, 0xa7, 0xbc, 0xce, 0xbf, + 0x62, 0x3d, 0xab, 0xd8, 0x23, 0x3d, 0x7e, 0xfd, 0x23, 0x3d, 0x0c, 0x3d, 0x6b, + 0x3d, 0x6c, 0x2f, 0x87, 0x3c, 0x1e, 0x26, 0x00, 0xbc, 0xc3, 0x94, 0x6f, 0xbd, + 0xb3, 0x7d, 0x24, 0xbd, 0x2a, 0xfb, 0x71, 0x3d, 0xee, 0x5a, 0xeb, 0xbc, 0x6c, + 0x3e, 0x60, 0xbd, 0x6c, 0x46, 0xf5, 0x3c, 0x83, 0xe3, 0x17, 0x3b, 0xe6, 0x15, + 0x32, 0xbd, 0x45, 0xba, 0x05, 0xbd, 0x18, 0x9a, 0x72, 0x3d, 0x45, 0x9c, 0x83, + 0xbd, 0x08, 0x2b, 0x5e, 0x3d, 0x75, 0xea, 0xe8, 0xbc, 0x81, 0xb6, 0x84, 0x3b, + 0x4b, 0xf4, 0x16, 0xbd, 0x90, 0xf4, 0x16, 0x3d, 0x2b, 0x95, 0x53, 0xbc, 0x53, + 0x27, 0x4b, 0xbd, 0x00, 0x6c, 0xe7, 0x3b, 0x62, 0xbd, 0x83, 0xbd, 0xd8, 0x6f, + 0x87, 0x3c, 0x3c, 0x17, 0x65, 0x3c, 0x3b, 0x64, 0x7e, 0x3d, 0xbd, 0x05, 0x09, + 0xbd, 0x7f, 0x37, 0x88, 0xbd, 0x63, 0x0e, 0x98, 0xbd, 0x03, 0x67, 0x71, 0x3c, + 0x02, 0x06, 0xe5, 0x39, 0xe4, 0x9f, 0xe7, 0x3b, 0x93, 0x66, 0x93, 0xbd, 0xc6, + 0xcd, 0x7c, 0xbd, 0xde, 0xaf, 0x20, 0x3d, 0xd2, 0x18, 0x54, 0x3c, 0xac, 0xeb, + 0x62, 0xbd, 0x93, 0xf7, 0xa2, 0x3c, 0x4c, 0x4b, 0x00, 0x3d, 0x38, 0x67, 0x3d, + 0xbd, 0x81, 0xcb, 0xa2, 0x3c, 0x9b, 0xd5, 0x90, 0x3c, 0x35, 0x26, 0x0f, 0x3c, + 0xcb, 0x77, 0x45, 0xbd, 0x38, 0xe0, 0x48, 0xbd, 0x96, 0x9e, 0x1d, 0x3b, 0x7c, + 0x3f, 0xaf, 0xbc, 0xef, 0x49, 0xac, 0xbc, 0x07, 0x74, 0xcc, 0x3c, 0xc0, 0x22, + 0x42, 0xbb, 0x5b, 0x72, 0x62, 0x3d, 0xd0, 0x55, 0x95, 0xbd, 0xf7, 0x7d, 0x82, + 0x3d, 0x90, 0x79, 0xd9, 0x3b, 0xd0, 0xa1, 0x96, 0x3c, 0xbf, 0x32, 0x8a, 0x3d, + 0xbd, 0xf0, 0x57, 0x3d, 0x5f, 0xf9, 0x3b, 0x3c, 0x4f, 0xea, 0x86, 0x3d, 0xbb, + 0x72, 0xaa, 0x3c, 0x42, 0x3b, 0x4c, 0x3d, 0x86, 0x1d, 0x86, 0x3c, 0x90, 0xc6, + 0x2a, 0xbd, 0x4f, 0x86, 0x76, 0x3d, 0x92, 0x79, 0x3d, 0x3d, 0x0d, 0x95, 0x92, + 0x3d, 0xbf, 0x77, 0x4e, 0x3d, 0x8b, 0x45, 0x03, 0xbd, 0x95, 0x0c, 0xff, 0xbc, + 0x62, 0x35, 0x11, 0xbb, 0xbd, 0x74, 0x28, 0x3d, 0xaf, 0x87, 0x7f, 0xbd, 0x8e, + 0xb8, 0x06, 0xbd, 0x0f, 0xbd, 0x3e, 0x3d, 0xe6, 0xd4, 0x41, 0xbd, 0x80, 0x81, + 0xac, 0x3c, 0x7a, 0xec, 0x82, 0xbc, 0x01, 0xac, 0x93, 0xbd, 0xe8, 0xba, 0xb3, + 0xbb, 0xcf, 0x47, 0x8f, 0xbb, 0x11, 0x6f, 0x57, 0x3d, 0x74, 0xf5, 0x9d, 0x3c, + 0x67, 0x6e, 0x01, 0xbd, 0xa6, 0x8c, 0x8f, 0xbd, 0xe4, 0x48, 0x30, 0xbd, 0x80, + 0xa7, 0x88, 0xbb, 0x48, 0x69, 0xea, 0x3c, 0x20, 0x78, 0x14, 0x3b, 0x18, 0xc4, + 0xca, 0xbc, 0xd6, 0x83, 0xcb, 0x3c, 0x88, 0x63, 0xd1, 0x3c, 0x02, 0x3a, 0x1b, + 0xbc, 0x02, 0x15, 0x13, 0x3c, 0xbe, 0x71, 0xf0, 0xbb, 0xe1, 0x3c, 0x12, 0xbd, + 0xa6, 0x23, 0x33, 0x3c, 0xc8, 0x04, 0xee, 0x3c, 0x78, 0x7e, 0x4d, 0x3c, 0x7f, + 0xd1, 0x95, 0xbc, 0xa3, 0x48, 0x22, 0x3c, 0x6d, 0x33, 0x77, 0xbd, 0xfc, 0x4f, + 0xc7, 0xbc, 0x8c, 0x5c, 0x8c, 0xbd, 0x98, 0x32, 0x02, 0xbd, 0x5f, 0x37, 0x00, + 0x3d, 0x41, 0xea, 0x7f, 0x3d, 0x4b, 0x38, 0x77, 0xbc, 0x47, 0x90, 0x92, 0xbd, + 0x56, 0x10, 0x1f, 0xbd, 0x10, 0x70, 0x8e, 0xbb, 0x0a, 0x99, 0x7a, 0x3c, 0x46, + 0x4c, 0x7d, 0x3d, 0xc0, 0x71, 0x6d, 0x3d, 0xd8, 0x3f, 0x28, 0x3d, 0x84, 0xe3, + 0x2b, 0x3d, 0x31, 0xdc, 0x55, 0xbd, 0x6e, 0x0a, 0x34, 0x3d, 0x10, 0xff, 0x85, + 0x3c, 0x72, 0x7b, 0x1d, 0xbd, 0x7f, 0xf5, 0xb4, 0xbb, 0xfb, 0xef, 0x87, 0x3d, + 0xb5, 0x8a, 0x4f, 0x3c, 0x20, 0xd7, 0x40, 0xbd, 0x17, 0x2c, 0x38, 0xbd, 0xcb, + 0xd4, 0x6d, 0x3d, 0x3c, 0x24, 0x7a, 0xbd, 0xb3, 0x3d, 0x92, 0xbd, 0x18, 0xbe, + 0x99, 0xba, 0x29, 0xe3, 0x42, 0xbc, 0xf7, 0x2c, 0x8f, 0xbd, 0x34, 0xd9, 0xc7, + 0x3c, 0xac, 0x8c, 0x99, 0xbd, 0x40, 0xe4, 0xa5, 0x3c, 0x8d, 0xcf, 0x3d, 0x3d, + 0x81, 0xe9, 0x3e, 0x3d, 0x7a, 0xbb, 0x3f, 0x3d, 0xc7, 0x9b, 0x25, 0xbc, 0x84, + 0x26, 0xc3, 0xbb, 0x52, 0x3f, 0x7a, 0x3d, 0x7b, 0xdb, 0x69, 0xbd, 0x99, 0x0e, + 0x71, 0xbd, 0x4c, 0xb5, 0xa5, 0x3b, 0xcf, 0x2f, 0xfd, 0xbb, 0x6b, 0x5b, 0x0c, + 0x3b, 0x9e, 0xeb, 0x04, 0xbc, 0x00, 0x9d, 0xdc, 0xbb, 0x10, 0xc2, 0xc0, 0x3c, + 0x08, 0xa2, 0x31, 0xbd, 0xc0, 0x3c, 0xf9, 0x3a, 0xad, 0xd5, 0x55, 0xbd, 0x11, + 0xea, 0xf3, 0x3c, 0x80, 0x63, 0xfa, 0x3a, 0x30, 0x82, 0x48, 0x3b, 0x58, 0x5f, + 0x2c, 0xbd, 0xd4, 0x00, 0x83, 0xbd, 0x12, 0x38, 0x8a, 0xbd, 0xd2, 0xdf, 0x1e, + 0x3c, 0xd0, 0x71, 0x1b, 0x3d, 0x92, 0x5f, 0x56, 0xbd, 0x51, 0x29, 0x94, 0xbd, + 0x40, 0x81, 0x92, 0xbd, 0x04, 0x93, 0x82, 0xbd, 0x8c, 0xf7, 0x84, 0x3d, 0x8a, + 0x96, 0x85, 0xbd, 0x2a, 0x93, 0x3b, 0xba, 0xc7, 0x7c, 0x3b, 0xbd, 0xb0, 0x3d, + 0x50, 0x3d, 0xa0, 0xcb, 0x42, 0x3d, 0xad, 0x3c, 0x16, 0xbc, 0x59, 0xaa, 0x30, + 0xbd, 0xcd, 0x10, 0x91, 0xbc, 0xe8, 0xea, 0x35, 0xbd, 0x53, 0x63, 0x36, 0xbd, + 0xa9, 0x85, 0x82, 0x3c, 0x23, 0xbd, 0x36, 0xbd, 0x25, 0x81, 0xe9, 0x3c, 0x76, + 0x54, 0x6d, 0x3d, 0xc1, 0x4f, 0x69, 0xbd, 0x55, 0x6c, 0x8f, 0x3d, 0xd5, 0x0a, + 0x7d, 0xbd, 0x48, 0xbe, 0xd2, 0x3c, 0x5b, 0xce, 0x84, 0x3d, 0xaa, 0x8e, 0x46, + 0xbc, 0x9c, 0x93, 0xc9, 0x3c, 0x66, 0xb1, 0x45, 0x3d, 0xf1, 0xc0, 0x90, 0xbc, + 0x2d, 0x09, 0x22, 0x3d, 0xcc, 0x52, 0x20, 0x3d, 0xaa, 0xec, 0x70, 0x3d, 0x3a, + 0xbd, 0xac, 0xbb, 0x70, 0x69, 0x81, 0x3d, 0x43, 0x3f, 0x8b, 0xbc, 0x46, 0x6a, + 0x04, 0xbd, 0xac, 0x25, 0x5a, 0xbd, 0xc2, 0xb9, 0x74, 0xbd, 0x35, 0x78, 0xeb, + 0x3c, 0xe2, 0x31, 0x54, 0xbd, 0xa0, 0xb1, 0xfe, 0x3c, 0xaf, 0xd2, 0xf8, 0x3c, + 0x00, 0x44, 0x82, 0x3a, 0x70, 0xcc, 0x91, 0xbd, 0x82, 0x1f, 0x57, 0xbd, 0xc2, + 0xe4, 0x03, 0x3d, 0xd0, 0xbd, 0x80, 0xbd, 0x7a, 0xde, 0x41, 0xbd, 0xe9, 0xf4, + 0x3b, 0x3c, 0xf9, 0x96, 0x1a, 0xbd, 0xe2, 0x2e, 0x46, 0xbd, 0xae, 0xbd, 0x34, + 0xbd, 0xb4, 0xa2, 0x8c, 0xbc, 0xa8, 0x0e, 0x30, 0xbd, 0x56, 0xf8, 0x33, 0xbd, + 0xce, 0x69, 0x35, 0x3d, 0x52, 0x2f, 0xeb, 0xbc, 0x9f, 0xe0, 0x0f, 0xbd, 0xc9, + 0x34, 0x29, 0xbd, 0x43, 0x26, 0x1e, 0x3d, 0xc8, 0x03, 0x05, 0x3c, 0x0f, 0x46, + 0x97, 0x3c, 0x18, 0x4c, 0x0c, 0xbd, 0xb8, 0xf9, 0x1c, 0xbd, 0xbd, 0x84, 0x86, + 0xbd, 0xbe, 0x50, 0xb1, 0xbc, 0x26, 0x15, 0x57, 0x3c, 0xca, 0x9f, 0x77, 0xbc, + 0xc0, 0xea, 0xca, 0xba, 0x23, 0xde, 0x41, 0xbd, 0x9d, 0xb4, 0x5c, 0xbd, 0x46, + 0x03, 0x30, 0xbd, 0xd0, 0xb3, 0x37, 0x3d, 0xfd, 0xe6, 0x3e, 0x3d, 0x8a, 0x0e, + 0x6a, 0xbd, 0xf8, 0x91, 0x64, 0x3d, 0xb4, 0x0b, 0x76, 0x3d, 0xf2, 0x94, 0x5f, + 0x3d, 0x98, 0xe6, 0x78, 0x3c, 0xc4, 0xab, 0x1e, 0xbd, 0xdd, 0xb6, 0x77, 0xbd, + 0x56, 0x1e, 0x8c, 0x3d, 0x0f, 0xee, 0x15, 0xbd, 0x42, 0xb6, 0x92, 0xbd, 0x2c, + 0xea, 0x96, 0xbc, 0x90, 0xc4, 0x30, 0xbd, 0x2e, 0xdc, 0xc8, 0xbb, 0xe4, 0x79, + 0xb0, 0xbc, 0x2e, 0xe6, 0x08, 0x3d, 0x74, 0x81, 0x34, 0x3d, 0xc0, 0xd5, 0x48, + 0xbc, 0xd3, 0xf2, 0x3c, 0xbd, 0x34, 0x47, 0xef, 0x3c, 0x9a, 0xcb, 0xe5, 0x3c, + 0xe0, 0x94, 0xef, 0xba, 0x80, 0x36, 0x23, 0xbc, 0x08, 0xf9, 0x35, 0xbd, 0x0f, + 0x9d, 0x99, 0xbd, 0x71, 0xdf, 0x2e, 0xbd, 0xb5, 0xa6, 0x78, 0xbd, 0xfa, 0xa8, + 0x69, 0x3d, 0x97, 0xc3, 0xda, 0xbb, 0x37, 0x74, 0xdf, 0x3c, 0x7f, 0xc2, 0x88, + 0xbd, 0x53, 0x20, 0xbe, 0x3b, 0x9c, 0x7a, 0xd9, 0x3c, 0xa9, 0x4b, 0x01, 0xbd, + 0xfb, 0xf7, 0x00, 0xbd, 0xd5, 0xda, 0x41, 0x3d, 0x9d, 0x2a, 0x82, 0x3d, 0x9a, + 0x03, 0x01, 0x3d, 0x38, 0xa7, 0x1b, 0x3d, 0x40, 0x75, 0xef, 0x3c, 0x4a, 0xdc, + 0x1b, 0xbc, 0xd1, 0x1a, 0x41, 0x3d, 0x04, 0xee, 0x74, 0x3d, 0xdb, 0x3f, 0x71, + 0xbd, 0x86, 0xc4, 0x22, 0x3d, 0x99, 0x74, 0x78, 0xbc, 0x48, 0x90, 0x54, 0xbd, + 0x88, 0xae, 0xf9, 0x3c, 0x4f, 0xbe, 0x10, 0x3d, 0x7d, 0x35, 0x68, 0xbd, 0xb3, + 0xf9, 0x3d, 0x3d, 0x1b, 0x89, 0x85, 0xbb, 0x85, 0x05, 0xae, 0x3c, 0xfd, 0x18, + 0x5b, 0xbd, 0x2d, 0xfa, 0x7f, 0xbd, 0x6e, 0xad, 0x8c, 0xbd, 0x67, 0x72, 0x28, + 0x3d, 0x2c, 0x8b, 0x9a, 0x3c, 0xb3, 0x94, 0x57, 0xbd, 0xa4, 0x3e, 0xa8, 0xbc, + 0xa6, 0x6a, 0x06, 0x3d, 0xf8, 0x03, 0x33, 0x3d, 0x56, 0xb0, 0x7a, 0xbd, 0x47, + 0x97, 0x68, 0xbc, 0xd0, 0x17, 0x7a, 0xbd, 0xe8, 0xab, 0x7d, 0xbd, 0xec, 0x67, + 0xf9, 0xbb, 0x3d, 0x92, 0x83, 0xbd, 0x36, 0xa4, 0x00, 0xbd, 0x00, 0x1b, 0x45, + 0x3a, 0x39, 0x13, 0x88, 0xbd, 0x05, 0x63, 0x26, 0x3c, 0x53, 0x7b, 0xc9, 0x3c, + 0x67, 0x97, 0x7a, 0xbb, 0xfe, 0x71, 0xd6, 0xbc, 0x24, 0x84, 0x1e, 0xbd, 0x02, + 0xa3, 0x76, 0x3d, 0xff, 0x16, 0x69, 0x3d, 0x80, 0xf0, 0x21, 0x3d, 0x90, 0x11, + 0x48, 0xbd, 0xc8, 0xa9, 0x3f, 0xbd, 0xc8, 0x06, 0x25, 0xbd, 0xaa, 0xfe, 0x96, + 0xbd, 0xa4, 0xbe, 0x57, 0xbc, 0x6e, 0x82, 0x1d, 0x3d, 0xd6, 0xfa, 0x66, 0xbb, + 0x9a, 0x25, 0x20, 0x3d, 0xa3, 0x94, 0x27, 0xbb, 0x23, 0x2f, 0xcd, 0x3c, 0x5e, + 0xa4, 0x4e, 0x3d, 0x2a, 0x3b, 0x09, 0xbd, 0x4a, 0x40, 0x6f, 0x3d, 0xfe, 0xd8, + 0xe4, 0x3c, 0xab, 0xce, 0x56, 0xbd, 0x1d, 0x9a, 0x65, 0x3d, 0xb6, 0xf5, 0x76, + 0xbd, 0x88, 0x3d, 0x52, 0x3d, 0x0f, 0x1c, 0x50, 0xbd, 0x1d, 0x0d, 0x6a, 0x3d, + 0x99, 0x66, 0x98, 0xbd, 0x6e, 0xe2, 0xb9, 0x3c, 0x4c, 0x26, 0x82, 0xbd, 0xe2, + 0x3f, 0x65, 0xbd, 0x09, 0xa4, 0x8a, 0x3c, 0x19, 0x7d, 0x7d, 0xbd, 0xe6, 0xf8, + 0x1d, 0xbd, 0xfc, 0xe2, 0xee, 0xbc, 0x1d, 0xab, 0x89, 0x3d, 0x8e, 0xb4, 0xfe, + 0xbc, 0x68, 0x9c, 0x83, 0x3c, 0xf7, 0xa9, 0x0b, 0xbd, 0x3c, 0xed, 0x92, 0x3c, + 0x90, 0x72, 0xa5, 0x3c, 0x02, 0xd9, 0x69, 0xbd, 0xa9, 0x64, 0x2a, 0xbb, 0x6d, + 0x20, 0xf5, 0xbc, 0x0e, 0x44, 0x37, 0xbd, 0xc7, 0xf0, 0xde, 0x3c, 0xb6, 0xdb, + 0x71, 0x3d, 0xea, 0x6b, 0xda, 0xbc, 0xc8, 0x8f, 0x1d, 0xbd, 0xb9, 0x43, 0x05, + 0xbd, 0x6c, 0x4a, 0x78, 0xbc, 0xc0, 0xc3, 0x82, 0x3b, 0x4b, 0x41, 0x49, 0xbd, + 0xc1, 0xfc, 0xcb, 0x3b, 0x93, 0x21, 0x8d, 0xbd, 0xcf, 0x67, 0x7a, 0xbd, 0x58, + 0x9d, 0xdb, 0x3c, 0xd3, 0x71, 0x03, 0x3d, 0xaf, 0x55, 0x84, 0x3d, 0x71, 0x0c, + 0x5d, 0xbd, 0x4c, 0x19, 0x89, 0x3c, 0x7f, 0x29, 0x8b, 0x3d, 0xf6, 0xcd, 0xa9, + 0x3c, 0xaa, 0x00, 0x4c, 0x3d, 0x2b, 0xaa, 0x19, 0xbc, 0x93, 0xde, 0x16, 0xb9, + 0xda, 0xaf, 0x90, 0xbb, 0xf6, 0xde, 0x48, 0x3d, 0x00, 0x08, 0x29, 0x3b, 0xb2, + 0xe0, 0x82, 0xbc, 0x84, 0xf3, 0x40, 0xbc, 0xd4, 0x75, 0x08, 0x3d, 0x88, 0xe7, + 0x64, 0xbd, 0x68, 0xd6, 0x95, 0x3c, 0x1b, 0x70, 0x3f, 0x3d, 0x64, 0xfa, 0xfd, + 0xbc, 0xfc, 0x82, 0x61, 0x3d, 0x8e, 0x6e, 0x11, 0xbd, 0x0a, 0x0a, 0x9f, 0xbc, + 0xb5, 0x1d, 0x68, 0x3c, 0x7d, 0x9f, 0x86, 0x3d, 0xe6, 0x3f, 0x83, 0x3d, 0xf9, + 0xd6, 0xfe, 0x3c, 0x68, 0x0c, 0x61, 0xbd, 0x65, 0x33, 0x27, 0x3d, 0x2c, 0xcf, + 0x68, 0x3d, 0xb0, 0xc0, 0x14, 0xbd, 0xb0, 0xb2, 0x81, 0x3d, 0xc0, 0x9c, 0x89, + 0xbc, 0xae, 0x60, 0x8e, 0xbd, 0x92, 0xdd, 0x91, 0xbd, 0xc9, 0x0b, 0x85, 0x3d, + 0xa4, 0x00, 0xb1, 0xbc, 0x80, 0x9d, 0xf8, 0x3c, 0x1d, 0xc1, 0x98, 0xbd, 0x3e, + 0x88, 0xcd, 0x3c, 0x67, 0xc9, 0x66, 0x3c, 0x00, 0x46, 0x64, 0xba, 0x80, 0x3e, + 0x19, 0xbd, 0x18, 0xe0, 0x20, 0x3c, 0x50, 0xcb, 0xc0, 0x3b, 0xe3, 0xf3, 0x8c, + 0xbc, 0xac, 0x02, 0xd6, 0x3c, 0xca, 0x7a, 0x45, 0x3d, 0x95, 0xab, 0x47, 0xbd, + 0xe6, 0x14, 0x55, 0x3d, 0x88, 0x82, 0x09, 0x3d, 0x1c, 0x74, 0x91, 0x3c, 0xbf, + 0x00, 0x2f, 0x3c, 0x8c, 0xfc, 0x96, 0xbd, 0xcb, 0xa8, 0x9e, 0xbb, 0xb5, 0x6b, + 0x42, 0x3d, 0x0f, 0xed, 0x99, 0xbd, 0x6a, 0x9e, 0x45, 0xba, 0x50, 0xa3, 0x2d, + 0xbc, 0x6a, 0x95, 0x52, 0x3d, 0x18, 0x66, 0xd7, 0xbb, 0x65, 0x63, 0x7c, 0xbd, + 0xfe, 0xa8, 0xe1, 0xbc, 0x48, 0x89, 0x50, 0xbd, 0x64, 0x1d, 0xbe, 0x3c, 0x54, + 0xe9, 0x07, 0x3d, 0x2f, 0x27, 0x2b, 0x3d, 0x55, 0x02, 0x00, 0x3d, 0xb2, 0xbe, + 0x53, 0xbd, 0xd8, 0x03, 0x72, 0xbd, 0xd4, 0x63, 0x69, 0x3d, 0x1c, 0x9b, 0x7c, + 0xbd, 0x87, 0x6b, 0x83, 0xbd, 0xc8, 0x0e, 0x0f, 0xbd, 0xed, 0x88, 0x30, 0xbd, + 0xce, 0x02, 0x31, 0xbd, 0xae, 0xdd, 0x17, 0xbd, 0x03, 0x61, 0x43, 0xbd, 0xcf, + 0xd3, 0x03, 0xbd, 0x56, 0x0b, 0x57, 0xbd, 0x85, 0x33, 0x0d, 0xbd, 0x36, 0x8f, + 0x0b, 0xbd, 0x8e, 0x7d, 0x2c, 0xbc, 0x99, 0x21, 0x40, 0xbd, 0x9b, 0xf2, 0x62, + 0xbb, 0xcc, 0xaf, 0x3f, 0x3d, 0x3f, 0xc0, 0xab, 0x3c, 0xc1, 0x4d, 0x27, 0x3c, + 0x4b, 0x78, 0x30, 0x3d, 0x04, 0x65, 0xfe, 0x3b, 0xbe, 0x78, 0xb0, 0xbc, 0x9a, + 0xb9, 0xe8, 0xbc, 0x58, 0x9c, 0x5d, 0x3d, 0x95, 0x93, 0x65, 0x3d, 0xd9, 0xa8, + 0x41, 0xbd, 0x91, 0xb5, 0x36, 0x3d, 0x48, 0xc5, 0x84, 0xbd, 0xf8, 0x98, 0x3c, + 0x3c, 0x07, 0x2e, 0x96, 0xbd, 0xf2, 0xa1, 0x2b, 0xba, 0xdc, 0xa1, 0x10, 0xbd, + 0x3a, 0xa4, 0xdb, 0xbc, 0x03, 0x75, 0x63, 0xbd, 0x5f, 0x46, 0x3d, 0x3a, 0x75, + 0x7d, 0x56, 0x3d, 0x68, 0x12, 0xa8, 0xbc, 0x03, 0xf5, 0x98, 0xbd, 0xe0, 0x3c, + 0xe7, 0xbc, 0x90, 0xb6, 0xbb, 0xbb, 0x48, 0x0e, 0x08, 0x3d, 0x68, 0x30, 0x35, + 0x3c, 0xb4, 0x17, 0xcf, 0x3c, 0xf9, 0xd9, 0xf8, 0x3c, 0xc8, 0x7e, 0x09, 0xbc, + 0x84, 0xde, 0x45, 0xbd, 0xfe, 0xad, 0xf7, 0xbc, 0xdb, 0x10, 0x8b, 0xbd, 0x65, + 0xac, 0x40, 0x3d, 0x2f, 0xc7, 0x12, 0x3c, 0x60, 0x81, 0x62, 0x3d, 0x96, 0xbd, + 0xf6, 0x3c, 0xee, 0x7e, 0x80, 0x3d, 0x76, 0x78, 0x25, 0x3d, 0xec, 0x17, 0x1b, + 0xbc, 0x17, 0xa7, 0x2f, 0xbd, 0x5c, 0x17, 0x4e, 0x3d, 0x92, 0x4e, 0x99, 0xbb, + 0xe6, 0xec, 0x1d, 0xbd, 0xcf, 0xd4, 0x15, 0x3d, 0x36, 0x68, 0xcb, 0x3c, 0x05, + 0xd3, 0x68, 0x3c, 0x4d, 0x37, 0x96, 0x3c, 0x85, 0x4b, 0x98, 0x3b, 0x3e, 0xf9, + 0x6a, 0x3d, 0x42, 0xd5, 0x85, 0xbc, 0x35, 0xf1, 0x48, 0xbd, 0xae, 0x5a, 0x69, + 0x3b, 0xfc, 0xc3, 0x81, 0xbd, 0x3d, 0xe3, 0x71, 0xbd, 0xdb, 0x3b, 0x18, 0xbd, + 0x40, 0x90, 0x26, 0xbd, 0x5d, 0xef, 0x80, 0xbc, 0x94, 0x89, 0x9a, 0xbc, 0x96, + 0x7a, 0x33, 0xbd, 0x94, 0x61, 0x71, 0x3d, 0xe6, 0xaf, 0x5a, 0x3d, 0x5f, 0x3d, + 0x6a, 0x3b, 0x22, 0xcf, 0x23, 0xbc, 0xb1, 0x6f, 0x4b, 0xbb, 0x9a, 0x4b, 0xbe, + 0x3c, 0xd7, 0x02, 0x95, 0xbc, 0xb5, 0xfa, 0x4b, 0xbd, 0x8d, 0x7e, 0x85, 0xbc, + 0x12, 0x0b, 0x3c, 0x3d, 0xa5, 0x2c, 0xfc, 0xbb, 0xb0, 0xcc, 0xb2, 0xbb, 0xf2, + 0x03, 0x4a, 0xbd, 0x87, 0xe3, 0x1d, 0xbd, 0xcc, 0xd7, 0xed, 0x3c, 0x16, 0x63, + 0x73, 0xbc, 0x18, 0x4e, 0x47, 0x3d, 0x70, 0x95, 0x37, 0xbd, 0xfb, 0xdd, 0xc4, + 0x3c, 0x3d, 0x65, 0xfb, 0x3c, 0x96, 0xa0, 0x84, 0x3d, 0x60, 0x19, 0xff, 0xbb, + 0xa4, 0xbf, 0x4b, 0x3c, 0x5b, 0x63, 0x03, 0xbd, 0x8d, 0x86, 0xcb, 0xbb, 0x62, + 0xee, 0x76, 0xbd, 0x9c, 0x16, 0x73, 0x3d, 0x4f, 0xd8, 0x81, 0x3d, 0xe2, 0x7d, + 0xba, 0xbc, 0xd6, 0x7a, 0xb4, 0x3b, 0x61, 0x45, 0x87, 0x3d, 0xe1, 0x5e, 0x8a, + 0xbd, 0xfc, 0x1f, 0xc0, 0xbc, 0xc0, 0x87, 0x14, 0xbd, 0x3d, 0x53, 0x16, 0x3d, + 0x86, 0x91, 0x17, 0x3c, 0xa6, 0x1a, 0x71, 0xbc, 0xe7, 0x57, 0xf9, 0xbc, 0x27, + 0x13, 0x87, 0x3d, 0x98, 0x4e, 0x02, 0x3d, 0xe5, 0x9d, 0x13, 0x3d, 0x89, 0xbf, + 0x2e, 0x3c, 0xa0, 0x5f, 0x21, 0x3b, 0x80, 0xc1, 0xf4, 0x3b, 0x14, 0x22, 0x2a, + 0xbc, 0x33, 0xd3, 0x93, 0x3c, 0xd7, 0x3d, 0x6e, 0x3d, 0x2e, 0xcd, 0x81, 0xbd, + 0x71, 0xa3, 0x45, 0xbd, 0xde, 0xd6, 0x4f, 0x3d, 0xb7, 0xe7, 0x41, 0xbd, 0x27, + 0x86, 0xd6, 0x3c, 0x6b, 0x72, 0x85, 0x3d, 0x6d, 0x89, 0x11, 0xbd, 0x21, 0x7b, + 0x1a, 0xbd, 0x18, 0xf1, 0x38, 0xbd, 0xc3, 0xf7, 0xb1, 0x3c, 0xd7, 0xa0, 0x8e, + 0xbd, 0x6e, 0x16, 0x24, 0x3d, 0xc2, 0x2b, 0x2f, 0x3d, 0xc8, 0x1c, 0x82, 0x3c, + 0x53, 0x30, 0x24, 0xbc, 0xd9, 0x49, 0x1f, 0xbd, 0xea, 0x81, 0x3f, 0x3d, 0xc4, + 0xb7, 0x1a, 0x3d, 0xc3, 0x0a, 0x0b, 0xbd, 0x29, 0x5d, 0x88, 0x3d, 0x3f, 0xb6, + 0x9f, 0xbc, 0x97, 0x16, 0x72, 0xbd, 0x67, 0x40, 0xa4, 0xbc, 0x67, 0x64, 0x59, + 0xbc, 0xd0, 0x90, 0xfd, 0xbc, 0x48, 0xa3, 0x1b, 0xbd, 0x5f, 0x6c, 0xf2, 0x3c, + 0xe4, 0x81, 0x97, 0xbd, 0x2b, 0xe9, 0x86, 0x3d, 0x6c, 0xa1, 0x06, 0xbd, 0xa8, + 0x7c, 0x2a, 0x3c, 0x07, 0xca, 0x8d, 0x3b, 0x1f, 0x0c, 0x21, 0xbd, 0xb0, 0x7f, + 0x90, 0xbd, 0xe5, 0x3f, 0x17, 0x3d, 0x03, 0x58, 0x43, 0xbd, 0xe7, 0x24, 0x42, + 0xbd, 0xdd, 0xf2, 0x95, 0xbd, 0x58, 0xd0, 0xd9, 0x3c, 0xa9, 0xbe, 0x00, 0x3d, + 0x40, 0x4c, 0x97, 0xbd, 0x06, 0x0f, 0x63, 0xbd, 0x44, 0x04, 0x42, 0xbd, 0x69, + 0xfa, 0xd6, 0xbb, 0x40, 0x95, 0xca, 0xba, 0xba, 0x29, 0x80, 0xbd, 0x40, 0x04, + 0x8f, 0xbd, 0x9b, 0xd2, 0x71, 0xbd, 0x16, 0x0f, 0x36, 0xbd, 0xcf, 0xe9, 0x77, + 0x3d, 0x00, 0x20, 0xe2, 0xb8, 0x77, 0xed, 0x89, 0xba, 0x27, 0x9d, 0x7d, 0xbd, + 0x8b, 0x7d, 0xa1, 0x3c, 0xaf, 0x02, 0x41, 0xbd, 0x76, 0x0a, 0x80, 0xbd, 0xc5, + 0xbe, 0x0c, 0x3c, 0x65, 0xbc, 0x53, 0x3c, 0x23, 0x57, 0x71, 0x3d, 0x4c, 0x69, + 0xad, 0x3c, 0xe6, 0x35, 0x70, 0xbd, 0x4a, 0x71, 0x0f, 0x3d, 0x60, 0x74, 0x60, + 0xbd, 0x00, 0x21, 0xff, 0xbc, 0x2e, 0x9e, 0x15, 0xbd, 0x5b, 0xfa, 0xfb, 0xbc, + 0x70, 0x17, 0xe6, 0x3c, 0xb8, 0x5a, 0x03, 0x3d, 0x26, 0x71, 0x82, 0x3d, 0x40, + 0xf1, 0xe2, 0xbb, 0xad, 0xa1, 0x7d, 0xbd, 0xbb, 0x38, 0xb0, 0xbc, 0xa8, 0x2e, + 0x18, 0x3d, 0x29, 0xe4, 0x01, 0xbd, 0x3d, 0xed, 0x75, 0xbc, 0xc1, 0x90, 0x09, + 0x3d, 0x7a, 0x35, 0xf9, 0xbc, 0x0a, 0x1f, 0x8e, 0xbc, 0x7b, 0x9e, 0x05, 0xbc, + 0x00, 0xe1, 0x18, 0x3c, 0x90, 0xf1, 0xc1, 0xbc, 0xbc, 0xfc, 0x87, 0x3d, 0x28, + 0x2a, 0x48, 0x3c, 0xcf, 0x41, 0xf4, 0xbc, 0xa3, 0x20, 0x7a, 0xbd, 0x58, 0x65, + 0x0c, 0x3b, 0x5b, 0x8e, 0xd7, 0xbc, 0x09, 0x03, 0x87, 0x3d, 0xfa, 0xcf, 0xaa, + 0xbc, 0x12, 0x45, 0x83, 0xbd, 0x29, 0x24, 0x89, 0xbd, 0x77, 0x6e, 0x98, 0xbd, + 0x50, 0xf7, 0x91, 0xbb, 0x3e, 0x17, 0x86, 0x3c, 0xcf, 0x82, 0x54, 0x3d, 0x12, + 0x48, 0xff, 0xbb, 0xa8, 0x39, 0xa6, 0x3c, 0x57, 0xfc, 0xb4, 0xbc, 0xc5, 0x25, + 0x30, 0xbd, 0xcd, 0xbc, 0x04, 0xbd, 0x10, 0x87, 0xb4, 0xbc, 0x16, 0x7b, 0x6e, + 0xbd, 0xba, 0x00, 0x5f, 0xbd, 0xf8, 0x14, 0xac, 0x3c, 0xdf, 0x4d, 0x88, 0xbd, + 0x2e, 0xd2, 0xb6, 0xbc, 0x8e, 0x7a, 0x8e, 0xbd, 0xac, 0xdb, 0xe2, 0x3c, 0x7b, + 0x12, 0x8b, 0x3d, 0x03, 0xe2, 0x91, 0xbd, 0x43, 0xac, 0x3c, 0xbc, 0x5a, 0xc7, + 0x52, 0x3d, 0x5e, 0xec, 0x40, 0x3d, 0x1a, 0xb0, 0x1f, 0xbc, 0x1d, 0x9c, 0x92, + 0xbd, 0xd3, 0x03, 0xfd, 0x3c, 0xdd, 0x22, 0x0a, 0xbb, 0xe2, 0x2a, 0x89, 0x3d, + 0x94, 0xb6, 0xd4, 0xbb, 0x74, 0x26, 0xb8, 0xbc, 0xc6, 0x7a, 0x35, 0xbd, 0xa8, + 0xb7, 0x8e, 0xbd, 0xbe, 0x94, 0x36, 0xbd, 0x22, 0xc0, 0x03, 0xbd, 0x40, 0xb4, + 0xe5, 0x3a, 0x53, 0xb5, 0x14, 0xbc, 0xac, 0x00, 0x3a, 0xbc, 0xb3, 0xd9, 0xee, + 0x3c, 0xb5, 0x7c, 0xae, 0xbb, 0xd6, 0xb2, 0x75, 0x3c, 0x2f, 0x0e, 0x1a, 0xbd, + 0xf0, 0xb2, 0x47, 0xbd, 0xad, 0x36, 0x50, 0xbb, 0x19, 0x86, 0x36, 0xbd, 0xb4, + 0x02, 0xe4, 0xbc, 0xe2, 0x37, 0x10, 0x3d, 0x17, 0xcb, 0x86, 0xbd, 0x33, 0x35, + 0x5e, 0x3c, 0x63, 0xfe, 0x8f, 0x3d, 0x8e, 0x91, 0x6c, 0xbd, 0xf8, 0x55, 0x6f, + 0x3c, 0x60, 0xc0, 0xb6, 0x3c, 0x09, 0x23, 0x8d, 0xbd, 0x75, 0xae, 0x89, 0x3d, + 0x4e, 0xb2, 0x76, 0x3d, 0xbc, 0x52, 0x57, 0xbd, 0x5c, 0xf2, 0xde, 0xbc, 0x5a, + 0xc5, 0xc5, 0xbc, 0x01, 0xbf, 0x1a, 0xbd, 0xc4, 0x10, 0x37, 0xbd, 0xe9, 0xe5, + 0x7a, 0x3b, 0xa0, 0x03, 0x58, 0xbd, 0x4f, 0xe4, 0x66, 0x3d, 0xbd, 0xc0, 0xa8, + 0xbc, 0xd0, 0x05, 0xb9, 0x3c, 0xd3, 0xb7, 0xd9, 0x3c, 0xf2, 0x28, 0x2d, 0x3d, + 0x69, 0x78, 0x38, 0xbd, 0x55, 0x58, 0x49, 0xbc, 0xc5, 0x5b, 0xc2, 0x3c, 0x67, + 0x0d, 0x40, 0x3d, 0x02, 0xec, 0x2b, 0x3d, 0x60, 0x6a, 0xac, 0x3c, 0x6a, 0x9c, + 0x65, 0x3d, 0x19, 0x18, 0x4d, 0xbd, 0x05, 0xaf, 0xbd, 0xbc, 0x22, 0x2b, 0x54, + 0xbd, 0x1d, 0x0c, 0xd9, 0xbc, 0x0a, 0xf7, 0xfd, 0x3a, 0x5a, 0x18, 0x23, 0x3d, + 0xeb, 0xfc, 0x84, 0xbd, 0xaf, 0x71, 0x0c, 0xbc, 0x98, 0x72, 0x5e, 0x3c, 0x18, + 0x8b, 0x88, 0x3c, 0xa4, 0x1d, 0x8f, 0xbb, 0x3c, 0x3d, 0xbf, 0xbc, 0x18, 0x7a, + 0xc7, 0x3c, 0x2e, 0x1c, 0x77, 0xbd, 0x50, 0x47, 0x55, 0x3c, 0x5c, 0xa7, 0x23, + 0xbc, 0x0c, 0x4e, 0xda, 0x3c, 0x00, 0x25, 0x7f, 0x3d, 0xdc, 0xbd, 0x85, 0xbd, + 0xee, 0x84, 0x91, 0xbc, 0x0b, 0xcb, 0x81, 0x3d, 0x7a, 0x5f, 0x04, 0xbc, 0xde, + 0x3d, 0x7b, 0xbb, 0x05, 0xa9, 0x79, 0x3d, 0x6c, 0x47, 0x2e, 0xbd, 0x9a, 0x8c, + 0x7c, 0x3d, 0xee, 0xc6, 0x93, 0xbd, 0xaf, 0xd0, 0xd9, 0xbc, 0x33, 0x14, 0x3c, + 0xbd, 0xe3, 0x36, 0x6e, 0x3d, 0x0b, 0x9a, 0x55, 0xbc, 0xe9, 0x83, 0x84, 0x3d, + 0xd6, 0xb4, 0x6c, 0x3d, 0xc4, 0xea, 0xd4, 0x3c, 0x48, 0xb4, 0x20, 0x3d, 0x6e, + 0xc9, 0x53, 0x3d, 0x4e, 0x95, 0xbb, 0xbc, 0x15, 0x0c, 0x86, 0x3d, 0xdc, 0x7a, + 0x40, 0xbd, 0x98, 0x24, 0x6d, 0xbc, 0x2f, 0xea, 0x8a, 0xbd, 0x78, 0x00, 0xb4, + 0x3c, 0x8f, 0x53, 0x52, 0x3d, 0xc2, 0xfb, 0x11, 0x3d, 0x10, 0x7e, 0x81, 0x3c, + 0xae, 0xf3, 0x3e, 0x3d, 0x34, 0x8d, 0xeb, 0x3c, 0x72, 0x86, 0xd6, 0xbc, 0xd5, + 0x02, 0xad, 0x3b, 0x9d, 0x1c, 0x41, 0xbd, 0xda, 0x6b, 0x23, 0x3d, 0xaf, 0xa0, + 0x2b, 0x3d, 0x91, 0xd9, 0x5c, 0x3d, 0xce, 0x13, 0x4c, 0xbd, 0xa8, 0x7a, 0x4a, + 0x3d, 0xfd, 0xc5, 0x29, 0xbd, 0xff, 0xa6, 0x50, 0xbd, 0x9d, 0x04, 0x43, 0x3d, + 0x49, 0x9f, 0x82, 0xbd, 0xe0, 0x8c, 0x87, 0xbd, 0xb7, 0xb5, 0x64, 0xbd, 0x5e, + 0x55, 0x27, 0x3d, 0x8d, 0xde, 0x41, 0x3d, 0x19, 0x6b, 0x23, 0xbc, 0x6f, 0x71, + 0xf6, 0x3c, 0x04, 0x56, 0x24, 0x3d, 0xb8, 0x20, 0x3a, 0x3c, 0x97, 0xb4, 0x91, + 0xbd, 0x87, 0xf5, 0x6d, 0x3d, 0x80, 0x5b, 0x9d, 0x3c, 0x70, 0x4c, 0xad, 0x3b, + 0xff, 0x49, 0x81, 0x3d, 0x88, 0x14, 0x89, 0xbc, 0x72, 0xde, 0x25, 0xbd, 0x62, + 0xa9, 0x21, 0x3d, 0x94, 0x43, 0x59, 0xbc, 0xb1, 0x5a, 0x92, 0x3d, 0x9d, 0x57, + 0x6b, 0x3c, 0x5d, 0xa8, 0x8d, 0x3d, 0xd7, 0xf7, 0x08, 0x3d, 0x1c, 0x07, 0xe3, + 0xbc, 0xdd, 0xfc, 0xb5, 0xbc, 0xbc, 0xca, 0x84, 0x3d, 0x5c, 0x9e, 0x18, 0xbd, + 0xd5, 0x6d, 0x86, 0x3d, 0x42, 0x2b, 0x58, 0x3c, 0x0a, 0xc6, 0x33, 0x3d, 0x2c, + 0x1e, 0xf6, 0xbc, 0xb8, 0x48, 0x46, 0xbd, 0x26, 0xd6, 0x88, 0xbd, 0xd8, 0x45, + 0x2e, 0x3d, 0x7f, 0x28, 0x4f, 0x3d, 0x52, 0x42, 0x40, 0xbc, 0xad, 0xc8, 0x45, + 0xbd, 0xaa, 0x1c, 0x27, 0xbd, 0x32, 0x83, 0x72, 0xbb, 0xd2, 0xc5, 0x33, 0x3b, + 0x1e, 0x2f, 0x6f, 0x3d, 0x9e, 0x5c, 0x1c, 0x3d, 0x2d, 0xfb, 0xc5, 0xbc, 0x3d, + 0x12, 0x68, 0x3b, 0xb4, 0x98, 0xe9, 0x3c, 0xb9, 0xbd, 0xdf, 0x3a, 0xe0, 0xac, + 0x2c, 0x3d, 0x10, 0x5c, 0x87, 0x3c, 0x80, 0xd6, 0x2d, 0xba, 0x18, 0x73, 0x94, + 0x3c, 0xb8, 0x3c, 0x39, 0xbc, 0x48, 0x64, 0xda, 0x3c, 0x54, 0xdf, 0x05, 0x3d, + 0x04, 0x35, 0xdf, 0x3c, 0xdb, 0xf8, 0xfb, 0xba, 0xc3, 0x2d, 0xc1, 0xb8, 0x0e, + 0x8c, 0xd1, 0x3c, 0x4f, 0x12, 0x14, 0x3d, 0x50, 0xbc, 0x7d, 0xbc, 0xc7, 0x20, + 0x88, 0xbd, 0x79, 0x45, 0x2f, 0xbd, 0x77, 0x83, 0x55, 0xbc, 0x42, 0x7e, 0x95, + 0xbd, 0x9d, 0xfb, 0x4d, 0xbd, 0x92, 0xcc, 0x89, 0xbd, 0x84, 0x1d, 0x03, 0xbd, + 0x1f, 0xe1, 0x86, 0xbb, 0xca, 0xee, 0x4e, 0x3c, 0x15, 0x39, 0x55, 0xbd, 0x94, + 0x4b, 0x87, 0xbd, 0xf3, 0xf0, 0x0d, 0xbd, 0x4d, 0x17, 0x7b, 0x3d, 0xe5, 0x0b, + 0x95, 0xbc, 0x10, 0x50, 0x20, 0xbd, 0x60, 0x74, 0x7c, 0xbd, 0x50, 0x76, 0xad, + 0xbc, 0xdd, 0x59, 0x89, 0x3c, 0xa1, 0xcc, 0x10, 0x3d, 0x23, 0x4c, 0x37, 0x3c, + 0x50, 0x0e, 0xa6, 0x3c, 0x02, 0x0e, 0x24, 0xbd, 0x9d, 0x9f, 0x40, 0xbd, 0xba, + 0xe1, 0x51, 0xbd, 0x9e, 0xe5, 0x2a, 0xbd, 0x44, 0x07, 0xc8, 0x3c, 0xc0, 0x11, + 0x85, 0x3c, 0x1c, 0xde, 0x40, 0xbd, 0x34, 0xd3, 0xe3, 0x3c, 0xf1, 0xae, 0xdb, + 0xbc, 0xea, 0xbb, 0xf0, 0xbc, 0x32, 0x81, 0xb7, 0x3c, 0x1b, 0xe9, 0x4f, 0xbd, + 0x47, 0xd3, 0xb7, 0xbc, 0xc4, 0x4b, 0xe7, 0xbc, 0xf3, 0x52, 0x3b, 0x3d, 0x10, + 0xb8, 0xb6, 0x3b, 0x0b, 0xb8, 0x33, 0xbc, 0xb1, 0xba, 0x29, 0x3d, 0x93, 0xfc, + 0x00, 0xbd, 0xdf, 0x63, 0x30, 0xbd, 0xac, 0x1d, 0x1e, 0x3d, 0x52, 0xf7, 0x15, + 0xbd, 0x7f, 0xea, 0x53, 0xbd, 0x29, 0xe4, 0x2f, 0xbc, 0x5e, 0xf0, 0xb7, 0x3c, + 0xb1, 0xff, 0x09, 0xbd, 0xc9, 0x0f, 0xae, 0x3c, 0x5a, 0xc0, 0x06, 0xbd, 0x34, + 0x15, 0x10, 0xbd, 0x76, 0xea, 0x95, 0xbc, 0x60, 0xd8, 0x2d, 0x3c, 0x4c, 0x12, + 0x77, 0xbc, 0x2d, 0xb6, 0x88, 0x3d, 0x7f, 0x15, 0xe4, 0x3c, 0xb0, 0xef, 0xf0, + 0xbc, 0x79, 0x32, 0x1c, 0xbd, 0x4d, 0xbc, 0x4b, 0xbd, 0xae, 0x6d, 0x64, 0x3d, + 0x0c, 0x44, 0x82, 0xbc, 0x15, 0x4f, 0x3e, 0xbd, 0x86, 0x54, 0xab, 0xbc, 0x78, + 0xea, 0x0d, 0xbd, 0x73, 0xc6, 0x87, 0xbd, 0x06, 0xed, 0x32, 0xbd, 0xfd, 0x03, + 0x8a, 0xbd, 0x89, 0x8b, 0x30, 0xbd, 0x40, 0x73, 0x0d, 0xbd, 0xcf, 0x80, 0x84, + 0xbd, 0x3c, 0x00, 0x69, 0xbd, 0xeb, 0x8a, 0xf8, 0x3b, 0xc1, 0xa4, 0x93, 0xbd, + 0x25, 0x74, 0x69, 0xbd, 0x11, 0xe5, 0x00, 0x3d, 0x2d, 0xa0, 0x01, 0x3d, 0xf9, + 0x7d, 0x02, 0xbc, 0x55, 0x26, 0x30, 0x3d, 0xad, 0xf7, 0x50, 0x3c, 0xd6, 0xb1, + 0x68, 0x3d, 0xce, 0x49, 0x71, 0xbd, 0xcf, 0xde, 0xaa, 0x3b, 0x5d, 0x6e, 0x91, + 0xbd, 0xb4, 0xf1, 0x1a, 0xbd, 0xc7, 0xeb, 0xc2, 0x3c, 0x50, 0x74, 0xd4, 0xbb, + 0xe8, 0x25, 0x1f, 0x3d, 0xdb, 0x0a, 0x8e, 0xbc, 0x9d, 0x5d, 0x73, 0xbd, 0x70, + 0xce, 0x01, 0xbc, 0xc4, 0x22, 0x84, 0x3d, 0x80, 0x3b, 0x1d, 0x3c, 0x3d, 0xfa, + 0x15, 0xbd, 0x45, 0xd7, 0x9a, 0xbd, 0x4d, 0xa2, 0x4e, 0xbd, 0x41, 0x6e, 0x96, + 0xbc, 0xbf, 0xe4, 0x6c, 0x3d, 0x90, 0x3c, 0x21, 0x3d, 0x99, 0x76, 0x83, 0x3c, + 0xe1, 0xb9, 0x6f, 0x3d, 0x24, 0xb9, 0xcf, 0xbc, 0xc0, 0x33, 0xee, 0xbb, 0x8d, + 0xa6, 0xf0, 0xbc, 0x40, 0x81, 0x3f, 0x3d, 0x43, 0x82, 0x7e, 0x3c, 0xfa, 0x13, + 0x7a, 0x3d, 0x91, 0xcd, 0x0a, 0xbc, 0x80, 0x3e, 0x61, 0x3d, 0x65, 0xef, 0x56, + 0xbd, 0x44, 0x57, 0x90, 0xbd, 0xb4, 0x86, 0x7a, 0x3c, 0x70, 0xf5, 0xbd, 0x3c, + 0x90, 0x5c, 0xdc, 0x3c, 0x13, 0xe5, 0xeb, 0xbc, 0x30, 0x7a, 0x48, 0x3d, 0xfa, + 0x4c, 0xbe, 0x3c, 0x4d, 0x35, 0x2e, 0xbd, 0x32, 0x33, 0xdb, 0xbc, 0xab, 0x4c, + 0x0a, 0xbd, 0x12, 0x58, 0xad, 0xbc, 0x20, 0x07, 0x0c, 0x3c, 0xbc, 0xb5, 0xa6, + 0x3c, 0xb6, 0x70, 0x8f, 0xbd, 0xbc, 0x9a, 0x57, 0x3d, 0xb3, 0x6f, 0x82, 0xbd, + 0x52, 0xb9, 0x5c, 0x3c, 0x0d, 0x71, 0xd9, 0x3c, 0x18, 0x70, 0x0a, 0x3d, 0x80, + 0x7b, 0x0a, 0x3b, 0xee, 0x75, 0x27, 0xbc, 0x63, 0x74, 0x56, 0xbd, 0xf0, 0x20, + 0x5f, 0x3b, 0xfb, 0x77, 0x1e, 0xba, 0xb8, 0x6c, 0xee, 0x3c, 0x01, 0xd0, 0xef, + 0x3c, 0xb2, 0x68, 0x12, 0xbd, 0x51, 0xf6, 0x3c, 0xbd, 0x12, 0xb0, 0x2e, 0xbd, + 0x11, 0xfd, 0x5e, 0xbd, 0x48, 0xea, 0xb4, 0xbc, 0xce, 0xca, 0x88, 0x3d, 0x38, + 0x57, 0x40, 0x3d, 0x11, 0xfa, 0x8b, 0x3d, 0xc0, 0x34, 0x36, 0x3d, 0xe4, 0x82, + 0x8e, 0xbd, 0xbd, 0x95, 0x59, 0xbd, 0xf0, 0x8b, 0x43, 0xbd, 0x93, 0x9b, 0x0a, + 0xbc, 0xb7, 0x99, 0x4d, 0x3c, 0x46, 0x42, 0x1d, 0x3d, 0x00, 0x19, 0x3a, 0xbd, + 0x1c, 0xd3, 0x5a, 0xbd, 0xff, 0x09, 0x02, 0xbd, 0xa1, 0x01, 0x8e, 0x3d, 0xc3, + 0x9e, 0xd8, 0xbb, 0x28, 0xb5, 0x2d, 0x3d, 0x56, 0x9c, 0x16, 0x3d, 0x78, 0xe6, + 0x1e, 0xbc, 0x06, 0x56, 0x14, 0x3d, 0xbc, 0x3f, 0x88, 0xbd, 0x34, 0x45, 0x94, + 0xbc, 0xfb, 0xb1, 0x0a, 0xbd, 0x67, 0x87, 0x90, 0xbd, 0x4d, 0x75, 0x27, 0xbd, + 0x9f, 0xc8, 0x60, 0x3b, 0x02, 0xc4, 0xb0, 0xbc, 0x54, 0x5b, 0x5f, 0xbd, 0xe3, + 0x43, 0xff, 0xbc, 0xf6, 0xf7, 0x39, 0xbc, 0x99, 0x4c, 0x82, 0xbd, 0xda, 0x99, + 0xa9, 0x3b, 0x6a, 0xd5, 0xee, 0xbc, 0x1e, 0xc1, 0x93, 0xbd, 0xc2, 0x21, 0x52, + 0xbc, 0x52, 0xfc, 0x06, 0xbc, 0x70, 0x59, 0x85, 0xbd, 0x5d, 0xbd, 0x8a, 0xbd, + 0xe2, 0x10, 0x77, 0x3d, 0x36, 0x83, 0x90, 0xbd, 0x66, 0x9f, 0x90, 0xbc, 0x30, + 0x78, 0x4c, 0x3d, 0xd4, 0x2c, 0x8b, 0x3c, 0xe0, 0x8b, 0x4e, 0xbc, 0x31, 0x0f, + 0x80, 0xbd, 0x4a, 0xb7, 0x5b, 0xbd, 0x52, 0xd0, 0x1a, 0xbd, 0x5c, 0x20, 0xe3, + 0x3c, 0x5a, 0x77, 0x29, 0xbd, 0x90, 0x0b, 0x00, 0xbd, 0x62, 0x10, 0x4c, 0x3d, + 0x40, 0x52, 0x58, 0x3c, 0x18, 0x5e, 0x46, 0x3c, 0xc6, 0x6b, 0x37, 0x3d, 0x17, + 0x5c, 0x90, 0x3d, 0x28, 0x6c, 0xfd, 0xbc, 0x7e, 0x4b, 0x28, 0xbd, 0x86, 0x7b, + 0x1d, 0xbd, 0x2b, 0x78, 0x83, 0x3d, 0x48, 0x65, 0x53, 0x3d, 0x91, 0x41, 0x7b, + 0xbd, 0x0a, 0x32, 0x65, 0xbd, 0x80, 0xb5, 0x83, 0xbd, 0x93, 0x10, 0x8b, 0x3d, + 0x40, 0xc2, 0x9b, 0x3a, 0xe8, 0xe9, 0xcc, 0x3c, 0xb8, 0xf5, 0x00, 0x3d, 0x2a, + 0x60, 0x70, 0x3d, 0xbb, 0xa9, 0x18, 0xbd, 0xbf, 0xca, 0x76, 0xbd, 0xf4, 0x83, + 0xda, 0xbc, 0xcc, 0x89, 0xeb, 0x3c, 0xa0, 0x01, 0x27, 0xbb, 0x90, 0x98, 0x1e, + 0x3d, 0x2d, 0x7a, 0x91, 0xbd, 0x00, 0x8e, 0x71, 0xbd, 0xc7, 0x30, 0x1a, 0xbd, + 0x22, 0xe9, 0x3d, 0x3d, 0x1a, 0xb3, 0x46, 0x3d, 0xbe, 0x20, 0x5a, 0x3d, 0x02, + 0x34, 0x0b, 0xbd, 0x8d, 0x91, 0x5c, 0xbd, 0x84, 0xeb, 0xdc, 0xbc, 0xaa, 0x4b, + 0xd6, 0xbc, 0xab, 0xd1, 0x91, 0x3d, 0xb8, 0x2c, 0x95, 0x3c, 0x0c, 0xf7, 0x59, + 0x3d, 0xc9, 0xea, 0x8e, 0xbd, 0x23, 0xb1, 0x83, 0xbd, 0x27, 0x20, 0x85, 0xbd, + 0x40, 0xdb, 0xaa, 0x3a, 0x4c, 0x7b, 0x48, 0xbc, 0x00, 0x62, 0x9d, 0x3b, 0xaf, + 0xeb, 0x83, 0x3d, 0xe0, 0x4e, 0x1d, 0x3b, 0x90, 0xf9, 0xdc, 0xbc, 0xd6, 0x49, + 0x60, 0x3d, 0x4e, 0x96, 0x66, 0x3d, 0xbe, 0x9e, 0x9b, 0xbc, 0xec, 0x9e, 0xff, + 0x3c, 0xd0, 0xa1, 0x0b, 0x3d, 0xb4, 0x2d, 0x39, 0x3d, 0x28, 0x62, 0x9a, 0x3c, + 0xce, 0xdc, 0x67, 0x3d, 0xe8, 0xb6, 0x68, 0x3c, 0xb6, 0x37, 0x87, 0xbd, 0xee, + 0xd3, 0x67, 0x3d, 0x18, 0xfb, 0x31, 0x3c, 0x27, 0x89, 0x26, 0xbd, 0x30, 0x9e, + 0xc0, 0x3c, 0xd0, 0x5b, 0x30, 0xbd, 0x90, 0x96, 0x33, 0x3c, 0x1e, 0xf8, 0x20, + 0xbd, 0x48, 0xa2, 0xa2, 0x3c, 0x2e, 0x6b, 0x3f, 0xbd, 0x32, 0x37, 0x1e, 0x3d, + 0x10, 0x9e, 0x26, 0xbd, 0x1c, 0xd5, 0x60, 0xbd, 0xf5, 0x5f, 0x06, 0xbd, 0x87, + 0xff, 0x71, 0xbd, 0x1d, 0xba, 0x8c, 0xbd, 0x00, 0xe0, 0x8c, 0xba, 0x20, 0x94, + 0x0d, 0xbc, 0x5a, 0x15, 0x84, 0xbc, 0x36, 0x58, 0x50, 0x3d, 0x7a, 0x21, 0x5c, + 0x3d, 0x78, 0x57, 0x39, 0xbd, 0x8d, 0x3b, 0x59, 0xbd, 0x90, 0x90, 0x80, 0xbb, + 0xf0, 0x93, 0xbe, 0x3b, 0x50, 0x34, 0xe1, 0xbb, 0xc0, 0xac, 0xd3, 0xba, 0x42, + 0x75, 0xb4, 0xbc, 0x38, 0xaa, 0x30, 0xbd, 0xa6, 0x79, 0x49, 0x3d, 0xfc, 0xd2, + 0x37, 0xbc, 0xe0, 0x0d, 0xd6, 0xbb, 0xc1, 0x2d, 0x73, 0xbd, 0x4a, 0xf1, 0x5b, + 0xbd, 0xd4, 0x0c, 0x82, 0x3c, 0xce, 0x51, 0x0c, 0xbd, 0xe0, 0x9c, 0x4e, 0xbd, + 0x3e, 0x98, 0x6a, 0x3d, 0x7e, 0xbf, 0x27, 0x3d, 0x00, 0xb2, 0x6f, 0xbd, 0x0c, + 0xcd, 0x4d, 0x3d, 0xfa, 0x7b, 0x22, 0x3d, 0x18, 0x3f, 0x02, 0xbc, 0xa4, 0x1a, + 0xb7, 0xbc, 0xe2, 0xf5, 0x45, 0x3d, 0xf0, 0x66, 0xe6, 0xbb, 0xd2, 0x56, 0x54, + 0x3d, 0x72, 0xff, 0x64, 0x3d, 0x68, 0xbf, 0x41, 0x3d, 0x8c, 0xa8, 0x39, 0xbd, + 0x4b, 0x80, 0x88, 0x3d, 0x40, 0x05, 0x8f, 0x3c, 0x9a, 0x58, 0x6b, 0xbd, 0xb6, + 0xc7, 0x58, 0xbd, 0x66, 0x73, 0x12, 0x3d, 0x9c, 0x2b, 0x50, 0xbd, 0xc8, 0x47, + 0x7d, 0xbc, 0xb7, 0x6a, 0x04, 0xbd, 0xe6, 0x6a, 0x23, 0x3d, 0xdb, 0x11, 0x1f, + 0xbd, 0x60, 0x1d, 0x5e, 0xbc, 0x80, 0x70, 0x72, 0xbd, 0x08, 0xed, 0x51, 0x3c, + 0xb8, 0x35, 0x0c, 0xbc, 0x2e, 0xef, 0x47, 0x3d, 0xd0, 0xfb, 0xdf, 0x3b, 0xee, + 0xea, 0x5c, 0x3d, 0x52, 0xa6, 0x7f, 0x3d, 0x1c, 0xd4, 0x92, 0x3c, 0x0c, 0xe1, + 0xe3, 0x3c, 0x0b, 0x0e, 0x8b, 0x3d, 0x1e, 0x6f, 0x20, 0x3d, 0xee, 0xf3, 0x45, + 0xbd, 0x28, 0xef, 0xfc, 0x3c, 0x48, 0x19, 0x8c, 0xbd, 0x02, 0x87, 0x7f, 0xbd, + 0x6c, 0xc1, 0x4b, 0x3d, 0x30, 0x88, 0x72, 0xbc, 0x00, 0xb2, 0xce, 0x39, 0x68, + 0x2f, 0xf1, 0xbc, 0x00, 0xa0, 0x3b, 0xb8, 0x0c, 0x90, 0x7b, 0xbd, 0xd0, 0x97, + 0x45, 0xbd, 0xf6, 0xf5, 0x5d, 0x3d, 0x50, 0x0b, 0x0e, 0x3c, 0x48, 0x51, 0xf9, + 0x3c, 0xb7, 0xe4, 0x4d, 0xbd, 0xca, 0x8d, 0xcf, 0xbc, 0x49, 0x0d, 0x88, 0xbd, + 0xb1, 0x3c, 0x8f, 0x3d, 0xef, 0x72, 0x8a, 0x3d, 0x90, 0x23, 0x02, 0x3d, 0xe8, + 0x60, 0x05, 0x3c, 0xc0, 0x9f, 0xb6, 0xba, 0xd5, 0x57, 0x03, 0xbd, 0x22, 0xae, + 0x66, 0x3d, 0x61, 0x03, 0x8b, 0xbd, 0xcc, 0x23, 0xea, 0xbc, 0x80, 0x58, 0x4f, + 0x3c, 0x60, 0xea, 0xd0, 0x3b, 0xae, 0x19, 0x2e, 0xbd, 0x5e, 0xee, 0xb5, 0xbc, + 0x50, 0x19, 0x18, 0x3c, 0x6d, 0xd7, 0x78, 0xbd, 0x40, 0xcb, 0xe9, 0xbc, 0xea, + 0x76, 0x53, 0xbd, 0x2c, 0x0e, 0x6b, 0xbc, 0xd8, 0xd6, 0x6a, 0x3c, 0xe0, 0x3d, + 0x80, 0xbd, 0x80, 0x36, 0xf1, 0xba, 0x30, 0x30, 0x51, 0x3c, 0x40, 0x41, 0xa3, + 0xba, 0xc8, 0xe8, 0x80, 0xbd, 0x72, 0x33, 0x67, 0x3d, 0xdd, 0x7d, 0x0c, 0xbd, + 0x1c, 0xcf, 0xbe, 0x3c, 0x8c, 0x1d, 0x8f, 0xbd, 0x4c, 0x5a, 0x3a, 0x3d, 0xa0, + 0x35, 0xff, 0x3b, 0x50, 0xb8, 0xea, 0xbb, 0x58, 0x63, 0x26, 0xbc, 0x70, 0x33, + 0x0c, 0xbc, 0x58, 0xbb, 0x09, 0xbc, 0x1a, 0xd0, 0xf6, 0xbc, 0x02, 0xb0, 0x08, + 0x3d, 0x4c, 0x72, 0xa7, 0x3c, 0x10, 0xa0, 0xa7, 0x3b, 0x7c, 0xab, 0x3f, 0x3d, + 0x12, 0x95, 0xc6, 0xbc, 0x58, 0xe5, 0xac, 0xbc, 0x80, 0xbc, 0x56, 0x3b, 0x00, + 0xd2, 0xda, 0xbb, 0x26, 0xff, 0xaa, 0xbc, 0xf2, 0xdc, 0x71, 0x3d, 0x30, 0xaf, + 0x85, 0xbb, 0x88, 0xf9, 0x14, 0x3d, 0x50, 0x89, 0xc5, 0xbb, 0xc0, 0xd0, 0xf1, + 0x3b, 0x95, 0xf2, 0x7b, 0xbd, 0x66, 0x43, 0xfa, 0xbc, 0xa0, 0x68, 0xf3, 0xbb, + 0x60, 0xa0, 0xdc, 0x3c, 0x0e, 0x67, 0x6e, 0x3d, 0xdd, 0xec, 0x8a, 0xbd, 0xca, + 0x1e, 0x8f, 0xbd, 0x64, 0x84, 0x6c, 0xbd, 0xee, 0x7b, 0x7a, 0xbd, 0xd2, 0xdc, + 0x97, 0xbc, 0x84, 0x44, 0x77, 0xbd, 0xf8, 0xec, 0x0e, 0xbd, 0xea, 0x25, 0x03, + 0x3d, 0x8e, 0x42, 0x27, 0xbd, 0x31, 0x0b, 0x87, 0x3d, 0xba, 0x5e, 0x31, 0xbd, + 0x74, 0xee, 0xa5, 0x3c, 0xb5, 0xa1, 0x83, 0x3d, 0x48, 0x87, 0xad, 0x3c, 0x5c, + 0xc4, 0x04, 0xbd, 0xe6, 0xe7, 0x4e, 0x3d, 0x24, 0xa4, 0xb2, 0xbc, 0x02, 0x4a, + 0x8d, 0xbd, 0xfa, 0x96, 0x92, 0xbd, 0xf8, 0x1e, 0xaf, 0x3c, 0x80, 0xdb, 0xfe, + 0x3a, 0x20, 0x48, 0xff, 0xbb, 0xf2, 0xdd, 0x63, 0x3d, 0x2c, 0x12, 0xaf, 0x3c, + 0x8a, 0x05, 0xcf, 0xbc, 0xd8, 0x3a, 0x23, 0x3d, 0x2b, 0x32, 0x89, 0xbd, 0xd0, + 0xff, 0x8b, 0x3b, 0x58, 0xd1, 0x13, 0xbd, 0x00, 0xac, 0x96, 0x3a, 0x8a, 0x92, + 0x33, 0x3d, 0x1c, 0xdb, 0x2f, 0xbc, 0x8a, 0x30, 0x69, 0xbd, 0x80, 0xcc, 0x7a, + 0x3b, 0x88, 0xaa, 0x7b, 0xbd, 0x03, 0xda, 0x8e, 0xbd, 0x10, 0x40, 0xfe, 0x3b, + 0x74, 0x92, 0x0b, 0x3d, 0x54, 0x61, 0x7e, 0xbd, 0xdd, 0x2f, 0x75, 0xbd, 0xa8, + 0xcd, 0x52, 0x3c, 0x20, 0xf1, 0x57, 0x3d, 0x98, 0x18, 0x05, 0xbc, 0x86, 0x14, + 0x3a, 0x3d, 0xf0, 0xa5, 0x94, 0x3b, 0x13, 0xd7, 0x8b, 0x3d, 0xbe, 0x38, 0x1e, + 0x3d, 0xe6, 0xa2, 0x8d, 0xbc, 0xc0, 0x39, 0xdf, 0x3c, 0xf8, 0x3f, 0x8b, 0xbd, + 0xc9, 0x86, 0x8a, 0x3d, 0x51, 0xa4, 0x6d, 0xbd, 0x7b, 0xe0, 0x82, 0x3d, 0x50, + 0x6e, 0x6d, 0x3c, 0xd0, 0x15, 0x60, 0xbd, 0x46, 0xec, 0x06, 0xbd, 0x50, 0x8b, + 0x0f, 0x3d, 0x8e, 0x36, 0xab, 0xbc, 0x7f, 0x46, 0x74, 0xbd, 0x4e, 0x2b, 0x63, + 0xbd, 0x6e, 0xdf, 0x2c, 0x3d, 0xee, 0x87, 0x60, 0x3d, 0x4e, 0x24, 0x6e, 0xbd, + 0x06, 0xbf, 0x7d, 0x3d, 0x40, 0xf6, 0x25, 0x3c, 0xba, 0xea, 0x01, 0x3d, 0x29, + 0x4f, 0x8c, 0xbd, 0xf3, 0x02, 0x8b, 0xbd, 0x7c, 0x06, 0x30, 0xbd, 0xda, 0x97, + 0x1e, 0x3d, 0xad, 0x89, 0x8b, 0xbd, 0x90, 0x78, 0xd1, 0x3b, 0x2c, 0x75, 0xb5, + 0x3c, 0x41, 0x04, 0x40, 0xbd, 0x52, 0x9d, 0x08, 0x3d, 0xf4, 0x53, 0xbf, 0x3c, + 0x48, 0x82, 0x16, 0x3c, 0x3a, 0xa1, 0x72, 0x3d, 0xc8, 0x73, 0x32, 0x3d, 0x5a, + 0x20, 0x20, 0x3d, 0x08, 0xb1, 0x48, 0x3d, 0x46, 0x6e, 0x73, 0x3d, 0x59, 0x17, + 0x0f, 0xbd, 0xb8, 0xa7, 0x01, 0x3c, 0x10, 0x53, 0x46, 0x3c, 0x27, 0xc2, 0x3f, + 0xbd, 0x77, 0x6b, 0x91, 0x3d, 0xa8, 0x1c, 0xec, 0x3c, 0xfd, 0x09, 0x92, 0xbd, + 0x1c, 0x87, 0x89, 0xbd, 0x60, 0x10, 0xdc, 0xbb, 0x00, 0x40, 0xd1, 0x36, 0x48, + 0xb3, 0x28, 0x3c, 0xc8, 0xb3, 0x94, 0x3c, 0xfa, 0x6c, 0x8e, 0xbc, 0x98, 0x5b, + 0x68, 0xbc, 0x32, 0xc1, 0x3b, 0x3d, 0xb7, 0xd5, 0x81, 0x3d, 0x48, 0xb6, 0x10, + 0x3d, 0x5c, 0x95, 0x58, 0xbd, 0xf6, 0xb9, 0x00, 0xbd, 0xaa, 0xbe, 0x51, 0xbd, + 0x2e, 0xbc, 0x70, 0x3d, 0xc8, 0x89, 0x06, 0x3c, 0x00, 0x00, 0x41, 0xb9, 0x31, + 0x3e, 0x10, 0xbd, 0xf0, 0x26, 0x14, 0xbc, 0x98, 0xfc, 0xf2, 0x3c, 0xf3, 0x6d, + 0x27, 0xbd, 0xd0, 0xdd, 0x2e, 0xbc, 0xee, 0x5b, 0x92, 0xbd, 0xc6, 0x4c, 0x24, + 0x3d, 0x3c, 0x5e, 0x01, 0x3d, 0x6a, 0xe6, 0x26, 0xbd, 0x90, 0xd6, 0x1f, 0x3c, + 0xbc, 0x88, 0xcd, 0x3c, 0xb0, 0xad, 0xee, 0x3c, 0xd4, 0xc5, 0xdf, 0x3c, 0xa6, + 0x0f, 0xe7, 0xbc, 0x51, 0x99, 0x84, 0x3d, 0xc4, 0x84, 0x6a, 0xbc, 0xa8, 0xb6, + 0x5c, 0xbc, 0x00, 0xba, 0x3a, 0x39, 0x28, 0x4f, 0x59, 0x3d, 0x80, 0x55, 0x45, + 0xba, 0x48, 0x20, 0x84, 0xbc, 0x3f, 0xfd, 0x90, 0x3d, 0x74, 0x17, 0x82, 0xbd, + 0x93, 0xd5, 0x26, 0xbd, 0xc0, 0x02, 0xbf, 0xbc, 0x42, 0xdf, 0x24, 0x3d, 0x0e, + 0xac, 0xd5, 0xbc, 0x42, 0xcc, 0x7a, 0xbd, 0xd0, 0x21, 0xf6, 0x3b, 0x88, 0x2e, + 0x63, 0xbd, 0x08, 0xdd, 0xc4, 0xbc, 0x08, 0xa7, 0x6b, 0x3c, 0x17, 0x07, 0x83, + 0xbd, 0x31, 0xfd, 0x81, 0x3d, 0x68, 0xb0, 0x3f, 0x3c, 0xec, 0x78, 0xc0, 0xbc, + 0x40, 0x91, 0x3b, 0x3c, 0x80, 0x96, 0xbf, 0x3a, 0x94, 0xed, 0xa7, 0x3c, 0xb0, + 0xf7, 0x2a, 0x3c, 0x00, 0x90, 0xc6, 0x37, 0xb4, 0x0d, 0x89, 0xbd, 0xd0, 0x28, + 0xb0, 0xbb, 0xf0, 0x65, 0x06, 0x3c, 0xcd, 0xc8, 0x8d, 0x3d, 0x66, 0xa5, 0x6f, + 0x3d, 0x36, 0x46, 0x4c, 0x3d, 0x00, 0x80, 0x67, 0x36, 0xaf, 0x78, 0x20, 0xbd, + 0xce, 0x83, 0x08, 0x3d, 0x7f, 0x32, 0x84, 0xbd, 0x23, 0x80, 0x8e, 0x3d, 0xb4, + 0xa5, 0x56, 0x3d, 0xe4, 0xc2, 0x10, 0xbd, 0xc0, 0xf4, 0xe9, 0xba, 0xa6, 0x4e, + 0x6d, 0x3d, 0x04, 0x19, 0xad, 0xbc, 0x0c, 0xf2, 0x38, 0x3d, 0xc6, 0x2c, 0x29, + 0xbd, 0xba, 0x51, 0x5c, 0x3d, 0x20, 0x92, 0xae, 0x3c, 0x68, 0x55, 0xf7, 0x3c, + 0x40, 0x10, 0x08, 0x3d, 0x86, 0x95, 0x62, 0x3d, 0x36, 0xef, 0x80, 0xbd, 0xd8, + 0x21, 0x37, 0xbd, 0x28, 0x37, 0x93, 0xbc, 0x20, 0xb5, 0x35, 0x3b, 0x2f, 0x41, + 0x86, 0xbd, 0xf0, 0xf4, 0xfd, 0xbc, 0x3e, 0xa1, 0x8a, 0xbd, 0x38, 0xf3, 0x8f, + 0xbd, 0x15, 0xd9, 0x6e, 0xbd, 0xb8, 0xd9, 0x4b, 0x3d, 0x6e, 0x7c, 0x61, 0xbd, + 0x00, 0x0e, 0x4d, 0xbb, 0xf8, 0xa5, 0x58, 0xbc, 0x20, 0x15, 0xb6, 0x3b, 0xa0, + 0x58, 0x09, 0x3b, 0xed, 0x15, 0x72, 0xbd, 0x00, 0xc6, 0x1a, 0x3a, 0x90, 0xdf, + 0x44, 0x3d, 0x70, 0xb4, 0x28, 0xbd, 0x66, 0x55, 0x7d, 0xbd, 0x94, 0x94, 0x84, + 0x3c, 0x49, 0xde, 0x32, 0xbd, 0x32, 0x47, 0x13, 0x3d, 0x2e, 0x3b, 0x4a, 0xbd, + 0x8a, 0x6d, 0x53, 0xbd, 0x88, 0x9e, 0x8b, 0xbc, 0xfe, 0x9b, 0xd0, 0xbc, 0xf0, + 0xb2, 0x16, 0x3c, 0x8c, 0x8a, 0x85, 0x3c, 0xd5, 0x73, 0x8b, 0xbd, 0xd6, 0xd6, + 0x02, 0xbd, 0x70, 0x96, 0x22, 0x3d, 0x8a, 0x4b, 0x1c, 0x3d, 0x80, 0x91, 0xeb, + 0x3a, 0x80, 0x29, 0x95, 0x3c, 0x71, 0xf1, 0x8d, 0x3d, 0x3e, 0x5e, 0x5e, 0xbd, + 0xd2, 0x53, 0x63, 0x3d, 0x0b, 0xcb, 0x8d, 0xbd, 0x58, 0x76, 0x5f, 0xbc, 0xc2, + 0xe8, 0x02, 0x3d, 0x9c, 0x96, 0x99, 0x3c, 0xbc, 0xe8, 0x96, 0x3c, 0xff, 0x05, + 0x45, 0xbd, 0x48, 0xa6, 0x02, 0x3d, 0x83, 0x34, 0x87, 0xbd, 0xe4, 0x9a, 0x47, + 0x3d, 0xd8, 0x5f, 0xc5, 0x3c, 0x0c, 0x1c, 0xee, 0xbc, 0x3e, 0x65, 0x46, 0x3d, + 0xe5, 0xd2, 0x10, 0xbd, 0x00, 0x98, 0x9a, 0xbb, 0x06, 0x89, 0x8d, 0xbc, 0xb8, + 0x08, 0xc5, 0xbc, 0x9e, 0xeb, 0xbd, 0xbc, 0x98, 0x4b, 0x78, 0xbd, 0x7d, 0x8a, + 0x7d, 0xbd, 0x00, 0x70, 0xf6, 0x39, 0xe0, 0x0c, 0xba, 0x3b, 0xa2, 0xf4, 0xdf, + 0xbc, 0xca, 0x61, 0x79, 0xbd, 0x44, 0x6f, 0xa3, 0xbc, 0x3c, 0x56, 0xe1, 0x3c, + 0x90, 0xfd, 0x3c, 0xbd, 0x71, 0x08, 0x35, 0xbd, 0xde, 0x28, 0x6b, 0xbd, 0xae, + 0xe2, 0x36, 0x3d, 0xe7, 0x04, 0x1e, 0xbd, 0x94, 0x0b, 0x1a, 0x3d, 0x3a, 0x8f, + 0x26, 0x3d, 0x40, 0xbe, 0x07, 0xbc, 0x10, 0x36, 0x8d, 0xbd, 0x40, 0x7b, 0x06, + 0x3b, 0xd8, 0x7b, 0x2c, 0x3d, 0x4f, 0x09, 0x59, 0xbd, 0x28, 0xc9, 0xeb, 0x3c, + 0x1c, 0xee, 0x7c, 0xbc, 0xf0, 0x79, 0x19, 0x3c, 0xf8, 0x06, 0x72, 0x3c, 0xe0, + 0x83, 0xb5, 0x3b, 0xc8, 0xca, 0x47, 0x3c, 0x88, 0x99, 0x0c, 0x3d, 0xe6, 0x5f, + 0xaf, 0xbc, 0x14, 0x1b, 0x4f, 0xbc, 0x13, 0x70, 0x80, 0xbd, 0xdd, 0x13, 0x18, + 0xbd, 0x4e, 0xae, 0xe3, 0xbc, 0xaa, 0x98, 0x7d, 0x3d, 0x00, 0xf9, 0x2f, 0x3c, + 0xdd, 0xd1, 0x8c, 0x3d, 0x28, 0x5c, 0x3c, 0x3d, 0x90, 0x81, 0x38, 0x3d, 0x3a, + 0xf4, 0x5d, 0x3d, 0xc2, 0x24, 0x53, 0x3d, 0x00, 0x34, 0x42, 0xbb, 0x32, 0xc8, + 0x78, 0x3d, 0x7a, 0x94, 0xe6, 0xbc, 0x76, 0x8f, 0x80, 0xbc, 0x83, 0xca, 0x8b, + 0x3d, 0x62, 0xfb, 0x78, 0x3d, 0xe9, 0x00, 0x90, 0x3d, 0xe8, 0x9b, 0x1c, 0xbd, + 0x66, 0xd9, 0x8d, 0xbd, 0xa2, 0xe7, 0x73, 0x3d, 0xd8, 0xb6, 0xb9, 0xbc, 0xa0, + 0x55, 0x70, 0x3b, 0x08, 0x5b, 0x00, 0x3c, 0xb4, 0xd0, 0x58, 0xbd, 0xe4, 0x3b, + 0x52, 0xbd, 0xb0, 0x22, 0x3d, 0x3d, 0x4a, 0x4f, 0x81, 0xbd, 0x48, 0xf0, 0x6a, + 0x3c, 0x61, 0xf4, 0x65, 0xbd, 0x34, 0x4e, 0x00, 0x3d, 0xd1, 0x71, 0x3c, 0xbd, + 0x8e, 0x3e, 0x70, 0x3d, 0x55, 0x7a, 0x27, 0xbd, 0x68, 0x22, 0xd5, 0xbc, 0x59, + 0x71, 0x90, 0xbd, 0xc8, 0xb0, 0x60, 0x3c, 0x74, 0x5b, 0x36, 0xbd, 0xdc, 0x16, + 0xbf, 0x3c, 0x62, 0x7a, 0xe3, 0xbc, 0x00, 0x21, 0x8e, 0xba, 0x1e, 0x0d, 0x08, + 0xbd, 0xa3, 0x7a, 0x07, 0xbd, 0xb4, 0x92, 0xee, 0x3c, 0x8d, 0xd2, 0x81, 0x3d, + 0x40, 0xc6, 0x98, 0x3c, 0x78, 0xc1, 0x69, 0x3c, 0x36, 0x9a, 0x72, 0x3d, 0xd2, + 0xfa, 0xe3, 0xbc, 0x42, 0x4c, 0x0e, 0x3d, 0x97, 0x2c, 0x88, 0x3d, 0x78, 0x6f, + 0x13, 0xbc, 0x40, 0x90, 0x7a, 0x3b, 0x66, 0x40, 0x95, 0xbc, 0xb8, 0xe6, 0x33, + 0x3d, 0x64, 0x0c, 0xf1, 0x3c, 0xb3, 0xc0, 0x1f, 0xbd, 0x67, 0x03, 0x03, 0xbd, + 0xe4, 0x7c, 0xfb, 0x3c, 0x7e, 0x22, 0x0e, 0xbd, 0xd6, 0x60, 0x8d, 0xbd, 0xcc, + 0xa2, 0x2c, 0xbd, 0x00, 0xa4, 0xd6, 0x39, 0xf8, 0x7d, 0x8d, 0xbd, 0xe4, 0x27, + 0x9a, 0xbc, 0xd8, 0x19, 0x61, 0xbd, 0xb8, 0x49, 0x54, 0xbd, 0x70, 0xcb, 0xd3, + 0x3b, 0x49, 0xe1, 0x89, 0x3d, 0x06, 0x6c, 0x78, 0x3d, 0xc0, 0xbe, 0x82, 0x3c, + 0x4d, 0x99, 0x8f, 0x3d, 0xd8, 0x0d, 0xe6, 0x3c, 0x4e, 0x2d, 0x60, 0x3d, 0x1c, + 0xab, 0x99, 0x3c, 0x66, 0xc6, 0xcc, 0xbc, 0x28, 0x76, 0x0b, 0xbc, 0x7b, 0x6e, + 0x90, 0x3d, 0x3b, 0x2f, 0x1c, 0xbd, 0x60, 0x1e, 0x83, 0x3b, 0xc8, 0x88, 0xfd, + 0x3c, 0x00, 0x48, 0xa8, 0x3c, 0x40, 0x3d, 0xd4, 0x3b, 0xa4, 0x83, 0xfc, 0x3c, + 0x3c, 0xe7, 0xd8, 0x3c, 0xfe, 0xaa, 0x6f, 0x3d, 0xbb, 0x22, 0x90, 0xbd, 0xd6, + 0xf5, 0x29, 0x3d, 0x8e, 0x7e, 0x65, 0x3d, 0xae, 0x3b, 0xe4, 0xbc, 0xea, 0x04, + 0x54, 0x3d, 0x64, 0x22, 0x1f, 0x3d, 0x24, 0x95, 0x90, 0x3c, 0xcd, 0x7b, 0x21, + 0xbd, 0xd0, 0xf8, 0xb9, 0x3b, 0x26, 0xf8, 0x28, 0xbd, 0x6a, 0x37, 0x5b, 0x3d, + 0x6e, 0x7e, 0x70, 0x3d, 0xa0, 0x90, 0xec, 0x3c, 0x00, 0x8e, 0x0d, 0xbb, 0xe0, + 0xbe, 0x5b, 0xbb, 0x58, 0xf6, 0x9c, 0x3c, 0xbe, 0x59, 0xc0, 0xbc, 0x64, 0x78, + 0xa4, 0x3c, 0x79, 0xfb, 0x86, 0x3d, 0x60, 0x6c, 0x85, 0xbc, 0xba, 0x44, 0x18, + 0xbd, 0x5e, 0xea, 0x6a, 0xbd, 0x6c, 0xf4, 0x36, 0xbd, 0xee, 0xd4, 0x4c, 0xbd, + 0xa2, 0x17, 0x16, 0x3d, 0x98, 0x59, 0xb9, 0x3c, 0x90, 0x41, 0x3d, 0x3c, 0x66, + 0x14, 0x06, 0x3d, 0x40, 0xa2, 0x17, 0xbb, 0xdd, 0x83, 0x75, 0xbd, 0x2c, 0x19, + 0x8f, 0x3c, 0xfe, 0xde, 0x49, 0xbd, 0x57, 0x3d, 0x85, 0x3d, 0x1c, 0xb3, 0xef, + 0xbc, 0x58, 0xdb, 0x3f, 0xbd, 0x0e, 0x38, 0x20, 0x3d, 0x80, 0xbf, 0xa7, 0x3a, + 0xf0, 0xe2, 0x91, 0xbd, 0xcc, 0x0f, 0x0a, 0x3d, 0xc7, 0xad, 0x4d, 0xbd, 0x64, + 0x33, 0x69, 0xbd, 0xc0, 0xc0, 0xd7, 0xbb, 0xb0, 0x16, 0x83, 0xbd, 0xd0, 0xbf, + 0x3c, 0x3d, 0x11, 0x62, 0x87, 0x3d, 0x68, 0x04, 0x0f, 0x3d, 0x6e, 0xee, 0x2a, + 0x3d, 0xb8, 0x70, 0x37, 0xbc, 0x62, 0x76, 0x7e, 0x3d, 0x84, 0xbc, 0xa0, 0x3c, + 0xc0, 0xc9, 0x26, 0xbd, 0x82, 0x1a, 0x85, 0xbd, 0x80, 0x55, 0x8e, 0xbd, 0xe4, + 0xdb, 0x48, 0x3d, 0x60, 0xa5, 0xd6, 0x3b, 0x39, 0x18, 0x92, 0x3d, 0x36, 0x5a, + 0x6c, 0xbd, 0xe8, 0x77, 0xcb, 0x3c, 0x48, 0x9e, 0x12, 0x3d, 0x3b, 0x40, 0x91, + 0xbd, 0x00, 0xe0, 0xf6, 0x38, 0xd6, 0xa0, 0x2f, 0xbd, 0xe0, 0xe2, 0x0f, 0xbc, + 0xf4, 0x85, 0x50, 0x3d, 0x64, 0xf7, 0x9b, 0x3c, 0xdc, 0x72, 0x53, 0x3d, 0x28, + 0x0b, 0x45, 0xbc, 0x4e, 0xb5, 0x3f, 0xbd, 0x34, 0x7a, 0xea, 0x3c, 0x58, 0xe1, + 0x71, 0x3c, 0x60, 0x5b, 0xf8, 0xbc, 0xf8, 0x3d, 0x52, 0x3c, 0xd0, 0xdc, 0x67, + 0xbd, 0xee, 0x2d, 0x0c, 0x3d, 0x70, 0x47, 0xb0, 0x3c, 0x70, 0x7c, 0x29, 0x3d, + 0xf4, 0x97, 0xc9, 0x3c, 0x74, 0x63, 0x32, 0x3d, 0x6c, 0x17, 0x94, 0x3c, 0x87, + 0xdc, 0x7a, 0xbd, 0xb6, 0xf5, 0x7c, 0x3d, 0x62, 0xd2, 0xe7, 0xbc, 0x99, 0xa5, + 0x50, 0xbd, 0x4c, 0xa2, 0xb1, 0xbc, 0xf0, 0x38, 0xdd, 0xbb, 0xac, 0x44, 0x3f, + 0xbd, 0x34, 0xb7, 0x06, 0x3d, 0xf6, 0x65, 0x25, 0x3d, 0xdb, 0x01, 0x1e, 0xbd, + 0x68, 0xee, 0x19, 0xbc, 0x4c, 0xdd, 0x8a, 0x3c, 0xe0, 0xe4, 0x14, 0xbc, 0x9e, + 0x6f, 0x21, 0x3d, 0x18, 0xd1, 0x59, 0x3d, 0x0c, 0xdd, 0xe1, 0xbc, 0x84, 0xa1, + 0xe6, 0x3c, 0x5c, 0x56, 0xfa, 0x3c, 0xc4, 0x30, 0x8d, 0x3c, 0x9c, 0xba, 0x12, + 0xbd, 0xe0, 0x85, 0xbf, 0xbc, 0x00, 0x1d, 0x62, 0xbb, 0xe4, 0x7a, 0x13, 0x3d, + 0x36, 0x6c, 0x07, 0x3d, 0x88, 0xb1, 0x2a, 0x3c, 0x06, 0xba, 0x16, 0xbd, 0x24, + 0x12, 0xaf, 0x3c, 0x7c, 0x97, 0x3b, 0xbc, 0xe4, 0x3d, 0x2e, 0xbd, 0x8c, 0x86, + 0xa9, 0xbc, 0x6c, 0x70, 0x06, 0x3d, 0x0b, 0x2c, 0x76, 0xbd, 0x72, 0x24, 0xe8, + 0xbc, 0x22, 0xeb, 0x70, 0x3d, 0xf0, 0xfb, 0x7b, 0x3c, 0x62, 0x51, 0x08, 0xbd, + 0x52, 0x97, 0x88, 0xbd, 0x58, 0x8d, 0x76, 0x3c, 0x3c, 0x79, 0xf1, 0x3c, 0x6c, + 0x9b, 0xbd, 0xbc, 0xa4, 0xf4, 0xe9, 0x3c, 0x80, 0x4d, 0x22, 0x3a, 0x78, 0x12, + 0x81, 0x3c, 0x9a, 0xc5, 0x4a, 0x3d, 0xfa, 0x9b, 0x4a, 0x3d, 0x0c, 0x20, 0x7f, + 0xbd, 0x36, 0x46, 0x06, 0xbd, 0x60, 0x13, 0xbd, 0xbb, 0x8e, 0x08, 0x92, 0xbc, + 0xca, 0x25, 0x1c, 0x3d, 0xb2, 0x84, 0x3f, 0x3d, 0x98, 0x3f, 0x47, 0x3d, 0x58, + 0x18, 0x4b, 0x3d, 0x60, 0x91, 0x63, 0xbb, 0xa2, 0x5c, 0xea, 0xbc, 0xc4, 0x8e, + 0x86, 0x3c, 0x5c, 0x76, 0x91, 0xbd, 0x10, 0xa2, 0x1d, 0xbc, 0xe0, 0xcb, 0xb5, + 0xbb, 0x50, 0xd2, 0xe2, 0x3c, 0x98, 0xbd, 0x88, 0xbd, 0x00, 0xd8, 0x0f, 0x39, + 0x72, 0x33, 0x20, 0x3d, 0x00, 0x13, 0xbd, 0x39, 0xae, 0xc3, 0xd1, 0xbc, 0xec, + 0x7e, 0xb8, 0xbc, 0x78, 0xb4, 0x90, 0xbc, 0xc2, 0x01, 0x68, 0x3d, 0x40, 0x0a, + 0x4f, 0xbb, 0xb7, 0xe6, 0x87, 0x3d, 0x35, 0xe8, 0x85, 0x3d, 0x94, 0x2a, 0xe6, + 0x3c, 0xd8, 0x5c, 0x69, 0x3c, 0x20, 0x8e, 0xc2, 0xbb, 0x4c, 0xa2, 0x92, 0x3c, + 0xd6, 0xc7, 0x73, 0x3d, 0xf8, 0x0c, 0xb8, 0x3c, 0x40, 0x90, 0xb9, 0x3a, 0x2e, + 0x2b, 0x31, 0x3d, 0x18, 0xf5, 0x8a, 0x3c, 0x91, 0x95, 0x5b, 0xbd, 0xc0, 0xfa, + 0xc8, 0x3a, 0x72, 0xf1, 0xa9, 0xbc, 0x36, 0x77, 0x48, 0xbd, 0x73, 0x0d, 0x6c, + 0xbd, 0x70, 0x22, 0xe4, 0xbb, 0x88, 0x5c, 0x28, 0x3d, 0xc6, 0x18, 0x3e, 0x3d, + 0x94, 0x3c, 0xd1, 0xbc, 0x7f, 0x43, 0x15, 0xbd, 0xee, 0x0d, 0x9e, 0xbc, 0x62, + 0xff, 0x29, 0x3d, 0xf0, 0x56, 0xf2, 0x3b, 0x22, 0x3f, 0x4e, 0x3d, 0xb6, 0x94, + 0x39, 0xbd, 0x9e, 0xf1, 0x45, 0xbd, 0x87, 0xdb, 0x85, 0x3d, 0xd8, 0x35, 0x65, + 0x3c, 0xcc, 0x13, 0x8a, 0x3c, 0x44, 0x89, 0x64, 0xbc, 0xe6, 0xb5, 0x2a, 0xbd, + 0x28, 0x4f, 0x69, 0x3c, 0x36, 0x45, 0x53, 0x3d, 0x3a, 0xd2, 0xfe, 0xbc, 0xce, + 0xa8, 0xa2, 0xbc, 0x8a, 0x16, 0x7d, 0xbd, 0xc2, 0xd5, 0xd9, 0xbc, 0xa0, 0x4a, + 0x87, 0xbd, 0x9e, 0xc2, 0x2c, 0x3d, 0xfc, 0x3a, 0xaf, 0x3c, 0x9e, 0x10, 0x40, + 0xbd, 0xe0, 0x3a, 0x82, 0x3b, 0x0c, 0xe4, 0xfc, 0x3c, 0xd8, 0x07, 0x57, 0xbd, + 0xba, 0x34, 0x91, 0xbd, 0xc6, 0x42, 0x51, 0x3d, 0xc0, 0xe9, 0xe1, 0x3b, 0x9c, + 0x4a, 0x2a, 0xbc, 0xc6, 0x92, 0x7b, 0x3d, 0x12, 0x9f, 0x59, 0xbd, 0x0c, 0x62, + 0xfd, 0xbc, 0x6c, 0x1a, 0xe6, 0x3c, 0x72, 0x2c, 0x4b, 0x3d, 0x7a, 0xa5, 0x3b, + 0xbd, 0xfa, 0x37, 0x7b, 0x3d, 0xc0, 0xf0, 0x87, 0xbc, 0x28, 0xd1, 0x5a, 0x3c, + 0xd7, 0x35, 0x6b, 0xbd, 0x7e, 0x9c, 0x6f, 0x3d, 0x1a, 0xf6, 0x23, 0xbd, 0x66, + 0x3b, 0xa2, 0xbc, 0x00, 0xb5, 0x5d, 0xba, 0xbb, 0xc3, 0x52, 0xbd, 0x24, 0x0d, + 0x14, 0x3d, 0x6f, 0x6f, 0x7d, 0xbd, 0x74, 0x88, 0x90, 0xbd, 0xda, 0x8a, 0x68, + 0xbd, 0xb4, 0xe0, 0x5f, 0xbc, 0xb8, 0x32, 0x88, 0xbd, 0x13, 0xc0, 0x81, 0x3d, + 0x2c, 0x07, 0x2e, 0xbd, 0xd0, 0x8a, 0x8a, 0x3b, 0xe2, 0x9e, 0x8a, 0xbd, 0x60, + 0x09, 0x8a, 0x3b, 0xd5, 0x6b, 0x92, 0xbd, 0x90, 0x61, 0x50, 0x3d, 0x62, 0x32, + 0x0f, 0xbd, 0x9b, 0x7c, 0x6f, 0xbd, 0x10, 0x7c, 0xa3, 0x3c, 0x80, 0x22, 0xcc, + 0xbb, 0x20, 0xc6, 0x3a, 0x3d, 0x40, 0xcb, 0x3f, 0x3b, 0xca, 0xa4, 0xdd, 0xbc, + 0xc0, 0x36, 0xbf, 0x3c, 0x40, 0x4f, 0x85, 0x3b, 0x13, 0x52, 0x6c, 0xbd, 0x6b, + 0xa9, 0x6f, 0xbd, 0x58, 0x41, 0x5d, 0xbc, 0xa8, 0x0e, 0x82, 0x3c, 0x7c, 0x92, + 0xf5, 0x3c, 0xfa, 0xd8, 0x5a, 0xbd, 0xcc, 0x79, 0x54, 0x3d, 0xc4, 0x8f, 0x2a, + 0xbc, 0x78, 0xec, 0xdb, 0x3c, 0xf0, 0x95, 0xa9, 0x3b, 0x78, 0x9d, 0xf6, 0xbc, + 0x53, 0x59, 0x55, 0xbd, 0x08, 0x4e, 0xca, 0x3c, 0xcc, 0x95, 0xbb, 0x3c, 0xe4, + 0x91, 0xb4, 0xbc, 0xfb, 0x9d, 0x86, 0xbd, 0x08, 0x68, 0x3f, 0xbc, 0x5d, 0x1b, + 0x84, 0xbd, 0xd0, 0xc8, 0x83, 0x3b, 0x4a, 0x39, 0x54, 0x3d, 0x3c, 0x6e, 0xb6, + 0xbc, 0x70, 0xdd, 0x1b, 0x3c, 0xf4, 0xfc, 0x21, 0xbd, 0x68, 0x25, 0x5e, 0x3c, + 0x01, 0xfc, 0x8e, 0xbd, 0x60, 0xe5, 0x2a, 0x3b, 0x98, 0x51, 0x23, 0xbc, 0x00, + 0xef, 0x0a, 0xba, 0xfc, 0x95, 0x1f, 0xbc, 0xf4, 0x89, 0x55, 0x3d, 0x76, 0x2e, + 0x29, 0x3d, 0xdb, 0x02, 0x86, 0x3d, 0x64, 0xaa, 0x31, 0xbc, 0x7c, 0x3a, 0x9c, + 0xbc, 0x00, 0xf2, 0x64, 0xbd, 0x86, 0xf3, 0x51, 0xbd, 0xc0, 0x2f, 0x9a, 0x3a, + 0xf2, 0xf2, 0xd3, 0xbc, 0x1e, 0x43, 0xcb, 0xbc, 0x6d, 0x44, 0x92, 0x3d, 0x40, + 0xc6, 0x90, 0xba, 0xaa, 0xc9, 0x3e, 0xbd, 0x02, 0xc1, 0x5b, 0x3d, 0x66, 0xeb, + 0x1e, 0x3d, 0xf2, 0x34, 0x63, 0xbd, 0xea, 0xba, 0x66, 0x3d, 0xee, 0x8c, 0x1a, + 0x3d, 0x3b, 0xb9, 0x1e, 0xbd, 0x0a, 0xd2, 0x13, 0x3d, 0xa0, 0xaf, 0x3e, 0x3c, + 0xc0, 0x24, 0x83, 0x3c, 0x90, 0x69, 0xf0, 0xbb, 0x1f, 0x73, 0x86, 0x3d, 0x9d, + 0x21, 0x77, 0xbd, 0x45, 0x4f, 0x8c, 0x3d, 0x40, 0x6d, 0xfe, 0x3c, 0xcb, 0xa5, + 0x8d, 0xbd, 0x00, 0x8d, 0xe5, 0x39, 0x56, 0x9b, 0x55, 0x3d, 0x26, 0x49, 0x5a, + 0xbd, 0x66, 0x93, 0x7a, 0x3d, 0x80, 0x29, 0x4f, 0xba, 0xff, 0xff, 0x82, 0xbd, + 0x50, 0xf9, 0x65, 0x3c, 0x28, 0xa6, 0xb5, 0xbc, 0xdf, 0x70, 0x54, 0xbd, 0x17, + 0xd1, 0x8e, 0xbd, 0x00, 0x3a, 0xb9, 0x3b, 0x26, 0x45, 0x86, 0xbc, 0xad, 0x85, + 0x33, 0xbd, 0x94, 0x78, 0x32, 0x3d, 0x70, 0xcb, 0xa1, 0x3b, 0x40, 0xe5, 0x21, + 0x3d, 0x32, 0xd5, 0xc2, 0xbc, 0xf8, 0x3d, 0x27, 0x3d, 0x28, 0xc0, 0x39, 0xbc, + 0xac, 0xc8, 0x7a, 0xbc, 0xe6, 0xc2, 0xd4, 0xbc, 0x91, 0x81, 0x5c, 0xbd, 0xe1, + 0x6a, 0x90, 0xbd, 0xa9, 0xc8, 0x1d, 0xbd, 0x00, 0x94, 0xcb, 0xb9, 0xe0, 0x0d, + 0x31, 0x3c, 0x00, 0x2a, 0xbe, 0xbb, 0x9a, 0x1e, 0x2a, 0xbd, 0x06, 0xef, 0x7f, + 0x3d, 0xc0, 0xcc, 0x0d, 0x3c, 0xd6, 0x50, 0x74, 0xbd, 0x10, 0x24, 0xcd, 0x3b, + 0x22, 0x4f, 0x0c, 0xbd, 0xc8, 0xf2, 0xaa, 0x3c, 0x9e, 0x84, 0xc8, 0xbc, 0x80, + 0xf2, 0x4e, 0x3c, 0x0c, 0x38, 0x77, 0xbd, 0x6c, 0xab, 0x63, 0xbd, 0xb7, 0x31, + 0x11, 0xbd, 0x25, 0x39, 0x84, 0x3d, 0x31, 0x0b, 0x91, 0x3d, 0xe3, 0x1d, 0x08, + 0xbd, 0x92, 0xb6, 0x1b, 0xbd, 0x65, 0xca, 0x88, 0x3d, 0x1c, 0x62, 0x2c, 0xbd, + 0xda, 0x7b, 0x73, 0x3d, 0xff, 0xbb, 0x85, 0xbd, 0xc4, 0xc7, 0x51, 0x3d, 0x98, + 0xd2, 0x6f, 0xbd, 0x70, 0xa4, 0xe9, 0x3c, 0x74, 0x65, 0xd7, 0x3c, 0x18, 0xdd, + 0x5e, 0x3c, 0x78, 0x1d, 0x04, 0x3d, 0x2c, 0xef, 0x43, 0xbd, 0x48, 0x7d, 0x5e, + 0xbd, 0xd6, 0x02, 0x9f, 0xbc, 0x80, 0x29, 0xa1, 0x3c, 0x70, 0x64, 0x54, 0x3d, + 0x3e, 0xe0, 0x50, 0x3d, 0xd3, 0x7d, 0x2e, 0xbd, 0x64, 0xdf, 0x55, 0xbd, 0x72, + 0x47, 0x8c, 0xbd, 0xfb, 0x45, 0x12, 0xbd, 0xd6, 0x49, 0x9d, 0xbc, 0xca, 0xd5, + 0x67, 0x3d, 0x50, 0xb9, 0xf4, 0x3c, 0x93, 0xca, 0x1f, 0xbd, 0xa7, 0xe1, 0x8f, + 0xbd, 0xcc, 0x00, 0x52, 0x3d, 0x07, 0xd3, 0x20, 0xbd, 0xd0, 0x26, 0x82, 0xbc, + 0x2a, 0x6e, 0x69, 0x3d, 0x0c, 0x67, 0x70, 0xbd, 0xaa, 0x35, 0xe9, 0xbc, 0xae, + 0x97, 0xba, 0xbc, 0xea, 0x69, 0x3d, 0xbd, 0x28, 0xa0, 0x6f, 0xbc, 0x2a, 0x6a, + 0x67, 0x3d, 0x50, 0xd0, 0x6e, 0x3c, 0x16, 0x90, 0x06, 0x3d, 0x4a, 0xdf, 0x3f, + 0x3d, 0xa0, 0x4e, 0x07, 0x3d, 0x48, 0x0d, 0x55, 0xbd, 0x50, 0x0b, 0xc6, 0xbc, + 0xc4, 0xf3, 0x47, 0xbd, 0x90, 0x09, 0xb3, 0xbb, 0x20, 0xe9, 0x7f, 0xbd, 0xbf, + 0x2e, 0x86, 0xbd, 0xba, 0xcf, 0x74, 0x3d, 0x86, 0xd8, 0xf6, 0xbc, 0x20, 0x65, + 0x57, 0x3d, 0x82, 0xc5, 0x50, 0xbd, 0xac, 0x70, 0x41, 0x3d, 0x0e, 0xb0, 0x40, + 0xbd, 0x4c, 0x30, 0x39, 0xbd, 0x80, 0xa0, 0xe5, 0x3c, 0x20, 0xc2, 0x86, 0xbb, + 0xb8, 0x3d, 0x8c, 0x3c, 0xdf, 0x7e, 0x5f, 0xbd, 0xe0, 0xfd, 0x37, 0x3b, 0x0b, + 0x70, 0x15, 0xbd, 0x00, 0xc1, 0x97, 0xba, 0x9a, 0x38, 0x56, 0xbd, 0x32, 0x67, + 0xdb, 0xbc, 0x4a, 0x22, 0x38, 0x3d, 0x12, 0x1c, 0x7f, 0x3d, 0x88, 0x38, 0xee, + 0x3c, 0x0a, 0x76, 0x61, 0x3d, 0x6d, 0xd7, 0x0a, 0xbd, 0xba, 0xb0, 0x3c, 0x3d, + 0x28, 0xbe, 0x91, 0xbc, 0xa8, 0x3e, 0x0b, 0x3c, 0x54, 0x53, 0xb7, 0x3c, 0x50, + 0x41, 0x57, 0x3c, 0xb4, 0x5d, 0x9b, 0x3c, 0x04, 0xb9, 0x18, 0xbd, 0xa8, 0xd5, + 0x9c, 0xbc, 0x7c, 0x5f, 0x15, 0xbd, 0x64, 0xf3, 0x0d, 0x3d, 0x17, 0x85, 0x90, + 0x3d, 0x5d, 0xf4, 0x51, 0xbd, 0x97, 0x93, 0x30, 0xbd, 0x40, 0x65, 0xe6, 0xbb, + 0x20, 0xa7, 0xc3, 0x3c, 0x10, 0xb1, 0x90, 0x3c, 0xc8, 0x2f, 0x36, 0x3c, 0x6b, + 0x38, 0x8e, 0xbd, 0xd6, 0x6c, 0x62, 0x3d, 0x94, 0x52, 0x4b, 0xbd, 0x48, 0xe5, + 0x15, 0x3d, 0x48, 0x7a, 0x3f, 0x3d, 0x60, 0xb0, 0xdf, 0xbb, 0xc2, 0x53, 0x05, + 0xbd, 0xc0, 0xaa, 0x94, 0x3a, 0xf2, 0xef, 0x68, 0xbd, 0xb0, 0x4d, 0x46, 0xbc, + 0xa0, 0xdc, 0x0e, 0x3b, 0x9c, 0x99, 0x5d, 0xbd, 0xd0, 0x37, 0x63, 0xbd, 0x61, + 0x02, 0x03, 0xbd, 0x80, 0x26, 0x51, 0x3a, 0xa0, 0xab, 0xb5, 0xbb, 0x65, 0x1e, + 0x8d, 0x3d, 0xa0, 0x46, 0xc6, 0x3c, 0x00, 0x48, 0xa3, 0x3c, 0x4d, 0xdf, 0x84, + 0x3d, 0x1c, 0xf1, 0x34, 0xbd, 0x1a, 0xb0, 0x00, 0x3d, 0x86, 0x6e, 0x5a, 0x3d, + 0x02, 0xfe, 0x8b, 0xbd, 0x0e, 0x96, 0x32, 0x3d, 0xe6, 0x1e, 0x91, 0xbc, 0x8a, + 0xe9, 0x6b, 0xbd, 0x4c, 0x53, 0x38, 0x3d, 0x39, 0xf5, 0x90, 0xbd, 0x66, 0x81, + 0x7e, 0x3d, 0xec, 0x33, 0xaa, 0xbc, 0x3e, 0xc4, 0x5c, 0x3d, 0xd8, 0x19, 0x87, + 0xbc, 0x70, 0xd6, 0x52, 0x3d, 0x00, 0x6a, 0xab, 0x3a, 0xda, 0x41, 0x81, 0xbc, + 0xf0, 0xbd, 0xe3, 0x3c, 0x38, 0x66, 0x1e, 0x3c, 0x62, 0x7d, 0x8e, 0xbd, 0xa5, + 0x2a, 0x15, 0xbd, 0xf6, 0x6a, 0x72, 0x3d, 0x72, 0x22, 0x33, 0x3d, 0x8c, 0xb7, + 0x8e, 0xbd, 0xe2, 0xf8, 0x6a, 0xbd, 0x01, 0x40, 0x35, 0xbd, 0xb3, 0xe4, 0x79, + 0xbd, 0xdc, 0xb4, 0x65, 0xbc, 0x3d, 0x74, 0x91, 0x3d, 0x94, 0x0a, 0xe8, 0x3c, + 0x16, 0x25, 0x57, 0xbd, 0xd6, 0x05, 0x0b, 0x3d, 0x16, 0x2b, 0x5f, 0x3d, 0x38, + 0x59, 0xcd, 0xbc, 0x8c, 0x9f, 0x0e, 0x3d, 0xac, 0x67, 0x9c, 0x3c, 0x00, 0xe1, + 0xb3, 0x39, 0x1c, 0x2e, 0xf8, 0x3c, 0xed, 0xfd, 0x80, 0x3d, 0xc6, 0x8b, 0x2b, + 0xbd, 0x08, 0x4d, 0xe0, 0x3c, 0xff, 0x55, 0x85, 0x3d, 0x3c, 0xd0, 0xe9, 0x3c, + 0x30, 0x7c, 0x79, 0x3c, 0xd0, 0xf7, 0x8c, 0x3b, 0x82, 0xe9, 0x7d, 0xbd, 0x54, + 0x3f, 0x46, 0x3d, 0xb8, 0x88, 0xc0, 0x3c, 0xc8, 0xf4, 0x35, 0xbc, 0xe9, 0x19, + 0x85, 0x3d, 0x01, 0x5f, 0x62, 0xbd, 0xea, 0x7f, 0x0f, 0x3d, 0xf8, 0x73, 0x42, + 0xbd, 0x41, 0x97, 0x8f, 0x3d, 0x13, 0xec, 0x80, 0x3d, 0xe7, 0xa8, 0x40, 0xbd, + 0x08, 0x47, 0x4b, 0x3c, 0x80, 0xce, 0x77, 0xbc, 0xb6, 0x2d, 0x4f, 0xbd, 0xe0, + 0xa7, 0x0b, 0x3b, 0xda, 0xb6, 0x76, 0x3d, 0xc8, 0xce, 0x14, 0x3c, 0xe0, 0xbf, + 0x20, 0xbb, 0x10, 0xa1, 0x94, 0x3b, 0x02, 0x4e, 0x3f, 0x3d, 0xa0, 0xe9, 0x0c, + 0xbc, 0x6a, 0x57, 0x2b, 0xbd, 0x22, 0x09, 0x1d, 0xbd, 0xa8, 0xa6, 0x4c, 0x3c, + 0x21, 0x7d, 0x40, 0xbd, 0x91, 0xdf, 0x87, 0x3d, 0x65, 0xe4, 0x05, 0xbd, 0xdc, + 0xd6, 0x84, 0xbd, 0x22, 0x49, 0x79, 0x3d, 0xf4, 0xf7, 0x40, 0xbc, 0x2c, 0x16, + 0x86, 0xbc, 0xa8, 0x26, 0x40, 0x3d, 0xaa, 0x89, 0xa9, 0xbc, 0xc4, 0x74, 0xc5, + 0xbc, 0x3c, 0x76, 0x83, 0xbc, 0x2b, 0xf7, 0x90, 0x3d, 0xa8, 0x0c, 0x6f, 0xbc, + 0xdc, 0x96, 0x2c, 0x3d, 0xe0, 0x71, 0x88, 0x3c, 0x66, 0x9f, 0x2a, 0xbd, 0xf1, + 0x10, 0x82, 0x3d, 0x41, 0x73, 0x41, 0xbd, 0x7e, 0x2c, 0x21, 0xbd, 0xf0, 0xea, + 0x08, 0x3c, 0x54, 0xb4, 0x2a, 0xbc, 0xf6, 0xf5, 0x64, 0xbd, 0x46, 0xf9, 0x2a, + 0xbd, 0x54, 0xa4, 0x29, 0x3d, 0x1e, 0x79, 0xee, 0xbc, 0xf5, 0x8b, 0x83, 0x3d, + 0x30, 0x04, 0x10, 0x3d, 0x14, 0x83, 0x4e, 0x3d, 0x67, 0x9f, 0x62, 0xbd, 0x00, + 0x01, 0x10, 0xbd, 0x96, 0xc8, 0x2c, 0x3d, 0x3f, 0x58, 0x8e, 0x3d, 0x34, 0xeb, + 0xe1, 0x3c, 0x12, 0x5d, 0x87, 0xbc, 0x0b, 0x23, 0x80, 0x3d, 0x0a, 0x55, 0x81, + 0xbd, 0xc2, 0x80, 0x16, 0xbd, 0x58, 0xa6, 0x7a, 0x3c, 0xec, 0x9a, 0xf1, 0x3c, + 0xf0, 0x0e, 0xaa, 0x3c, 0xe2, 0x06, 0x9a, 0xbc, 0x20, 0x57, 0xec, 0xbb, 0xe8, + 0x5b, 0xc6, 0x3c, 0x40, 0x51, 0x3b, 0x3c, 0x47, 0xf6, 0x8e, 0x3d, 0x6e, 0xc5, + 0x06, 0xbd, 0xac, 0xf6, 0x2b, 0x3d, 0xec, 0x29, 0x05, 0x3d, 0x76, 0xd9, 0x2e, + 0x3d, 0x7c, 0x02, 0x40, 0xbc, 0x5e, 0x98, 0x8b, 0xbc, 0x20, 0xf8, 0x8b, 0x3c, + 0xcc, 0x04, 0x59, 0xbc, 0xd7, 0xfe, 0x8a, 0x3d, 0xda, 0xed, 0x1a, 0xbd, 0x82, + 0x45, 0x9b, 0xbc, 0xfc, 0xa0, 0x7b, 0xbc, 0x14, 0x19, 0x0a, 0x3d, 0x7c, 0x3a, + 0x7d, 0xbd, 0x46, 0x32, 0x91, 0xbd, 0xc0, 0xea, 0x8b, 0x3c, 0x0e, 0x44, 0x78, + 0x3d, 0x96, 0x53, 0x2a, 0x3d, 0x3a, 0xbb, 0x79, 0x3d, 0x1f, 0xe3, 0x19, 0xbd, + 0x56, 0xbb, 0x67, 0x3d, 0x44, 0x48, 0x86, 0x3c, 0x33, 0x5f, 0x8e, 0xbd, 0xc0, + 0x86, 0x8c, 0xbc, 0xb0, 0x2a, 0x8e, 0x3b, 0x20, 0xd2, 0x8f, 0xbd, 0x16, 0x08, + 0x67, 0x3d, 0x4a, 0xc7, 0x67, 0x3d, 0x50, 0x7c, 0xfd, 0xbc, 0xb0, 0xc1, 0x3f, + 0xbd, 0xc0, 0x77, 0xde, 0x3b, 0x98, 0x6b, 0x98, 0xbc, 0x10, 0x91, 0xa0, 0x3b, + 0x80, 0x9a, 0xed, 0x3c, 0xdd, 0xc9, 0x82, 0x3d, 0x2c, 0x20, 0x4d, 0x3d, 0x05, + 0xe9, 0x78, 0xbd, 0x44, 0xae, 0xcd, 0x3c, 0xd8, 0x92, 0x81, 0x3c, 0x57, 0xa3, + 0x77, 0xbd, 0xbe, 0x2e, 0x65, 0xbd, 0x74, 0xfc, 0x41, 0x3d, 0xa2, 0x99, 0x7b, + 0x3d, 0xe0, 0x55, 0x98, 0x3b, 0xe4, 0xdf, 0xa5, 0x3c, 0xcf, 0x0c, 0x16, 0xbd, + 0x68, 0x3f, 0x78, 0xbd, 0xbe, 0xe3, 0x4e, 0x3d, 0xf4, 0x7f, 0x4a, 0x3d, 0xaa, + 0x64, 0x3b, 0xbd, 0xa7, 0xe7, 0x83, 0xbd, 0xe0, 0x45, 0x60, 0x3b, 0x41, 0x1e, + 0x0c, 0xbd, 0x14, 0xa6, 0x90, 0xbd, 0x71, 0x37, 0x5f, 0xbd, 0x72, 0x90, 0xb8, + 0xbc, 0xc6, 0x6e, 0x3b, 0xbd, 0x4d, 0x5e, 0xe0, 0xbc, 0x40, 0x74, 0x5b, 0xbb, + 0xb2, 0x61, 0x06, 0x3d, 0xc8, 0xd6, 0xc1, 0x3c, 0xa9, 0x80, 0x85, 0xbd, 0x76, + 0xe9, 0x20, 0x3d, 0x1a, 0xcc, 0x80, 0x3d, 0x39, 0x17, 0xdf, 0xbc, 0xe1, 0x45, + 0x8c, 0x3c, 0x67, 0x35, 0x48, 0x3d, 0x9d, 0x17, 0x76, 0xbd, 0x38, 0xa6, 0xb2, + 0xba, 0xad, 0x55, 0xaf, 0x3c, 0xf4, 0x50, 0x5e, 0x3d, 0x02, 0x7b, 0xd9, 0xba, + 0x0a, 0x74, 0x0f, 0xbd, 0xa9, 0x69, 0x54, 0x3d, 0x3e, 0xa8, 0x6c, 0x3d, 0xcc, + 0xde, 0x27, 0xbd, 0x4f, 0x51, 0xa7, 0xbb, 0xbf, 0x78, 0x26, 0xbd, 0x66, 0xcc, + 0x84, 0xbd, 0xce, 0x30, 0xcd, 0xbc, 0xab, 0x28, 0x60, 0x3d, 0x97, 0xdb, 0x31, + 0xbd, 0x6f, 0x6f, 0xc3, 0x3b, 0xe0, 0x7e, 0x8c, 0xbd, 0x06, 0xe2, 0xc0, 0xbc, + 0xce, 0x5b, 0x7a, 0xbd, 0xa5, 0xfb, 0xe1, 0xbc, 0xbd, 0x3b, 0x44, 0xbd, 0x90, + 0xa1, 0xbd, 0x3b, 0xc9, 0xba, 0x34, 0xbc, 0x5f, 0xab, 0x08, 0xbd, 0xf8, 0x5a, + 0x5f, 0x3c, 0x23, 0xbe, 0x8c, 0x3d, 0xbc, 0x19, 0xad, 0xbc, 0xb1, 0xd8, 0x19, + 0xbd, 0x33, 0x7a, 0x85, 0x3d, 0xa5, 0x19, 0xc7, 0x3b, 0x83, 0x55, 0x83, 0xbc, + 0x9d, 0x63, 0x08, 0x3d, 0x36, 0x98, 0x1c, 0x3d, 0x20, 0x2d, 0x2d, 0xbc, 0x6b, + 0xc3, 0x68, 0xbd, 0xbc, 0x22, 0xb6, 0x3c, 0x93, 0xdb, 0xc0, 0x3a, 0x88, 0x17, + 0xdf, 0x3c, 0x0d, 0x0d, 0x2c, 0xbd, 0xc0, 0x40, 0x60, 0x3b, 0xea, 0xf9, 0x3f, + 0xbd, 0x0d, 0xd7, 0x03, 0xbd, 0x45, 0x08, 0x68, 0xbd, 0xb3, 0xa4, 0xe9, 0xbc, + 0xfd, 0xe9, 0x5f, 0x3d, 0x4c, 0x45, 0x0c, 0x3d, 0xff, 0xdb, 0xa3, 0xbc, 0x12, + 0x16, 0x88, 0xbd, 0x70, 0x42, 0xe5, 0xbc, 0x60, 0xda, 0x1c, 0x3c, 0x2b, 0x55, + 0xf8, 0x3b, 0x07, 0x82, 0x87, 0x3c, 0x08, 0x94, 0x83, 0xbd, 0x66, 0xf3, 0x44, + 0x3d, 0x0b, 0xed, 0x10, 0x3c, 0x1b, 0x7e, 0x8f, 0xbd, 0xbe, 0x4c, 0xb5, 0xbc, + 0xc4, 0x84, 0x26, 0x3d, 0x80, 0x5f, 0x6a, 0xbc, 0xb8, 0x41, 0x29, 0x3d, 0xfa, + 0xbc, 0x4a, 0x3d, 0xbe, 0x44, 0x47, 0xbc, 0xc1, 0x9b, 0x21, 0x3d, 0x33, 0xb8, + 0xd7, 0xbc, 0x54, 0xe6, 0x53, 0x3d, 0xd8, 0x95, 0x3d, 0xbd, 0x2b, 0x4d, 0x90, + 0x3d, 0x0c, 0x3c, 0x3a, 0xbc, 0x6c, 0x41, 0x24, 0xbd, 0x31, 0xfd, 0x66, 0xbd, + 0x43, 0x29, 0x4a, 0x3d, 0x00, 0x8d, 0xc3, 0xb9, 0x20, 0xd6, 0xe2, 0xbb, 0xb7, + 0xf6, 0x22, 0xbd, 0xe9, 0xd7, 0x3f, 0x3d, 0x8d, 0xb7, 0xf7, 0x3c, 0x2b, 0x56, + 0x8b, 0x3d, 0xa6, 0xa7, 0x70, 0xbd, 0xdf, 0x62, 0x56, 0x3d, 0xe9, 0x4b, 0xb0, + 0x3c, 0x40, 0xb6, 0x04, 0x3c, 0x34, 0x8c, 0x04, 0xbd, 0xb9, 0x1a, 0x1b, 0x3d, + 0x25, 0xbc, 0x05, 0xbd, 0x3d, 0x10, 0x1c, 0xbd, 0x77, 0x24, 0x8c, 0xbd, 0x53, + 0x9b, 0xdf, 0x3b, 0x80, 0xc9, 0x53, 0x3d, 0x40, 0xc7, 0x6c, 0xbc, 0x00, 0xb3, + 0xbe, 0xba, 0xe5, 0xe9, 0x89, 0x3d, 0xb0, 0x72, 0x88, 0xbd, 0xcd, 0x2d, 0x0c, + 0xbd, 0x27, 0x35, 0x07, 0xbd, 0x6b, 0x6a, 0x49, 0xbd, 0x99, 0x9b, 0x51, 0xbd, + 0x1c, 0x94, 0x51, 0x3c, 0x78, 0x26, 0x6a, 0xbd, 0xc2, 0x3e, 0x04, 0x3d, 0xf3, + 0x19, 0x16, 0xbd, 0x9c, 0xb7, 0x0b, 0xbd, 0xb8, 0x3d, 0xf9, 0x3c, 0x69, 0xdb, + 0x14, 0x3d, 0x0a, 0xe3, 0x0f, 0xbd, 0x1a, 0xd5, 0x80, 0xbd, 0xed, 0x79, 0x8d, + 0x3c, 0x1b, 0x21, 0x00, 0xbb, 0x9a, 0x88, 0x0e, 0x3d, 0xc0, 0x1c, 0x66, 0x3d, + 0x60, 0x74, 0x82, 0xbd, 0x7b, 0x96, 0x1c, 0x3d, 0x53, 0x16, 0x49, 0x3d, 0xeb, + 0xfc, 0x8d, 0x3d, 0xb0, 0x52, 0x32, 0x3c, 0xa0, 0xa5, 0x5a, 0xbd, 0xfe, 0xf7, + 0x9c, 0xbc, 0x19, 0x78, 0x4a, 0x3c, 0x78, 0xd1, 0xc2, 0x3c, 0xb4, 0x51, 0x91, + 0xbd, 0x47, 0x08, 0x76, 0xbd, 0x7e, 0x70, 0x02, 0x3d, 0x8b, 0x90, 0x80, 0xbd, + 0xc0, 0xad, 0x10, 0xbd, 0xc6, 0x2e, 0x4d, 0xbd, 0x0e, 0xe4, 0x0b, 0x3d, 0x9e, + 0x8e, 0x8f, 0x3b, 0xd6, 0x81, 0x8a, 0xbd, 0xb9, 0x43, 0x05, 0xbd, 0xfd, 0xb4, + 0x3d, 0xbd, 0x69, 0x1b, 0xa9, 0xbb, 0x0b, 0xb6, 0x88, 0xbd, 0xe3, 0x8f, 0x64, + 0x3d, 0xd9, 0xda, 0x4d, 0x3c, 0xa8, 0xa9, 0x66, 0xbd, 0x87, 0x10, 0x23, 0x3d, + 0xf6, 0x03, 0x3b, 0x3d, 0xa4, 0xcb, 0x83, 0x3c, 0x36, 0xd0, 0x2a, 0xbd, 0x22, + 0x31, 0x27, 0x3d, 0xf0, 0xfb, 0x18, 0x3d, 0x8e, 0xa1, 0x04, 0x3d, 0x67, 0x0e, + 0x67, 0xbc, 0x77, 0x07, 0x90, 0x3d, 0xaf, 0x11, 0x72, 0x3d, 0x7b, 0xdd, 0x80, + 0x3d, 0x18, 0xd2, 0x6e, 0xbc, 0x0c, 0xfa, 0x5e, 0xbd, 0xe8, 0x92, 0xaf, 0xbc, + 0x8f, 0x89, 0xe9, 0x3c, 0x15, 0x06, 0x1d, 0x3c, 0x02, 0x7f, 0x81, 0x3d, 0x88, + 0xe0, 0x0f, 0xbd, 0x16, 0x6a, 0xab, 0xbc, 0xc4, 0x1f, 0xdf, 0x3c, 0x38, 0xab, + 0x4b, 0x3c, 0x40, 0xfd, 0x83, 0x3b, 0x71, 0x9a, 0x52, 0xbd, 0x90, 0x3f, 0x04, + 0xbd, 0xe4, 0x23, 0x81, 0x3d, 0x4a, 0xaa, 0x39, 0xbd, 0xc1, 0xb6, 0x7c, 0x3d, + 0xa4, 0xb4, 0x2d, 0x3d, 0x3c, 0x8b, 0xea, 0x3b, 0xf3, 0x93, 0x8e, 0x3d, 0x9b, + 0xea, 0x87, 0xbc, 0x25, 0x22, 0x91, 0xbd, 0xeb, 0x03, 0x1a, 0x3d, 0xde, 0xb3, + 0x41, 0x3d, 0xb3, 0x03, 0x59, 0xbd, 0x98, 0xea, 0x1d, 0xbd, 0xaf, 0x46, 0xd9, + 0xbc, 0xc0, 0x55, 0x3e, 0xbd, 0x4d, 0xe2, 0x45, 0x3d, 0x85, 0xa0, 0x44, 0x3c, + 0x00, 0xe5, 0x3e, 0xbd, 0x6f, 0x4e, 0x4b, 0xbb, 0xe1, 0xcd, 0x86, 0x3c, 0x90, + 0xaa, 0x08, 0xbd, 0xb6, 0xb9, 0x7a, 0x3d, 0x45, 0x80, 0x5c, 0x3d, 0xda, 0x7b, + 0x28, 0xbd, 0x4e, 0x73, 0xc1, 0xbc, 0x8b, 0xff, 0x1b, 0x3d, 0xe0, 0xad, 0x71, + 0xbc, 0x5c, 0xa3, 0xd3, 0xbc, 0x93, 0x08, 0x85, 0x3d, 0xce, 0x42, 0x3a, 0x3d, + 0x31, 0x10, 0x86, 0x3d, 0x28, 0x95, 0x86, 0x3a, 0x81, 0x0e, 0x39, 0xbd, 0xa6, + 0xb2, 0x57, 0x3d, 0x97, 0xab, 0xf8, 0xbc, 0x53, 0x5b, 0x9f, 0xbc, 0x79, 0x78, + 0x54, 0x3d, 0xdc, 0x5b, 0x8b, 0x3d, 0xf5, 0xe7, 0x2d, 0x3d, 0xe7, 0x23, 0xa4, + 0xbc, 0x6a, 0xff, 0x83, 0x3d, 0x53, 0xe7, 0x48, 0x3d, 0x27, 0x3c, 0x8c, 0x3d, + 0x44, 0xdf, 0x74, 0xbd, 0x58, 0xe8, 0xf3, 0xbc, 0x4c, 0x9f, 0x57, 0x3c, 0x6c, + 0xb6, 0x95, 0x3c, 0xbd, 0x8e, 0x65, 0x3d, 0x11, 0x3e, 0xcb, 0x3c, 0x88, 0x0e, + 0x02, 0xbd, 0x68, 0x1c, 0x8d, 0xbb, 0xe9, 0xaa, 0x81, 0x3d, 0x00, 0xcc, 0x35, + 0xbd, 0x4f, 0x0b, 0x8f, 0xbd, 0xa4, 0xaa, 0x40, 0xbc, 0x0a, 0x00, 0xac, 0xbc, + 0xe2, 0x2a, 0x40, 0xbd, 0xc3, 0xff, 0x05, 0xbd, 0x09, 0xbe, 0x65, 0xbd, 0xe6, + 0xde, 0x7e, 0xbd, 0x30, 0x36, 0x17, 0x3c, 0x50, 0x30, 0x0e, 0xbc, 0x64, 0x36, + 0xfa, 0x3c, 0x9d, 0x5a, 0x85, 0xbb, 0x50, 0x2c, 0x65, 0xbc, 0x90, 0x5a, 0xae, + 0xbb, 0x37, 0xe6, 0x41, 0xbd, 0xfd, 0x21, 0xf7, 0xbc, 0xb5, 0x91, 0x8b, 0xbb, + 0x15, 0xaa, 0xbe, 0x3c, 0x86, 0x46, 0x78, 0xbd, 0xd4, 0x41, 0xf8, 0xbc, 0xf2, + 0xb7, 0xe4, 0x3c, 0x1b, 0x84, 0x5a, 0x3c, 0x5a, 0xc8, 0x5e, 0x3d, 0x74, 0xad, + 0xa8, 0x3c, 0x71, 0xbe, 0xa0, 0xbc, 0x9b, 0xaf, 0x2b, 0x3d, 0x43, 0x1b, 0x69, + 0xbd, 0xb3, 0xe7, 0x88, 0x3d, 0xbd, 0xe2, 0x5c, 0x3d, 0x6b, 0xa4, 0x35, 0xbd, + 0xe9, 0xbc, 0x8f, 0xbd, 0x16, 0xc0, 0x74, 0x3d, 0x92, 0xb9, 0x4c, 0x3d, 0x5d, + 0xee, 0x91, 0x3c, 0x74, 0xda, 0x1d, 0xbd, 0xda, 0x42, 0x5a, 0xbb, 0x70, 0x1b, + 0xbc, 0x3c, 0xc3, 0x23, 0xd9, 0xba, 0x6c, 0xf4, 0xa4, 0x3c, 0x9c, 0x95, 0x0a, + 0x3d, 0xb8, 0x03, 0x9e, 0x3c, 0x05, 0x7b, 0x84, 0x3d, 0x88, 0x24, 0x29, 0x3d, + 0x6e, 0xb3, 0x72, 0x3d, 0x36, 0x31, 0x62, 0x3c, 0xea, 0x27, 0x24, 0xbd, 0x6d, + 0xf3, 0xe5, 0x3c, 0x2e, 0x24, 0x1f, 0x3d, 0x69, 0x95, 0x6b, 0xbd, 0xa6, 0xdf, + 0x42, 0xba, 0xdd, 0x6e, 0x90, 0xbd, 0xb3, 0x52, 0x00, 0xbd, 0xbe, 0x22, 0x02, + 0x3d, 0xbf, 0x61, 0x80, 0xbd, 0x8d, 0xde, 0x82, 0x3d, 0xf4, 0x40, 0x28, 0x3d, + 0x7b, 0xeb, 0xb7, 0xba, 0xe1, 0x73, 0x94, 0x3c, 0xae, 0x7f, 0x12, 0xba, 0x02, + 0xf0, 0x40, 0xbb, 0xf1, 0xb7, 0x05, 0x3d, 0x0d, 0xbb, 0x6b, 0xbd, 0xe2, 0x4f, + 0x12, 0xbd, 0x0a, 0x66, 0x09, 0xbd, 0xb7, 0xe9, 0x8f, 0x3d, 0x0d, 0x7c, 0x14, + 0x3d, 0x11, 0xf4, 0xbe, 0xba, 0x09, 0x4d, 0x38, 0xbd, 0x80, 0x94, 0x41, 0x3a, + 0xd3, 0x89, 0xc2, 0x3c, 0xd8, 0x3a, 0x3d, 0x3c, 0x28, 0x00, 0x5f, 0xbc, 0xc4, + 0x2a, 0x91, 0xbc, 0x50, 0x98, 0xe6, 0xbc, 0xfa, 0x52, 0x16, 0x3d, 0x3c, 0xb5, + 0x87, 0x3d, 0xed, 0xcf, 0x70, 0x3c, 0x78, 0x9e, 0x72, 0xbb, 0x93, 0x6b, 0x23, + 0x3d, 0xf0, 0xaf, 0x64, 0xbd, 0xce, 0xd7, 0x5e, 0xbd, 0x6c, 0x20, 0x7b, 0xbc, + 0xd0, 0x7a, 0xe0, 0xbb, 0x60, 0xfd, 0xef, 0x3b, 0x95, 0xe5, 0x5f, 0xbd, 0xdf, + 0x49, 0x33, 0x3c, 0x11, 0x3d, 0x80, 0x3d, 0xd4, 0x04, 0xc8, 0x3c, 0x58, 0xc0, + 0x41, 0xbd, 0x50, 0x35, 0x63, 0x3d, 0xd2, 0x8a, 0xc8, 0xbc, 0x67, 0xf0, 0x8b, + 0xbd, 0x69, 0x02, 0x55, 0x3d, 0x0c, 0xa1, 0x76, 0xbd, 0xa8, 0x5e, 0x05, 0xbb, + 0xd0, 0xc3, 0x16, 0x3d, 0x78, 0x7f, 0x23, 0xbc, 0x59, 0x25, 0x5c, 0xbd, 0xb4, + 0xaf, 0x36, 0xbd, 0x26, 0xc1, 0xd0, 0xb9, 0xa3, 0xb9, 0x54, 0x3d, 0xd3, 0x99, + 0xea, 0xbc, 0x56, 0x87, 0xfc, 0xbc, 0x86, 0x17, 0x16, 0xbd, 0x80, 0x75, 0x17, + 0xbd, 0xe9, 0xe9, 0x26, 0xbd, 0x73, 0xd9, 0x7f, 0xbd, 0x78, 0xf7, 0x08, 0x3d, + 0xb4, 0x6e, 0x24, 0x3d, 0xdb, 0x78, 0x04, 0x3d, 0x91, 0x4e, 0x5e, 0x3d, 0x93, + 0x73, 0x86, 0x3d, 0xd5, 0xc8, 0x41, 0xbd, 0x18, 0x68, 0x79, 0x3d, 0x1e, 0x5e, + 0x74, 0xbd, 0x05, 0x92, 0x43, 0x3d, 0xed, 0xd7, 0xcb, 0x3c, 0x90, 0x04, 0x48, + 0xbd, 0x2a, 0x81, 0x59, 0xbd, 0xa6, 0xf8, 0x8f, 0xbd, 0x21, 0x1b, 0x82, 0x3d, + 0x47, 0x2f, 0x03, 0xbd, 0x49, 0x8a, 0xea, 0x3b, 0x82, 0x20, 0x29, 0x3d, 0x3e, + 0x06, 0x0a, 0x3b, 0x0d, 0xe3, 0x93, 0x3c, 0x3f, 0xb2, 0x83, 0x3d, 0x57, 0x42, + 0xe4, 0x3b, 0x02, 0x82, 0xde, 0xbc, 0x75, 0x96, 0x0a, 0xbd, 0x66, 0xb5, 0x0a, + 0x3d, 0x11, 0xed, 0x8d, 0xbd, 0xc5, 0x7c, 0x61, 0xbd, 0x85, 0xde, 0x56, 0xbc, + 0x2f, 0x3e, 0x41, 0xbd, 0x65, 0x92, 0x70, 0x3d, 0x10, 0x6d, 0xd8, 0xbb, 0x6e, + 0x7b, 0x45, 0x3d, 0xe0, 0xcd, 0x58, 0x3d, 0x5a, 0xa0, 0x6c, 0xbd, 0x25, 0x13, + 0x2f, 0xbd, 0x95, 0xcf, 0x6b, 0xbd, 0x42, 0x36, 0x20, 0xbc, 0x3c, 0x82, 0x47, + 0x3c, 0x71, 0xef, 0x16, 0x3c, 0x50, 0xa2, 0xb8, 0xba, 0x7e, 0xc4, 0x61, 0x3c, + 0xa6, 0xc5, 0x78, 0xbd, 0xb9, 0x33, 0x32, 0xbd, 0x47, 0x60, 0x81, 0x3d, 0x58, + 0xd9, 0x16, 0x3d, 0x3a, 0x50, 0x7a, 0xbd, 0x47, 0xc7, 0x15, 0x3d, 0x00, 0xca, + 0x8a, 0xbd, 0x6f, 0x8f, 0x83, 0xbd, 0x7b, 0x4f, 0x58, 0xba, 0x30, 0x8f, 0x43, + 0xbd, 0xd1, 0x28, 0xd6, 0xbb, 0x20, 0x94, 0xf7, 0xbc, 0x84, 0xef, 0x25, 0xbd, + 0x06, 0x79, 0x6f, 0x3d, 0xdb, 0x3e, 0xcd, 0x3c, 0xc7, 0xce, 0x79, 0x3d, 0x23, + 0x71, 0x97, 0xbc, 0x5c, 0x5c, 0x38, 0x3d, 0xc8, 0xb6, 0x03, 0xbd, 0xd6, 0x31, + 0xc6, 0xbc, 0x33, 0xe1, 0xd0, 0xbb, 0x66, 0xf2, 0xd5, 0xbc, 0xe2, 0x07, 0x49, + 0x3d, 0x2c, 0x67, 0xc9, 0xbc, 0x71, 0xd2, 0x41, 0xbd, 0x1a, 0xb4, 0x81, 0x3c, + 0xf0, 0x27, 0x7d, 0x3d, 0xca, 0xcc, 0xd5, 0xbc, 0x3f, 0x3e, 0x30, 0xbd, 0x50, + 0xe1, 0x26, 0xba, 0x53, 0x7d, 0x00, 0x3d, 0x8e, 0x75, 0x4d, 0x3b, 0x0a, 0x56, + 0x20, 0x3d, 0x61, 0xaf, 0xf4, 0xbc, 0x55, 0x41, 0x98, 0xbc, 0x16, 0x66, 0x13, + 0x3d, 0x40, 0x96, 0x67, 0xbd, 0x40, 0x3a, 0x0b, 0xbd, 0xbe, 0x16, 0x88, 0xbc, + 0x54, 0xd1, 0x56, 0xbd, 0xd5, 0xa2, 0xba, 0xbb, 0x97, 0x30, 0x1f, 0xbb, 0x37, + 0x2d, 0x18, 0xbd, 0xe7, 0xe3, 0x8e, 0xbd, 0x82, 0x9b, 0x29, 0x3c, 0x8f, 0x41, + 0x24, 0xbd, 0xa2, 0x55, 0x8f, 0x3b, 0x25, 0xa4, 0x18, 0x3c, 0xb6, 0xee, 0xe7, + 0x3c, 0x3a, 0x0b, 0x12, 0xbd, 0x27, 0xfb, 0xb4, 0xb9, 0x70, 0x41, 0x0a, 0xbc, + 0xe8, 0x8b, 0x62, 0xbd, 0x04, 0x95, 0xc5, 0x3c, 0xa4, 0x51, 0x46, 0xbd, 0x42, + 0x1e, 0x65, 0xbd, 0x4f, 0x3d, 0x4a, 0x3d, 0x6f, 0x9d, 0x19, 0x3d, 0xb8, 0xdb, + 0x8c, 0xbd, 0x9a, 0xfe, 0x23, 0x3c, 0x0c, 0x8a, 0x58, 0x3d, 0xe2, 0x61, 0x62, + 0xbd, 0x1f, 0xee, 0x64, 0x3c, 0x0c, 0xb0, 0x9a, 0x3b, 0xe8, 0x9f, 0xf7, 0xbc, + 0x54, 0xf9, 0xef, 0xbc, 0xbb, 0x3b, 0x57, 0x3a, 0xcc, 0x92, 0xa6, 0x3c, 0xfa, + 0x7f, 0xf0, 0x3c, 0x92, 0x0c, 0x03, 0x3d, 0xc4, 0xa7, 0x0b, 0xbd, 0x3d, 0xf1, + 0x8b, 0xbd, 0x6a, 0x7a, 0x4c, 0xbd, 0xfe, 0x96, 0xdc, 0x3c, 0xf8, 0x93, 0x99, + 0x3b, 0xe4, 0xd7, 0x70, 0x3d, 0x72, 0x25, 0x4f, 0x3d, 0xc0, 0xa1, 0x80, 0xbd, + 0xb8, 0xac, 0x50, 0x3d, 0x87, 0x18, 0x87, 0xbc, 0xcc, 0xe2, 0x01, 0xbd, 0x70, + 0x67, 0xfb, 0xbb, 0xda, 0x29, 0x7c, 0x3d, 0xe6, 0xf0, 0x67, 0x3d, 0x98, 0xd8, + 0x0e, 0x3d, 0xe8, 0xf6, 0x45, 0xbd, 0xcc, 0x76, 0x57, 0xbd, 0x12, 0xec, 0x02, + 0x3d, 0x02, 0x73, 0xbf, 0x3c, 0xea, 0x67, 0x9e, 0x3a, 0x29, 0x29, 0x1f, 0x3d, + 0x19, 0x65, 0x2a, 0x3d, 0x9c, 0x3a, 0x86, 0x3d, 0xd8, 0xcd, 0x15, 0xbd, 0xf3, + 0xed, 0x75, 0xbd, 0xa6, 0x30, 0xff, 0xbc, 0x87, 0x2e, 0xc7, 0x3c, 0xe6, 0x41, + 0xb9, 0x3c, 0x38, 0xf9, 0xb0, 0x3c, 0x49, 0x88, 0x8c, 0xbd, 0xf2, 0x2b, 0x70, + 0x3d, 0x3d, 0x58, 0xec, 0x3b, 0xa2, 0x59, 0x3a, 0x3c, 0x3f, 0x5f, 0x3a, 0x3d, + 0x5f, 0xb9, 0x48, 0xbd, 0x09, 0x9a, 0xc5, 0x3b, 0x12, 0x63, 0x84, 0xbd, 0x11, + 0x76, 0x5e, 0x3d, 0x4f, 0xa0, 0x84, 0x3d, 0x90, 0x8b, 0x29, 0xbd, 0x03, 0xcc, + 0x2c, 0xbd, 0xbe, 0x89, 0x8f, 0xbd, 0xa5, 0x7a, 0x81, 0x3d, 0x54, 0xa8, 0xd0, + 0x3c, 0x54, 0x70, 0x9d, 0xbb, 0x4a, 0xe4, 0xb9, 0xbc, 0x94, 0x65, 0xfe, 0xbc, + 0x3c, 0xef, 0xac, 0x3c, 0x4c, 0x87, 0x16, 0xbd, 0x0a, 0xda, 0x85, 0xbc, 0x89, + 0x04, 0x88, 0x3d, 0xb6, 0xe7, 0x19, 0x3d, 0x38, 0x06, 0x08, 0xbd, 0x37, 0x6c, + 0x3d, 0xbd, 0x75, 0x70, 0x09, 0x3d, 0x13, 0x5c, 0x7f, 0xbd, 0xe2, 0x25, 0xfb, + 0x3c, 0x74, 0xe4, 0x06, 0x3d, 0xd8, 0xcb, 0x82, 0x3d, 0xbc, 0xa0, 0xeb, 0xbc, + 0xaf, 0xb1, 0x8e, 0xbd, 0x30, 0x53, 0xdc, 0x3b, 0x4b, 0x94, 0x84, 0x3d, 0xc9, + 0x6d, 0xcd, 0x3c, 0xd1, 0x47, 0x8e, 0x3d, 0x5e, 0x1a, 0x15, 0xbc, 0x0b, 0xe3, + 0xb2, 0x3c, 0x4c, 0x7f, 0xfb, 0x3c, 0x6e, 0x6d, 0x53, 0x3d, 0xdc, 0xa5, 0x8d, + 0x3d, 0x71, 0x25, 0x85, 0xbd, 0xc8, 0xa9, 0x17, 0xbc, 0xe1, 0xcd, 0xf3, 0xbc, + 0xbd, 0xc5, 0x5f, 0xbd, 0xde, 0xbc, 0x07, 0x3d, 0x2a, 0x50, 0x91, 0x3c, 0x12, + 0x64, 0x9a, 0x3b, 0x54, 0x8b, 0x02, 0x3d, 0x2d, 0x77, 0x8b, 0xbd, 0x83, 0x37, + 0x82, 0x3d, 0x5f, 0xdb, 0x50, 0xbd, 0xba, 0xe6, 0x63, 0x3d, 0x2d, 0x97, 0x21, + 0x3d, 0xfe, 0xba, 0x80, 0x3d, 0xe4, 0xc2, 0x39, 0xbd, 0x8d, 0x37, 0x94, 0x3c, + 0x8d, 0xe8, 0xb0, 0xbc, 0x0e, 0xbc, 0xa9, 0xbc, 0xbb, 0xfb, 0xb1, 0xbb, 0xff, + 0xdb, 0x13, 0xbd, 0x15, 0x1e, 0x1f, 0xbd, 0xe6, 0x81, 0x51, 0xbd, 0xf1, 0x39, + 0xaf, 0xbc, 0x86, 0x69, 0x68, 0xbd, 0x33, 0x5c, 0xe8, 0x3c, 0x25, 0xd3, 0x5d, + 0xbd, 0x77, 0xf4, 0x0e, 0xbd, 0x5f, 0x4b, 0xec, 0x3c, 0xc4, 0x6c, 0xfc, 0x3c, + 0x39, 0x1e, 0xc9, 0x3c, 0x2c, 0xdc, 0x6f, 0xbd, 0xf0, 0xdd, 0x5b, 0x3c, 0xba, + 0x58, 0x63, 0x3d, 0x20, 0xb8, 0x9c, 0x3b, 0x58, 0x4e, 0xb6, 0xbc, 0x47, 0x2d, + 0xc4, 0xbc, 0x0c, 0x5b, 0x6b, 0x3d, 0x00, 0x18, 0xed, 0xb9, 0x96, 0xa9, 0x9e, + 0x3c, 0x42, 0x5c, 0x4a, 0xbb, 0x94, 0x9f, 0x85, 0xbd, 0x10, 0xdd, 0xcd, 0x3c, + 0x47, 0x98, 0x8c, 0xbd, 0x28, 0x33, 0x6f, 0xbd, 0x6c, 0x52, 0x21, 0x3d, 0x41, + 0x5c, 0x45, 0x3c, 0xf7, 0x7c, 0x36, 0xbd, 0x6d, 0xf5, 0xdb, 0xbc, 0x30, 0x95, + 0x87, 0x3d, 0xed, 0x8a, 0x8f, 0xbd, 0x79, 0x78, 0x88, 0xbd, 0x0c, 0x54, 0x1c, + 0xbc, 0x82, 0xa3, 0xa7, 0x3b, 0x1f, 0xcf, 0x76, 0xbd, 0x71, 0x23, 0x8b, 0x3c, + 0x01, 0xc3, 0x87, 0x3d, 0x54, 0xb5, 0xe5, 0x3c, 0x3e, 0x2f, 0x17, 0xbd, 0x99, + 0xb5, 0x13, 0x3d, 0x69, 0xf7, 0xad, 0x3c, 0xb1, 0x19, 0x13, 0xbc, 0x0e, 0xf8, + 0x5b, 0xbd, 0x74, 0x52, 0x82, 0x3d, 0x7a, 0x5f, 0xfd, 0xbb, 0x2b, 0x17, 0x15, + 0xbd, 0x05, 0x3c, 0x72, 0xbd, 0x18, 0xbd, 0xb9, 0xba, 0xaf, 0x8e, 0xc5, 0xbc, + 0x7a, 0x8f, 0xc3, 0xbb, 0xd9, 0x64, 0x14, 0xbd, 0x97, 0xdf, 0x55, 0x3d, 0x99, + 0x96, 0xac, 0xba, 0x4f, 0x5c, 0x84, 0x3d, 0xa4, 0x57, 0x27, 0x3d, 0xf8, 0x8e, + 0x81, 0xbd, 0xf8, 0xef, 0x55, 0x3c, 0x0e, 0x2d, 0x59, 0xbd, 0xf1, 0xeb, 0x52, + 0x3a, 0x06, 0xde, 0x94, 0x3c, 0x53, 0x8e, 0x17, 0xbd, 0x5d, 0x25, 0x86, 0x3c, + 0x1c, 0x8c, 0x8b, 0xbc, 0x32, 0xa0, 0x1c, 0x3d, 0x2e, 0xb3, 0x53, 0x3d, 0x2e, + 0x1c, 0x3f, 0x3d, 0x38, 0xb0, 0xf1, 0x3c, 0x95, 0xc2, 0x55, 0xbb, 0x74, 0x05, + 0x39, 0xbd, 0x4a, 0xa6, 0x27, 0x3b, 0xb3, 0x63, 0xd8, 0x3c, 0xd6, 0x03, 0x83, + 0x3d, 0x24, 0x65, 0x49, 0xbd, 0x18, 0x9e, 0xee, 0x3c, 0x26, 0xf0, 0x85, 0xbd, + 0xfc, 0xd0, 0x67, 0xbd, 0x43, 0xca, 0x12, 0xbd, 0xb1, 0xec, 0x03, 0x3d, 0x00, + 0x1e, 0x74, 0x3c, 0xb5, 0x32, 0xa6, 0xbc, 0x3d, 0x56, 0x65, 0x3d, 0x8b, 0x0e, + 0xa9, 0xbc, 0x03, 0x1e, 0x91, 0x3d, 0x64, 0x8f, 0x88, 0x3d, 0x1c, 0x50, 0xb5, + 0xbc, 0xe4, 0xb3, 0x05, 0xbd, 0x2c, 0x4f, 0x59, 0xbd, 0x29, 0x30, 0x23, 0xbd, + 0x0c, 0x23, 0x56, 0xbd, 0x7d, 0x77, 0x82, 0xbc, 0x45, 0x1a, 0xa4, 0x3c, 0xb7, + 0x9c, 0x0f, 0xbc, 0xc5, 0x76, 0xd8, 0xbc, 0x7f, 0x4f, 0x78, 0xbd, 0xb4, 0x07, + 0x82, 0x3c, 0x56, 0xcc, 0x6a, 0xbd, 0xc3, 0x11, 0x29, 0x3c, 0xa5, 0xf6, 0x7a, + 0x3d, 0x8a, 0x88, 0xc4, 0x3c, 0x00, 0xf8, 0xa2, 0xbc, 0x30, 0x08, 0x50, 0xbd, + 0x59, 0xcf, 0xb1, 0xbc, 0xd1, 0xba, 0x52, 0xbd, 0xc0, 0xe8, 0xbe, 0x3b, 0xc3, + 0xb8, 0xfe, 0xbc, 0x22, 0xc5, 0x84, 0xbd, 0xef, 0x51, 0xbd, 0x3a, 0x75, 0x42, + 0xc8, 0xbc, 0x1a, 0x32, 0x88, 0x3d, 0x2a, 0x26, 0xc2, 0xbc, 0x66, 0x17, 0x2a, + 0xbd, 0x1d, 0x0f, 0x7f, 0x3d, 0x55, 0x2f, 0x8f, 0x3b, 0x01, 0x47, 0x8c, 0x3d, + 0x3a, 0x01, 0x18, 0x3d, 0xca, 0xa0, 0xea, 0xbc, 0x3e, 0x16, 0x34, 0xbd, 0xe8, + 0xf7, 0x75, 0x3c, 0x20, 0xee, 0x49, 0x3c, 0x6a, 0xc1, 0x3b, 0xbd, 0xa0, 0x98, + 0x5c, 0xbd, 0x60, 0x8e, 0x94, 0x3b, 0xa2, 0x9b, 0x8a, 0x3d, 0x10, 0x4d, 0x4f, + 0x3d, 0x87, 0xe4, 0x45, 0xbd, 0xb6, 0x17, 0xdd, 0x3b, 0xee, 0x06, 0x71, 0xbd, + 0xca, 0xb4, 0xe0, 0x3c, 0xd4, 0x9d, 0x0b, 0xbd, 0xba, 0x3a, 0x21, 0x3d, 0x6c, + 0xfd, 0xaa, 0x3c, 0x35, 0x20, 0x61, 0xbd, 0x20, 0x51, 0x52, 0x3d, 0x96, 0xcc, + 0x29, 0xbd, 0x9f, 0x99, 0x22, 0x3d, 0x06, 0x2d, 0xdb, 0xba, 0xdb, 0xf1, 0x90, + 0x3c, 0xf9, 0x05, 0x06, 0x3d, 0xdf, 0x02, 0xcb, 0x3c, 0x02, 0xb8, 0xf8, 0xbc, + 0x70, 0x14, 0x50, 0xbd, 0x51, 0xdc, 0x88, 0x3d, 0xa8, 0xa5, 0xd6, 0xbc, 0x69, + 0xd7, 0x8e, 0x3d, 0xbe, 0x91, 0x86, 0xbd, 0x5d, 0x93, 0x12, 0xbd, 0x7c, 0x23, + 0x60, 0xbd, 0xb2, 0x55, 0xb7, 0x3c, 0x38, 0xb8, 0x0e, 0x3d, 0x88, 0x86, 0x0e, + 0x3c, 0x9a, 0x4b, 0x0d, 0x3d, 0x00, 0xfa, 0x1a, 0x3b, 0xb8, 0x59, 0xbf, 0x3c, + 0xbe, 0xa8, 0xea, 0x3c, 0xfc, 0xf4, 0xf3, 0x3c, 0xbf, 0x69, 0x17, 0x3d, 0x82, + 0xe6, 0x84, 0xbd, 0x9d, 0xde, 0x3e, 0xbd, 0x3a, 0x02, 0x5b, 0xbd, 0x04, 0x34, + 0x8b, 0xbd, 0x83, 0x26, 0xc5, 0x3c, 0x71, 0x0c, 0x17, 0x3d, 0x44, 0x33, 0x5a, + 0xbd, 0xe0, 0x15, 0xe4, 0x3b, 0xd9, 0x25, 0x80, 0xbd, 0xbb, 0xac, 0x56, 0xbd, + 0x54, 0x26, 0x6f, 0xbd, 0x30, 0x23, 0xa2, 0x3b, 0x08, 0x7c, 0x27, 0xbd, 0xba, + 0x00, 0xde, 0xbc, 0x80, 0x47, 0x8f, 0xbd, 0xca, 0x52, 0x17, 0xbd, 0xf0, 0x9a, + 0x0a, 0x3d, 0xe9, 0x6a, 0xea, 0x3b, 0x12, 0xaa, 0x65, 0x3d, 0x3e, 0x1a, 0x49, + 0x3d, 0x3b, 0x68, 0x30, 0xbd, 0xfb, 0x34, 0x3d, 0x3d, 0x0c, 0x21, 0xe3, 0x3c, + 0x13, 0x68, 0x67, 0xbb, 0xe5, 0xaf, 0x8b, 0xbd, 0xfe, 0x2b, 0x00, 0xbd, 0x5e, + 0x1e, 0x4a, 0xbd, 0xb2, 0x94, 0x70, 0x3d, 0xa0, 0x7e, 0x47, 0x3b, 0xde, 0xa9, + 0xef, 0xbc, 0x84, 0x2f, 0x1a, 0x3a, 0x26, 0xb6, 0xf8, 0x3c, 0xe4, 0xab, 0xd9, + 0xbc, 0xa8, 0x0b, 0x87, 0xbd, 0x70, 0x2c, 0xbd, 0x3c, 0x32, 0xb2, 0x8c, 0x3c, + 0xce, 0x0f, 0x34, 0xba, 0xc7, 0xc9, 0x3b, 0xbd, 0x22, 0xdb, 0xf3, 0xbc, 0x8d, + 0x4e, 0x48, 0xbd, 0xf0, 0x63, 0x53, 0x3d, 0x04, 0xd6, 0xc7, 0x3b, 0xfa, 0x40, + 0x6c, 0xbd, 0x22, 0xfb, 0x80, 0x38, 0xe9, 0x8c, 0x0e, 0x3c, 0xc4, 0x60, 0x27, + 0x3d, 0xaa, 0xcf, 0x60, 0x3d, 0xfe, 0x59, 0x08, 0x3d, 0x6e, 0x69, 0x43, 0xbd, + 0xcb, 0xa1, 0x03, 0xbd, 0x16, 0x47, 0x72, 0x3d, 0xc1, 0x37, 0x5d, 0x3d, 0x53, + 0x6f, 0x8b, 0xbd, 0x50, 0x99, 0x18, 0x3d, 0x65, 0x92, 0x89, 0x3d, 0x12, 0x80, + 0x94, 0xbd, 0x8d, 0x1d, 0x21, 0xbd, 0x6e, 0xc6, 0x69, 0x3d, 0x18, 0x1d, 0x23, + 0x3d, 0x3e, 0x2b, 0x00, 0x3d, 0xe4, 0x71, 0x4f, 0xbd, 0xfb, 0xc5, 0x0e, 0xbd, + 0x6e, 0x24, 0x47, 0x3d, 0x34, 0xf0, 0x50, 0x3c, 0x3f, 0x38, 0x89, 0x3d, 0xb5, + 0x84, 0x41, 0xbc, 0xb8, 0xdc, 0x56, 0x3d, 0x3b, 0x56, 0x60, 0xbc, 0x5a, 0x3b, + 0x58, 0x3d, 0x86, 0x56, 0x6d, 0xbd, 0x4f, 0x33, 0x43, 0x3d, 0x7e, 0x6c, 0x7d, + 0x3c, 0xb9, 0x4c, 0x8b, 0x3d, 0x00, 0x88, 0x3f, 0x3a, 0x3a, 0xb8, 0xc1, 0x3c, + 0x02, 0x18, 0x30, 0x3d, 0x6b, 0xb4, 0x4c, 0xbd, 0x0d, 0xd8, 0x3c, 0x3d, 0x9a, + 0x25, 0x61, 0xbd, 0x87, 0x7b, 0xa7, 0xbc, 0x76, 0x8e, 0x06, 0xbb, 0x47, 0xf9, + 0x73, 0xbd, 0x80, 0xfa, 0x28, 0xbb, 0xd4, 0xd1, 0x76, 0xbd, 0x9a, 0xcb, 0x29, + 0xbd, 0xf6, 0x0f, 0xe5, 0xbc, 0x6d, 0xeb, 0x4f, 0xbd, 0x46, 0xe8, 0x69, 0xbc, + 0x9a, 0x72, 0x69, 0x3d, 0x55, 0x19, 0x86, 0xbd, 0xba, 0x77, 0x0f, 0x3d, 0x4d, + 0xf6, 0x64, 0x3d, 0xf4, 0xf6, 0x19, 0x3d, 0xc3, 0x53, 0x4a, 0x3d, 0x83, 0xc4, + 0x7f, 0x3c, 0xb6, 0xcb, 0x53, 0xbd, 0xc5, 0x99, 0x83, 0xbd, 0xa9, 0xcb, 0x4e, + 0xbd, 0xbc, 0xc0, 0xf3, 0x3c, 0xc3, 0x45, 0x2c, 0x3d, 0x6a, 0x2f, 0x93, 0xbd, + 0x8d, 0x05, 0x67, 0x3d, 0xec, 0x6f, 0x3a, 0x3d, 0xf5, 0x47, 0x5a, 0x3d, 0xca, + 0xa6, 0x79, 0x3d, 0x16, 0x97, 0x7d, 0xbd, 0x53, 0x30, 0x52, 0x3d, 0x07, 0x81, + 0x52, 0x3d, 0xf7, 0xae, 0xa6, 0xbc, 0xa3, 0xc2, 0xa4, 0xbc, 0x5c, 0xd8, 0x23, + 0xbd, 0xc5, 0x77, 0x50, 0x3d, 0x28, 0x78, 0x47, 0x3c, 0xe7, 0xe2, 0x04, 0xbd, + 0xcc, 0x6f, 0x83, 0xbd, 0x4c, 0x2b, 0xfc, 0xbc, 0x42, 0xf8, 0xf6, 0x3c, 0x03, + 0x7c, 0x87, 0x3d, 0x2d, 0x4d, 0x80, 0xbd, 0x08, 0x59, 0x65, 0x3d, 0x2b, 0x4a, + 0x3a, 0xbd, 0xae, 0xec, 0x68, 0x3d, 0x1e, 0x42, 0x85, 0xbd, 0xd6, 0x06, 0x6a, + 0x3d, 0x6e, 0xfe, 0x65, 0xbd, 0x77, 0xef, 0xb0, 0x3c, 0x81, 0xb1, 0x48, 0x3c, + 0x86, 0x4b, 0x57, 0xbd, 0x1e, 0x45, 0x82, 0x3c, 0x9b, 0x6c, 0x0f, 0xbd, 0xeb, + 0x5f, 0x1c, 0xbd, 0xc3, 0x49, 0x3b, 0x3d, 0x5b, 0x31, 0x7b, 0xbd, 0xee, 0xcb, + 0x0c, 0xbd, 0x49, 0xa6, 0xa7, 0x3c, 0x89, 0x96, 0x73, 0xbd, 0x4d, 0xcf, 0x89, + 0x3d, 0xec, 0x73, 0xe1, 0x3b, 0x0e, 0x74, 0x0b, 0x3c, 0xc4, 0x52, 0xe1, 0xbc, + 0xf9, 0x15, 0x5f, 0x3d, 0x4a, 0x6c, 0x6c, 0xbd, 0x1d, 0x1d, 0xc7, 0xbb, 0xa2, + 0x11, 0x26, 0x3d, 0x92, 0xa6, 0x00, 0xbd, 0xe8, 0x29, 0x52, 0x3d, 0x6c, 0x9f, + 0xc3, 0x3c, 0xa9, 0xf6, 0xea, 0xbc, 0x0b, 0xce, 0x84, 0x3d, 0x3a, 0x7a, 0x83, + 0x3d, 0x95, 0x99, 0xff, 0x3c, 0x26, 0xc1, 0xae, 0xbc, 0x4c, 0x73, 0xab, 0x3c, + 0x10, 0x47, 0x5f, 0xbd, 0x6c, 0x99, 0xab, 0x3c, 0x40, 0x91, 0xee, 0x3a, 0x30, + 0xe9, 0x43, 0xbd, 0xd8, 0xdf, 0xed, 0x3c, 0x93, 0xd4, 0x98, 0xbc, 0x05, 0xf8, + 0x8c, 0x3d, 0x8d, 0x54, 0x89, 0xbd, 0x29, 0x6a, 0x5a, 0xbd, 0x54, 0x2f, 0x2d, + 0xbd, 0x11, 0x76, 0x90, 0xbd, 0x62, 0x24, 0xdf, 0x3c, 0x1f, 0x0c, 0x92, 0xbd, + 0x87, 0xb7, 0x06, 0xbd, 0x28, 0x1b, 0x92, 0xbd, 0x41, 0xb6, 0x19, 0xbd, 0x90, + 0xa9, 0xc8, 0xbc, 0x10, 0x06, 0xa2, 0x3c, 0x9b, 0x59, 0x72, 0x3d, 0x9f, 0x9b, + 0xc4, 0x3c, 0xc2, 0x44, 0xb9, 0xbb, 0xe4, 0x46, 0x90, 0x3d, 0xe9, 0x54, 0x40, + 0xbd, 0x18, 0xdd, 0xc8, 0xbc, 0xff, 0x78, 0x44, 0xbd, 0x6e, 0xaa, 0x92, 0xbc, + 0x76, 0xaa, 0x31, 0x3c, 0x37, 0x94, 0xe8, 0xbc, 0x2b, 0x84, 0xf6, 0x3c, 0xce, + 0x29, 0x8f, 0xbc, 0x37, 0xdc, 0xaf, 0x3c, 0x40, 0x76, 0xbd, 0x3c, 0xd6, 0x49, + 0x50, 0x3d, 0x48, 0x72, 0x36, 0xbd, 0xc7, 0x51, 0x63, 0xbd, 0x04, 0x47, 0x70, + 0xbc, 0x02, 0x99, 0x7c, 0xbc, 0x83, 0xb4, 0x44, 0xbd, 0x1d, 0x3b, 0x83, 0xbd, + 0x55, 0xe3, 0x41, 0x3d, 0x2c, 0x05, 0xcf, 0x3a, 0x52, 0x65, 0x2f, 0x3d, 0x8e, + 0x0d, 0x2d, 0x3d, 0x59, 0x13, 0x43, 0xbd, 0xe6, 0x6e, 0xf3, 0x3c, 0xc3, 0xfc, + 0xac, 0x3c, 0x82, 0x9e, 0x5f, 0xbc, 0x07, 0xd9, 0x6f, 0xbd, 0xf0, 0xf1, 0x9d, + 0x3b, 0x09, 0xcd, 0x07, 0xbd, 0x99, 0xc1, 0x87, 0x3d, 0xfa, 0xef, 0x73, 0x3d, + 0xe5, 0x18, 0xfc, 0x3c, 0xbc, 0x08, 0x06, 0x3d, 0x5e, 0x91, 0x90, 0xbd, 0x9c, + 0x69, 0xf7, 0x3b, 0x71, 0x14, 0xef, 0xbc, 0x90, 0x77, 0xf9, 0x3c, 0x4c, 0x17, + 0x6e, 0xbd, 0x59, 0x66, 0xe5, 0xbb, 0x6d, 0x0b, 0x5f, 0xbc, 0x8a, 0xde, 0x57, + 0x3d, 0xdf, 0x37, 0x84, 0xbd, 0x6a, 0x62, 0x7b, 0x3d, 0x19, 0x4c, 0xc5, 0xbc, + 0xf0, 0x81, 0x2b, 0x3d, 0x0c, 0xe8, 0x3f, 0xbd, 0x2c, 0xac, 0x36, 0xbd, 0x2a, + 0x6a, 0x2e, 0x3d, 0x90, 0xcc, 0x94, 0xbb, 0x07, 0xfd, 0x28, 0xbd, 0x5e, 0x9f, + 0xb7, 0x3b, 0xcc, 0xf7, 0x83, 0xbd, 0x2e, 0x4f, 0xa0, 0xbc, 0x06, 0x60, 0xcc, + 0x3c, 0xc6, 0xbf, 0x5d, 0x3c, 0x48, 0x40, 0x6b, 0xbd, 0x69, 0x48, 0x03, 0x3d, + 0x75, 0x47, 0x48, 0x3d, 0xc4, 0x2f, 0x0f, 0x3d, 0x2d, 0xa5, 0x6e, 0xbd, 0x5a, + 0x05, 0x41, 0xbd, 0x7c, 0x10, 0xff, 0x3c, 0x2c, 0x2e, 0x78, 0xbd, 0x16, 0x4f, + 0x7d, 0x3d, 0xcf, 0x20, 0x5f, 0x3d, 0xd7, 0x5c, 0x87, 0xbd, 0x96, 0x63, 0x1e, + 0xbc, 0x2b, 0xf3, 0x8c, 0xbc, 0x6e, 0x52, 0x00, 0xbd, 0xb0, 0xb0, 0x47, 0x3d, + 0x6e, 0x8c, 0xa2, 0xbc, 0x26, 0xa4, 0xbd, 0x3c, 0x50, 0xfb, 0xc4, 0xbc, 0x16, + 0xc5, 0xe2, 0x3c, 0x34, 0xbe, 0xba, 0xbc, 0x58, 0x77, 0x06, 0xbc, 0xb6, 0x0f, + 0x02, 0x3d, 0x00, 0xc0, 0x67, 0xbd, 0x19, 0x7b, 0x0f, 0xbd, 0xdf, 0xca, 0x42, + 0xbd, 0x28, 0x6b, 0x5d, 0xbd, 0xe8, 0x7b, 0x0b, 0x3d, 0x0f, 0xd3, 0x9b, 0xbc, + 0x0e, 0x94, 0x3c, 0x3d, 0x56, 0xcd, 0x32, 0xbd, 0x39, 0x73, 0x82, 0xbd, 0x32, + 0x4b, 0x06, 0xbd, 0x77, 0xbe, 0x35, 0xbd, 0x4f, 0x03, 0x0b, 0x3d, 0x40, 0x14, + 0x8b, 0x3d, 0xe0, 0x32, 0x60, 0xbd, 0x4f, 0xd0, 0x85, 0x3d, 0x0f, 0xfc, 0x74, + 0xbc, 0xa1, 0xfc, 0xfa, 0xbb, 0x83, 0x11, 0x49, 0x3b, 0x48, 0x21, 0x1b, 0xbc, + 0x4d, 0x36, 0xe6, 0xbc, 0x27, 0x47, 0x6c, 0xbc, 0x6f, 0x04, 0x37, 0xbd, 0xc6, + 0x57, 0x6a, 0x3d, 0xa0, 0x16, 0x4d, 0x3b, 0x1a, 0xeb, 0x55, 0x3d, 0x6e, 0x5f, + 0x2d, 0xbd, 0xde, 0xff, 0x65, 0xbd, 0x68, 0x46, 0x49, 0x3c, 0x3c, 0x27, 0x3c, + 0xbd, 0xfd, 0xdc, 0x0e, 0xbd, 0xb9, 0xff, 0x24, 0xbd, 0xf0, 0x8f, 0x5c, 0xbd, + 0xa8, 0x9d, 0x32, 0x3d, 0x5c, 0x6d, 0x4d, 0xbd, 0x0d, 0xc2, 0x47, 0x3d, 0xf5, + 0xe0, 0x8b, 0x3c, 0x4e, 0xd4, 0xfb, 0xbc, 0x2f, 0xef, 0x7d, 0x3d, 0x0d, 0xbf, + 0x03, 0x3d, 0x54, 0x6e, 0x16, 0x3d, 0x51, 0x8b, 0x85, 0xbd, 0xac, 0x6b, 0x19, + 0xbb, 0x2e, 0x99, 0x9e, 0x3c, 0xd9, 0xa5, 0x35, 0x3d, 0x90, 0x56, 0x59, 0x3d, + 0xda, 0xee, 0x7c, 0x3d, 0x63, 0x87, 0x1b, 0xbb, 0x12, 0x90, 0x39, 0xbd, 0x4b, + 0xb8, 0x39, 0x3d, 0x3f, 0x49, 0x94, 0xbc, 0xeb, 0x8f, 0x80, 0x3d, 0x8a, 0x9f, + 0x81, 0xbd, 0xdb, 0x11, 0x0c, 0x3d, 0x13, 0x28, 0x29, 0x3d, 0x70, 0x84, 0xfc, + 0xbc, 0x48, 0x74, 0x10, 0x3c, 0xcc, 0xb3, 0x30, 0xbd, 0x48, 0x07, 0x16, 0x3c, + 0x5d, 0x4f, 0x19, 0xbd, 0x2b, 0x80, 0xf7, 0xbb, 0x16, 0x87, 0x08, 0xbd, 0x07, + 0x00, 0x88, 0x3d, 0x12, 0x69, 0x44, 0x3d, 0x18, 0x31, 0x0d, 0x3c, 0x57, 0xd3, + 0x06, 0x3d, 0x24, 0x3d, 0x07, 0x3d, 0xcc, 0x07, 0x7f, 0x3d, 0xab, 0x2a, 0x79, + 0xbd, 0x7e, 0x3c, 0x79, 0xbd, 0xa9, 0x22, 0xfb, 0xbc, 0x3d, 0xa3, 0x3f, 0x3d, + 0x9b, 0x63, 0x40, 0x3c, 0x8f, 0xd5, 0x9b, 0x3c, 0x38, 0x24, 0x2b, 0x3d, 0x73, + 0x53, 0x02, 0x3d, 0xf4, 0xe3, 0xfb, 0x3c, 0xab, 0x4b, 0x81, 0x3d, 0x6c, 0x44, + 0x17, 0x3d, 0xe9, 0xbe, 0x8e, 0x3d, 0x79, 0xc1, 0x23, 0x3c, 0x19, 0xfd, 0x91, + 0x3c, 0xf9, 0xea, 0x83, 0x3c, 0x5a, 0xee, 0x86, 0x3c, 0xa7, 0x51, 0x2f, 0xbd, + 0x4a, 0xa1, 0x43, 0x3d, 0xf7, 0xc3, 0xdd, 0x3b, 0x41, 0x5d, 0x48, 0xbd, 0x91, + 0x94, 0x92, 0xbd, 0x76, 0xb0, 0x87, 0x3d, 0xad, 0x39, 0x8e, 0x3d, 0xa0, 0x5a, + 0xc3, 0xbb, 0x13, 0xd2, 0x42, 0xbd, 0x93, 0x32, 0x41, 0xbc, 0x02, 0x56, 0x91, + 0xbd, 0x6e, 0x37, 0x12, 0xbd, 0x70, 0x73, 0xe7, 0x3b, 0x85, 0xd7, 0x78, 0x3b, + 0xb0, 0xfb, 0x3f, 0xbd, 0x44, 0xb8, 0x2e, 0xbd, 0xcd, 0x1c, 0x92, 0xbd, 0x78, + 0xee, 0xe1, 0xbc, 0xb4, 0x56, 0x52, 0xbd, 0xa6, 0xbd, 0x62, 0x3d, 0xdc, 0x38, + 0xe8, 0xbc, 0x30, 0xaf, 0x68, 0x3c, 0xe0, 0x72, 0x05, 0xbc, 0x06, 0xad, 0xd5, + 0x3b, 0xd9, 0x62, 0x23, 0x3d, 0xf8, 0xa2, 0xee, 0xbc, 0x44, 0x13, 0x07, 0x3d, + 0x04, 0xcc, 0xf2, 0x3a, 0xce, 0x3f, 0x2c, 0x3d, 0x25, 0x8b, 0x28, 0x3c, 0x55, + 0xd2, 0x7a, 0xbc, 0x19, 0x6f, 0x83, 0x3d, 0x62, 0xaa, 0x32, 0xbd, 0xf2, 0x19, + 0x1c, 0xbc, 0x54, 0xc3, 0x8b, 0xbd, 0xdd, 0xeb, 0x52, 0x3c, 0x2a, 0xc7, 0x7c, + 0x3d, 0x04, 0xf0, 0xb9, 0x3b, 0xe8, 0x91, 0x84, 0x3d, 0x8d, 0xa2, 0xa3, 0x3c, + 0x01, 0xde, 0x7d, 0xbd, 0x14, 0xf3, 0x25, 0xbd, 0xde, 0x87, 0x8e, 0xbd, 0x6b, + 0x3b, 0x85, 0x3d, 0x02, 0x85, 0x84, 0xbd, 0x6b, 0x77, 0x6d, 0xbc, 0xb6, 0x9a, + 0x53, 0x3d, 0x0f, 0xb3, 0xaa, 0xbb, 0x13, 0x69, 0x55, 0xbd, 0x65, 0x98, 0x57, + 0xbd, 0xef, 0x9c, 0xb2, 0xbc, 0xd2, 0x02, 0xd4, 0x3c, 0x8e, 0xca, 0x27, 0x3d, + 0x64, 0xc8, 0x42, 0xbd, 0xca, 0x34, 0x39, 0xbd, 0xec, 0x45, 0x78, 0xbc, 0xe3, + 0xe3, 0x15, 0xbd, 0xad, 0x80, 0x30, 0x3d, 0xa3, 0xc8, 0x12, 0xbd, 0x11, 0x8e, + 0x40, 0x3d, 0x9a, 0x5f, 0x29, 0xbc, 0xbe, 0xc0, 0x8e, 0xbd, 0x2e, 0x01, 0x05, + 0xba, 0xde, 0x16, 0x2d, 0x3d, 0xce, 0xc7, 0x68, 0x3d, 0x08, 0x78, 0x4b, 0x3d, + 0xb9, 0xc7, 0x8f, 0xbd, 0x99, 0x7d, 0x71, 0x3d, 0x20, 0x52, 0x85, 0x3b, 0x8e, + 0x86, 0xcc, 0xbc, 0x18, 0x1e, 0x1e, 0x3d, 0x06, 0x84, 0x35, 0x3d, 0xd8, 0x65, + 0x71, 0xbd, 0xb1, 0x95, 0x1e, 0x3d, 0xa8, 0x12, 0x4f, 0x3d, 0xf0, 0x82, 0x6b, + 0x3c, 0x82, 0x05, 0x05, 0xbd, 0x78, 0x40, 0xef, 0x3c, 0xea, 0xf1, 0x91, 0xbd, + 0x06, 0x99, 0x82, 0x3d, 0x65, 0x80, 0x81, 0xbc, 0xc7, 0xd2, 0x98, 0xbc, 0x1b, + 0xab, 0x8c, 0x3b, 0x8d, 0xe6, 0xa2, 0x3c, 0x5a, 0xb0, 0xe8, 0xbc, 0x74, 0x5c, + 0x65, 0x3c, 0x53, 0x81, 0x88, 0x3d, 0x77, 0xe4, 0x83, 0xbd, 0x05, 0x68, 0x3f, + 0xbd, 0x7f, 0xa0, 0x34, 0xbd, 0x23, 0xc6, 0x57, 0xbd, 0xe8, 0x03, 0x4c, 0xbd, + 0xef, 0x5a, 0x91, 0x3c, 0x85, 0x78, 0x46, 0xbd, 0xc3, 0x5f, 0x2e, 0xbd, 0x38, + 0x74, 0x09, 0x3d, 0x71, 0x8d, 0x2a, 0xbd, 0x7c, 0xb3, 0x40, 0x3d, 0x26, 0xf6, + 0x72, 0xbd, 0x84, 0xfa, 0x4f, 0xbd, 0x34, 0x53, 0xa7, 0x3c, 0x2c, 0x63, 0x6f, + 0x3d, 0xe4, 0xa4, 0x29, 0xbd, 0x00, 0x17, 0x21, 0xbb, 0x82, 0x9e, 0x6f, 0x3d, + 0x8a, 0x61, 0x8d, 0xbd, 0xc4, 0xd7, 0x45, 0x3d, 0x20, 0x1a, 0xce, 0x3c, 0x86, + 0x39, 0x27, 0xbd, 0xf1, 0x45, 0x1f, 0xbd, 0xe0, 0x3e, 0xd4, 0x3c, 0x8a, 0x80, + 0x70, 0xbc, 0x80, 0xae, 0xd4, 0x3c, 0x04, 0x93, 0x0a, 0x3d, 0xff, 0x3c, 0x78, + 0x3d, 0x31, 0x0e, 0x48, 0x3c, 0x20, 0xa8, 0x89, 0xbd, 0x98, 0x75, 0x07, 0xbc, + 0x68, 0xa1, 0x71, 0x3d, 0xe0, 0xe8, 0x8e, 0xbc, 0xe9, 0x29, 0x19, 0x3d, 0x79, + 0x7c, 0x4f, 0xbc, 0x90, 0x98, 0xd5, 0x3c, 0x3b, 0xec, 0x1c, 0xbd, 0x36, 0x46, + 0x84, 0xb9, 0x18, 0x09, 0x8a, 0xbc, 0x84, 0xce, 0x0d, 0xbc, 0xb8, 0x2c, 0xa8, + 0x3c, 0x20, 0x84, 0x18, 0xbc, 0xa0, 0x54, 0x72, 0xbd, 0x5f, 0xd9, 0x82, 0xbd, + 0xe7, 0x32, 0x69, 0xbc, 0x58, 0xf3, 0x30, 0xbc, 0x12, 0xff, 0x89, 0x3b, 0x38, + 0xb3, 0x50, 0x3c, 0x5c, 0xf7, 0x48, 0x3c, 0x40, 0xb3, 0xb9, 0x3c, 0x08, 0x01, + 0x2b, 0x3d, 0xcb, 0x34, 0xc0, 0xbc, 0x9c, 0x64, 0x51, 0xbd, 0x58, 0x1a, 0x2f, + 0xbd, 0x4a, 0x45, 0x8a, 0xbc, 0x6a, 0x88, 0xe3, 0x3b, 0xf2, 0xe0, 0x74, 0x3d, + 0x08, 0xa7, 0x2d, 0xbd, 0x73, 0x61, 0x17, 0xbd, 0xf0, 0xee, 0xce, 0xbc, 0xda, + 0xbc, 0x20, 0xbd, 0x57, 0x27, 0xc6, 0x3c, 0x3c, 0xfc, 0xb2, 0x3d, 0xf9, 0x52, + 0x72, 0x3d, 0x98, 0x21, 0x23, 0x3a, 0x64, 0x0e, 0x39, 0xbd, 0x3c, 0x50, 0xff, + 0xbd, 0xf0, 0xb9, 0x36, 0xbd, 0xff, 0xe2, 0xa3, 0x3d, 0x1c, 0xad, 0x24, 0xbd, + 0x17, 0x26, 0x4b, 0x3d, 0x32, 0xdb, 0xca, 0x3b, 0xc6, 0x04, 0x3c, 0x3d, 0x3c, + 0x98, 0x9c, 0x3d, 0xd7, 0xd3, 0x80, 0xbc, 0x30, 0x4e, 0xd9, 0x3c, 0xff, 0xc1, + 0x21, 0x3d, 0x66, 0xcc, 0xa5, 0xbc, 0x61, 0x87, 0x98, 0x3d, 0x98, 0x20, 0x32, + 0x3d, 0xec, 0xf1, 0x87, 0xbd, 0x40, 0x73, 0xb9, 0xbd, 0xed, 0x67, 0x98, 0x3d, + 0x82, 0xde, 0x83, 0x3c, 0xef, 0xb3, 0xe9, 0x3c, 0xf6, 0xd1, 0x2f, 0x3d, 0xb6, + 0xa2, 0x6c, 0xbd, 0xfa, 0x55, 0x87, 0xbd, 0x5e, 0x0d, 0x4b, 0xbd, 0x52, 0x83, + 0x1b, 0x3d, 0x38, 0xa3, 0x32, 0xbd, 0x68, 0xa3, 0xd0, 0x3c, 0x6b, 0x9b, 0x0e, + 0xbd, 0xe8, 0x58, 0x83, 0x3b, 0xac, 0xf2, 0x1d, 0x3d, 0xdc, 0x01, 0xfe, 0xbb, + 0x45, 0xd1, 0x37, 0x3d, 0x7d, 0x74, 0x10, 0x3d, 0x39, 0x6f, 0x42, 0xbd, 0x1f, + 0x11, 0xd3, 0xbc, 0x58, 0x36, 0x98, 0x3d, 0xe6, 0x99, 0x19, 0xbd, 0x2e, 0x3f, + 0x44, 0x3c, 0x04, 0xd0, 0x08, 0xbd, 0x9e, 0x8c, 0x74, 0xbc, 0x73, 0x43, 0xeb, + 0xbc, 0xa2, 0x01, 0x9b, 0xbd, 0x30, 0x8a, 0x29, 0xbd, 0x4d, 0xe1, 0x50, 0xbd, + 0xc8, 0x2a, 0x1d, 0x3d, 0x2d, 0x12, 0x7d, 0x3d, 0xdd, 0x75, 0x24, 0xbc, 0xd7, + 0x2b, 0x48, 0x3c, 0x84, 0x77, 0xf0, 0x3c, 0xf8, 0x69, 0x8a, 0x3d, 0x0d, 0x62, + 0x23, 0x3d, 0x8d, 0x2a, 0x65, 0x3d, 0x33, 0xc6, 0xce, 0x3b, 0x34, 0xb9, 0x97, + 0x3b, 0xf3, 0x86, 0xe2, 0xbb, 0x5d, 0x2a, 0x53, 0xbd, 0xea, 0x2b, 0x9a, 0xba, + 0xbf, 0xd8, 0x91, 0xbc, 0x3d, 0x5f, 0xfa, 0xbc, 0x04, 0x71, 0x82, 0x3d, 0x02, + 0x09, 0xbe, 0x3d, 0xa2, 0xb3, 0xad, 0x3c, 0x6c, 0x47, 0x28, 0xbd, 0xce, 0xd6, + 0x16, 0xbd, 0x95, 0x44, 0xff, 0x3c, 0x6c, 0x62, 0x82, 0x3d, 0x2a, 0x15, 0xba, + 0xbc, 0xc1, 0xa7, 0x83, 0xbb, 0x69, 0x42, 0x7c, 0xbd, 0x03, 0x6e, 0x01, 0x3d, + 0xd9, 0x8c, 0x1b, 0xbd, 0xc7, 0x85, 0xdc, 0x3c, 0x76, 0x04, 0x4d, 0x3d, 0x99, + 0x3b, 0x69, 0x3c, 0xee, 0x8a, 0x6f, 0x3d, 0x2c, 0xb5, 0x34, 0xbd, 0x95, 0xc2, + 0x32, 0xbd, 0x34, 0x5b, 0x8a, 0x3c, 0x0d, 0x52, 0x44, 0xbb, 0xe8, 0xfd, 0xe3, + 0xbc, 0x6c, 0x8f, 0x6c, 0x3d, 0x22, 0xe9, 0xce, 0xbc, 0x38, 0x1d, 0xa4, 0x3d, + 0x37, 0xb9, 0xcc, 0xbb, 0x58, 0x8e, 0xbb, 0xbc, 0x13, 0x85, 0x8d, 0x3d, 0x7b, + 0x10, 0x9d, 0xbd, 0xb0, 0x74, 0x20, 0xbd, 0xbf, 0x6b, 0x24, 0xbc, 0x0b, 0xb2, + 0x6f, 0xbd, 0xbe, 0x9c, 0xae, 0x3d, 0x64, 0xfc, 0x34, 0x3d, 0x84, 0x44, 0x59, + 0x3b, 0xc5, 0x97, 0xb6, 0xbc, 0x25, 0x1b, 0x42, 0xbd, 0x1c, 0x64, 0x59, 0x3d, + 0x00, 0x12, 0x82, 0x3d, 0x64, 0xac, 0x91, 0x3b, 0x3b, 0xae, 0x6b, 0xbd, 0x18, + 0x6c, 0xd0, 0x3d, 0x9e, 0xea, 0x60, 0x3d, 0xf3, 0xf6, 0x49, 0xbd, 0xd3, 0xfc, + 0x5b, 0xbc, 0xe5, 0x37, 0x64, 0x3c, 0xbe, 0x33, 0x9c, 0xbc, 0x0e, 0x7a, 0x70, + 0xbd, 0xf7, 0x19, 0x32, 0xbd, 0x7a, 0x54, 0xac, 0xbd, 0x94, 0x9a, 0x45, 0xbc, + 0xb6, 0xa0, 0x55, 0x3d, 0x72, 0x8b, 0x81, 0x3d, 0xec, 0xf7, 0x1d, 0x3c, 0x7c, + 0xc0, 0x65, 0xbd, 0x21, 0x3d, 0xa8, 0x3d, 0xfe, 0x98, 0x91, 0xbc, 0xfc, 0x4e, + 0x99, 0xbd, 0xd5, 0x77, 0xa0, 0xbd, 0x9a, 0xec, 0x0b, 0x3d, 0xc2, 0xc5, 0x2e, + 0xbd, 0x58, 0x39, 0x9b, 0x3d, 0x1a, 0x19, 0x4e, 0xbd, 0x32, 0x1e, 0x11, 0xbd, + 0xe2, 0x81, 0x2f, 0xbd, 0x72, 0x93, 0x82, 0x3d, 0xb5, 0x33, 0x96, 0x3d, 0xfd, + 0x32, 0x31, 0xbd, 0xf0, 0x5e, 0x7b, 0xbd, 0x37, 0x76, 0x4d, 0xbd, 0x5e, 0xa1, + 0x9a, 0x3d, 0x58, 0xb2, 0x89, 0xbd, 0xc0, 0x61, 0x93, 0x3a, 0x12, 0xf4, 0x7a, + 0x3d, 0xad, 0xe5, 0x32, 0xba, 0xf3, 0xfe, 0x75, 0x3d, 0xbd, 0xec, 0x57, 0xbd, + 0x4d, 0x5b, 0x09, 0x3d, 0x27, 0x1d, 0x1b, 0xbd, 0x26, 0x5e, 0x77, 0xbc, 0x33, + 0xd7, 0x30, 0xbd, 0x93, 0xde, 0x6d, 0xbd, 0xfe, 0xdd, 0x6f, 0x3d, 0x07, 0x21, + 0xad, 0x3d, 0xb6, 0xfb, 0x77, 0x3d, 0xc7, 0xd4, 0x12, 0x3d, 0xee, 0xd1, 0x1a, + 0x3b, 0x57, 0x6a, 0xdf, 0xbc, 0x9a, 0x69, 0x98, 0xbd, 0x18, 0xb5, 0x8b, 0xbd, + 0x3f, 0x2a, 0x1b, 0xbc, 0xba, 0x61, 0x4e, 0x3d, 0xf7, 0xfc, 0x15, 0x3d, 0x15, + 0x6a, 0x89, 0x3d, 0x0c, 0x26, 0x12, 0xbd, 0x3c, 0x56, 0x75, 0x3d, 0x31, 0x95, + 0x49, 0x3c, 0x80, 0x89, 0x27, 0xbd, 0xc5, 0xc8, 0x2d, 0xba, 0xd4, 0xb2, 0x99, + 0x3d, 0xbd, 0xfe, 0x19, 0xbd, 0x88, 0x62, 0x88, 0x3d, 0x1a, 0xea, 0xb6, 0x3d, + 0x06, 0xc5, 0x95, 0xbd, 0xbe, 0x0c, 0x2d, 0xbd, 0x09, 0x1b, 0x59, 0x3d, 0xf7, + 0xd4, 0xbe, 0xba, 0x23, 0x7e, 0x0d, 0xbd, 0x3f, 0x6a, 0x9f, 0x3c, 0x29, 0x6c, + 0x86, 0x3c, 0x50, 0x53, 0xad, 0xbc, 0x4d, 0x7e, 0xd5, 0xbd, 0xd2, 0xac, 0x6b, + 0x3d, 0xfd, 0xc0, 0x8d, 0xbd, 0x96, 0xc2, 0x3f, 0x3d, 0xc7, 0x50, 0x9d, 0xbc, + 0xf8, 0x74, 0xa7, 0xbc, 0x20, 0xcb, 0xbe, 0xbd, 0x39, 0xaa, 0x5d, 0x3d, 0x53, + 0x49, 0x99, 0xbc, 0xfe, 0x92, 0xca, 0xbd, 0xf2, 0x46, 0x75, 0xbd, 0x71, 0xfe, + 0x6e, 0xbd, 0x9f, 0x2f, 0x59, 0xbd, 0x0b, 0xe7, 0x3f, 0xbc, 0xad, 0x3f, 0x80, + 0x3d, 0xec, 0x4d, 0x81, 0xbd, 0x53, 0x8f, 0x8a, 0x3d, 0xfb, 0x2c, 0x54, 0x3d, + 0x20, 0x2c, 0x57, 0xbd, 0xc1, 0xeb, 0xe2, 0xba, 0x98, 0xed, 0x46, 0x3d, 0x6a, + 0x20, 0xc1, 0x3c, 0x54, 0x95, 0x2c, 0xbd, 0xac, 0xc1, 0x2b, 0x3c, 0x29, 0x2a, + 0xf8, 0xbd, 0x4e, 0x69, 0x7f, 0x3d, 0x17, 0x04, 0x29, 0xbd, 0xf2, 0xbb, 0xeb, + 0xbb, 0xf1, 0x49, 0x40, 0x3d, 0x00, 0x69, 0x01, 0x3d, 0x8d, 0x53, 0x64, 0x3d, + 0xb7, 0x21, 0x0b, 0xbd, 0x43, 0xc5, 0xc7, 0xbd, 0x1b, 0xa3, 0x48, 0x3d, 0xcb, + 0x7c, 0x09, 0xbd, 0x20, 0xcb, 0x6e, 0xbb, 0x94, 0x3f, 0x2e, 0x3d, 0xf7, 0x32, + 0x72, 0xbd, 0x9a, 0x1e, 0x40, 0xbd, 0x5b, 0xf3, 0x47, 0x3d, 0x02, 0xea, 0x77, + 0xba, 0x63, 0xf3, 0xe8, 0x3c, 0xac, 0x35, 0x06, 0xbd, 0xbd, 0x03, 0x4c, 0xbd, + 0x11, 0xf6, 0x92, 0x3d, 0x1b, 0x1a, 0x64, 0x3d, 0x51, 0x88, 0x58, 0xbc, 0x61, + 0xbf, 0x83, 0xbd, 0xdd, 0x44, 0x73, 0xbd, 0xe7, 0xe5, 0xd0, 0x3c, 0xc9, 0x5f, + 0x87, 0x3d, 0xec, 0x20, 0xbe, 0x3d, 0xd9, 0x21, 0x0f, 0x3d, 0xf9, 0xdd, 0xe7, + 0xbc, 0xf3, 0x32, 0x91, 0xbd, 0x71, 0xb6, 0x4a, 0x3d, 0x29, 0x35, 0x86, 0x3d, + 0xba, 0xf4, 0x40, 0xbd, 0x1c, 0x2b, 0x17, 0xbd, 0x70, 0xfb, 0x3c, 0xbd, 0xed, + 0x3e, 0xdf, 0xbc, 0x60, 0xf1, 0x3d, 0x3d, 0x53, 0x6e, 0x87, 0xbd, 0x0f, 0x52, + 0x3d, 0x3d, 0x58, 0xd1, 0x47, 0xbd, 0xab, 0x7f, 0xc3, 0x3c, 0x3d, 0x5d, 0xa8, + 0xbd, 0xe9, 0x7f, 0x11, 0xbd, 0x88, 0x93, 0x50, 0xbd, 0xf2, 0xd2, 0x0f, 0x3d, + 0x24, 0x59, 0x90, 0x3a, 0x99, 0x86, 0x8b, 0xbd, 0x27, 0x21, 0x5f, 0xbd, 0xf4, + 0xa1, 0x80, 0x3d, 0x0b, 0xbb, 0x89, 0x3c, 0xbc, 0xda, 0x79, 0x3d, 0xe8, 0x9b, + 0x56, 0xbc, 0x42, 0xca, 0xf1, 0x3c, 0x74, 0xe2, 0x86, 0x3c, 0xe4, 0x85, 0x0f, + 0x3d, 0x07, 0x57, 0x2e, 0x3d, 0x41, 0x24, 0x85, 0x3d, 0x48, 0x7e, 0x08, 0xbd, + 0x91, 0xa8, 0xdd, 0x3c, 0x8c, 0xe1, 0xb7, 0xbc, 0x04, 0xae, 0x2f, 0x3d, 0xe4, + 0x63, 0xa2, 0x3c, 0x6e, 0x28, 0x06, 0xbc, 0x8d, 0xd9, 0x67, 0xbd, 0x88, 0x14, + 0x43, 0x3d, 0xe5, 0x9a, 0xde, 0x3c, 0x45, 0x3e, 0x9d, 0x3d, 0x03, 0x22, 0xcb, + 0xbc, 0x71, 0x92, 0x7c, 0x3d, 0xf7, 0xc6, 0x0d, 0x3d, 0xfb, 0x47, 0xa4, 0x3d, + 0x45, 0x18, 0x91, 0xbd, 0xda, 0x0b, 0x79, 0xbc, 0x18, 0x17, 0x71, 0xbd, 0xa2, + 0x74, 0x4e, 0xbd, 0xd7, 0xdb, 0x46, 0x3d, 0x35, 0x53, 0xbb, 0x3c, 0x0c, 0x62, + 0x0f, 0xbc, 0xe9, 0x2d, 0xdf, 0xbd, 0x33, 0xc7, 0x60, 0x3c, 0x18, 0x74, 0xa8, + 0x3c, 0xa3, 0x75, 0x87, 0xbd, 0x7b, 0x58, 0xf3, 0xbd, 0x30, 0xcd, 0xfa, 0x3c, + 0x35, 0xbd, 0x9c, 0xbd, 0x93, 0xcf, 0xdb, 0xbc, 0xc2, 0x35, 0xd9, 0xbc, 0x5e, + 0x5a, 0x06, 0x3d, 0x3d, 0x8b, 0x39, 0xbd, 0xb7, 0x5d, 0x33, 0xbc, 0x50, 0xca, + 0xb8, 0x3c, 0x8b, 0x71, 0xfb, 0x3c, 0x80, 0x8e, 0x2a, 0x3d, 0xa0, 0x72, 0x80, + 0xbc, 0x08, 0x4a, 0x00, 0xbd, 0x9b, 0x6f, 0xd2, 0x3b, 0xda, 0x83, 0xf9, 0xbc, + 0xed, 0x0c, 0x0b, 0x3c, 0x5d, 0x80, 0x40, 0xbc, 0x84, 0x40, 0x25, 0xbd, 0x52, + 0x1e, 0x03, 0x3d, 0x53, 0xd4, 0x54, 0x3c, 0x0b, 0x6b, 0xda, 0x3c, 0xcc, 0x67, + 0x17, 0x3b, 0x58, 0x05, 0xe5, 0xba, 0x63, 0x8d, 0x95, 0x3c, 0xc6, 0xa5, 0x5a, + 0x3d, 0xdf, 0x29, 0x23, 0xbd, 0x4b, 0x72, 0x9b, 0x3d, 0xef, 0x78, 0x4b, 0xbd, + 0xa5, 0x08, 0xb7, 0xbd, 0x9c, 0xb5, 0x78, 0xbc, 0xdf, 0x0c, 0x88, 0x3d, 0x07, + 0xab, 0x19, 0x3d, 0xdc, 0xad, 0xc9, 0xbd, 0x5e, 0x37, 0x4f, 0x3d, 0xe6, 0x99, + 0x77, 0xbd, 0x12, 0x5f, 0x48, 0xbc, 0x89, 0x82, 0xf2, 0x3b, 0x86, 0x89, 0x44, + 0x3c, 0x66, 0x1b, 0xb7, 0xbc, 0x2f, 0x07, 0xd0, 0x3b, 0xb5, 0x85, 0x76, 0xb9, + 0xb2, 0xc4, 0x11, 0xbd, 0x5b, 0x02, 0x30, 0xbd, 0xed, 0xed, 0xee, 0x3c, 0x77, + 0xbd, 0x24, 0xbb, 0x36, 0xe9, 0x97, 0xbd, 0x2a, 0xe1, 0x6d, 0x3d, 0x75, 0x29, + 0xaf, 0x3d, 0xff, 0x38, 0xac, 0xbb, 0x76, 0x6d, 0xe4, 0xbc, 0xf8, 0x03, 0x15, + 0xbd, 0x6f, 0x3d, 0x9a, 0xbc, 0x6b, 0x64, 0x1f, 0x3d, 0xa6, 0x7c, 0x6f, 0xbd, + 0xa7, 0x60, 0x83, 0x3c, 0xe1, 0xa5, 0x53, 0xbd, 0x04, 0x4f, 0xb6, 0xbc, 0xe7, + 0x0b, 0x28, 0x3d, 0x4c, 0x15, 0xa9, 0xbc, 0x68, 0x90, 0x73, 0xbb, 0x77, 0x3e, + 0x8e, 0x3c, 0xdd, 0x42, 0x0c, 0xbd, 0x07, 0x7d, 0x22, 0xbd, 0x35, 0x15, 0x82, + 0xbd, 0xed, 0x56, 0xe0, 0x3c, 0xfa, 0x8d, 0x7e, 0x3d, 0xab, 0xb5, 0x85, 0xbd, + 0x8c, 0x4b, 0xa4, 0xbc, 0xe5, 0xee, 0x53, 0xbc, 0x9e, 0x26, 0x4f, 0xbd, 0xaa, + 0xdf, 0x63, 0xbd, 0xd2, 0x48, 0x11, 0x3c, 0xd6, 0x9c, 0x58, 0x3d, 0xa9, 0x90, + 0x00, 0x3d, 0x9b, 0xfa, 0x8c, 0x3b, 0x2a, 0x97, 0x1d, 0x3d, 0x37, 0xe9, 0x3e, + 0xbd, 0x51, 0xd8, 0xf0, 0xbd, 0x92, 0x65, 0x2b, 0xbd, 0x06, 0x73, 0x21, 0x3c, + 0x85, 0x89, 0xad, 0x3d, 0x50, 0x07, 0x60, 0x3d, 0x01, 0x61, 0x9a, 0x3d, 0xcf, + 0xba, 0x9c, 0x3d, 0x7c, 0x6f, 0x69, 0x3d, 0x20, 0x79, 0x71, 0xbd, 0xc8, 0x59, + 0xd1, 0xbc, 0x2f, 0x68, 0x1e, 0xbd, 0xb2, 0xed, 0x87, 0xbd, 0x3e, 0xe7, 0xa0, + 0xba, 0xb1, 0xf0, 0xd0, 0x3c, 0x1c, 0xf1, 0xdd, 0xbc, 0xb0, 0x4a, 0x83, 0xbb, + 0xb5, 0x00, 0x55, 0xbc, 0xc6, 0x63, 0x0b, 0x3d, 0xa8, 0x88, 0x2f, 0x3d, 0x3c, + 0x6e, 0xd7, 0x3c, 0x68, 0x1d, 0x14, 0xbc, 0xac, 0xd1, 0x37, 0x3d, 0x7f, 0xb7, + 0x66, 0x3d, 0xca, 0xd0, 0xc7, 0xbb, 0x72, 0x5a, 0x91, 0x3d, 0x64, 0x09, 0xaf, + 0x3c, 0xea, 0x7a, 0x0d, 0xbb, 0x87, 0xd8, 0x4f, 0xbb, 0x88, 0xdf, 0xa5, 0x3c, + 0x1a, 0xd5, 0x73, 0xbc, 0x55, 0x5b, 0xce, 0x3a, 0xff, 0x62, 0x16, 0x3d, 0xb9, + 0x06, 0xa8, 0xbd, 0xbc, 0x96, 0xc0, 0xbc, 0x77, 0x06, 0x17, 0xbc, 0xe9, 0xdf, + 0x7e, 0xba, 0x94, 0x5f, 0xcd, 0x3b, 0x7b, 0x66, 0xf2, 0xbc, 0xc3, 0xdf, 0x7d, + 0xbd, 0x9c, 0x07, 0x0e, 0xbd, 0xaa, 0x4e, 0x0a, 0xbd, 0x42, 0x2d, 0x7f, 0x3c, + 0x6f, 0x45, 0xb9, 0x3c, 0x6a, 0xf4, 0x2c, 0xbd, 0x66, 0x01, 0x23, 0xbd, 0x5a, + 0x2e, 0x12, 0xbc, 0x00, 0x0c, 0xc4, 0xbd, 0x56, 0xf3, 0xd9, 0xbc, 0x57, 0x20, + 0x14, 0xbd, 0x8f, 0xae, 0xbd, 0x3c, 0x0a, 0x85, 0xbb, 0xbd, 0x51, 0x63, 0x28, + 0xbd, 0xc3, 0x45, 0x19, 0xbd, 0x1a, 0xc0, 0x66, 0x3d, 0x58, 0xac, 0x77, 0xbd, + 0x2e, 0xb6, 0xdc, 0xbc, 0xaa, 0x45, 0xe6, 0xbc, 0x06, 0xba, 0x43, 0xbd, 0x71, + 0x36, 0xac, 0x3d, 0xf5, 0xcb, 0x96, 0x3d, 0x5b, 0x32, 0x58, 0xba, 0x6a, 0xe8, + 0xe0, 0xb9, 0x39, 0xb6, 0xbe, 0x3c, 0x56, 0xcc, 0xc5, 0x3b, 0x6b, 0xde, 0xad, + 0xbc, 0x6c, 0xd9, 0xf4, 0xbc, 0xb2, 0xe9, 0x43, 0x3d, 0xf9, 0xd2, 0x1b, 0xbc, + 0xb1, 0x0f, 0x19, 0x3d, 0xb3, 0xe0, 0x05, 0x3b, 0xdd, 0x85, 0xa8, 0x3d, 0x92, + 0x70, 0xc0, 0xbc, 0xaf, 0xa0, 0x22, 0xbd, 0x9f, 0x05, 0x33, 0xbd, 0x4a, 0xe4, + 0xa8, 0x3c, 0x80, 0xf3, 0xc9, 0xba, 0x9f, 0x4c, 0x31, 0xbd, 0x5e, 0x75, 0xa4, + 0xbc, 0x4e, 0xa3, 0x73, 0xbd, 0x32, 0x14, 0x96, 0xbd, 0xf1, 0xc8, 0xb1, 0x3c, + 0xa6, 0x72, 0x15, 0xbd, 0x06, 0xbc, 0x4c, 0x3d, 0xd6, 0x84, 0x96, 0x3b, 0xbd, + 0x95, 0x27, 0x3d, 0x89, 0x66, 0xd8, 0x3c, 0x14, 0xc8, 0xf8, 0xbc, 0x48, 0xc6, + 0x2a, 0x3d, 0x68, 0x7c, 0xa4, 0x3d, 0x0b, 0xfe, 0x48, 0x3d, 0x03, 0x4e, 0xa0, + 0x3c, 0x14, 0xeb, 0x9e, 0x3d, 0x54, 0x79, 0x17, 0xbd, 0x8d, 0xe5, 0x44, 0x3c, + 0x89, 0xb2, 0x14, 0xbc, 0x37, 0x64, 0x98, 0x3d, 0xd5, 0x7d, 0x54, 0xbd, 0x82, + 0x97, 0x92, 0xbd, 0x97, 0x4c, 0x7c, 0x3b, 0xf8, 0x3f, 0x2b, 0x3d, 0xa2, 0x52, + 0xc8, 0x3c, 0x67, 0x7b, 0x49, 0xbd, 0x8b, 0xdc, 0x84, 0xbc, 0xfc, 0xd2, 0x1c, + 0xbd, 0x50, 0x53, 0x8d, 0xbb, 0xa7, 0x93, 0xfe, 0xbc, 0xab, 0xb3, 0xff, 0xbc, + 0xb0, 0x0d, 0x12, 0x3c, 0x90, 0xde, 0x69, 0x3d, 0x19, 0x4a, 0x31, 0x3d, 0xba, + 0x86, 0xbe, 0xbd, 0xf0, 0xd1, 0x6f, 0xbd, 0x2a, 0x37, 0xa2, 0x3c, 0xba, 0x72, + 0x91, 0xbc, 0x69, 0xfe, 0x8f, 0xbb, 0xb4, 0xe0, 0x26, 0x3d, 0x9e, 0x8e, 0x6f, + 0x3d, 0x28, 0x1c, 0xa4, 0xbc, 0xeb, 0x11, 0x0b, 0x3d, 0xd3, 0x1a, 0x27, 0x3c, + 0x89, 0x93, 0xa3, 0x3d, 0x22, 0xbf, 0x46, 0x3d, 0xe2, 0x27, 0xe5, 0xbc, 0xa1, + 0x10, 0x8a, 0xbc, 0xe9, 0x93, 0x65, 0xbd, 0xef, 0x81, 0xce, 0x3c, 0x0c, 0x10, + 0x44, 0x3c, 0xdc, 0x0d, 0x15, 0xbd, 0x8d, 0x3b, 0x09, 0x3d, 0xc2, 0xe2, 0x35, + 0xbd, 0xc3, 0xde, 0x09, 0x3c, 0x68, 0xc5, 0x8f, 0x3d, 0xa2, 0xb3, 0x38, 0x3d, + 0x94, 0xa6, 0x66, 0x3c, 0x5f, 0x15, 0x79, 0x3d, 0x74, 0x80, 0x7e, 0x3d, 0x00, + 0xb6, 0xb0, 0xbb, 0xdb, 0xb6, 0x98, 0xbb, 0x8c, 0x1a, 0xb7, 0xbc, 0xa0, 0xf9, + 0x7e, 0x3c, 0x66, 0x95, 0x47, 0x3d, 0xca, 0x33, 0xf0, 0xbc, 0xde, 0x00, 0xfa, + 0x3b, 0x57, 0x05, 0xfb, 0xbb, 0xfc, 0x7f, 0xcb, 0xbc, 0x31, 0x1c, 0x11, 0x3d, + 0x16, 0xe4, 0xfd, 0x3b, 0x3d, 0xd5, 0xb5, 0x3c, 0x8c, 0xd4, 0x69, 0xbd, 0x40, + 0x7f, 0x87, 0xbb, 0x26, 0x9d, 0x77, 0xbc, 0x6b, 0xa7, 0xde, 0x3c, 0xf4, 0xd2, + 0x00, 0x3c, 0xff, 0x0d, 0xbc, 0x3c, 0xab, 0xfb, 0x6f, 0x3d, 0x5a, 0x15, 0x8b, + 0x3b, 0x05, 0x27, 0x77, 0x3d, 0xd8, 0xa8, 0x54, 0x3d, 0xa7, 0xf2, 0x01, 0x3d, + 0x20, 0x41, 0x70, 0x3c, 0x19, 0x99, 0xfd, 0xbc, 0xc0, 0xea, 0x48, 0x3d, 0xd7, + 0x09, 0x26, 0x3b, 0x79, 0x58, 0x6b, 0x3d, 0x2b, 0x43, 0x2e, 0xbd, 0x58, 0x06, + 0x76, 0x3c, 0xc3, 0x4a, 0x8c, 0x3d, 0x4b, 0x5b, 0x62, 0x3d, 0xb2, 0xff, 0x1f, + 0xbd, 0xeb, 0x73, 0x08, 0x3d, 0x39, 0xd4, 0x77, 0xbd, 0xfc, 0x94, 0x83, 0xbc, + 0x0e, 0x0d, 0x6c, 0x3d, 0x5c, 0x29, 0x73, 0x3d, 0x96, 0xc4, 0x92, 0xba, 0x00, + 0x64, 0x97, 0xbd, 0x3b, 0x52, 0x3a, 0xbd, 0x3a, 0x2d, 0x91, 0xbd, 0x62, 0x65, + 0x97, 0xbd, 0x72, 0xde, 0xd2, 0xbd, 0x1d, 0x30, 0x00, 0xbd, 0x74, 0x93, 0x95, + 0xbd, 0xae, 0x2c, 0xd7, 0xbc, 0xe3, 0xae, 0x27, 0x3d, 0x67, 0x7f, 0x0b, 0x3c, + 0xfc, 0xcf, 0x74, 0xbc, 0x7f, 0x2b, 0x74, 0x3d, 0x00, 0x49, 0xa2, 0xba, 0x13, + 0xfa, 0x0e, 0xbd, 0x7e, 0xfe, 0x9f, 0xbc, 0xa6, 0x05, 0xc7, 0xbb, 0xc2, 0xa7, + 0x2a, 0xbc, 0xb3, 0x63, 0x9b, 0x3a, 0x9c, 0x14, 0x0e, 0x3d, 0x82, 0xc6, 0xb0, + 0xbc, 0xc1, 0x25, 0xc0, 0x3c, 0x03, 0x95, 0x45, 0xbd, 0x61, 0xb6, 0x50, 0xbd, + 0xf8, 0x77, 0xea, 0x3a, 0x9d, 0xa7, 0xaa, 0x3a, 0xf2, 0x18, 0x1d, 0xbd, 0x42, + 0x15, 0x94, 0x3d, 0x7e, 0x0e, 0x47, 0xbd, 0xa5, 0x82, 0x84, 0x3d, 0xed, 0xbe, + 0x3b, 0x3d, 0x3b, 0xdc, 0x2e, 0xbd, 0x5c, 0x8c, 0x4b, 0xbd, 0x37, 0xbc, 0x99, + 0xbb, 0xb7, 0x55, 0x54, 0x3d, 0x8e, 0x6d, 0xa8, 0xbd, 0x09, 0x3c, 0x3f, 0x3d, + 0x83, 0x0e, 0x3a, 0xbd, 0x8f, 0x1f, 0x91, 0x3d, 0x8b, 0x2b, 0x33, 0xbd, 0x92, + 0x57, 0x58, 0x3d, 0x71, 0xcd, 0x27, 0xbd, 0xcf, 0x53, 0x30, 0x3d, 0x20, 0x81, + 0x64, 0x3d, 0x50, 0x82, 0x60, 0xbd, 0x98, 0x46, 0x2f, 0x3d, 0x32, 0x95, 0x28, + 0xbd, 0x70, 0xf5, 0x71, 0x3c, 0x9d, 0x96, 0xb0, 0xbc, 0x5b, 0x59, 0x56, 0xbd, + 0x10, 0x59, 0x90, 0x3d, 0xc0, 0x1e, 0xbb, 0x3c, 0x5c, 0x37, 0x9d, 0x3d, 0xbd, + 0x75, 0x61, 0x3d, 0xcf, 0x8b, 0x84, 0xbc, 0xb2, 0x23, 0x46, 0x3d, 0x0a, 0x82, + 0x02, 0x3d, 0xaf, 0xd4, 0x8e, 0xbb, 0x60, 0x87, 0xca, 0x3c, 0xdb, 0x73, 0x1a, + 0xbd, 0x52, 0xa2, 0x09, 0x3d, 0xa2, 0x5b, 0x4a, 0xbd, 0x1d, 0x5d, 0xa0, 0xbb, + 0x30, 0x20, 0x7e, 0xbd, 0x84, 0x2a, 0x78, 0xbd, 0x74, 0x5f, 0x6a, 0xbd, 0xa5, + 0x1a, 0xa5, 0xbd, 0xa8, 0x46, 0x92, 0x3c, 0xe5, 0x7e, 0x50, 0xbd, 0xc1, 0x19, + 0x4b, 0x3c, 0x1a, 0x20, 0x71, 0x3d, 0xa1, 0xa7, 0x48, 0xbc, 0xc3, 0xa7, 0xeb, + 0x3c, 0xd4, 0x58, 0x6c, 0xbd, 0x06, 0x40, 0x08, 0x3d, 0x07, 0x97, 0x93, 0x3d, + 0x36, 0xb8, 0x5c, 0xbd, 0x69, 0x31, 0xc4, 0x3d, 0x5d, 0x20, 0x62, 0xbc, 0x73, + 0x3a, 0xbf, 0xbc, 0xea, 0xff, 0x3f, 0x3d, 0x39, 0x07, 0xec, 0x3c, 0xeb, 0x30, + 0xb4, 0xbb, 0x0b, 0x38, 0x72, 0xbd, 0x12, 0x71, 0xfd, 0xbc, 0xc5, 0x09, 0x82, + 0x3b, 0x5d, 0x51, 0x84, 0xbd, 0xff, 0x16, 0x49, 0xbd, 0x5e, 0xd1, 0x13, 0xbd, + 0xd8, 0xaf, 0x96, 0x3c, 0xea, 0x7c, 0x7e, 0xbd, 0x9b, 0x71, 0x1c, 0x3d, 0xe0, + 0xff, 0xaf, 0xbc, 0xac, 0x24, 0x57, 0x3d, 0x8a, 0xf8, 0x49, 0x3d, 0x24, 0xfd, + 0xbc, 0xbc, 0x46, 0x2c, 0xac, 0xbd, 0xc8, 0xdf, 0x63, 0xbc, 0x61, 0xc6, 0x2e, + 0xbd, 0x9d, 0xec, 0xd9, 0xbc, 0xb1, 0x44, 0x86, 0xbd, 0x85, 0x38, 0x47, 0x3d, + 0x7b, 0x49, 0x5a, 0xbd, 0xb0, 0x9c, 0xee, 0xbc, 0x03, 0x6f, 0x33, 0xbd, 0x55, + 0x8c, 0x23, 0xbc, 0xd5, 0xcc, 0x82, 0xbc, 0x82, 0xc2, 0xcc, 0xbc, 0xac, 0x00, + 0x85, 0x3c, 0xf6, 0xf5, 0x70, 0x3d, 0xb0, 0x0f, 0x03, 0x37, 0xa3, 0xfd, 0x5a, + 0xbd, 0x13, 0x57, 0x38, 0x3c, 0x25, 0xe4, 0xea, 0xbc, 0x1a, 0xb8, 0x0e, 0x3c, + 0x80, 0x95, 0x20, 0xbb, 0x84, 0x35, 0x36, 0x3d, 0x27, 0x0c, 0x1f, 0xbd, 0x4e, + 0x46, 0x8d, 0x3d, 0xa4, 0xb0, 0xef, 0x3c, 0xe1, 0xf5, 0xce, 0xbc, 0x34, 0x54, + 0x9d, 0xbc, 0x9f, 0x03, 0xd9, 0x3b, 0x22, 0xe9, 0xed, 0xbc, 0xd3, 0x7d, 0x30, + 0xbd, 0xb8, 0x86, 0x1f, 0xbc, 0xed, 0xc3, 0x44, 0x3d, 0xbf, 0x32, 0xa1, 0x39, + 0x74, 0xe5, 0x38, 0xbd, 0xa3, 0xe4, 0x6c, 0xbd, 0x56, 0x19, 0x33, 0xbd, 0x17, + 0x60, 0xbd, 0xbc, 0xd5, 0xec, 0x4a, 0x3c, 0xa2, 0x27, 0xa4, 0x3d, 0x50, 0xea, + 0x77, 0xbd, 0x5a, 0xb3, 0x91, 0x39, 0xf3, 0xc2, 0x19, 0x3d, 0xd2, 0xb9, 0x4f, + 0xbd, 0x60, 0x90, 0x81, 0x3d, 0xbf, 0x14, 0x60, 0xbd, 0x7a, 0xdd, 0x62, 0x3c, + 0x43, 0x4c, 0xa5, 0xbb, 0xad, 0x1c, 0xe1, 0xbc, 0xc8, 0x0b, 0x15, 0x3d, 0xe1, + 0xbd, 0x0f, 0x3d, 0xc6, 0x1f, 0x92, 0x3d, 0xdf, 0x9a, 0x86, 0xbd, 0x08, 0x1a, + 0xed, 0x3c, 0xfa, 0x1f, 0x00, 0x3c, 0x90, 0x94, 0x1b, 0x3d, 0x4a, 0x1c, 0x25, + 0xbd, 0x79, 0xe4, 0xff, 0xbc, 0xdf, 0xeb, 0x91, 0x3d, 0x43, 0x22, 0x81, 0x3d, + 0x1f, 0x1c, 0xa2, 0xbd, 0x54, 0xaf, 0x48, 0xbd, 0xbb, 0x7d, 0x4a, 0x3c, 0x32, + 0xcd, 0x6a, 0x3d, 0xc0, 0x75, 0x8b, 0x3d, 0x9a, 0xad, 0x67, 0x3c, 0xd1, 0xe6, + 0x30, 0xbd, 0x85, 0x2b, 0x33, 0x3c, 0xee, 0x90, 0x69, 0x3b, 0x7b, 0xdc, 0x96, + 0xbd, 0x38, 0x29, 0xad, 0x3b, 0xd8, 0x2b, 0xff, 0xbb, 0x72, 0x62, 0x57, 0x3c, + 0x55, 0x29, 0x86, 0x3d, 0xc7, 0x7c, 0x90, 0xbd, 0xfa, 0xa6, 0x71, 0xbd, 0x7f, + 0x51, 0x15, 0x3c, 0x7a, 0x11, 0x61, 0xbd, 0xd8, 0xd1, 0x64, 0x3b, 0xbc, 0x7e, + 0x8e, 0x3c, 0x06, 0x60, 0xe6, 0x3b, 0x1a, 0xd8, 0x43, 0x3d, 0x9b, 0xa8, 0x99, + 0xbd, 0x30, 0x98, 0x17, 0x3d, 0x82, 0xd8, 0x7a, 0xbd, 0xca, 0x23, 0x14, 0x3d, + 0x45, 0x6d, 0x18, 0xbd, 0x0d, 0x33, 0x8d, 0x3c, 0xd9, 0x88, 0xb5, 0xbc, 0x9c, + 0x01, 0xc6, 0x3b, 0xc2, 0x52, 0xe5, 0x3c, 0xc6, 0xbf, 0x5a, 0x3d, 0xa8, 0x06, + 0x1f, 0xbd, 0x1f, 0xaf, 0x4e, 0x3d, 0x84, 0x35, 0xca, 0xbd, 0x50, 0xc8, 0xee, + 0x3c, 0x64, 0xe8, 0x35, 0xbd, 0xbc, 0x23, 0x31, 0x3d, 0x36, 0x1d, 0xbf, 0xbd, + 0x7c, 0x88, 0x94, 0xbc, 0x0f, 0x8f, 0x1b, 0x3d, 0x08, 0x54, 0x81, 0x3c, 0x12, + 0x2f, 0x8a, 0xbd, 0xd7, 0x70, 0x3c, 0xbc, 0xb8, 0x2a, 0x50, 0x3d, 0xc8, 0xed, + 0x0e, 0xbd, 0xb7, 0xa3, 0x54, 0x3d, 0xc9, 0x64, 0x6c, 0xbc, 0x89, 0x83, 0x25, + 0xbd, 0xef, 0x72, 0x3b, 0x3b, 0xeb, 0xf8, 0xec, 0x3b, 0xe6, 0x5e, 0x0b, 0xbc, + 0xd4, 0xc0, 0xf5, 0xbc, 0x8a, 0x04, 0x92, 0x3d, 0xe8, 0x04, 0x39, 0xbd, 0x0f, + 0x74, 0xea, 0x3c, 0xfc, 0x8b, 0x01, 0xbc, 0xb2, 0xe0, 0x73, 0x3d, 0xc8, 0xa1, + 0xea, 0x3c, 0x99, 0xfe, 0x4f, 0x3d, 0xde, 0x4f, 0x36, 0xbd, 0x73, 0xe5, 0x76, + 0xbd, 0x8b, 0xd2, 0xdb, 0x3b, 0x96, 0x72, 0x79, 0x3c, 0xd0, 0x9b, 0x14, 0x3d, + 0x3d, 0x6f, 0x6a, 0x3d, 0x21, 0x55, 0x16, 0x3d, 0xeb, 0x2a, 0x91, 0x3d, 0x8c, + 0xd0, 0x33, 0xbd, 0x45, 0xdd, 0x54, 0xbd, 0x7e, 0x94, 0x90, 0xbc, 0xd4, 0x4c, + 0x8b, 0x3c, 0x4a, 0x6b, 0x19, 0x3d, 0x9e, 0x42, 0xeb, 0x3c, 0x7d, 0xf2, 0x4f, + 0x3d, 0x17, 0x4f, 0xab, 0x3c, 0x28, 0x37, 0xa1, 0x3c, 0x6d, 0xb8, 0x88, 0xbd, + 0xc1, 0xe3, 0x1e, 0xbd, 0x8f, 0x8c, 0x60, 0x3d, 0xe9, 0x88, 0x93, 0x3c, 0x54, + 0x12, 0x8e, 0x3d, 0x04, 0x68, 0xcb, 0xbc, 0x6e, 0xbf, 0xb0, 0xb9, 0xba, 0x8b, + 0x16, 0x3d, 0x3a, 0x30, 0xd5, 0x39, 0x89, 0x43, 0x89, 0x3c, 0x89, 0x8c, 0xc0, + 0x3b, 0x93, 0x98, 0xd9, 0xbd, 0xc5, 0x26, 0x3e, 0xbd, 0x2a, 0x4f, 0xa9, 0xbb, + 0x35, 0xa6, 0xe6, 0xbc, 0xeb, 0x89, 0x1f, 0x3d, 0xea, 0x85, 0xb7, 0xbc, 0xa7, + 0x52, 0xbb, 0xbc, 0x02, 0xda, 0x86, 0x3d, 0x82, 0xad, 0xfd, 0xba, 0x01, 0x20, + 0x2f, 0xbd, 0xb8, 0x8c, 0x9d, 0xbd, 0x9c, 0xbd, 0x1b, 0x3d, 0x1d, 0xad, 0xe6, + 0x3c, 0xac, 0x48, 0x6b, 0x3c, 0xdd, 0x13, 0xcb, 0xbd, 0xee, 0xcd, 0x8a, 0xbd, + 0x8b, 0x33, 0x7c, 0x3d, 0xc5, 0x0a, 0x2a, 0x3d, 0x13, 0x49, 0x77, 0x3d, 0x7e, + 0x78, 0xd1, 0xbd, 0xd3, 0x18, 0x3c, 0x3c, 0xb7, 0xaa, 0xb1, 0xbc, 0x54, 0x3a, + 0xce, 0xbc, 0x86, 0x08, 0x97, 0xbd, 0x04, 0x21, 0x01, 0xbc, 0x72, 0xa8, 0x65, + 0x3d, 0x71, 0x0b, 0xf3, 0x3b, 0x14, 0x9e, 0x88, 0x3c, 0x9c, 0xc6, 0x90, 0x3d, + 0x1d, 0xdb, 0x37, 0xbd, 0x8e, 0x9e, 0x59, 0x3c, 0xf6, 0xa9, 0x1a, 0xbd, 0xfd, + 0xec, 0x19, 0x3d, 0xa3, 0x01, 0x5a, 0xbd, 0xcc, 0xe7, 0x15, 0xbd, 0x26, 0xe6, + 0x51, 0x3d, 0xeb, 0x5f, 0x8d, 0x3d, 0x93, 0x7a, 0x73, 0x3c, 0x94, 0x02, 0x10, + 0x3d, 0x5d, 0x7e, 0xa7, 0x3c, 0x52, 0x78, 0x12, 0xbd, 0xe2, 0xfb, 0x44, 0x3d, + 0xb8, 0xdf, 0xa4, 0x3c, 0x84, 0x3d, 0x0e, 0xbd, 0xad, 0xae, 0x0e, 0x3c, 0x52, + 0xda, 0x1e, 0x3d, 0xfe, 0x93, 0x92, 0xbd, 0xe8, 0xe3, 0xde, 0xbd, 0x7a, 0xdc, + 0xd9, 0xbc, 0xc3, 0xb0, 0x68, 0x3d, 0x58, 0x56, 0x25, 0xbd, 0x3a, 0x61, 0xdc, + 0xbc, 0x71, 0xa2, 0xbc, 0x3c, 0x1b, 0xab, 0x30, 0x3d, 0x2a, 0x68, 0xbd, 0xbb, + 0x5e, 0xaf, 0x8b, 0xbd, 0xb4, 0x4d, 0x30, 0x3d, 0xa0, 0x46, 0x72, 0x3d, 0x4e, + 0xd2, 0x10, 0x3d, 0x71, 0x47, 0x4e, 0xbd, 0xe5, 0xd4, 0xe6, 0xbc, 0x25, 0x05, + 0x87, 0x3c, 0x33, 0x85, 0xec, 0x3c, 0x84, 0x58, 0x5f, 0xbd, 0xb0, 0xfa, 0xc0, + 0xbd, 0xc0, 0xdb, 0x87, 0xba, 0xa0, 0x30, 0x13, 0x3d, 0x84, 0x01, 0xe2, 0xbc, + 0xee, 0x8d, 0xa1, 0x3c, 0xc8, 0x8c, 0x24, 0x3c, 0x2b, 0x33, 0xf0, 0x3c, 0xc5, + 0xdd, 0x55, 0x3c, 0x89, 0x7c, 0xa5, 0xbc, 0x3b, 0x39, 0x19, 0xbd, 0xed, 0x0d, + 0x74, 0x3d, 0x98, 0xdf, 0x24, 0xbc, 0xdd, 0xdc, 0x38, 0xbd, 0xab, 0x9f, 0x75, + 0x3b, 0xd7, 0x20, 0xf3, 0x3c, 0x96, 0xa3, 0x78, 0x3c, 0x58, 0x44, 0x90, 0xbd, + 0x21, 0xcb, 0xf2, 0x3b, 0x18, 0x22, 0x58, 0xbd, 0x7c, 0x1c, 0x1b, 0xbd, 0xdc, + 0x4d, 0x19, 0xbd, 0xff, 0x68, 0x35, 0xbb, 0x34, 0xc5, 0x5e, 0x3c, 0x48, 0x3a, + 0x90, 0xbd, 0xa1, 0x84, 0xa7, 0x3c, 0x96, 0xc6, 0x46, 0xbd, 0x20, 0x22, 0xb3, + 0xbc, 0x16, 0x95, 0x18, 0x3d, 0x84, 0xa2, 0x5e, 0x3d, 0x78, 0x3a, 0x29, 0xbd, + 0x37, 0x9a, 0x5a, 0xbd, 0x93, 0x8b, 0x80, 0x3d, 0x25, 0xff, 0x49, 0xbd, 0xf0, + 0x1e, 0x8c, 0xbb, 0xde, 0xa1, 0x48, 0x3d, 0x58, 0x67, 0x2d, 0x3d, 0x09, 0x18, + 0x26, 0x3d, 0x37, 0x68, 0x85, 0x3d, 0xa0, 0x28, 0x70, 0x3d, 0x33, 0xf5, 0x9f, + 0xbc, 0x81, 0xcc, 0x97, 0xbd, 0x75, 0x24, 0x45, 0xbd, 0x60, 0x45, 0x29, 0x3d, + 0x6b, 0x87, 0x25, 0xbd, 0x67, 0xd9, 0xb5, 0xbc, 0x15, 0xcb, 0x01, 0xbd, 0x39, + 0xa5, 0xc6, 0xbd, 0xd2, 0xbe, 0xb9, 0xbd, 0x7c, 0x53, 0x20, 0xbd, 0x1a, 0x64, + 0xb4, 0xbd, 0x5a, 0xc1, 0x1d, 0x3d, 0xdf, 0xdd, 0x50, 0xbc, 0x8e, 0x86, 0x2b, + 0x3d, 0x20, 0xeb, 0x4d, 0x3d, 0x9a, 0xf8, 0x88, 0x3d, 0x92, 0xf1, 0x5e, 0xbd, + 0x24, 0xb3, 0xd8, 0xbb, 0x19, 0xbc, 0xd9, 0xbc, 0x8d, 0x97, 0x8f, 0xbd, 0x6d, + 0xf5, 0x7b, 0x3c, 0xfe, 0x33, 0x66, 0xbc, 0x35, 0x64, 0xfa, 0x3b, 0xe6, 0x00, + 0x9d, 0xbc, 0xd6, 0x9c, 0x63, 0xbd, 0x02, 0xff, 0x8e, 0xbd, 0x10, 0xa1, 0x23, + 0xbd, 0x93, 0x33, 0x0f, 0xbd, 0x59, 0xfc, 0x1b, 0x3d, 0x43, 0x0c, 0x7f, 0x3d, + 0x06, 0xbd, 0x96, 0x3d, 0xe1, 0x5b, 0x9f, 0xbc, 0x44, 0x05, 0xf8, 0x3c, 0x1c, + 0x60, 0xec, 0xbd, 0x33, 0x7f, 0x8c, 0xbd, 0x93, 0xcb, 0x0c, 0xbc, 0xc0, 0x8d, + 0x0e, 0xbb, 0x16, 0x45, 0x65, 0xbd, 0x76, 0x93, 0x88, 0xbd, 0x49, 0xd0, 0xb3, + 0xbd, 0xeb, 0x0e, 0x56, 0xbd, 0x8f, 0x1a, 0xab, 0x3d, 0x30, 0xde, 0x72, 0xb8, + 0xcf, 0xc7, 0x1d, 0xbd, 0x12, 0xc3, 0x31, 0xbd, 0x6e, 0x1d, 0x47, 0xbd, 0xb3, + 0x0f, 0x8c, 0x3d, 0x31, 0x82, 0x80, 0x3d, 0x44, 0xc4, 0x6b, 0xbc, 0x07, 0x28, + 0x5a, 0x3d, 0xa3, 0x3c, 0x3d, 0xbd, 0x13, 0x5c, 0x6a, 0x3d, 0x1c, 0x3f, 0x11, + 0x3d, 0x50, 0xac, 0xb5, 0xbc, 0x9f, 0x0e, 0xd9, 0x3c, 0x55, 0xfb, 0xde, 0xbc, + 0x6b, 0x4f, 0x6a, 0xbd, 0x38, 0x5f, 0x3f, 0x3b, 0x5a, 0x26, 0x98, 0xbc, 0x32, + 0x8c, 0x36, 0x3d, 0x78, 0x0a, 0x73, 0x3c, 0x7f, 0xd4, 0x51, 0x3d, 0x69, 0xdb, + 0x97, 0x3d, 0x52, 0x37, 0x80, 0x3d, 0x9b, 0x10, 0x88, 0xbd, 0xc0, 0xbf, 0x90, + 0xbd, 0x43, 0x84, 0x44, 0x3d, 0x12, 0x73, 0xc8, 0xbc, 0x84, 0xe0, 0x42, 0x3d, + 0xf5, 0x79, 0xd2, 0xbc, 0x88, 0x3b, 0x05, 0x3d, 0xf6, 0x10, 0xf3, 0x3b, 0x73, + 0x77, 0x8d, 0x3d, 0x92, 0xf0, 0x77, 0x3d, 0xd4, 0xcd, 0x55, 0xbd, 0x44, 0x7c, + 0x88, 0xbd, 0x3b, 0xe3, 0x5f, 0xbd, 0x0c, 0x35, 0x87, 0x3c, 0x09, 0x68, 0xf0, + 0x3c, 0x60, 0x3e, 0x47, 0x3a, 0xf6, 0x12, 0xb2, 0xbd, 0x2b, 0xe9, 0x9d, 0x3d, + 0x8e, 0x7c, 0x97, 0xbc, 0xb1, 0x05, 0x2e, 0xbc, 0x99, 0x6b, 0x14, 0xbd, 0xb2, + 0xa1, 0x85, 0x3d, 0x1c, 0xd1, 0x31, 0x3d, 0x18, 0xe6, 0xf5, 0x3c, 0xa7, 0x25, + 0x5a, 0x3c, 0xe0, 0x75, 0x9e, 0xbd, 0x1b, 0xe1, 0x69, 0xbd, 0x1b, 0x22, 0xc0, + 0x3d, 0xc4, 0x04, 0x8e, 0x3d, 0x92, 0x7f, 0x9d, 0x3d, 0xd3, 0xf3, 0x80, 0xbb, + 0x69, 0x7a, 0x58, 0x3c, 0xd5, 0xc2, 0x92, 0xbc, 0x26, 0x08, 0xa2, 0xbd, 0x9f, + 0xe8, 0x45, 0x3d, 0x10, 0xc9, 0x44, 0x3d, 0x7e, 0xac, 0x61, 0x3d, 0x88, 0xa8, + 0xf1, 0x3c, 0xa2, 0xd1, 0x87, 0xbd, 0x8c, 0xa7, 0xd1, 0xbc, 0x77, 0x21, 0x86, + 0xbd, 0x3b, 0x5a, 0xaa, 0x3d, 0x27, 0x8b, 0xb7, 0x3d, 0xe2, 0x8c, 0x39, 0x3d, + 0x16, 0x70, 0xc0, 0xbc, 0x45, 0xcc, 0x81, 0xbd, 0xfd, 0x54, 0x09, 0x3d, 0x7f, + 0x19, 0x0d, 0x3c, 0x0a, 0xfe, 0x39, 0xbd, 0xaf, 0x91, 0x66, 0xbd, 0x1c, 0xf9, + 0xa3, 0x3d, 0x6d, 0xfa, 0xa7, 0x3b, 0x55, 0x1d, 0xa2, 0x3d, 0xd4, 0x1c, 0x8a, + 0x3d, 0x21, 0xeb, 0xbd, 0xbc, 0xd7, 0x77, 0x45, 0xbc, 0x2b, 0xb9, 0x37, 0xbd, + 0x7b, 0x7c, 0xbd, 0xbd, 0x59, 0xa0, 0x92, 0xbd, 0xb9, 0x28, 0x2f, 0xbd, 0x1c, + 0xb6, 0x8c, 0xbc, 0x48, 0x52, 0x58, 0xbd, 0x90, 0x67, 0xa3, 0x3b, 0x92, 0xff, + 0x79, 0x3d, 0x55, 0x80, 0x9d, 0x3c, 0x68, 0x54, 0x98, 0xbd, 0xc6, 0xff, 0xbc, + 0xbc, 0x76, 0xb5, 0x72, 0xbd, 0x00, 0x62, 0x86, 0xbd, 0x6b, 0x01, 0xe3, 0xbc, + 0x42, 0x03, 0x6e, 0xbd, 0xd6, 0xe1, 0x7d, 0xbd, 0xcd, 0xed, 0x8b, 0x3c, 0x67, + 0x9d, 0x49, 0x3d, 0x6a, 0xe8, 0x31, 0x3d, 0xfd, 0x25, 0x4c, 0x3d, 0x87, 0x12, + 0xe8, 0xbb, 0x31, 0x54, 0x92, 0xbc, 0xbe, 0xab, 0x98, 0xbb, 0x85, 0x6c, 0xf7, + 0x3b, 0xb8, 0x0e, 0xbc, 0xbc, 0xf8, 0xea, 0x9a, 0x3d, 0x36, 0x13, 0xe2, 0xbc, + 0x9f, 0xd7, 0x6d, 0x3d, 0x4f, 0x0a, 0xb1, 0x3d, 0xba, 0x5c, 0x6b, 0xbd, 0xae, + 0x73, 0x60, 0xbc, 0x61, 0xf2, 0x8b, 0x3c, 0x90, 0x4c, 0x7b, 0xbd, 0x50, 0xef, + 0xe9, 0xbd, 0x54, 0x83, 0x99, 0xbc, 0x8f, 0xd5, 0x4d, 0x3d, 0x6b, 0x02, 0x37, + 0x3d, 0xc8, 0xe7, 0x84, 0x3d, 0x4e, 0x73, 0x87, 0x3d, 0x7a, 0xcc, 0xaa, 0x3c, + 0x0e, 0xde, 0x26, 0xbd, 0xef, 0xfb, 0xc8, 0xbd, 0x96, 0xe9, 0x11, 0xbd, 0xd2, + 0xd6, 0x26, 0xbc, 0x01, 0xea, 0x72, 0xbd, 0xf4, 0xb7, 0xad, 0xbb, 0x5b, 0xe7, + 0x9e, 0x3d, 0xe6, 0xa1, 0x06, 0xbe, 0x4d, 0xa9, 0xd4, 0x3c, 0x83, 0xc9, 0xdf, + 0x3c, 0x31, 0x26, 0x85, 0x3c, 0x4d, 0x25, 0xcf, 0xbb, 0x6c, 0xea, 0x91, 0x3d, + 0xb3, 0x55, 0x5d, 0x3c, 0x7f, 0x1d, 0x70, 0xbd, 0x0d, 0x6f, 0x85, 0x3d, 0xbe, + 0xe6, 0x35, 0xbd, 0x0f, 0x5b, 0x02, 0xbc, 0x1e, 0xad, 0x60, 0xbd, 0xeb, 0x48, + 0x4c, 0x3d, 0x73, 0x67, 0xaf, 0x3c, 0xda, 0x33, 0x03, 0x3d, 0xd9, 0xa3, 0x0d, + 0xbb, 0x6e, 0x31, 0x11, 0x3d, 0xb3, 0x7e, 0xfc, 0x3c, 0xc4, 0x86, 0x49, 0x3c, + 0x0a, 0x52, 0x0b, 0x3d, 0x68, 0x25, 0xae, 0x3d, 0xe0, 0x16, 0x02, 0x3d, 0xc0, + 0x47, 0x3f, 0xbd, 0x98, 0x55, 0x70, 0x3c, 0x1a, 0xbb, 0x38, 0x3d, 0xcf, 0x31, + 0xe4, 0xbc, 0xe0, 0x45, 0x39, 0xbd, 0x7c, 0xa1, 0x3f, 0xbd, 0xcc, 0x5b, 0x91, + 0xbd, 0x55, 0x28, 0x59, 0x3a, 0x75, 0xdc, 0x02, 0xbd, 0xd8, 0x0d, 0xfe, 0xbb, + 0x38, 0x7f, 0x92, 0xbd, 0x0f, 0xeb, 0x83, 0xbc, 0xcf, 0xe7, 0x0c, 0xbd, 0xb5, + 0xf8, 0x59, 0x3d, 0xfc, 0xd4, 0xcf, 0xbb, 0xa3, 0x75, 0x8a, 0x3d, 0xac, 0xe9, + 0x8e, 0xbd, 0x4a, 0xf9, 0x71, 0x3d, 0xee, 0x83, 0x32, 0xbc, 0x7c, 0x78, 0xa0, + 0xbd, 0x87, 0x86, 0x6a, 0xbd, 0x1a, 0x3c, 0xe4, 0xbc, 0x89, 0x4a, 0xa1, 0x3d, + 0xa0, 0x39, 0xdd, 0x3c, 0x93, 0xa3, 0x93, 0x3c, 0xdd, 0x08, 0xa2, 0x3d, 0x9a, + 0x87, 0x98, 0xbd, 0xe6, 0x5a, 0x32, 0xbd, 0xeb, 0x4d, 0xea, 0xbb, 0x48, 0xda, + 0x6b, 0x3c, 0x36, 0x23, 0x82, 0x3d, 0x80, 0x78, 0x90, 0x3d, 0x0e, 0x4c, 0x1b, + 0xbd, 0xb9, 0x3c, 0x54, 0x3d, 0x5f, 0x8b, 0xf5, 0xbb, 0x54, 0x40, 0x54, 0xbd, + 0x35, 0x04, 0x8e, 0xbc, 0x38, 0xcf, 0xe0, 0x3b, 0x2f, 0xf6, 0x55, 0xbd, 0xe0, + 0xed, 0x7e, 0x3c, 0x84, 0x12, 0x9c, 0x3d, 0x74, 0x34, 0xfb, 0xbc, 0x02, 0xd9, + 0x93, 0xbd, 0xff, 0x27, 0xa8, 0xbd, 0x83, 0xf3, 0xaf, 0xbb, 0x99, 0x16, 0x7d, + 0x3d, 0xc6, 0xd9, 0x32, 0xbd, 0xb1, 0xa4, 0xbd, 0xbc, 0xd2, 0x1c, 0x5b, 0x3d, + 0xb3, 0xdb, 0x31, 0x3d, 0xe4, 0x10, 0x03, 0x3c, 0x29, 0xb0, 0x0b, 0xbd, 0x16, + 0x47, 0x9b, 0x3d, 0x75, 0x6b, 0xfd, 0xbc, 0x09, 0x92, 0xac, 0x3c, 0x12, 0x2c, + 0x07, 0x3d, 0x5a, 0xb3, 0xa0, 0x3c, 0xc9, 0x3d, 0x21, 0xbd, 0xc1, 0x80, 0x6d, + 0xbd, 0xa9, 0x20, 0x9c, 0x3d, 0xf5, 0x5b, 0x07, 0xbe, 0x9a, 0x76, 0x6f, 0xbd, + 0xd5, 0x11, 0xff, 0x3d, 0x58, 0xda, 0xd4, 0x3c, 0x18, 0x2f, 0xb9, 0x3d, 0xd4, + 0xa0, 0x6c, 0xbd, 0x4d, 0xe5, 0x2b, 0xbc, 0x97, 0x9d, 0x5f, 0xbc, 0x55, 0xe6, + 0x9b, 0xbd, 0x61, 0xee, 0xb3, 0x3c, 0x24, 0x06, 0xbf, 0x3c, 0xc2, 0x90, 0x09, + 0xbd, 0x91, 0xaf, 0x63, 0x3d, 0xde, 0x86, 0x7b, 0x3c, 0xca, 0x42, 0x0d, 0x3c, + 0x5f, 0xda, 0xcd, 0xbc, 0x7b, 0x27, 0x13, 0x3d, 0xf9, 0xd1, 0x14, 0x3c, 0xb6, + 0x83, 0x4a, 0x3d, 0x37, 0x74, 0x63, 0xbd, 0xbb, 0x85, 0x40, 0xbd, 0x3e, 0x15, + 0x13, 0x3d, 0x00, 0xe1, 0x22, 0xbd, 0xef, 0xdd, 0x63, 0xbd, 0x95, 0xdb, 0xa6, + 0x3c, 0xf4, 0xc1, 0x86, 0xbd, 0xfd, 0xf0, 0xe5, 0x3c, 0x84, 0xc1, 0x69, 0xbd, + 0xe4, 0x85, 0xf5, 0x3c, 0x18, 0xfa, 0x79, 0xbd, 0xe3, 0xd5, 0x2e, 0xbd, 0x32, + 0x90, 0x8f, 0xbc, 0x40, 0xfa, 0x08, 0xbc, 0xa4, 0x5f, 0xcb, 0xbc, 0x5a, 0xa7, + 0x3f, 0x3d, 0x09, 0x40, 0x23, 0x3d, 0x7b, 0x17, 0x0e, 0xbd, 0x6e, 0x70, 0xb9, + 0x3b, 0xc7, 0x3d, 0x4d, 0xbd, 0xe9, 0x57, 0x5d, 0x3d, 0x5c, 0x02, 0x91, 0x3c, + 0xc8, 0x08, 0x31, 0xbd, 0x09, 0xea, 0xe3, 0x3c, 0x14, 0x23, 0xf6, 0x3c, 0x95, + 0xd1, 0x22, 0xbd, 0xba, 0x27, 0xce, 0x3c, 0xb2, 0x59, 0x42, 0xbd, 0x29, 0x50, + 0x6d, 0x3d, 0x20, 0xe5, 0x10, 0xbd, 0xc2, 0x68, 0x5a, 0xbd, 0x04, 0x6e, 0x81, + 0xbd, 0xd6, 0xc7, 0xa4, 0xbc, 0x16, 0x22, 0x33, 0x3d, 0x80, 0xbf, 0x70, 0x3c, + 0xbf, 0x62, 0x02, 0xbd, 0xdd, 0x19, 0x28, 0xbd, 0x8d, 0x5c, 0x60, 0x3d, 0x96, + 0xb4, 0x24, 0xbd, 0x9a, 0xb5, 0x6e, 0xbd, 0x52, 0xb5, 0x81, 0x3d, 0xf3, 0x49, + 0x85, 0xbd, 0x4a, 0x65, 0xcc, 0x3c, 0x06, 0xca, 0x13, 0xbd, 0x18, 0x94, 0x07, + 0x3d, 0xde, 0x60, 0x45, 0x3c, 0x7a, 0x2d, 0x69, 0x3d, 0x7e, 0xc6, 0xba, 0xbc, + 0xff, 0xcf, 0x64, 0x3d, 0x3e, 0x22, 0x98, 0xbd, 0xe1, 0x87, 0xc8, 0x3c, 0xec, + 0x54, 0x90, 0xbd, 0x60, 0x0b, 0x09, 0x3d, 0x5e, 0xc7, 0x95, 0x3c, 0x54, 0x1c, + 0x5b, 0x3b, 0xac, 0x77, 0xfe, 0x3c, 0x4c, 0x43, 0xea, 0xbc, 0xe4, 0x4d, 0xb3, + 0x3c, 0xab, 0x96, 0x20, 0xbd, 0xf7, 0x8a, 0x48, 0xbd, 0xcc, 0xcb, 0x70, 0x3d, + 0x25, 0x01, 0x91, 0xbc, 0x9c, 0x9a, 0x96, 0x3c, 0x9c, 0x7d, 0x56, 0x3d, 0x3e, + 0x2b, 0x47, 0xbd, 0x44, 0x48, 0x15, 0xbd, 0x38, 0x4e, 0xc1, 0x3c, 0x9e, 0x72, + 0x05, 0x3d, 0xe9, 0xbd, 0x44, 0xbc, 0x96, 0xdd, 0x6f, 0x3d, 0x17, 0x2b, 0x4e, + 0x3c, 0x21, 0x91, 0x4c, 0x3d, 0x2f, 0x87, 0x8e, 0xbd, 0xf2, 0xd2, 0x31, 0x3d, + 0x47, 0x07, 0xad, 0xbc, 0x41, 0x54, 0x89, 0x3c, 0xee, 0xa9, 0x4d, 0x3d, 0xf2, + 0xb1, 0x80, 0x3d, 0x6a, 0xd9, 0x78, 0xbd, 0x55, 0x4a, 0x32, 0xbd, 0xd1, 0xd8, + 0x44, 0x3d, 0xda, 0x72, 0x7d, 0x3d, 0xa1, 0xd1, 0xbc, 0x3b, 0x7a, 0xf4, 0x32, + 0xbd, 0xf0, 0x44, 0x84, 0x3d, 0xd3, 0x0b, 0x8c, 0x3d, 0xd9, 0xc8, 0x58, 0xbd, + 0xdd, 0x2c, 0x7c, 0x3d, 0x49, 0x3e, 0x8f, 0x3d, 0x39, 0xbd, 0x95, 0xbd, 0x99, + 0x46, 0x25, 0x3d, 0x63, 0xfe, 0x20, 0xbd, 0x0a, 0x1d, 0x62, 0xbc, 0x4b, 0xae, + 0x3b, 0xbc, 0x3c, 0x28, 0x84, 0xbc, 0x79, 0x24, 0x25, 0xbd, 0x62, 0x6b, 0x56, + 0xbd, 0xe9, 0x9a, 0x88, 0x3d, 0xd6, 0x9f, 0x85, 0xbc, 0xad, 0xf6, 0x51, 0xbd, + 0xc2, 0x72, 0x85, 0x3d, 0xf6, 0x0d, 0x89, 0xbd, 0x3e, 0x76, 0xca, 0x39, 0x90, + 0x96, 0x89, 0x3d, 0xa1, 0x6e, 0x25, 0xbd, 0x4b, 0xbd, 0x18, 0x3c, 0x0e, 0x05, + 0x69, 0xbc, 0x03, 0x9e, 0x76, 0x3d, 0xa3, 0xae, 0x67, 0x3d, 0xc4, 0x38, 0x5a, + 0x3d, 0x8c, 0x9d, 0x53, 0xbd, 0x35, 0x24, 0x42, 0xbd, 0x36, 0xfa, 0xcf, 0x3c, + 0xe8, 0x09, 0x0f, 0xbd, 0xe9, 0x6e, 0x15, 0xbd, 0x51, 0x03, 0x1b, 0xbd, 0xf7, + 0x1d, 0x32, 0x3d, 0x08, 0xfc, 0x2f, 0xbd, 0x9d, 0x4c, 0x65, 0x3d, 0x9d, 0xf0, + 0x98, 0xbb, 0xb0, 0xba, 0x0d, 0xbc, 0x64, 0xee, 0x03, 0xbb, 0x92, 0x82, 0x16, + 0xbc, 0xa5, 0xa0, 0x94, 0xbd, 0xd0, 0x1f, 0xf1, 0x3c, 0xeb, 0x06, 0x8c, 0xbb, + 0xb5, 0xc2, 0x64, 0x3c, 0x7e, 0x30, 0x55, 0x3c, 0x68, 0x89, 0x64, 0x3c, 0xec, + 0x1e, 0x9e, 0x3c, 0xf0, 0xc9, 0x57, 0x3d, 0xfe, 0x25, 0x0c, 0xbd, 0x2f, 0xb4, + 0x0b, 0x3c, 0x32, 0x76, 0x7a, 0xbd, 0xd2, 0x15, 0xea, 0xba, 0xc0, 0xc9, 0x45, + 0xbd, 0xb7, 0xda, 0x48, 0xbc, 0x5e, 0x85, 0x6c, 0x3c, 0xbc, 0xda, 0x84, 0xbc, + 0xc6, 0x56, 0x35, 0xbd, 0x21, 0xfd, 0x7d, 0x3d, 0xbf, 0x0c, 0x0f, 0x3b, 0xc2, + 0x28, 0xa4, 0xbc, 0xad, 0xa3, 0xe7, 0xbb, 0x77, 0xd9, 0x55, 0x3d, 0x6d, 0x5a, + 0x21, 0xbc, 0x3f, 0xa0, 0xd9, 0xbc, 0x1b, 0x86, 0x85, 0x3d, 0x38, 0x2f, 0x1f, + 0xbd, 0xd5, 0xa5, 0x43, 0x3d, 0xdb, 0x04, 0x8d, 0xbd, 0xbc, 0x0d, 0x25, 0x3d, + 0xf5, 0x71, 0x86, 0x3d, 0xa8, 0x4e, 0x88, 0xbd, 0xca, 0xab, 0x24, 0x3c, 0x8d, + 0x03, 0xda, 0x3c, 0xad, 0x77, 0x19, 0xbc, 0x2e, 0x7c, 0xf5, 0x3c, 0x75, 0x45, + 0x6e, 0x3d, 0x9b, 0x9f, 0x80, 0xbd, 0x1d, 0xce, 0x85, 0x3d, 0xb6, 0xbe, 0x86, + 0xbc, 0xc0, 0x1c, 0x55, 0xbb, 0xd0, 0xc7, 0x5c, 0xbd, 0x1f, 0x60, 0x64, 0x3c, + 0x4f, 0x04, 0x60, 0xbd, 0x04, 0xc9, 0x64, 0x3d, 0x0a, 0xbb, 0x10, 0x3b, 0x08, + 0x41, 0x92, 0xbd, 0xac, 0x5b, 0x15, 0xbd, 0x44, 0xe8, 0x27, 0x3b, 0x9c, 0x98, + 0x0c, 0x3d, 0x09, 0x52, 0x7a, 0x3d, 0x33, 0xe4, 0xcd, 0xbc, 0xda, 0x48, 0x17, + 0xbd, 0x26, 0xe5, 0x5d, 0xbb, 0x2f, 0xfc, 0x69, 0xbd, 0x9f, 0xfd, 0x54, 0x3d, + 0x1d, 0x45, 0x07, 0xbd, 0x86, 0x69, 0x91, 0x3c, 0x9e, 0x1a, 0xbe, 0xbc, 0xfa, + 0xf4, 0x5e, 0x3d, 0xb5, 0x9d, 0x00, 0xbd, 0xe0, 0xfd, 0x90, 0x3c, 0x3a, 0xac, + 0xc9, 0xbc, 0x11, 0xa7, 0xb0, 0xbb, 0x3e, 0x18, 0xa8, 0x3c, 0x79, 0x2e, 0x55, + 0xbd, 0xe0, 0xb2, 0xfd, 0xbb, 0x72, 0xb0, 0x5d, 0xbc, 0xe1, 0xd9, 0x6f, 0x3d, + 0xd5, 0x3a, 0x9f, 0xbc, 0xc8, 0x8f, 0x1a, 0xbd, 0x18, 0x60, 0x3b, 0x3c, 0xc0, + 0x90, 0x24, 0xbc, 0x78, 0xb6, 0x50, 0x3d, 0x84, 0xc6, 0x81, 0xbd, 0x98, 0x2d, + 0x46, 0x3d, 0x7f, 0x8a, 0x3b, 0x3d, 0x03, 0xd9, 0x7f, 0x3d, 0x50, 0x04, 0xae, + 0x3c, 0xaf, 0xae, 0x6b, 0xbd, 0xcd, 0x34, 0x48, 0xbd, 0xbd, 0x05, 0xa8, 0x3c, + 0x84, 0xc8, 0x3f, 0xbd, 0xcb, 0x46, 0x89, 0x3d, 0x92, 0x2b, 0x16, 0x3d, 0x98, + 0xfb, 0xcd, 0xbc, 0x80, 0x5b, 0x43, 0xbd, 0xac, 0x5e, 0x78, 0x3c, 0xd6, 0xbf, + 0x7e, 0x3b, 0x32, 0xec, 0x81, 0x3b, 0xce, 0xab, 0xf1, 0x3b, 0xb2, 0xd7, 0x86, + 0xbc, 0xb1, 0xe3, 0x09, 0x3d, 0x4f, 0xc6, 0xa5, 0xbc, 0x4c, 0x1b, 0x89, 0x3c, + 0xd6, 0x09, 0x2b, 0x3d, 0x61, 0x67, 0x4a, 0xbc, 0x7a, 0x5e, 0x87, 0xbc, 0x6c, + 0x32, 0x55, 0x3c, 0x6b, 0xe0, 0xa7, 0xba, 0x41, 0xc8, 0xb5, 0xbc, 0x94, 0x54, + 0x64, 0xbc, 0x81, 0xb6, 0x33, 0x3d, 0x3a, 0x05, 0x59, 0x3d, 0x42, 0x25, 0x46, + 0xbd, 0xfc, 0xda, 0x8c, 0xbd, 0x17, 0x64, 0x87, 0x3d, 0x55, 0x39, 0x61, 0x3d, + 0x4f, 0xcf, 0x25, 0xbd, 0xfc, 0x4d, 0x26, 0x3c, 0x7c, 0x18, 0xd8, 0x3c, 0x4f, + 0x1b, 0x5c, 0x3d, 0x3a, 0x09, 0xcd, 0x3c, 0x27, 0x4a, 0x00, 0x3d, 0x1c, 0xb7, + 0xb7, 0xbc, 0x0a, 0x1b, 0x38, 0xbc, 0x88, 0x6d, 0x2f, 0x3d, 0x96, 0xdf, 0x6a, + 0xbd, 0x7e, 0x7e, 0xa0, 0xb9, 0x10, 0x23, 0x10, 0xbc, 0xec, 0x6b, 0xbf, 0x3c, + 0x1a, 0x8e, 0x7a, 0xbc, 0x68, 0xb1, 0x7c, 0x3d, 0xb0, 0xcc, 0x30, 0xbd, 0xec, + 0x59, 0xef, 0x3c, 0x8d, 0xd5, 0x41, 0x3b, 0x82, 0xa1, 0xec, 0xbc, 0x29, 0x35, + 0x51, 0xbd, 0x6e, 0x6e, 0x91, 0xbc, 0xf9, 0x6d, 0x2a, 0x3d, 0x5d, 0x97, 0x17, + 0x3d, 0xcb, 0xad, 0x29, 0x3c, 0xc4, 0x47, 0x41, 0x3d, 0x40, 0x7c, 0x6a, 0xbc, + 0xa6, 0x09, 0x1e, 0x3d, 0x14, 0x9c, 0xf2, 0xbc, 0x70, 0x31, 0x5d, 0x3c, 0xd1, + 0x54, 0x70, 0xbc, 0xd8, 0x58, 0xdd, 0x3a, 0x65, 0x21, 0x6a, 0xbd, 0x64, 0x81, + 0x99, 0xbd, 0x51, 0x5a, 0x64, 0x3c, 0x8c, 0xa6, 0x90, 0x3c, 0xe6, 0xb6, 0x2a, + 0xbd, 0x3d, 0x2a, 0x15, 0xbd, 0x82, 0xbe, 0x8d, 0xbc, 0x65, 0x32, 0x68, 0xbd, + 0x0a, 0x5d, 0x6d, 0xbc, 0x24, 0x8c, 0xd6, 0xbc, 0x70, 0x4d, 0xe7, 0x3c, 0x06, + 0x58, 0x01, 0x3c, 0x22, 0xd2, 0x58, 0x3d, 0x62, 0x60, 0x88, 0x3c, 0xfc, 0xe6, + 0x12, 0x3d, 0x31, 0x59, 0xdb, 0x3c, 0x5d, 0xfb, 0x96, 0xbc, 0xb6, 0x50, 0x7f, + 0x3b, 0xd7, 0x01, 0x37, 0x3d, 0x6a, 0x71, 0xc4, 0xbc, 0x8d, 0x28, 0xc9, 0x3c, + 0x33, 0x39, 0x4f, 0xbb, 0x14, 0x14, 0x1b, 0x3d, 0x32, 0x36, 0x62, 0xbd, 0xa7, + 0xf1, 0x89, 0x3d, 0xc4, 0x12, 0x13, 0x3d, 0xf3, 0x79, 0xde, 0x3c, 0xc0, 0x39, + 0xb3, 0xbb, 0x36, 0xb5, 0x54, 0xbd, 0x04, 0xf2, 0xcc, 0xbc, 0x45, 0x14, 0xf8, + 0x3a, 0x4b, 0x1d, 0x55, 0xbd, 0x13, 0x35, 0xc6, 0xbc, 0x7a, 0x92, 0x1b, 0xbd, + 0x71, 0xb0, 0x3b, 0xbd, 0xfe, 0x84, 0x2f, 0xbd, 0xd4, 0x64, 0x60, 0x3d, 0xa7, + 0x0b, 0xb7, 0xbb, 0xd1, 0xc7, 0x8a, 0xbd, 0x21, 0x20, 0x78, 0x3d, 0x1b, 0x25, + 0x77, 0x3d, 0x5e, 0x06, 0x20, 0xbd, 0x7d, 0xfa, 0xe0, 0xbc, 0x5b, 0x2b, 0x38, + 0x3d, 0x8c, 0x10, 0x90, 0xbd, 0xbe, 0xc0, 0xb2, 0x3c, 0x5a, 0x88, 0x94, 0xbd, + 0x80, 0x87, 0x94, 0x3c, 0x73, 0xed, 0x81, 0xbd, 0x73, 0x42, 0x3f, 0xba, 0xdc, + 0xf8, 0x4e, 0x3d, 0x9a, 0xd4, 0x8d, 0xbc, 0x3a, 0x6f, 0x72, 0xbc, 0x37, 0xe8, + 0x06, 0x3d, 0xbb, 0x35, 0x61, 0x3d, 0x64, 0xc6, 0x4a, 0x3d, 0xee, 0x94, 0x13, + 0xb9, 0xc0, 0x4b, 0xaf, 0xba, 0x60, 0x4b, 0x42, 0x3d, 0x40, 0x88, 0xb1, 0x3c, + 0xc6, 0x61, 0x6c, 0x3d, 0x92, 0xd0, 0x40, 0x3d, 0x32, 0xc0, 0x8d, 0xbd, 0x90, + 0x66, 0xc2, 0xbc, 0x52, 0x1f, 0x14, 0xbd, 0x03, 0x9d, 0x23, 0x3d, 0x81, 0x60, + 0xe1, 0x3c, 0xe3, 0x31, 0x5f, 0x3d, 0x38, 0xbc, 0x52, 0x3d, 0x23, 0x3e, 0x3b, + 0xbd, 0xf6, 0x53, 0x8e, 0xbd, 0xc9, 0xb1, 0x88, 0xbd, 0x02, 0x0c, 0xc6, 0xbc, + 0x2e, 0x6d, 0x26, 0xbd, 0xe2, 0x88, 0x87, 0xbd, 0x45, 0x45, 0x28, 0x3d, 0xbc, + 0x73, 0xd7, 0xba, 0x17, 0x1e, 0x15, 0xbc, 0xa6, 0x0c, 0x9c, 0xbc, 0x5a, 0x74, + 0x63, 0x3d, 0x05, 0x28, 0xf6, 0x3c, 0xe5, 0xda, 0x4d, 0xbd, 0x02, 0x69, 0x42, + 0xbd, 0x8a, 0xb0, 0x2c, 0x3d, 0x27, 0x22, 0x07, 0x3d, 0x6a, 0x7a, 0x08, 0x3b, + 0x88, 0xb6, 0x03, 0x3d, 0x80, 0xad, 0xac, 0xbb, 0xc9, 0x67, 0x6d, 0xbb, 0x80, + 0xf0, 0x8d, 0xbd, 0x53, 0x78, 0x85, 0x3d, 0x14, 0x99, 0x24, 0xbb, 0x86, 0x7c, + 0x0c, 0x3d, 0xbe, 0xff, 0x79, 0x3d, 0x01, 0x39, 0xb4, 0x3c, 0x19, 0x42, 0x52, + 0x3c, 0x4d, 0x8b, 0x73, 0x3d, 0xb4, 0x6b, 0xf1, 0x3a, 0x6e, 0x53, 0xb4, 0xbc, + 0x09, 0x88, 0x11, 0xbd, 0xdf, 0x5e, 0x86, 0xbd, 0x10, 0xdc, 0x5a, 0xbd, 0x6b, + 0xb3, 0x3a, 0xbd, 0x7e, 0x23, 0x84, 0xbd, 0x95, 0x50, 0x8c, 0xbd, 0xd1, 0x50, + 0x93, 0x3c, 0x5f, 0x43, 0x67, 0x3a, 0x92, 0xc2, 0x91, 0xbd, 0xbe, 0xb0, 0x4e, + 0xbd, 0x8c, 0xeb, 0x36, 0xbd, 0x4e, 0x0e, 0x82, 0xbd, 0xc5, 0x15, 0x0b, 0xbd, + 0x1c, 0x66, 0x5a, 0xbd, 0xf6, 0xe4, 0x19, 0x3b, 0x4d, 0x1c, 0x07, 0x3d, 0x70, + 0x1f, 0x24, 0x3d, 0x59, 0x80, 0x3b, 0xbd, 0x8e, 0x9e, 0xae, 0xbb, 0x11, 0x6f, + 0x8f, 0x3b, 0x5f, 0xc9, 0x74, 0xbd, 0x36, 0x65, 0x2b, 0x3c, 0x43, 0xb4, 0xcf, + 0x3c, 0x7f, 0xbf, 0x18, 0x3d, 0x91, 0x58, 0x16, 0xbd, 0x72, 0xc4, 0xf3, 0xbc, + 0x80, 0xd3, 0x8a, 0x3b, 0x95, 0x0e, 0xe7, 0x3c, 0xdd, 0x17, 0x1d, 0x3d, 0x55, + 0x74, 0x98, 0xbd, 0x5c, 0x6b, 0x1e, 0xbc, 0x02, 0x65, 0x61, 0xba, 0x01, 0x7f, + 0x81, 0xbc, 0x97, 0x95, 0x73, 0xbd, 0xd8, 0x60, 0xfd, 0xbc, 0xd4, 0x64, 0x8a, + 0x3a, 0xe5, 0x81, 0x24, 0x3c, 0xfd, 0x2b, 0x14, 0x3d, 0x60, 0x49, 0xff, 0x3b, + 0x6f, 0x63, 0x33, 0xbd, 0xe0, 0x83, 0x4b, 0xbd, 0xed, 0x7a, 0x10, 0x3d, 0x5b, + 0x26, 0x33, 0x3d, 0x03, 0xff, 0x2d, 0x3d, 0xcd, 0xca, 0x42, 0xbd, 0x4c, 0x09, + 0x3f, 0x3d, 0xcb, 0xcb, 0x95, 0xbc, 0xff, 0x04, 0x18, 0x3c, 0x99, 0x48, 0x6c, + 0xbd, 0xb6, 0x3f, 0x04, 0x3a, 0x68, 0x3d, 0x67, 0x3c, 0x71, 0xd9, 0x7a, 0xbc, + 0x88, 0x7d, 0x02, 0x3c, 0x0f, 0xfa, 0x3b, 0xbd, 0x78, 0x64, 0xfc, 0x3c, 0xab, + 0x8c, 0x37, 0x3d, 0x08, 0x19, 0xcf, 0xbc, 0x03, 0xe0, 0x85, 0xbd, 0x1b, 0xaf, + 0x79, 0xbd, 0x92, 0x9e, 0x67, 0x3d, 0x31, 0x3e, 0x94, 0xbd, 0xe8, 0xd1, 0x1f, + 0xbd, 0x4d, 0xa1, 0xcb, 0x3c, 0x9f, 0xc0, 0xf7, 0x3c, 0xa8, 0x88, 0xe1, 0xbc, + 0xf7, 0x13, 0x8b, 0x3c, 0x77, 0x1b, 0xfe, 0xbc, 0x11, 0xf0, 0x4d, 0x3d, 0x02, + 0x73, 0xff, 0xbc, 0x20, 0x4b, 0x2f, 0x3d, 0x50, 0x14, 0x28, 0x3c, 0xa2, 0x0a, + 0xc1, 0xbc, 0xb3, 0xf6, 0xe1, 0xbc, 0x32, 0x98, 0xa1, 0x3c, 0x3f, 0xef, 0xcc, + 0x3b, 0xd6, 0xbf, 0x37, 0xbd, 0x4e, 0x0a, 0x15, 0x3d, 0xfd, 0x81, 0x24, 0xbd, + 0x62, 0x05, 0x43, 0x3d, 0x4b, 0x8d, 0xb5, 0xbc, 0x0e, 0xe7, 0x7c, 0x3d, 0xd1, + 0x64, 0x88, 0xbd, 0xca, 0x03, 0xd3, 0xbb, 0xc9, 0xaa, 0x9f, 0xbb, 0xb5, 0x0e, + 0xbf, 0xbc, 0x48, 0x82, 0xe7, 0x3c, 0xa1, 0x4b, 0x10, 0x3d, 0x40, 0x51, 0x68, + 0xbb, 0xc0, 0x36, 0xc4, 0x3c, 0xcc, 0xd9, 0x37, 0xbc, 0xec, 0x40, 0xcf, 0x3c, + 0xb2, 0x38, 0x52, 0xbd, 0x15, 0xe7, 0x0c, 0xbd, 0x52, 0xea, 0x59, 0x3c, 0xcf, + 0xe3, 0xd1, 0xbc, 0x9e, 0xb7, 0x94, 0xbc, 0x1a, 0x13, 0xc8, 0x3c, 0x04, 0x51, + 0xa0, 0x3b, 0x7f, 0xb4, 0x32, 0x3d, 0x5e, 0x43, 0x5a, 0x3d, 0x8b, 0x6d, 0x98, + 0xba, 0xa4, 0x70, 0x47, 0x3d, 0xe6, 0x23, 0x60, 0x3d, 0x48, 0xf3, 0x8b, 0xbc, + 0x85, 0xfe, 0x60, 0x3d, 0x33, 0x94, 0xc7, 0xbc, 0xdd, 0xbf, 0x80, 0xbd, 0x31, + 0x98, 0xbb, 0x3b, 0x76, 0x70, 0x8a, 0x3c, 0x72, 0xc5, 0x4e, 0x3c, 0x31, 0x53, + 0x20, 0x3d, 0xcd, 0xda, 0x03, 0x3b, 0x8c, 0xc0, 0x3d, 0x3d, 0x9c, 0xaa, 0x90, + 0xbd, 0xb5, 0x9f, 0xab, 0x3c, 0x45, 0x77, 0x31, 0xbd, 0xea, 0x85, 0x8e, 0xbd, + 0x15, 0x6d, 0x8b, 0xbc, 0xb9, 0x98, 0xb1, 0xbc, 0x09, 0x9b, 0xff, 0x3c, 0x1e, + 0xcf, 0x3c, 0x3d, 0x3c, 0xe3, 0x2a, 0xbd, 0x2a, 0xff, 0x20, 0x3d, 0xbb, 0x1c, + 0x4a, 0x3b, 0x8f, 0x19, 0x83, 0xbd, 0xad, 0x9f, 0xe5, 0x3c, 0x43, 0x3d, 0x44, + 0x3d, 0xaa, 0xb9, 0xe3, 0x3c, 0x8c, 0xd1, 0x86, 0x3d, 0xfa, 0x93, 0x7c, 0x3d, + 0x31, 0xe5, 0x67, 0xbc, 0x3f, 0x25, 0x8a, 0xbd, 0x90, 0x91, 0x5e, 0x3b, 0xbf, + 0xd8, 0xfe, 0xbc, 0x68, 0xaa, 0x85, 0x3c, 0xb3, 0xb6, 0x07, 0xbd, 0x6f, 0x51, + 0x91, 0xbd, 0x3c, 0x5d, 0xc8, 0xbc, 0xba, 0xf5, 0xd3, 0xbb, 0x8d, 0x90, 0xd5, + 0xbc, 0x02, 0x78, 0x2f, 0xbc, 0x12, 0x94, 0x10, 0x3d, 0xb2, 0x26, 0x82, 0xbd, + 0x49, 0x2a, 0x70, 0x3d, 0x9c, 0xf4, 0x67, 0xbd, 0x8d, 0x33, 0xf3, 0xbc, 0x22, + 0xa0, 0xc3, 0x3c, 0x38, 0xb2, 0x31, 0x3d, 0x71, 0xe9, 0x87, 0xbd, 0x7c, 0xc5, + 0x96, 0xbd, 0x5b, 0x13, 0xa5, 0xbc, 0x2d, 0x8a, 0x8a, 0x3d, 0x80, 0xc2, 0x24, + 0x3d, 0x1e, 0xc5, 0x74, 0x3d, 0xec, 0x3a, 0xca, 0x3c, 0x37, 0xb4, 0x00, 0xbc, + 0x29, 0xe2, 0x0c, 0x3d, 0xbc, 0x36, 0x20, 0x3d, 0x58, 0x3a, 0x5f, 0x3d, 0x8a, + 0xe4, 0x24, 0xbd, 0x22, 0x99, 0x45, 0xbd, 0xbe, 0xef, 0x0d, 0xbd, 0xbe, 0xae, + 0x0f, 0xbc, 0xe1, 0xe9, 0x4e, 0x3c, 0xd2, 0xed, 0x54, 0xbd, 0x62, 0xcb, 0x7d, + 0x3c, 0xc8, 0xe4, 0x0d, 0xbc, 0x61, 0xaa, 0xa8, 0x3b, 0x68, 0x56, 0x92, 0xbb, + 0x83, 0xb3, 0x25, 0xbd, 0x0a, 0x28, 0x39, 0xbd, 0x9d, 0xd4, 0x13, 0x3c, 0x5c, + 0x3c, 0x27, 0x3d, 0x34, 0x21, 0x30, 0x3d, 0x9d, 0xac, 0x54, 0xbd, 0xaa, 0xe8, + 0x60, 0x3d, 0xb4, 0xaf, 0xe5, 0x3c, 0xb0, 0x22, 0x1d, 0x3d, 0x9c, 0x7e, 0x64, + 0x3d, 0x3e, 0xd9, 0x7b, 0x3d, 0x55, 0x9e, 0x46, 0x3d, 0x47, 0xf9, 0xfe, 0x3a, + 0x00, 0xf0, 0x79, 0xbc, 0x49, 0x93, 0xd5, 0xbb, 0x98, 0x75, 0x29, 0xbc, 0xfb, + 0xdc, 0x37, 0xbd, 0x9a, 0x0e, 0x65, 0x3d, 0x7a, 0x74, 0x93, 0xbd, 0x39, 0x83, + 0xba, 0x3c, 0x20, 0xa3, 0x94, 0xbd, 0xbf, 0x32, 0x18, 0xbc, 0xbd, 0x90, 0x19, + 0x3c, 0x31, 0xbe, 0x94, 0xbd, 0x1f, 0xd5, 0x9b, 0x3a, 0x09, 0xa3, 0x44, 0xbd, + 0xe4, 0x91, 0xae, 0xbc, 0x98, 0x84, 0x73, 0xbd, 0xe6, 0x64, 0x70, 0x3d, 0xcc, + 0x0d, 0x01, 0xbd, 0xb0, 0xd6, 0xce, 0x3c, 0x2a, 0x8b, 0x78, 0xbd, 0x51, 0x8a, + 0xcd, 0x3c, 0x76, 0x3b, 0x0b, 0x3b, 0x85, 0xe3, 0x76, 0xbd, 0xad, 0x98, 0x6f, + 0x3d, 0xf8, 0xa1, 0x92, 0xbd, 0x22, 0xb9, 0x24, 0xbd, 0x81, 0xf4, 0x62, 0xbd, + 0xeb, 0x97, 0x83, 0x3d, 0x0d, 0xa9, 0x91, 0x3a, 0x62, 0x88, 0x0c, 0xbc, 0x99, + 0x64, 0x48, 0x3d, 0x0b, 0x11, 0x80, 0xba, 0x94, 0xe3, 0x70, 0xbc, 0xa3, 0x42, + 0x56, 0x3c, 0x1c, 0x41, 0xec, 0x3c, 0x68, 0x56, 0x29, 0x3c, 0x50, 0x4a, 0x05, + 0x3d, 0xfa, 0x33, 0x37, 0x3d, 0x5d, 0x7c, 0x8d, 0x3d, 0xa8, 0x02, 0x3f, 0x3c, + 0xa6, 0x1d, 0x68, 0x3d, 0x41, 0x3b, 0x76, 0x3d, 0x29, 0xa1, 0x56, 0xbd, 0xbd, + 0x90, 0x7c, 0x3b, 0xd9, 0x96, 0x62, 0xbd, 0xf2, 0x15, 0xd8, 0xbc, 0xad, 0x62, + 0x38, 0x3d, 0x19, 0xc7, 0x0d, 0x3d, 0xda, 0xcc, 0xf8, 0x3b, 0x63, 0xaf, 0x84, + 0xbd, 0x42, 0x94, 0x3f, 0xbc, 0x60, 0x67, 0x83, 0x3d, 0x13, 0xdb, 0xa8, 0x3c, + 0x8f, 0xcb, 0x5e, 0x3d, 0x97, 0x69, 0x14, 0xbd, 0xd5, 0x52, 0x97, 0x3c, 0x28, + 0xb2, 0x09, 0xbb, 0xd0, 0x5c, 0x0f, 0x3d, 0x08, 0x01, 0x38, 0xbd, 0x2a, 0xd1, + 0x75, 0xbd, 0xb6, 0x48, 0x5e, 0xbd, 0xe6, 0x3a, 0x40, 0x3d, 0x91, 0x52, 0xb5, + 0x3c, 0xe6, 0xe6, 0x2f, 0x3d, 0x7b, 0x0a, 0x0b, 0x3d, 0x05, 0xa6, 0xf1, 0xbb, + 0xe5, 0x14, 0x12, 0x3c, 0x70, 0x4a, 0x61, 0xbd, 0xc0, 0xd5, 0x77, 0x3c, 0xea, + 0x92, 0x4e, 0x3d, 0xe8, 0xea, 0x7a, 0x3c, 0x85, 0xec, 0x8d, 0xbc, 0x1f, 0x06, + 0x3a, 0x3d, 0x24, 0x7d, 0x43, 0x3c, 0x3b, 0xfb, 0x4e, 0x3d, 0x10, 0xdb, 0x26, + 0xbc, 0x3c, 0xe4, 0x44, 0x3d, 0x5f, 0x54, 0xe6, 0x3c, 0x32, 0x15, 0xdf, 0xbc, + 0x07, 0x77, 0x1f, 0x3d, 0x68, 0x58, 0xea, 0x3c, 0xbe, 0x48, 0x90, 0xbc, 0x42, + 0x47, 0x35, 0x3d, 0x21, 0x06, 0x7d, 0xbd, 0x96, 0xd4, 0x67, 0x3c, 0x17, 0x5e, + 0x79, 0x3b, 0xd0, 0x09, 0x93, 0xbd, 0xaf, 0x34, 0x3d, 0x3d, 0xc6, 0xd3, 0x8f, + 0xbc, 0xae, 0x06, 0x0c, 0x3c, 0x84, 0xeb, 0x04, 0xbd, 0x44, 0xf4, 0x2e, 0xbd, + 0xad, 0x8d, 0x61, 0x3c, 0xb0, 0x1e, 0xaf, 0xb9, 0xb6, 0xd3, 0x57, 0xbc, 0x78, + 0x89, 0x97, 0x3c, 0x39, 0xa2, 0x41, 0xbd, 0x1c, 0xb3, 0x30, 0xbd, 0x44, 0xc4, + 0x90, 0x3c, 0xa3, 0x43, 0x03, 0xbd, 0xe0, 0xe2, 0xc4, 0xbb, 0xf0, 0xf3, 0x4d, + 0x3c, 0x6c, 0xf3, 0x85, 0x3d, 0x8f, 0xa9, 0x56, 0xbd, 0x36, 0x75, 0x5c, 0x3d, + 0x7e, 0x57, 0x89, 0x3c, 0x3a, 0xb8, 0x29, 0x3c, 0x2c, 0x10, 0x40, 0xbd, 0x5f, + 0x74, 0x32, 0xbd, 0xaf, 0x9e, 0x09, 0xbd, 0x60, 0xe4, 0x4b, 0xbd, 0x49, 0xb4, + 0xd7, 0x3c, 0xa0, 0x1f, 0x31, 0xbd, 0xd6, 0x5e, 0xde, 0x3c, 0x4e, 0xb1, 0xdb, + 0xbc, 0x98, 0x5a, 0x1e, 0x3d, 0x03, 0xe2, 0xa0, 0xba, 0x76, 0xc1, 0x63, 0xbd, + 0xbd, 0x03, 0xcf, 0x3c, 0xde, 0x4d, 0x22, 0x3d, 0x6a, 0x58, 0x5c, 0xbb, 0xc3, + 0xb8, 0x19, 0xbd, 0xf3, 0x01, 0x8f, 0x3d, 0x40, 0x62, 0xdc, 0x3b, 0x58, 0x64, + 0xa0, 0xbc, 0xdc, 0xd4, 0x6d, 0x3d, 0x62, 0x98, 0x1d, 0xbd, 0x96, 0x88, 0x4d, + 0x3b, 0x0e, 0xab, 0x46, 0x3d, 0xcb, 0xee, 0xce, 0x3b, 0xc5, 0x27, 0xe2, 0xbb, + 0xe4, 0xe4, 0x1c, 0x3d, 0x75, 0x86, 0x08, 0xbd, 0xf0, 0xce, 0x1c, 0x3d, 0xcb, + 0x9d, 0x7a, 0x3d, 0x24, 0x56, 0x42, 0xbc, 0x3a, 0x7f, 0xc4, 0xbc, 0x6e, 0xfd, + 0x6e, 0x3d, 0xa1, 0x3f, 0x80, 0x3d, 0xfb, 0x13, 0xc9, 0xbc, 0x5f, 0x8f, 0xb9, + 0x3c, 0xe3, 0xde, 0x94, 0xbd, 0x9f, 0x88, 0x88, 0xbd, 0x79, 0x27, 0x71, 0x3d, + 0xeb, 0xc8, 0x36, 0x3d, 0xe7, 0x2c, 0x9e, 0xbc, 0xb1, 0x19, 0x4d, 0xbd, 0x1e, + 0x82, 0x79, 0x3d, 0x75, 0xfe, 0x94, 0xbd, 0xdc, 0xd7, 0x96, 0xbd, 0x3a, 0x57, + 0x84, 0x3d, 0x70, 0xcd, 0x09, 0xbd, 0x08, 0xd9, 0x01, 0xbd, 0xa6, 0x1a, 0x85, + 0x3d, 0x5e, 0x34, 0xec, 0xbc, 0x3c, 0x0f, 0xa6, 0xbc, 0x0a, 0xc2, 0x6f, 0x3d, + 0x72, 0x1c, 0x89, 0x3d, 0xb0, 0x55, 0x12, 0xbd, 0x71, 0x87, 0x1f, 0x3d, 0x03, + 0xf0, 0x07, 0x3c, 0x52, 0x7d, 0x29, 0x3d, 0xe0, 0x13, 0x55, 0xbc, 0xe0, 0xac, + 0xbb, 0x3c, 0x36, 0x1f, 0x58, 0x3d, 0x34, 0x2f, 0xe3, 0x3c, 0xb5, 0xb7, 0x89, + 0xbc, 0x06, 0xfa, 0x93, 0xbd, 0xe7, 0x2e, 0x20, 0xbc, 0xc8, 0x71, 0x4c, 0x3d, + 0x03, 0x3b, 0xf6, 0xbb, 0x1c, 0xf7, 0x24, 0x3d, 0x88, 0x07, 0x09, 0x3d, 0xa6, + 0x16, 0xde, 0xbc, 0xd4, 0xfa, 0xf5, 0xbc, 0x2e, 0x35, 0x3f, 0x3d, 0x22, 0x36, + 0x5c, 0xbd, 0x99, 0xea, 0x90, 0x3d, 0x7c, 0xfd, 0xe6, 0x3c, 0xda, 0x89, 0x2e, + 0x3d, 0xea, 0x83, 0x39, 0x3c, 0xe2, 0x35, 0x12, 0x3d, 0xa6, 0xee, 0x46, 0x3d, + 0x7b, 0x4e, 0x36, 0xbd, 0x0a, 0x6d, 0xd1, 0x3b, 0x90, 0x59, 0x08, 0xbc, 0x3e, + 0xee, 0x86, 0x3b, 0x18, 0x92, 0x13, 0x3d, 0x71, 0xd5, 0x69, 0x3c, 0x5f, 0xc2, + 0x8d, 0xbd, 0xb0, 0x51, 0x81, 0x3c, 0x5a, 0x81, 0x9e, 0x3c, 0xcf, 0xae, 0x13, + 0x3d, 0xa4, 0x0d, 0x54, 0x3d, 0xb6, 0x82, 0x77, 0x3d, 0x6a, 0x20, 0xf7, 0xbc, + 0x60, 0xcc, 0x56, 0xbd, 0x45, 0x8f, 0x23, 0xbd, 0x92, 0x5c, 0x69, 0xbc, 0x8d, + 0xb5, 0x5d, 0xbd, 0x39, 0x60, 0x29, 0xbc, 0x06, 0x25, 0x6b, 0x3c, 0xad, 0x40, + 0x32, 0xbd, 0xcd, 0xbe, 0xf3, 0xbc, 0x7e, 0xd6, 0x74, 0x3d, 0x2e, 0x72, 0x63, + 0x3d, 0xc3, 0xaa, 0x0c, 0xbd, 0x74, 0xfc, 0x6a, 0xbd, 0xff, 0xa6, 0x7b, 0x3d, + 0xa8, 0x4f, 0xec, 0xbc, 0x8a, 0x91, 0x39, 0xbd, 0xd1, 0xa4, 0x7b, 0x3d, 0xff, + 0x3a, 0x99, 0x3b, 0xe9, 0xd2, 0x4e, 0xbd, 0xc6, 0x84, 0x1e, 0x3d, 0xe7, 0x73, + 0xdf, 0xbc, 0x88, 0xfb, 0x08, 0x3d, 0xf9, 0x98, 0xa2, 0xbc, 0x41, 0x1d, 0x8d, + 0x3d, 0xe6, 0x32, 0x38, 0x3d, 0x5f, 0xea, 0x1a, 0xbd, 0xce, 0x8f, 0x92, 0xbd, + 0xea, 0x1f, 0x69, 0x3d, 0x5b, 0x6e, 0x58, 0xbc, 0x6d, 0xfc, 0x2d, 0x3d, 0xa9, + 0x01, 0x83, 0x3d, 0xbc, 0xdb, 0x53, 0x3d, 0x70, 0xea, 0x72, 0xbd, 0xa4, 0xc0, + 0xae, 0xbc, 0x80, 0x8a, 0x54, 0x3a, 0x4a, 0x00, 0x80, 0xbc, 0x4a, 0x66, 0x78, + 0xbc, 0xbe, 0x62, 0x79, 0xbd, 0xe8, 0x24, 0x84, 0xbc, 0x0d, 0xef, 0x0f, 0x3d, + 0xa9, 0xa6, 0x26, 0x3d, 0xb8, 0x68, 0x83, 0xbd, 0xe2, 0x7b, 0x27, 0xbd, 0xdc, + 0xda, 0x80, 0xbd, 0x5e, 0x50, 0x88, 0xbd, 0x76, 0x41, 0x8d, 0x3d, 0xee, 0x0a, + 0x95, 0xbc, 0xc4, 0x0b, 0x41, 0x3c, 0x6e, 0x16, 0xe0, 0xbc, 0xb2, 0x34, 0x58, + 0x3d, 0x65, 0xd4, 0x06, 0x3d, 0x8a, 0x8a, 0x18, 0xbd, 0x99, 0xdd, 0x47, 0x3d, + 0x2b, 0xec, 0x00, 0x3d, 0xc3, 0xb1, 0xad, 0xb9, 0xf9, 0x57, 0x77, 0x3c, 0xae, + 0xc6, 0x8a, 0xbd, 0x55, 0x51, 0x43, 0x3d, 0x34, 0xd3, 0x1b, 0xbd, 0xda, 0x9e, + 0x47, 0x3d, 0xe5, 0x3a, 0x1f, 0x3d, 0x6d, 0xf2, 0x59, 0x3d, 0x14, 0x27, 0xb7, + 0xbc, 0xb0, 0x72, 0x8f, 0x3d, 0xbe, 0x91, 0x83, 0xbd, 0xbb, 0x8f, 0x39, 0xbd, + 0x40, 0x7f, 0x7e, 0xbd, 0x2d, 0x3e, 0x86, 0x3b, 0xca, 0x43, 0x29, 0xbc, 0xe2, + 0xb8, 0x4d, 0x3d, 0x48, 0x31, 0x85, 0xbd, 0xcb, 0x54, 0x1b, 0x3d, 0xb4, 0xc8, + 0x56, 0x3d, 0x09, 0x2f, 0x1d, 0x3d, 0xca, 0x8f, 0x10, 0x3d, 0xe1, 0x8d, 0x4c, + 0x3a, 0xdb, 0x4d, 0xd2, 0xbc, 0x4a, 0xc7, 0xd1, 0xbc, 0xc8, 0x03, 0xfa, 0x3c, + 0x4e, 0x3f, 0xa4, 0xbc, 0x5f, 0x9e, 0x90, 0xbd, 0x13, 0x82, 0xc0, 0x3c, 0x59, + 0x55, 0x54, 0x3c, 0xb6, 0x95, 0xa5, 0xbb, 0xef, 0x59, 0xa4, 0x3b, 0x7e, 0x93, + 0x1e, 0xbd, 0xaf, 0x49, 0x81, 0xbc, 0xe7, 0xd1, 0xc6, 0xbb, 0xc0, 0xa3, 0xc9, + 0x3b, 0x53, 0xa9, 0x77, 0xbb, 0xfa, 0x26, 0x74, 0xbc, 0x06, 0x1b, 0x63, 0x3d, + 0xe4, 0x90, 0x0a, 0xbd, 0x64, 0x50, 0x31, 0x3d, 0xff, 0x66, 0x82, 0x3d, 0x9d, + 0x1c, 0x06, 0xbd, 0x38, 0x29, 0x40, 0xbd, 0x6f, 0xea, 0x89, 0x3d, 0xdc, 0x8a, + 0x3f, 0xbd, 0xd1, 0x88, 0x02, 0x3d, 0x2f, 0x23, 0x27, 0x3c, 0x9c, 0x85, 0x56, + 0x3d, 0x41, 0xc7, 0x41, 0xbd, 0x67, 0x51, 0x49, 0x3c, 0x5f, 0x41, 0xf9, 0xbb, + 0x15, 0x37, 0xdb, 0xbc, 0x51, 0x7a, 0xd9, 0x3a, 0x05, 0xc0, 0x90, 0xbd, 0x8f, + 0xdb, 0x84, 0xbd, 0x3a, 0xc1, 0x48, 0xb9, 0x22, 0x3c, 0xfb, 0x3c, 0x7d, 0xf5, + 0x14, 0xbd, 0x26, 0xe6, 0x53, 0xbc, 0xde, 0x94, 0xa0, 0xbc, 0xd9, 0xc4, 0x5e, + 0x3d, 0xd4, 0xcf, 0xa6, 0xba, 0xfa, 0x43, 0x18, 0xbd, 0xee, 0x62, 0x19, 0xbd, + 0xfb, 0x61, 0x66, 0xbb, 0x1e, 0x8b, 0x82, 0xbd, 0x26, 0xec, 0x87, 0xbd, 0xc2, + 0xf6, 0x04, 0x3d, 0x2b, 0x2e, 0xe4, 0xbc, 0x60, 0xa6, 0x4e, 0x3d, 0x21, 0x99, + 0x5c, 0x3d, 0xdd, 0xde, 0x37, 0x3d, 0x8e, 0xfc, 0xf5, 0x3c, 0x6d, 0x33, 0xc2, + 0x39, 0x48, 0xea, 0x34, 0x3d, 0x79, 0x3e, 0x85, 0xbd, 0x20, 0xb1, 0x3d, 0xbb, + 0xdc, 0xe9, 0x64, 0xbc, 0xd2, 0xac, 0x4a, 0xbd, 0x1a, 0x4a, 0x8d, 0xbd, 0xb5, + 0xa2, 0xf3, 0x3c, 0xcd, 0x54, 0xb6, 0xbc, 0xc1, 0x9b, 0x2c, 0x3c, 0xd0, 0xea, + 0xad, 0xbc, 0x3f, 0xbc, 0x7f, 0x3c, 0xde, 0xe3, 0xe9, 0xbc, 0x1e, 0x28, 0x6f, + 0xbc, 0xd1, 0xce, 0xfe, 0xbc, 0xcc, 0x16, 0x21, 0x3d, 0x2a, 0x10, 0x18, 0xbd, + 0x5e, 0x73, 0xe9, 0xbb, 0xb3, 0x67, 0xa1, 0xbb, 0x94, 0x7d, 0x0d, 0x3c, 0x1d, + 0x67, 0x3b, 0xbd, 0xa9, 0xb9, 0x84, 0x3c, 0xe1, 0xc1, 0x89, 0xba, 0x49, 0x7f, + 0x91, 0xbd, 0x47, 0xf8, 0x57, 0xbc, 0x00, 0x6a, 0x24, 0x3d, 0x61, 0x71, 0x6f, + 0x3c, 0xd7, 0x6e, 0x4e, 0xbc, 0x07, 0xda, 0x60, 0xbb, 0x2d, 0xd9, 0x8e, 0x3d, + 0x0d, 0x9d, 0xc5, 0x3b, 0x50, 0x74, 0xe2, 0xbc, 0xaf, 0x90, 0x2d, 0xbd, 0xce, + 0x93, 0x2a, 0x3d, 0x56, 0xee, 0xee, 0xbc, 0x62, 0x58, 0x0a, 0x3d, 0x25, 0x7c, + 0x64, 0x3d, 0x23, 0x8d, 0x80, 0x3d, 0x3b, 0xfd, 0x55, 0xbd, 0x8f, 0x71, 0xe2, + 0xbc, 0x9c, 0xae, 0x07, 0x3d, 0x0e, 0xe4, 0xdd, 0xbc, 0x93, 0xc9, 0xd7, 0x3c, + 0x87, 0x9c, 0xe5, 0xbb, 0xa3, 0xd5, 0x5d, 0x3d, 0x23, 0xdb, 0x3a, 0xbd, 0x67, + 0xb3, 0x1a, 0x3d, 0x9e, 0xa1, 0x6b, 0x3d, 0x93, 0x17, 0xc2, 0xbc, 0x0c, 0xb7, + 0x33, 0xbd, 0xc0, 0xba, 0xeb, 0xbc, 0x16, 0x2c, 0x4d, 0xbd, 0xed, 0x60, 0x78, + 0x3c, 0x54, 0xa3, 0x93, 0xbd, 0x62, 0xa6, 0x8a, 0xbd, 0xdc, 0x16, 0x25, 0xbd, + 0xa9, 0xaf, 0x76, 0xbd, 0xab, 0x3c, 0x5d, 0xbd, 0xcf, 0x78, 0x9c, 0x3c, 0x74, + 0xf2, 0x97, 0x3c, 0xaa, 0x5d, 0x3b, 0x3d, 0x9c, 0xd2, 0xef, 0x3c, 0xd8, 0x6a, + 0x37, 0x3c, 0x44, 0xd2, 0xb9, 0xbc, 0x41, 0x5d, 0x7e, 0x3d, 0x74, 0x3c, 0x7d, + 0xbd, 0x40, 0x08, 0x0c, 0xbd, 0xbb, 0xc3, 0x04, 0xbd, 0xd7, 0xd3, 0x5d, 0xbd, + 0x41, 0xe7, 0x7c, 0x3d, 0x65, 0x20, 0x6f, 0x3b, 0x4e, 0xef, 0x81, 0x3a, 0xae, + 0xe0, 0x5d, 0xbd, 0x3f, 0xfb, 0x82, 0xbd, 0xf1, 0xc5, 0x58, 0xbd, 0x96, 0xab, + 0x45, 0x3b, 0x97, 0x5f, 0xcd, 0x3b, 0x39, 0x48, 0x5b, 0x3b, 0x6d, 0xf0, 0x28, + 0xbd, 0x08, 0xcc, 0x9f, 0x3c, 0x21, 0xd5, 0x2b, 0xbd, 0xc1, 0xe3, 0x1c, 0x3d, + 0x86, 0x52, 0xb4, 0x3c, 0x02, 0xd4, 0xc6, 0xbc, 0xbe, 0xab, 0x27, 0xbd, 0x18, + 0x8f, 0x84, 0x3c, 0x7d, 0x47, 0x2e, 0x3d, 0x0a, 0x58, 0x9c, 0x3b, 0x52, 0x72, + 0xe4, 0xbc, 0x98, 0x57, 0x5e, 0x3c, 0x24, 0xf1, 0x04, 0xbc, 0x3b, 0xec, 0x0f, + 0xbd, 0xf5, 0x54, 0x13, 0x3d, 0x6f, 0xf9, 0x80, 0x3c, 0x80, 0x19, 0xa2, 0xbc, + 0xfa, 0x89, 0x35, 0x3d, 0xd8, 0x61, 0x82, 0x3c, 0x21, 0x81, 0x8b, 0x3d, 0x40, + 0x2d, 0x65, 0xbc, 0xc6, 0x21, 0x61, 0x3d, 0x51, 0x3d, 0xa9, 0xbc, 0x47, 0x12, + 0x55, 0x3d, 0x7e, 0x85, 0x71, 0xbd, 0x22, 0x14, 0x05, 0x3d, 0x94, 0x35, 0x97, + 0xbd, 0x3c, 0x00, 0x86, 0xbd, 0x3a, 0x46, 0x5f, 0x3d, 0x18, 0x14, 0x06, 0xbd, + 0xb4, 0xea, 0x8c, 0xbd, 0xdc, 0x2e, 0xfe, 0x3b, 0x21, 0x96, 0x3d, 0xbd, 0x3a, + 0xf6, 0x8b, 0xbc, 0x3a, 0x3b, 0x6d, 0xbb, 0x39, 0x87, 0x13, 0x3c, 0x15, 0xbc, + 0x92, 0xbd, 0x24, 0xb7, 0x13, 0x3d, 0x9c, 0x66, 0x7a, 0xbd, 0x6b, 0xf2, 0x41, + 0xbd, 0x1d, 0x15, 0x6a, 0xbc, 0x20, 0x2a, 0x73, 0x3d, 0x25, 0x95, 0x40, 0x3d, + 0x23, 0x8f, 0x90, 0xbd, 0xd6, 0x95, 0xa7, 0xbc, 0xbe, 0xce, 0x4f, 0x3d, 0xaf, + 0xe0, 0x3f, 0x3d, 0x1b, 0x9f, 0x47, 0x3c, 0x57, 0x37, 0x14, 0x3d, 0x33, 0x06, + 0x86, 0x3d, 0xe5, 0x3c, 0x77, 0x3d, 0x60, 0x46, 0x95, 0x3b, 0xee, 0xd2, 0x97, + 0xbc, 0x38, 0x20, 0x9c, 0x3c, 0xe6, 0x90, 0xdf, 0xba, 0x77, 0x4f, 0x30, 0x3d, + 0x54, 0x87, 0x03, 0x3d, 0x86, 0x7c, 0x25, 0x3d, 0xdb, 0x5a, 0x18, 0x3d, 0x60, + 0x84, 0xf9, 0xbc, 0x84, 0x3c, 0xd0, 0xbc, 0xe9, 0x8c, 0x87, 0xbb, 0x39, 0xb9, + 0x81, 0x3d, 0x2e, 0x3e, 0x67, 0x3d, 0x5d, 0x57, 0xf8, 0xba, 0x60, 0x31, 0x38, + 0x3c, 0xf4, 0x31, 0x02, 0xbd, 0x31, 0x10, 0x98, 0x3c, 0x85, 0x28, 0x16, 0x3d, + 0xc5, 0xcd, 0xef, 0x3c, 0x92, 0x8d, 0x59, 0x3d, 0x6a, 0x54, 0x27, 0xbc, 0x72, + 0x4a, 0xf7, 0xbc, 0x0d, 0x8d, 0x81, 0x3d, 0xbd, 0x74, 0x8f, 0xbd, 0x80, 0xed, + 0x5c, 0x3b, 0xbe, 0x52, 0x7e, 0x3d, 0x49, 0x3f, 0x28, 0xbd, 0xcc, 0xc5, 0xea, + 0xbc, 0x2f, 0x46, 0x6b, 0xbd, 0x05, 0xd4, 0x0c, 0xbc, 0x41, 0x09, 0x02, 0x3d, + 0x2e, 0xa8, 0x53, 0xbc, 0xc7, 0x56, 0x56, 0xbd, 0xc2, 0x01, 0x88, 0xbd, 0x7a, + 0x9c, 0x6f, 0x3d, 0x3c, 0x49, 0x1c, 0x3d, 0x2b, 0x80, 0xe3, 0x3b, 0x43, 0x27, + 0x7d, 0x3d, 0x91, 0xa0, 0x58, 0x3d, 0xdb, 0x70, 0x76, 0xbc, 0xc4, 0xfa, 0x04, + 0xbd, 0x5e, 0x76, 0xcc, 0x3b, 0x0a, 0xcf, 0xc0, 0xbc, 0xfa, 0x3f, 0x08, 0xbd, + 0x26, 0x65, 0xaa, 0x3c, 0x2f, 0xec, 0x37, 0x3d, 0xa0, 0xae, 0x51, 0x3d, 0xbd, + 0x0e, 0x4e, 0x3d, 0x4d, 0x36, 0xae, 0xbc, 0xf1, 0xc8, 0x3f, 0xbd, 0x79, 0xe5, + 0x84, 0xbc, 0xac, 0x19, 0xf7, 0x3b, 0x5f, 0x52, 0x70, 0xbd, 0x46, 0x15, 0x01, + 0xbd, 0x17, 0xb1, 0xb1, 0x3c, 0x2e, 0x19, 0x87, 0xbd, 0x0c, 0xe6, 0x98, 0x3c, + 0x35, 0xd0, 0x22, 0xbd, 0xe3, 0x8f, 0x8a, 0xbd, 0x23, 0x8b, 0xfa, 0x3c, 0x01, + 0x67, 0x80, 0x3d, 0x6c, 0x9e, 0xb2, 0x3a, 0x6b, 0xbe, 0x8b, 0x3d, 0x74, 0x68, + 0xdb, 0x3c, 0x4c, 0x13, 0xae, 0xbc, 0x94, 0xfe, 0x50, 0xbd, 0xdc, 0x7e, 0x2f, + 0x3d, 0x78, 0x0a, 0x6e, 0xbc, 0x0e, 0x2b, 0xe9, 0xbc, 0x3b, 0x4b, 0x08, 0x3d, + 0x4d, 0x1a, 0x3d, 0xbd, 0x55, 0x7e, 0x51, 0xbb, 0x15, 0xa6, 0xb4, 0xbc, 0xac, + 0x1b, 0x86, 0xbb, 0x8a, 0x27, 0x22, 0x3d, 0x39, 0xc8, 0x34, 0xbc, 0x65, 0x0e, + 0x1a, 0xbb, 0x4c, 0x08, 0xdb, 0x3b, 0x60, 0x75, 0x2d, 0xbc, 0x25, 0xba, 0x64, + 0xbc, 0x8c, 0x05, 0x70, 0x3d, 0x0e, 0xdc, 0xaa, 0xbc, 0x63, 0x17, 0x03, 0x3d, + 0x03, 0x9d, 0x36, 0x3c, 0xe3, 0xf5, 0x6e, 0x3d, 0x01, 0xf8, 0x12, 0xbd, 0x15, + 0x62, 0xb3, 0x3c, 0xe1, 0x20, 0x1f, 0x3d, 0xbd, 0x41, 0x8d, 0x3d, 0x7b, 0x02, + 0x47, 0x3d, 0x8e, 0x9c, 0x93, 0xbc, 0x82, 0xa1, 0x81, 0xbd, 0xb9, 0x59, 0x6e, + 0x3c, 0xc6, 0x93, 0x07, 0xbd, 0x4c, 0x87, 0x44, 0x3d, 0x6a, 0x66, 0x49, 0xbd, + 0x80, 0xd5, 0x4b, 0xbb, 0x70, 0xd5, 0x09, 0x3c, 0x20, 0x85, 0x06, 0x3c, 0x7e, + 0xd6, 0x42, 0x3d, 0x5d, 0x10, 0x01, 0x3c, 0x71, 0xbe, 0x6c, 0xbc, 0xcc, 0xba, + 0x2d, 0xbd, 0xbf, 0xf6, 0x90, 0xbd, 0x59, 0xb8, 0x8c, 0x3d, 0x4a, 0xe8, 0x87, + 0xbc, 0xee, 0xd3, 0xd1, 0x3c, 0xde, 0xdd, 0xa6, 0xbb, 0x26, 0x06, 0x6a, 0xbc, + 0x1f, 0xa2, 0x88, 0xbd, 0x00, 0x6c, 0x24, 0xbb, 0x36, 0xf0, 0x00, 0x3c, 0x1e, + 0x54, 0x86, 0xbb, 0x55, 0x5e, 0x01, 0xbc, 0x3e, 0x0e, 0xe8, 0x3c, 0xbd, 0x02, + 0x70, 0xbb, 0x8e, 0xb9, 0x85, 0x3d, 0x8e, 0x8a, 0x5d, 0xbb, 0xa4, 0x21, 0x13, + 0x3d, 0xd1, 0x77, 0x16, 0xbc, 0x40, 0x95, 0x1d, 0x3c, 0x58, 0x2f, 0xbb, 0x3c, + 0xf5, 0x88, 0x86, 0xbb, 0xa0, 0x02, 0x83, 0xbd, 0x93, 0xb8, 0x0a, 0x3c, 0xfd, + 0x65, 0xe2, 0xbb, 0x24, 0x21, 0x11, 0x3d, 0xc6, 0x89, 0x8c, 0xbd, 0xc3, 0xa9, + 0x7a, 0xbd, 0x43, 0xcf, 0x81, 0xbd, 0xde, 0x81, 0x58, 0xbd, 0x3d, 0x35, 0x23, + 0x3d, 0xbe, 0x81, 0x90, 0xbd, 0xd3, 0xd2, 0xbb, 0x3c, 0x60, 0x68, 0xe5, 0xbc, + 0x25, 0x64, 0xa8, 0xbb, 0x8e, 0x5e, 0x4e, 0xbd, 0xc3, 0xa4, 0xd3, 0xbc, 0xb0, + 0x99, 0xf7, 0xbc, 0x2d, 0x56, 0x17, 0xbd, 0x44, 0x65, 0x2b, 0x3d, 0xa7, 0x80, + 0x05, 0xbd, 0xfc, 0xe1, 0x02, 0x3d, 0x65, 0xa7, 0x68, 0x3d, 0x52, 0x5d, 0x8b, + 0xbd, 0x6a, 0x9e, 0x83, 0xbd, 0xd4, 0xac, 0x1a, 0xbc, 0x3e, 0x6b, 0x7d, 0xbc, + 0xeb, 0xff, 0x40, 0xbd, 0xcd, 0xd2, 0x21, 0x3d, 0x7e, 0xf1, 0x70, 0xbd, 0x9b, + 0xc6, 0x6a, 0xbb, 0x1e, 0xb9, 0x20, 0x3d, 0xfd, 0x9b, 0x61, 0xbd, 0x57, 0xf3, + 0x5a, 0xbd, 0x5d, 0xbe, 0xbb, 0x3b, 0xd3, 0xc8, 0x50, 0xbd, 0x38, 0x8a, 0x5e, + 0xbd, 0x86, 0x65, 0x57, 0x3d, 0x02, 0xc7, 0x85, 0xbd, 0x95, 0x0a, 0x80, 0x3d, + 0x08, 0xcd, 0x66, 0x3c, 0x68, 0x38, 0x3d, 0x3c, 0xad, 0x64, 0x12, 0xbd, 0x20, + 0x0d, 0xcc, 0x3c, 0x63, 0x2c, 0x3f, 0x3d, 0xf6, 0xe1, 0xdc, 0x3c, 0x5f, 0xa6, + 0x35, 0x3d, 0x7b, 0xf6, 0x68, 0xbd, 0x9e, 0x65, 0xd2, 0x3c, 0x13, 0x63, 0x9d, + 0xbb, 0xd6, 0x42, 0x51, 0xbc, 0xa2, 0xc5, 0x52, 0xbc, 0x6a, 0x3d, 0x3f, 0x3d, + 0xa6, 0xde, 0xf8, 0xbc, 0x01, 0xa1, 0x5b, 0x3d, 0x8d, 0xdf, 0x16, 0xbd, 0x62, + 0x4d, 0x35, 0xba, 0x22, 0xca, 0x30, 0xbd, 0x50, 0x22, 0x72, 0xbc, 0xf1, 0xaa, + 0x96, 0xbd, 0x52, 0xf4, 0xd9, 0x3c, 0x08, 0x89, 0x6d, 0x3d, 0x90, 0x97, 0xa9, + 0x3c, 0x20, 0x9d, 0x0b, 0x3c, 0x47, 0x97, 0xf5, 0xbc, 0x7f, 0xc1, 0x3c, 0x3d, + 0x77, 0xa7, 0xeb, 0x3b, 0xe2, 0x0c, 0x77, 0x3d, 0xca, 0x57, 0x3e, 0x3d, 0x16, + 0x46, 0x38, 0xbd, 0x15, 0xde, 0x87, 0x3d, 0x10, 0x09, 0x0a, 0xbd, 0xa0, 0xfa, + 0x56, 0x3b, 0xba, 0x6c, 0x2f, 0x3d, 0x0f, 0xb9, 0x70, 0x3c, 0x35, 0xb8, 0x8c, + 0xbd, 0x88, 0xad, 0xc5, 0xbc, 0xb2, 0x0b, 0x40, 0xbd, 0x63, 0x62, 0x80, 0xbd, + 0xb4, 0xd9, 0x78, 0x3c, 0x91, 0x49, 0x8a, 0xbd, 0x59, 0x3c, 0x47, 0x3d, 0xb1, + 0xb7, 0x3a, 0xbd, 0x0f, 0x07, 0xea, 0x3b, 0xca, 0x89, 0x50, 0xbd, 0xf6, 0x2c, + 0x27, 0xbd, 0x3f, 0xf7, 0x37, 0x3c, 0x1c, 0x12, 0x23, 0x3c, 0x6d, 0x88, 0x97, + 0xbd, 0x06, 0x09, 0x66, 0x3d, 0x40, 0xac, 0x80, 0xbc, 0xac, 0xea, 0x7c, 0xbd, + 0x7e, 0xfb, 0x1a, 0x3d, 0x11, 0xd1, 0x65, 0x3d, 0x56, 0x13, 0xee, 0xbc, 0xa5, + 0xe1, 0x69, 0xbd, 0x47, 0xff, 0x45, 0xbc, 0x20, 0xba, 0x2e, 0xbd, 0xff, 0x15, + 0x48, 0xbc, 0x01, 0xd5, 0x8f, 0x3d, 0x42, 0x0f, 0x37, 0x3c, 0x68, 0xbc, 0xcc, + 0x3c, 0xf4, 0x1e, 0x39, 0xbd, 0x00, 0x6c, 0x07, 0xb9, 0xe4, 0x6e, 0xb2, 0x3c, + 0x9b, 0x53, 0x88, 0xbd, 0x20, 0xf2, 0xef, 0xbc, 0xd3, 0xf3, 0x8e, 0x3d, 0xbc, + 0xe9, 0xa6, 0xbc, 0xa3, 0xb6, 0x6b, 0xbc, 0x73, 0xeb, 0xdd, 0xbc, 0xdf, 0xa3, + 0x04, 0xbd, 0x1a, 0x9f, 0x21, 0x3c, 0x1d, 0xb7, 0x89, 0xbb, 0x28, 0x66, 0x85, + 0xbc, 0xf9, 0x7f, 0x95, 0xbd, 0x4c, 0x07, 0xfa, 0xbc, 0x52, 0x7d, 0x29, 0x3d, + 0x66, 0x78, 0x24, 0xbc, 0xd4, 0x70, 0xfa, 0xbc, 0x20, 0xdb, 0x02, 0xbd, 0x51, + 0x27, 0x09, 0xbd, 0xb6, 0xb6, 0x42, 0x3d, 0x37, 0xa4, 0x3f, 0xbd, 0xfc, 0x30, + 0xb2, 0xbb, 0x2b, 0xa7, 0xb7, 0x3c, 0x77, 0xf6, 0x2e, 0x3d, 0x4e, 0x18, 0x6c, + 0x3d, 0xb0, 0xb9, 0xe4, 0x3c, 0xa6, 0xce, 0x89, 0xbd, 0x18, 0x9a, 0xc2, 0x3c, + 0x8d, 0xdc, 0x51, 0xbd, 0x50, 0x09, 0x0a, 0x3d, 0xd8, 0x90, 0x6c, 0xbc, 0x28, + 0x48, 0x96, 0xbc, 0x50, 0x5f, 0x62, 0xbc, 0x8b, 0xbc, 0x82, 0xbd, 0xb0, 0x24, + 0xce, 0x3b, 0x54, 0xb0, 0x4b, 0x3c, 0xd8, 0x02, 0x59, 0x3c, 0x0b, 0x7d, 0xa0, + 0x3c, 0x2a, 0x6f, 0xfa, 0xbc, 0x51, 0xf4, 0x0a, 0xbd, 0xe5, 0xdd, 0x45, 0x3d, + 0x69, 0xcb, 0x5f, 0x3d, 0x59, 0xee, 0x1b, 0x3d, 0x15, 0x0c, 0x6d, 0x3d, 0xb4, + 0xe8, 0x3a, 0x3c, 0xd6, 0x4c, 0x71, 0x3d, 0x2c, 0x6c, 0x5f, 0xbc, 0x23, 0xc7, + 0x96, 0x3c, 0x90, 0xfd, 0xef, 0xb9, 0x80, 0x9a, 0xce, 0xbc, 0xc8, 0xa7, 0xfa, + 0xbc, 0x3f, 0x84, 0x4d, 0xbc, 0xb9, 0x1e, 0x63, 0x3d, 0x91, 0xff, 0x16, 0xbd, + 0xe4, 0x6d, 0x65, 0xbc, 0xbb, 0x19, 0x69, 0xbc, 0xf0, 0xba, 0xfe, 0xbc, 0xbb, + 0xe6, 0x30, 0x3d, 0x12, 0x3a, 0x4d, 0x3d, 0x08, 0xa7, 0x79, 0x3d, 0x37, 0x6c, + 0x88, 0x3d, 0xb4, 0x66, 0xf1, 0xba, 0xb8, 0x48, 0xcc, 0xbc, 0x61, 0xb9, 0x1d, + 0xbd, 0x8a, 0x51, 0x45, 0xbd, 0x2e, 0x8a, 0x59, 0x3d, 0x88, 0xe0, 0x7d, 0xbd, + 0x53, 0xc6, 0x8e, 0xbd, 0x0e, 0x7b, 0x5a, 0x3d, 0x13, 0xc2, 0xcb, 0xbc, 0x57, + 0xcd, 0x8b, 0xbd, 0x60, 0x8c, 0x4e, 0xbd, 0xe2, 0x03, 0x07, 0x3d, 0x5f, 0x0d, + 0x80, 0x3c, 0x5f, 0xc8, 0x3d, 0x3d, 0x89, 0x06, 0xc8, 0x3c, 0x17, 0x2b, 0x88, + 0x3d, 0xf6, 0x31, 0x63, 0x3d, 0x51, 0x2b, 0x60, 0xbd, 0xc9, 0x26, 0x67, 0xbd, + 0x02, 0x8e, 0x4f, 0xbd, 0xbd, 0x67, 0x20, 0x3d, 0x53, 0xfa, 0x64, 0xbb, 0x27, + 0x16, 0x28, 0xbd, 0x45, 0x52, 0xfb, 0xbb, 0x66, 0x53, 0x8d, 0x3c, 0x0c, 0x18, + 0x74, 0xbc, 0x60, 0x98, 0x19, 0x3d, 0xd2, 0x7c, 0x3c, 0x3d, 0x77, 0x65, 0x90, + 0xbc, 0x69, 0x1e, 0x3e, 0xbd, 0x04, 0x22, 0x7f, 0xbc, 0x7c, 0x5d, 0x2c, 0xbc, + 0x51, 0xb3, 0x1f, 0xbc, 0xc4, 0xaf, 0xbf, 0xbc, 0xa8, 0xc5, 0x59, 0x3c, 0xfe, + 0x08, 0x62, 0x3d, 0x7c, 0x3a, 0x56, 0x3d, 0x4a, 0xaf, 0x38, 0x3d, 0xd9, 0x9e, + 0x26, 0xbd, 0x48, 0xc2, 0x16, 0xbc, 0x6e, 0xcc, 0xec, 0xbc, 0x05, 0x78, 0x0e, + 0xbc, 0xd2, 0x5c, 0x51, 0xbd, 0x44, 0x63, 0x6b, 0x3d, 0x7c, 0xfd, 0xca, 0xbb, + 0x62, 0xda, 0x30, 0x3c, 0xc4, 0xcc, 0x61, 0x3d, 0xdc, 0xa6, 0x34, 0xbd, 0xff, + 0x8f, 0x24, 0xbc, 0x68, 0x37, 0xf6, 0xbc, 0xd1, 0x4d, 0x25, 0xbd, 0x33, 0x6e, + 0x91, 0x3c, 0x60, 0x57, 0x6b, 0x3d, 0x04, 0xf7, 0x34, 0xbd, 0x90, 0xe7, 0x30, + 0x3d, 0x8e, 0x22, 0x65, 0xbd, 0x62, 0xcf, 0xb6, 0x3c, 0xce, 0x5d, 0x9f, 0x3c, + 0xa0, 0x0a, 0x43, 0xbd, 0x1e, 0x7b, 0x56, 0xbd, 0x1f, 0x6a, 0x93, 0xbd, 0x60, + 0x5e, 0x39, 0x3d, 0x4d, 0x17, 0x8e, 0xbd, 0x28, 0x00, 0xad, 0x3c, 0x79, 0xd0, + 0xab, 0xbb, 0x15, 0xf3, 0x1a, 0xbd, 0x28, 0x13, 0x05, 0x3c, 0x90, 0x55, 0x20, + 0x3d, 0x98, 0x9b, 0xc4, 0x3c, 0x32, 0x5f, 0x86, 0xbd, 0x6d, 0xf8, 0x52, 0xbd, + 0xcc, 0x28, 0xae, 0x3c, 0x96, 0xc7, 0x81, 0x3d, 0x04, 0x2e, 0x5b, 0xbc, 0xdd, + 0xce, 0xb2, 0x3c, 0x14, 0x5d, 0x67, 0x3d, 0x74, 0xe8, 0x77, 0x3d, 0x2e, 0xf5, + 0x51, 0x3d, 0x21, 0x78, 0x7a, 0xbd, 0x62, 0xea, 0x6a, 0xbd, 0x36, 0x1c, 0xf4, + 0xbc, 0xd0, 0x98, 0xda, 0x3b, 0x26, 0x14, 0x8a, 0xbd, 0xf2, 0xa4, 0x67, 0xbd, + 0xb2, 0xa7, 0x39, 0xbd, 0x93, 0xa6, 0xd6, 0x3c, 0xe1, 0xa9, 0xe4, 0x3b, 0x49, + 0xca, 0x3f, 0x3d, 0x07, 0xe3, 0x64, 0x3d, 0x1e, 0xf5, 0x4d, 0xbd, 0x4e, 0xc3, + 0x8a, 0xbd, 0x88, 0xf9, 0xf8, 0x3c, 0xc6, 0x2a, 0xba, 0xbc, 0x56, 0xd7, 0xb1, + 0xbc, 0xbd, 0xff, 0x10, 0x3c, 0xfe, 0x3d, 0x16, 0xbd, 0x88, 0xdd, 0x5f, 0x3c, + 0x66, 0xd4, 0x50, 0xbd, 0xe2, 0x59, 0x62, 0x3d, 0x1c, 0xdf, 0xac, 0x3c, 0xc2, + 0x72, 0xb7, 0xbc, 0xe2, 0x19, 0x4d, 0xbd, 0xc1, 0xbb, 0xa1, 0x3c, 0xf2, 0x8f, + 0x24, 0x3d, 0x2f, 0xb1, 0xeb, 0xbc, 0xa7, 0xe6, 0x13, 0xbd, 0x4c, 0x51, 0x7c, + 0xbd, 0x23, 0x87, 0x3e, 0xbd, 0x65, 0x03, 0x86, 0x3b, 0x5d, 0x13, 0x15, 0x3d, + 0x44, 0x77, 0x96, 0xba, 0xe9, 0x74, 0x0a, 0x3d, 0xb4, 0xd0, 0x59, 0xbd, 0x4c, + 0x9a, 0x22, 0x3d, 0x82, 0x1b, 0x85, 0x3d, 0x09, 0x1e, 0xf9, 0x3c, 0x20, 0xcf, + 0x97, 0xbd, 0xf9, 0x46, 0x0e, 0xbd, 0xba, 0x0d, 0x82, 0x3d, 0xf6, 0xf1, 0xd7, + 0x3c, 0x8e, 0x08, 0xf8, 0xbc, 0x4d, 0xbf, 0x22, 0xbd, 0xd0, 0x25, 0x8a, 0x3c, + 0xa8, 0x71, 0x2e, 0xbd, 0xd9, 0xaa, 0x24, 0x3a, 0x48, 0x85, 0x6c, 0xbd, 0x90, + 0x0e, 0x8c, 0x3c, 0x3c, 0x45, 0x50, 0x3d, 0x71, 0xab, 0x65, 0x3d, 0x60, 0x38, + 0xdb, 0x3b, 0x9b, 0x94, 0x81, 0xbd, 0xc0, 0xaa, 0xb3, 0xbc, 0xc8, 0x46, 0x93, + 0xbc, 0x3a, 0x19, 0xea, 0xbc, 0x16, 0xab, 0x36, 0xbc, 0x20, 0x52, 0x74, 0xbd, + 0xbd, 0x3b, 0x75, 0x3d, 0xea, 0xef, 0xc3, 0xbc, 0x54, 0xbe, 0x26, 0xbd, 0x88, + 0x03, 0x6c, 0x3d, 0xa0, 0x3e, 0x4a, 0x3d, 0x46, 0x60, 0x0a, 0x3d, 0xf9, 0x88, + 0x59, 0x3d, 0xa2, 0x8a, 0x87, 0xbd, 0xde, 0x60, 0x48, 0x3d, 0xc6, 0x87, 0x60, + 0x3d, 0x05, 0x18, 0x3d, 0xbc, 0xa8, 0x15, 0x01, 0x3d, 0x68, 0x46, 0x41, 0xbd, + 0x7f, 0x8e, 0x58, 0x3d, 0xc6, 0xa4, 0xf6, 0x3c, 0x22, 0xbc, 0x73, 0x3d, 0xe8, + 0x2d, 0x83, 0x3c, 0x97, 0x7f, 0x8b, 0xbb, 0xe6, 0x83, 0x81, 0xbc, 0x42, 0x79, + 0x5b, 0x3d, 0x62, 0xfb, 0xd4, 0x3b, 0xf3, 0x51, 0x06, 0xbd, 0xb0, 0x65, 0x79, + 0x3d, 0xbc, 0x83, 0xdc, 0x3c, 0xbe, 0xbd, 0x8c, 0x3d, 0x64, 0xdf, 0x13, 0x3d, + 0x1f, 0xa8, 0x44, 0xbd, 0x1e, 0x7f, 0x87, 0xbc, 0x15, 0x05, 0x6c, 0xbd, 0x43, + 0x6b, 0x75, 0xbd, 0x38, 0x5a, 0x64, 0x3d, 0xb8, 0x35, 0x2c, 0x3c, 0x93, 0x41, + 0xd5, 0xb9, 0xf4, 0x66, 0x79, 0xbc, 0xd9, 0xda, 0xae, 0xbc, 0xd6, 0x82, 0xd4, + 0x3b, 0x48, 0x9e, 0x3e, 0xbd, 0x0c, 0x2c, 0xb7, 0xbc, 0xba, 0x9c, 0x2f, 0xbd, + 0x9c, 0x53, 0x4f, 0x3d, 0xf5, 0x5f, 0xe6, 0x3c, 0x60, 0x8e, 0x1f, 0x3b, 0xa6, + 0x27, 0x4a, 0xbd, 0xe5, 0x82, 0x9b, 0x3c, 0xb7, 0xe1, 0x84, 0x3d, 0x13, 0x34, + 0x34, 0xbc, 0x58, 0xca, 0x09, 0x3d, 0xe2, 0x9f, 0x70, 0x3d, 0x7b, 0x73, 0xa1, + 0xbc, 0xdb, 0x26, 0x08, 0xbd, 0xc0, 0x46, 0xce, 0xba, 0xfc, 0xde, 0xe1, 0x3c, + 0xf5, 0xd5, 0xbc, 0x3c, 0x03, 0x9b, 0x16, 0x3d, 0x61, 0xda, 0x16, 0xbd, 0x9c, + 0x34, 0x15, 0xbd, 0x6c, 0xae, 0x50, 0xbd, 0xc0, 0x47, 0x89, 0xbd, 0xf0, 0xff, + 0x52, 0x3d, 0xa2, 0xf2, 0x01, 0x3d, 0x7c, 0x68, 0x1a, 0x3d, 0x70, 0x77, 0x58, + 0xbd, 0x62, 0xb8, 0xb3, 0x3c, 0xd8, 0x2e, 0x07, 0xbc, 0xe6, 0x32, 0x8b, 0x3d, + 0x6b, 0xa2, 0x53, 0x3d, 0x12, 0xfa, 0x55, 0xbd, 0x7d, 0x83, 0x28, 0x3d, 0x92, + 0xa8, 0x73, 0xbd, 0xd5, 0xd5, 0x9c, 0x3c, 0xe5, 0x93, 0x83, 0x3c, 0xf9, 0xc8, + 0xb3, 0xbc, 0xfb, 0x27, 0x78, 0xbd, 0xa6, 0x7d, 0x5b, 0x3d, 0x9c, 0x51, 0x4d, + 0x3d, 0x25, 0x60, 0x4b, 0x3d, 0xba, 0x91, 0x96, 0xb9, 0xd7, 0xaf, 0xc3, 0x3c, + 0x34, 0x25, 0x3c, 0x3d, 0x3a, 0x04, 0x3a, 0x3d, 0x86, 0xb2, 0x30, 0x3c, 0x90, + 0xcf, 0x46, 0x3d, 0x96, 0xee, 0xe2, 0xbc, 0x9c, 0x30, 0xa7, 0x3c, 0x56, 0xe3, + 0x5a, 0xbd, 0x2f, 0xb6, 0x23, 0x3d, 0xda, 0x3e, 0x3c, 0xbd, 0x6e, 0xa0, 0x5c, + 0x3d, 0x28, 0xe0, 0x6e, 0xbd, 0x1a, 0x52, 0x34, 0x3d, 0xb8, 0xcd, 0x27, 0xbc, + 0x4a, 0xb4, 0x22, 0x3d, 0x1c, 0xd7, 0x64, 0xbc, 0x8f, 0xd9, 0x1d, 0xbd, 0xa2, + 0x1e, 0x17, 0x3d, 0x78, 0xed, 0xe2, 0x3c, 0x82, 0x5e, 0x0d, 0x3c, 0x93, 0x9d, + 0x58, 0xbd, 0x35, 0x43, 0x8a, 0xbd, 0xbd, 0xa6, 0xdf, 0x3c, 0x11, 0xc3, 0x3b, + 0x3d, 0x6c, 0xad, 0x58, 0xbd, 0x2e, 0x39, 0x1f, 0x3d, 0x45, 0x7d, 0x00, 0x3a, + 0xa9, 0xb2, 0x5b, 0x3d, 0x00, 0x38, 0x81, 0x38, 0xaa, 0x9f, 0xc9, 0x3a, 0xaa, + 0x79, 0x73, 0xbd, 0x39, 0x7b, 0xf7, 0x3b, 0xc4, 0x9f, 0x4e, 0xbd, 0xa1, 0x0c, + 0x64, 0x3a, 0x9b, 0x06, 0x5f, 0xbd, 0x32, 0x21, 0x6d, 0xbd, 0xbe, 0x94, 0x4e, + 0x3d, 0x7c, 0x40, 0xf9, 0x3c, 0xc8, 0xac, 0xca, 0x3c, 0x30, 0x76, 0x50, 0xbd, + 0x08, 0x66, 0x93, 0xbd, 0x0b, 0x4c, 0xb9, 0x3c, 0x8e, 0xef, 0x26, 0x3d, 0xe3, + 0x00, 0x68, 0x3d, 0x51, 0x3a, 0x84, 0xbd, 0x54, 0xac, 0xb3, 0xbc, 0x95, 0x17, + 0x91, 0xbd, 0x04, 0xf2, 0x31, 0x3d, 0x48, 0xbb, 0x20, 0x3c, 0xf3, 0x82, 0x88, + 0xbd, 0xdd, 0x5e, 0x4e, 0xbd, 0x95, 0x9e, 0x45, 0xbd, 0x62, 0xce, 0x51, 0xbd, + 0xa3, 0x8b, 0x3b, 0x3d, 0x40, 0xdb, 0x85, 0x3d, 0x33, 0xdc, 0xc1, 0xbc, 0xa7, + 0xb6, 0x7d, 0xbd, 0xd3, 0x99, 0x40, 0xbc, 0x6b, 0x63, 0x18, 0x3d, 0x73, 0x2f, + 0x63, 0xbc, 0xf8, 0xa2, 0x4a, 0xbc, 0xa5, 0x0b, 0x76, 0x3d, 0xd5, 0x88, 0x79, + 0x3d, 0x97, 0x41, 0x98, 0x3c, 0xe8, 0x20, 0x16, 0x3d, 0xcc, 0x47, 0x78, 0xbd, + 0xfd, 0x9a, 0xae, 0x3c, 0xf2, 0xe2, 0x8a, 0xbd, 0x07, 0xd1, 0x19, 0x3d, 0xd4, + 0xef, 0x68, 0xbc, 0x82, 0x5d, 0x51, 0x3d, 0x0c, 0x61, 0xc8, 0xba, 0xc1, 0xd5, + 0x36, 0xbd, 0xf2, 0x3c, 0x1d, 0x3d, 0x86, 0xdf, 0x65, 0x3d, 0x04, 0x4c, 0x87, + 0x3d, 0xe9, 0x46, 0x91, 0x3d, 0xc0, 0x63, 0x33, 0xbc, 0x7c, 0xd0, 0xbf, 0x3c, + 0xe8, 0xfe, 0x55, 0xbd, 0x18, 0x50, 0x53, 0x3c, 0x51, 0x99, 0xb0, 0xbb, 0x50, + 0x90, 0xec, 0x3b, 0x3d, 0x3a, 0x69, 0xbd, 0x6e, 0x49, 0x09, 0xbc, 0x74, 0x12, + 0xde, 0xbc, 0xad, 0x0c, 0x87, 0x3c, 0x35, 0x8f, 0x41, 0x3d, 0x5e, 0xa8, 0x3b, + 0xbd, 0x28, 0x85, 0x61, 0x3d, 0xfe, 0xb2, 0xe1, 0x3b, 0xec, 0xbb, 0x0e, 0x3d, + 0x04, 0xe3, 0x05, 0x3d, 0x10, 0xeb, 0x07, 0xbd, 0x63, 0x3a, 0x68, 0x3d, 0x55, + 0x9c, 0x49, 0x3b, 0x58, 0xdc, 0x62, 0x3d, 0x33, 0x78, 0x03, 0x3d, 0x0f, 0xc8, + 0x7a, 0xbd, 0xa3, 0x94, 0x83, 0xbd, 0xf7, 0x86, 0x5d, 0xbd, 0xcb, 0xd6, 0x82, + 0x3d, 0xcb, 0x78, 0x82, 0xbd, 0xcb, 0x8b, 0x46, 0xbc, 0x44, 0xff, 0x75, 0xbd, + 0x63, 0xc6, 0x48, 0x3d, 0x50, 0x1b, 0x14, 0xbc, 0x57, 0xd1, 0xe1, 0x3c, 0x60, + 0xa8, 0xe2, 0x3c, 0x00, 0xa0, 0xf8, 0xb9, 0x9c, 0x9f, 0x24, 0x3d, 0x10, 0x2c, + 0x4a, 0x3c, 0x90, 0xdf, 0xbc, 0xbc, 0x9e, 0xae, 0xa4, 0xbc, 0xf7, 0x31, 0x66, + 0xbd, 0x1e, 0x83, 0x14, 0x3c, 0x9b, 0xaa, 0x91, 0x3b, 0x91, 0x24, 0x11, 0xbd, + 0x54, 0x0b, 0x90, 0x3b, 0x30, 0xa4, 0x64, 0x3d, 0x69, 0xa8, 0x81, 0x3d, 0x5e, + 0x35, 0x03, 0xbb, 0xcc, 0xce, 0xa6, 0x3c, 0x2f, 0x18, 0xfd, 0xbc, 0x50, 0x81, + 0xe2, 0xbb, 0x40, 0x4b, 0x16, 0x3d, 0xc0, 0x66, 0x63, 0xbd, 0x5f, 0xcd, 0x9b, + 0xbc, 0x2f, 0xf8, 0x25, 0xbd, 0xa0, 0x4d, 0x7a, 0x3c, 0x81, 0x0c, 0x5a, 0xbd, + 0x54, 0xa9, 0x6a, 0x3d, 0xc0, 0x3b, 0x3c, 0xbd, 0xb4, 0x63, 0xfb, 0x3c, 0x26, + 0x9c, 0x11, 0x3d, 0x06, 0xea, 0xa3, 0xbc, 0x3f, 0x44, 0x92, 0xbc, 0x00, 0x88, + 0x6f, 0x3b, 0xd8, 0x6f, 0x36, 0xbd, 0xe0, 0xad, 0x89, 0x3d, 0x52, 0xfb, 0x72, + 0x3d, 0x64, 0x05, 0x64, 0xbc, 0xd7, 0x2a, 0x57, 0xbd, 0x02, 0x49, 0xad, 0xbc, + 0x38, 0xf1, 0x2d, 0xbd, 0x8a, 0x2e, 0x8b, 0x3d, 0x39, 0x44, 0x12, 0xbd, 0xfc, + 0xa0, 0xb8, 0xbc, 0x32, 0x17, 0x8a, 0xbd, 0x7e, 0xbf, 0x6b, 0x3d, 0x32, 0x76, + 0xad, 0xbc, 0xb0, 0x21, 0x58, 0x3d, 0x62, 0xf5, 0x59, 0x3d, 0xb3, 0x5f, 0x98, + 0x3c, 0xa4, 0x02, 0x2c, 0x3b, 0x59, 0x69, 0x97, 0xbd, 0x70, 0xcf, 0x91, 0x3b, + 0x6b, 0xc3, 0x47, 0xbd, 0x10, 0xfe, 0xd4, 0xbc, 0x08, 0x93, 0xd1, 0x3b, 0xf5, + 0xe9, 0x14, 0xbd, 0x9a, 0x9c, 0x7b, 0x3d, 0x15, 0x75, 0x54, 0x3d, 0x09, 0xbf, + 0x57, 0xbc, 0xbf, 0x09, 0x29, 0xbb, 0xf5, 0x6d, 0x91, 0xbd, 0xb8, 0x41, 0xbd, + 0x3c, 0x80, 0x60, 0x6e, 0x3c, 0xab, 0xf2, 0x4f, 0xbd, 0x81, 0x36, 0x79, 0x3d, + 0x6a, 0x5a, 0x85, 0xbd, 0xf2, 0xac, 0x36, 0x3d, 0x92, 0x7c, 0xc0, 0xbc, 0x00, + 0x12, 0x06, 0x3c, 0xfe, 0x9c, 0x66, 0x3d, 0xa0, 0xf3, 0xbb, 0xbb, 0x37, 0xb0, + 0x74, 0xbd, 0x18, 0xb1, 0x10, 0xbd, 0x82, 0xd7, 0xe2, 0xbc, 0x87, 0xee, 0x14, + 0x3d, 0xe9, 0x2a, 0x40, 0xbd, 0xe3, 0x0d, 0x53, 0x3c, 0x5c, 0x02, 0x93, 0x3c, + 0x25, 0x0f, 0x49, 0xbd, 0x88, 0xd8, 0x3f, 0x3d, 0x58, 0xf0, 0x39, 0xbd, 0xe3, + 0x0a, 0x3b, 0xbd, 0xeb, 0x61, 0x01, 0x3d, 0xb4, 0xa0, 0x6b, 0xbd, 0x1d, 0x4b, + 0x90, 0xbd, 0xb2, 0x31, 0x34, 0xbd, 0xaa, 0x20, 0xad, 0x3a, 0xd5, 0x1e, 0x3a, + 0xbd, 0xf4, 0x05, 0x38, 0x3d, 0x1b, 0xb2, 0x46, 0xbc, 0x2c, 0xd7, 0x3e, 0x3d, + 0xec, 0x98, 0xc7, 0x3c, 0xe7, 0xd3, 0x21, 0xbd, 0x07, 0x35, 0x60, 0xbd, 0x2b, + 0xb9, 0xfd, 0xbc, 0x9b, 0x69, 0x36, 0x3d, 0xdf, 0xdf, 0x6f, 0xbd, 0x5a, 0x80, + 0x81, 0xbd, 0x9b, 0x67, 0xf2, 0x3b, 0x20, 0x94, 0xde, 0xbb, 0xc5, 0xfc, 0x29, + 0xbd, 0x0c, 0x34, 0x30, 0xbd, 0x50, 0xbb, 0xc9, 0xbc, 0x92, 0x32, 0x93, 0xbc, + 0x12, 0xf9, 0x69, 0xbd, 0x1c, 0x84, 0x3a, 0xbc, 0x88, 0x93, 0x84, 0xbd, 0x07, + 0x7e, 0xb5, 0x3c, 0xe6, 0xb8, 0x4a, 0x3d, 0xde, 0x7c, 0x55, 0x3d, 0x16, 0x69, + 0xf0, 0xbc, 0x91, 0x57, 0x5b, 0xbd, 0xa2, 0x4a, 0x26, 0x3d, 0x5b, 0xdc, 0xaf, + 0xba, 0xe8, 0x30, 0xe1, 0xbc, 0xf8, 0x97, 0x21, 0x3d, 0x00, 0x3e, 0x11, 0x3c, + 0x92, 0x1c, 0xb1, 0xbc, 0xce, 0x5f, 0xa3, 0x3c, 0x2d, 0x13, 0x88, 0xbd, 0xbc, + 0x64, 0xbc, 0x3c, 0xd1, 0x47, 0x97, 0xbb, 0xf2, 0x46, 0x55, 0x3d, 0x70, 0x6e, + 0x09, 0x3d, 0x6b, 0x66, 0x93, 0xbd, 0x26, 0xf4, 0xcb, 0xbc, 0x59, 0xb5, 0x84, + 0xbc, 0x13, 0x19, 0x8d, 0x3d, 0x35, 0xf3, 0x3e, 0xbc, 0x9d, 0xf8, 0x78, 0x3d, + 0x75, 0x6d, 0x4f, 0x3d, 0xd4, 0x8a, 0xd7, 0x3c, 0x74, 0x49, 0x0d, 0xbd, 0x40, + 0x3d, 0xcd, 0x3a, 0xa2, 0xb6, 0x64, 0x3d, 0x73, 0xc5, 0x90, 0x3d, 0x5b, 0x4e, + 0x85, 0xbd, 0xf6, 0x1b, 0x64, 0x3d, 0x15, 0x44, 0xbf, 0xbc, 0x4c, 0xb6, 0x0e, + 0x3d, 0xaf, 0x91, 0x06, 0xbc, 0xa0, 0xc6, 0xdf, 0x3c, 0xb7, 0xb5, 0x66, 0x3d, + 0x23, 0x0d, 0x68, 0xbd, 0xcf, 0x9f, 0xe9, 0xbc, 0xcd, 0xa5, 0x1f, 0xbd, 0x92, + 0x3c, 0x5b, 0x3d, 0x0c, 0x92, 0x57, 0x3d, 0x73, 0xa2, 0x2e, 0xbd, 0x4a, 0xeb, + 0x23, 0xbc, 0x6b, 0xa1, 0x3c, 0xba, 0xd2, 0x19, 0xbb, 0xbc, 0x44, 0x55, 0x29, + 0xbd, 0xcd, 0x07, 0x34, 0xbd, 0xbf, 0xaa, 0xf9, 0xba, 0x18, 0x7b, 0x8a, 0xbc, + 0x4a, 0xe1, 0x5d, 0x3d, 0x28, 0x1b, 0x38, 0x3c, 0xfd, 0x1b, 0xd0, 0x3b, 0xdd, + 0x1c, 0x92, 0xbb, 0xf4, 0x64, 0x31, 0x3c, 0x82, 0x22, 0x44, 0x3d, 0x22, 0xd5, + 0x0c, 0xbd, 0x63, 0x1f, 0x24, 0xbd, 0xd0, 0xe3, 0x03, 0x3c, 0xfc, 0x32, 0x22, + 0xbc, 0x26, 0x4e, 0xba, 0xbc, 0xf2, 0x18, 0xa8, 0xbc, 0x1d, 0xb1, 0x43, 0xbc, + 0x4b, 0x52, 0x17, 0xbd, 0xe1, 0xf7, 0x05, 0x3d, 0xdb, 0xfb, 0xd9, 0x3c, 0x0b, + 0x58, 0x8e, 0xbc, 0xc1, 0x1f, 0x81, 0x3d, 0xa0, 0x6f, 0x36, 0xbd, 0x52, 0xec, + 0x57, 0xbd, 0x6a, 0x3b, 0x06, 0xbd, 0xb5, 0x5b, 0x9c, 0xbc, 0x08, 0xb1, 0x32, + 0xbc, 0xc0, 0xde, 0x85, 0xbd, 0x2d, 0xd5, 0xd2, 0x3c, 0xa6, 0x1d, 0x14, 0xbc, + 0x8d, 0x5e, 0xd8, 0x3c, 0x83, 0x8e, 0xcf, 0xbc, 0xa0, 0xc2, 0x83, 0xbd, 0xce, + 0x5f, 0x3b, 0xbd, 0x60, 0xbc, 0x7d, 0xbc, 0x8e, 0x9c, 0x7f, 0xbd, 0xb3, 0x61, + 0x0b, 0xbd, 0x1c, 0x2b, 0xc9, 0x3c, 0xbc, 0xb7, 0x6f, 0x3c, 0x61, 0x58, 0xda, + 0xbc, 0xcc, 0x72, 0x23, 0x3c, 0x28, 0x64, 0x61, 0x3c, 0x5a, 0x19, 0x42, 0x3d, + 0xb0, 0x39, 0x13, 0x3c, 0xe6, 0x3a, 0xf7, 0xbc, 0xc4, 0xaf, 0xc4, 0x3c, 0xd2, + 0x14, 0xd0, 0xbc, 0x1a, 0x00, 0xb8, 0xbc, 0xf9, 0x9e, 0x23, 0xbd, 0xdf, 0x82, + 0x6a, 0xbd, 0x7a, 0xc2, 0x18, 0xbc, 0xbf, 0xb0, 0x11, 0xbc, 0x2d, 0x48, 0x5b, + 0xbd, 0xff, 0xff, 0x46, 0x3c, 0x6c, 0x6c, 0x36, 0x3c, 0xec, 0x21, 0x8a, 0xbd, + 0x02, 0x85, 0xe0, 0x3c, 0xdf, 0x2e, 0x42, 0xbd, 0xf0, 0xa5, 0x24, 0x3d, 0x0a, + 0xd1, 0x00, 0x3d, 0x58, 0x44, 0xb3, 0x3c, 0xc9, 0xe4, 0x33, 0x39, 0xba, 0x0f, + 0xb9, 0xbc, 0xba, 0x18, 0x64, 0x3c, 0x9e, 0xc4, 0x50, 0xbc, 0x5f, 0x96, 0x4c, + 0x3d, 0xbc, 0xdc, 0x61, 0x3d, 0xba, 0xaf, 0x38, 0x3d, 0xf1, 0x21, 0x89, 0x3d, + 0x60, 0x95, 0x05, 0x3c, 0xc6, 0xb2, 0x6e, 0xbc, 0x5f, 0x2d, 0x21, 0xbd, 0xee, + 0x52, 0x23, 0x3d, 0x3c, 0xc0, 0x1d, 0xbc, 0x3e, 0xcd, 0x84, 0x3d, 0x00, 0xc5, + 0xa8, 0x39, 0x06, 0x5b, 0x4a, 0xbd, 0xec, 0x4b, 0x1b, 0xbd, 0x05, 0x4c, 0x17, + 0xbd, 0x18, 0x01, 0x56, 0x3c, 0xcd, 0x05, 0x87, 0xbd, 0xe4, 0x37, 0x41, 0xbc, + 0xdc, 0x36, 0x84, 0x3d, 0xa1, 0xd7, 0x09, 0x3d, 0x44, 0xf4, 0x63, 0xbd, 0x56, + 0x62, 0x78, 0xbd, 0x12, 0x57, 0x3b, 0xbd, 0x43, 0xcd, 0x71, 0xbb, 0xa3, 0xf6, + 0x10, 0x3d, 0x3a, 0x9f, 0xff, 0xbc, 0x6f, 0xdd, 0x8d, 0x3d, 0xb3, 0xd7, 0x08, + 0xbd, 0x3e, 0x97, 0x76, 0x3d, 0x99, 0x60, 0x02, 0xbd, 0x08, 0x27, 0x8d, 0x3d, + 0xf1, 0x51, 0x29, 0x3d, 0x48, 0x9d, 0xfe, 0x3c, 0x97, 0xb9, 0x72, 0xbd, 0x35, + 0x21, 0xab, 0xbc, 0xc3, 0x96, 0x69, 0x3c, 0x05, 0x44, 0x05, 0x3d, 0x80, 0x79, + 0x75, 0x3a, 0x94, 0x62, 0xfe, 0x3b, 0x47, 0xb4, 0x64, 0x3c, 0xbb, 0x50, 0x29, + 0xbd, 0xe9, 0xb8, 0x6e, 0xbd, 0x2e, 0xab, 0x26, 0xbc, 0x54, 0x42, 0xb6, 0xbc, + 0x08, 0xdb, 0x22, 0xbd, 0xae, 0x42, 0x78, 0x3d, 0x3c, 0xba, 0x2c, 0xbc, 0x46, + 0xf1, 0x6e, 0x3d, 0xed, 0xb1, 0x88, 0xbd, 0x96, 0x2c, 0x75, 0x3d, 0x26, 0x69, + 0x90, 0xbd, 0x9b, 0x7b, 0x77, 0xbc, 0x9a, 0xbc, 0x05, 0xbd, 0x85, 0xb1, 0x19, + 0xbd, 0xb8, 0x33, 0x8b, 0xbd, 0xfa, 0xa3, 0x8b, 0xbc, 0xc6, 0x36, 0xf2, 0x3c, + 0x4e, 0x81, 0xa2, 0xbc, 0xa7, 0x85, 0x73, 0xbd, 0xca, 0xe5, 0x93, 0xbc, 0xc8, + 0x3d, 0x0e, 0x3d, 0x75, 0x3c, 0x00, 0xbd, 0x28, 0x32, 0x0e, 0x3d, 0x8f, 0x29, + 0x04, 0xbc, 0x0c, 0x29, 0x37, 0xbd, 0x47, 0x11, 0x83, 0xbd, 0x82, 0x57, 0x2a, + 0xbd, 0x45, 0x1f, 0x6b, 0xbc, 0x66, 0xaf, 0x7d, 0xbd, 0xa8, 0x5a, 0x25, 0xbd, + 0x96, 0xc0, 0x14, 0x3b, 0xba, 0xf0, 0x1b, 0xbd, 0xe0, 0x71, 0x44, 0xbb, 0x9c, + 0x09, 0xb9, 0xbc, 0x45, 0xda, 0x77, 0x3c, 0x2b, 0x5d, 0x80, 0x3d, 0xaa, 0xf0, + 0x21, 0x3d, 0xa0, 0x25, 0x31, 0x3d, 0x34, 0xc8, 0x3b, 0xbd, 0x90, 0x50, 0xf6, + 0xbc, 0x53, 0xed, 0x04, 0x3a, 0x26, 0xf8, 0x6e, 0x3d, 0x6d, 0x73, 0x0f, 0x3d, + 0xe8, 0xac, 0x43, 0x3d, 0xf1, 0x03, 0x8a, 0x3c, 0xc4, 0x94, 0x3d, 0x3d, 0x3c, + 0x89, 0x8b, 0x3d, 0x62, 0x99, 0x0f, 0x3d, 0xb6, 0x30, 0x8d, 0x3c, 0xfa, 0x8f, + 0x25, 0x3c, 0x4c, 0x45, 0xd2, 0xbc, 0x00, 0x5d, 0xc0, 0x3c, 0xae, 0x8d, 0x6c, + 0xbd, 0xcb, 0xa3, 0x92, 0xbd, 0xc4, 0x1e, 0xbb, 0xbc, 0x63, 0xf8, 0xaa, 0x3c, + 0xd7, 0x7c, 0x81, 0x3d, 0xbf, 0x33, 0x41, 0x3c, 0x80, 0x59, 0x69, 0xbb, 0x0a, + 0x75, 0x37, 0xbd, 0x29, 0xdc, 0x1b, 0xbd, 0x10, 0x1f, 0x46, 0xbd, 0xee, 0xb4, + 0x5d, 0x3d, 0xfa, 0x40, 0x95, 0xbd, 0x02, 0xd8, 0x19, 0xbd, 0xa8, 0xd0, 0xf0, + 0xbc, 0x0a, 0xb8, 0xc4, 0x3c, 0x68, 0xa8, 0x11, 0xbd, 0x24, 0x4f, 0x3e, 0x3d, + 0x39, 0x99, 0x90, 0xbd, 0x7c, 0x43, 0x13, 0xbd, 0x86, 0xe5, 0x8f, 0xbd, 0xa4, + 0x16, 0xb4, 0xbc, 0xa0, 0xe9, 0xf2, 0x3c, 0x91, 0x68, 0x5d, 0xbd, 0x51, 0x92, + 0x85, 0x3d, 0xd2, 0x4d, 0x35, 0xbd, 0xc7, 0x44, 0x3e, 0xbd, 0x20, 0xf6, 0xe0, + 0x3c, 0x6b, 0x38, 0x35, 0x3d, 0xd2, 0x2b, 0x2a, 0xbb, 0xc8, 0xbf, 0x0c, 0xbd, + 0xec, 0xd6, 0xfc, 0x3b, 0x1c, 0xae, 0xa9, 0xbc, 0x28, 0x65, 0xb3, 0x3c, 0xdf, + 0x29, 0x98, 0xbc, 0x11, 0x52, 0xbd, 0x3c, 0x4d, 0x7d, 0xac, 0x3c, 0x95, 0xcb, + 0x09, 0xbc, 0xc5, 0xc5, 0xf8, 0xbc, 0xe6, 0x99, 0x3f, 0x3c, 0xb0, 0x51, 0xfd, + 0xbc, 0x88, 0x6b, 0xe0, 0xbc, 0xaa, 0x84, 0x83, 0xbd, 0x98, 0x79, 0x8d, 0x3c, + 0xda, 0x5f, 0xf2, 0x3c, 0xb3, 0xcc, 0x7a, 0x3d, 0xc9, 0x55, 0x08, 0x3d, 0xd1, + 0x83, 0x33, 0x3d, 0x6c, 0xc1, 0x66, 0xbc, 0x80, 0xf9, 0x62, 0xba, 0xe4, 0xd5, + 0x88, 0xbd, 0x60, 0x31, 0xd2, 0xbc, 0x2b, 0x89, 0x86, 0x3d, 0x1b, 0x1e, 0x53, + 0xbd, 0xfa, 0x0c, 0x07, 0xbd, 0x50, 0xe8, 0xb5, 0xbc, 0x4f, 0xc6, 0x65, 0xbd, + 0xef, 0x09, 0x75, 0xbd, 0xd5, 0x47, 0x0c, 0xbd, 0xcc, 0x4e, 0x89, 0xbd, 0x9c, + 0x69, 0xe3, 0x3c, 0x52, 0xea, 0x9d, 0xbc, 0x01, 0x0e, 0x86, 0xbc, 0x2a, 0x61, + 0x72, 0xbd, 0x85, 0xbc, 0x87, 0x3d, 0x21, 0xf7, 0x42, 0x3d, 0x0b, 0x60, 0x23, + 0xbd, 0x0f, 0x0f, 0xed, 0xbc, 0x7d, 0x05, 0xd2, 0xbc, 0x6e, 0x5e, 0x5f, 0xbd, + 0x36, 0x52, 0x92, 0xbd, 0x7e, 0x96, 0x05, 0xbb, 0x6e, 0x51, 0x98, 0x3a, 0xe5, + 0x11, 0x19, 0xbd, 0x00, 0xcf, 0x84, 0xbb, 0x61, 0x5e, 0xed, 0x3c, 0x60, 0xcf, + 0x50, 0xbb, 0xce, 0xbe, 0x07, 0x3c, 0x5c, 0x81, 0x20, 0x3d, 0x45, 0x85, 0xf6, + 0xbc, 0x1d, 0xb7, 0x91, 0x3d, 0x38, 0x08, 0x59, 0x3c, 0x28, 0x93, 0x4b, 0x3d, + 0x3a, 0xc4, 0x87, 0xbd, 0x44, 0x7f, 0x04, 0xbd, 0xdd, 0x17, 0x81, 0x3d, 0xbe, + 0x94, 0x48, 0x3d, 0x88, 0x6a, 0xce, 0xba, 0x93, 0x5b, 0x20, 0x3d, 0xab, 0x05, + 0x90, 0xbd, 0xf9, 0x71, 0xc4, 0x3c, 0x6c, 0xd4, 0x7a, 0x3d, 0x4a, 0x2d, 0x20, + 0x3d, 0x94, 0xd7, 0x88, 0x3d, 0x82, 0xb5, 0x87, 0xbd, 0x55, 0x15, 0xec, 0x3b, + 0xc0, 0x09, 0xe4, 0xba, 0x31, 0x50, 0xfc, 0x3c, 0x25, 0x49, 0x6e, 0x3c, 0x5c, + 0x79, 0x92, 0xbc, 0xed, 0xab, 0x14, 0xbd, 0x24, 0x3e, 0xaa, 0x3c, 0x98, 0x43, + 0x58, 0x3d, 0x2f, 0x00, 0x62, 0x3d, 0x3c, 0x09, 0x2d, 0x3d, 0xe3, 0x27, 0x85, + 0x3c, 0x7a, 0x37, 0x06, 0x3d, 0x49, 0xe6, 0x62, 0xbd, 0x71, 0x53, 0x94, 0xbd, + 0xc4, 0xeb, 0xd0, 0xbb, 0xd8, 0xed, 0x11, 0x3c, 0xfe, 0x75, 0x8c, 0xbc, 0xc4, + 0xeb, 0x16, 0xbd, 0xb8, 0xb8, 0xf7, 0x3c, 0x30, 0x85, 0xaa, 0xbb, 0xcb, 0x9f, + 0x16, 0xbd, 0x1d, 0xed, 0x8d, 0x3d, 0x0f, 0xf3, 0x08, 0xbd, 0x8e, 0x3c, 0x13, + 0x3d, 0xc4, 0x04, 0x74, 0x3d, 0x60, 0xeb, 0x35, 0xbd, 0xe7, 0xcf, 0x38, 0x3d, + 0x12, 0xde, 0xaf, 0x3c, 0xca, 0x71, 0x04, 0x3d, 0x1c, 0xd8, 0xeb, 0x3c, 0xc6, + 0xfc, 0xb3, 0x3c, 0xa0, 0x37, 0x5a, 0x3d, 0xbe, 0xcc, 0x59, 0x3c, 0x4c, 0x95, + 0x9a, 0xbc, 0xa6, 0xff, 0xa8, 0x3b, 0xcd, 0x7d, 0x7d, 0xbd, 0x5c, 0xe7, 0xba, + 0x3c, 0xf9, 0x97, 0x02, 0xbd, 0x3a, 0xd3, 0x80, 0xbd, 0xcd, 0xbe, 0x97, 0xbd, + 0x3b, 0x0d, 0x35, 0xba, 0x76, 0x27, 0x44, 0x3d, 0x63, 0xae, 0x8a, 0x3d, 0x03, + 0x4c, 0x68, 0xbd, 0xe5, 0x9d, 0x0f, 0xbc, 0x6f, 0x5d, 0x45, 0xbb, 0x48, 0x3a, + 0x74, 0x3d, 0x85, 0xfa, 0x37, 0xbd, 0x31, 0xf5, 0x1c, 0x3d, 0x0b, 0x19, 0x52, + 0xbd, 0x00, 0xcd, 0x9e, 0xb9, 0xdb, 0xe5, 0x84, 0xbd, 0x83, 0xf1, 0x7f, 0xbd, + 0xb7, 0x44, 0x63, 0xbd, 0x44, 0x0a, 0x98, 0xbd, 0x60, 0xd8, 0x23, 0xbb, 0xd1, + 0x69, 0x61, 0xbd, 0x71, 0x41, 0x5a, 0xbd, 0x2f, 0xd9, 0x70, 0xbd, 0xc3, 0xb8, + 0xd3, 0x3c, 0x38, 0xa7, 0x99, 0x3c, 0xe0, 0xa0, 0x21, 0xbd, 0xd2, 0x90, 0xa8, + 0xb8, 0xff, 0xae, 0x32, 0x3c, 0x65, 0x1a, 0x0d, 0x3d, 0xa6, 0xd0, 0x39, 0xbd, + 0xdd, 0xb4, 0x18, 0xbd, 0xb0, 0xa0, 0xbc, 0x3c, 0xa0, 0xe4, 0x8b, 0x3d, 0x90, + 0xe6, 0x25, 0x3d, 0x7c, 0x20, 0x5d, 0x3d, 0x74, 0x50, 0xda, 0xbb, 0x4a, 0xe0, + 0x70, 0x3d, 0x02, 0x36, 0x13, 0x3d, 0xaa, 0xab, 0x05, 0xbd, 0xec, 0xda, 0x10, + 0xbd, 0xd1, 0x40, 0x35, 0xbd, 0xd2, 0x14, 0x3a, 0xbd, 0xd6, 0x7f, 0x06, 0xbd, + 0x55, 0xf8, 0x31, 0x3d, 0xea, 0xc4, 0x5c, 0x3d, 0xd6, 0x89, 0x52, 0x3d, 0x68, + 0xe6, 0x44, 0x3d, 0xd5, 0x64, 0x20, 0xbd, 0x18, 0x41, 0xc8, 0x3c, 0x10, 0xfa, + 0x44, 0x3d, 0x30, 0x39, 0x20, 0xbc, 0x27, 0x26, 0x85, 0x3d, 0x9e, 0x02, 0x48, + 0x3d, 0x59, 0xbb, 0xad, 0xbc, 0x67, 0x3c, 0xe3, 0xbc, 0xcc, 0x6e, 0x4b, 0xbd, + 0x08, 0xf9, 0x1c, 0xbd, 0x50, 0x02, 0xa8, 0x3c, 0x77, 0x8c, 0x21, 0xbd, 0x1b, + 0x8e, 0x0c, 0x3c, 0x0a, 0xe3, 0x76, 0x3d, 0x60, 0xa0, 0xa6, 0xbc, 0x30, 0x1d, + 0x2c, 0x3d, 0x89, 0xab, 0x57, 0xbd, 0x39, 0xdf, 0x8e, 0x3b, 0x4e, 0xd0, 0x81, + 0x3d, 0x6f, 0xc7, 0x0c, 0x3d, 0xb8, 0x21, 0x12, 0x3d, 0x32, 0xe6, 0x5a, 0x3d, + 0x26, 0xbf, 0x64, 0x3c, 0xa8, 0xaf, 0x35, 0x3d, 0x0e, 0x6e, 0xb4, 0xbc, 0x78, + 0x59, 0xa8, 0x3c, 0xd1, 0xca, 0x5c, 0xbd, 0x3a, 0x40, 0x53, 0x3d, 0x30, 0x50, + 0x0c, 0xbc, 0x11, 0xd3, 0x35, 0xbd, 0x06, 0x5b, 0x89, 0xbd, 0x2e, 0xe3, 0x63, + 0x3d, 0xc5, 0xdc, 0x0e, 0xbd, 0x60, 0x04, 0x2d, 0xbb, 0xae, 0xfb, 0x42, 0x3d, + 0x83, 0x52, 0xcd, 0xbc, 0x20, 0x53, 0x06, 0x3d, 0xd5, 0xc6, 0x38, 0x3c, 0xa7, + 0xa9, 0xf4, 0xbc, 0x9b, 0x2d, 0x89, 0x3d, 0x70, 0x74, 0x83, 0x3c, 0x06, 0x87, + 0xe7, 0x3b, 0x97, 0xa3, 0x92, 0x3c, 0x38, 0x5f, 0xf7, 0x3c, 0xdf, 0x71, 0x3b, + 0xbd, 0xfe, 0x14, 0x4d, 0x3d, 0x0a, 0x42, 0xb8, 0xbc, 0xb4, 0xf6, 0x2f, 0x3c, + 0x33, 0xe6, 0x94, 0xbd, 0x26, 0x39, 0x71, 0xbd, 0x10, 0xf4, 0x6e, 0xbd, 0xe4, + 0x3f, 0x09, 0xbd, 0x35, 0xe6, 0xb7, 0x3c, 0x9b, 0x3a, 0x10, 0xbd, 0x4d, 0x58, + 0x43, 0xbd, 0x3e, 0x25, 0x2c, 0xbd, 0x38, 0xdc, 0x4f, 0x3c, 0x06, 0xf5, 0xff, + 0xbc, 0x33, 0x3e, 0x81, 0xbd, 0x27, 0x99, 0x8e, 0xbb, 0x27, 0xc9, 0x68, 0xbd, + 0xce, 0x6c, 0x81, 0x3c, 0x0e, 0xab, 0x67, 0xbd, 0x50, 0x8a, 0x2f, 0x3c, 0x30, + 0x32, 0x37, 0x3d, 0x49, 0xd1, 0x0e, 0xbd, 0x60, 0xe2, 0x38, 0x3d, 0xf8, 0xd0, + 0x9f, 0x3c, 0x3e, 0x8a, 0x0d, 0x3d, 0x7e, 0x2f, 0x6a, 0xbd, 0xe8, 0x0f, 0xab, + 0x3b, 0x6e, 0x3d, 0x49, 0xbd, 0xba, 0xdd, 0x00, 0x3d, 0x80, 0x40, 0xdc, 0x3b, + 0x18, 0x06, 0x76, 0x3d, 0x48, 0xe5, 0x6d, 0x3d, 0xca, 0xcf, 0xa9, 0xbc, 0x3c, + 0xb8, 0x50, 0xbc, 0x70, 0xbf, 0x76, 0x3c, 0x0c, 0xbc, 0x1c, 0x3d, 0x59, 0x70, + 0xf3, 0xbc, 0x21, 0xaa, 0x83, 0xbc, 0xf6, 0x67, 0x4f, 0xbd, 0x86, 0xa6, 0x71, + 0x3c, 0x69, 0xd6, 0x48, 0x3c, 0x50, 0x60, 0x56, 0x3d, 0x9c, 0x25, 0x50, 0xbd, + 0x10, 0x27, 0x76, 0x3c, 0x98, 0x24, 0x7b, 0xbd, 0x6c, 0xb9, 0x01, 0xbc, 0xe6, + 0xea, 0x85, 0x3d, 0x0e, 0xa0, 0xf5, 0x3b, 0xb4, 0xb3, 0x0e, 0x3d, 0xe2, 0xc0, + 0xa1, 0x3c, 0x4c, 0x2c, 0xf6, 0xbc, 0xc8, 0x58, 0x25, 0x3c, 0xd0, 0x2c, 0xeb, + 0x3c, 0xa8, 0x0f, 0xfa, 0x3c, 0x50, 0xc1, 0xd6, 0xbb, 0x42, 0x81, 0x4d, 0xbd, + 0x37, 0x4c, 0x88, 0xbd, 0xf4, 0x1a, 0xd2, 0xbc, 0x94, 0xb7, 0xaf, 0xbb, 0xaf, + 0xeb, 0x0f, 0x3d, 0xed, 0x56, 0xa3, 0x3c, 0x5e, 0x0a, 0x87, 0x3d, 0x5c, 0x4a, + 0x64, 0xbc, 0x37, 0x90, 0x62, 0x3c, 0x57, 0xcd, 0xbb, 0x3b, 0x50, 0x0c, 0x76, + 0xbd, 0x1c, 0x48, 0x87, 0xbc, 0x38, 0x8a, 0x4e, 0x3c, 0xda, 0x2b, 0x3a, 0x3d, + 0xba, 0x1a, 0x81, 0xbc, 0x29, 0xca, 0xba, 0x3c, 0x78, 0x39, 0x2b, 0xbd, 0xd4, + 0x80, 0xe2, 0xbb, 0x08, 0x96, 0x95, 0x3c, 0x55, 0x08, 0x50, 0x3c, 0xbd, 0xed, + 0x15, 0xbd, 0xd0, 0xeb, 0xe5, 0xbb, 0xa5, 0x5a, 0x22, 0xbc, 0x6c, 0xe7, 0x8f, + 0xbc, 0x63, 0x73, 0xb2, 0x3c, 0xc0, 0xae, 0x13, 0x3c, 0x54, 0xbd, 0x6f, 0xbd, + 0x9e, 0x5a, 0x60, 0x3d, 0x62, 0xe8, 0x34, 0x3d, 0x38, 0x91, 0x24, 0x3d, 0x10, + 0xac, 0x03, 0x3c, 0x04, 0xc0, 0x83, 0xbd, 0x16, 0x48, 0x7e, 0xbd, 0x64, 0x7a, + 0x40, 0xbc, 0x52, 0xcf, 0x4a, 0x3d, 0xa1, 0x54, 0x1f, 0xb9, 0x61, 0x19, 0x8c, + 0x3d, 0x08, 0xfa, 0x5a, 0xbd, 0x2a, 0xf5, 0x67, 0x3d, 0xb3, 0xcc, 0x12, 0xbd, + 0xc3, 0x2a, 0x65, 0x3d, 0x06, 0xbb, 0x41, 0xbd, 0xfc, 0xc0, 0x09, 0xbd, 0x2c, + 0xdf, 0xa7, 0xbc, 0xb7, 0xfe, 0x5d, 0xbd, 0xcb, 0x10, 0xa3, 0xbb, 0x75, 0xc3, + 0xcd, 0x3c, 0x2b, 0xd5, 0x0e, 0x3d, 0x11, 0x1c, 0x83, 0x3d, 0x71, 0xdc, 0xb2, + 0xbc, 0xda, 0xe1, 0x86, 0xbd, 0x39, 0xf2, 0x50, 0x3c, 0x40, 0x25, 0x50, 0x3b, + 0x18, 0x17, 0x43, 0xbc, 0x6b, 0xa6, 0x88, 0x3c, 0x60, 0x10, 0x5d, 0xbd, 0x0e, + 0x88, 0xa1, 0x3c, 0xa6, 0xd3, 0xe4, 0xbc, 0x11, 0x76, 0x88, 0xbc, 0x1e, 0x07, + 0x6c, 0x3d, 0xa6, 0x6e, 0x1b, 0x3d, 0xc0, 0x30, 0x30, 0x3d, 0xf2, 0x34, 0x8d, + 0xbd, 0xc0, 0xe2, 0x18, 0x3b, 0xce, 0xef, 0x83, 0xbc, 0xe7, 0x31, 0x0e, 0xbd, + 0xd1, 0xf1, 0x8b, 0xbd, 0xba, 0x6e, 0x3e, 0xbc, 0xc7, 0x45, 0x08, 0xbd, 0x57, + 0x7e, 0x56, 0x3d, 0x6d, 0xaf, 0x68, 0xbd, 0xef, 0x94, 0x28, 0xbd, 0x65, 0xf5, + 0xa5, 0x3c, 0xea, 0x2c, 0x43, 0xbd, 0x5c, 0xc6, 0x5d, 0x3c, 0x3e, 0x7e, 0x3f, + 0xbd, 0xd4, 0xa5, 0x7c, 0xbd, 0x14, 0x39, 0x35, 0xbd, 0xc5, 0x8a, 0x08, 0xbd, + 0x7e, 0xc0, 0x0c, 0x3d, 0x45, 0xbb, 0x84, 0x3c, 0x0d, 0x10, 0x6f, 0x39, 0x81, + 0x04, 0x4b, 0x3c, 0x5b, 0x45, 0xff, 0x3c, 0xab, 0xd1, 0x74, 0xbd, 0x98, 0x8a, + 0x38, 0x3c, 0xe3, 0xc7, 0xa9, 0x3c, 0x8b, 0x12, 0x7f, 0xbd, 0x6f, 0xb7, 0xc5, + 0x3a, 0x95, 0x7e, 0xaf, 0x3c, 0x50, 0xc8, 0xc5, 0x3b, 0xf9, 0x02, 0x89, 0xbd, + 0x6e, 0x63, 0xa2, 0xbc, 0x0c, 0x74, 0x32, 0x3d, 0xea, 0x32, 0x79, 0x3d, 0x0e, + 0x34, 0x91, 0xbd, 0xa1, 0x87, 0xec, 0xbc, 0x1c, 0xd4, 0x17, 0x3d, 0xe1, 0xb0, + 0x74, 0x3d, 0xe9, 0x8e, 0xc6, 0x3c, 0x8a, 0x62, 0x55, 0xbc, 0x51, 0x37, 0x95, + 0xbd, 0x2b, 0xc8, 0xbd, 0xbc, 0x8e, 0xe4, 0xef, 0xbc, 0x11, 0x49, 0x0d, 0x3d, + 0xe8, 0xcc, 0x16, 0x3d, 0xc6, 0xa8, 0xc8, 0x3c, 0x98, 0x01, 0x88, 0x3c, 0xbd, + 0x8e, 0x46, 0xbd, 0xab, 0x7d, 0xd4, 0xbc, 0x7a, 0xde, 0xb6, 0xbc, 0xf9, 0x44, + 0xcd, 0xbc, 0xad, 0xae, 0x13, 0xbc, 0x8d, 0xb5, 0x21, 0xbd, 0x48, 0xfb, 0x05, + 0xbc, 0x1d, 0x6d, 0x84, 0x3d, 0x4c, 0x32, 0x8a, 0x3c, 0xa8, 0xe9, 0x69, 0x3c, + 0xa6, 0xba, 0x1b, 0xbd, 0xe5, 0xfa, 0x12, 0x3d, 0xea, 0xea, 0x11, 0x3d, 0xa4, + 0xa1, 0x10, 0xbd, 0x0c, 0x0e, 0xad, 0x3d, 0x04, 0xeb, 0x1c, 0xbd, 0xe5, 0x6d, + 0x0f, 0xbd, 0x1e, 0x40, 0xea, 0x3d, 0xfa, 0xc5, 0x36, 0x3d, 0x7a, 0xd3, 0x34, + 0xbd, 0xe2, 0xe5, 0x4b, 0xbd, 0x27, 0x35, 0xf0, 0xbd, 0x60, 0x53, 0xc6, 0xbc, + 0xb4, 0x7c, 0x0b, 0xbd, 0x0c, 0xc1, 0xbd, 0x39, 0x4b, 0xfb, 0x67, 0x3c, 0x4c, + 0x65, 0xc4, 0x3c, 0x23, 0x9d, 0x88, 0x3c, 0x7c, 0x7e, 0xa0, 0x3b, 0x7f, 0xd2, + 0x94, 0x3b, 0x45, 0xd2, 0x24, 0x3d, 0x00, 0xd4, 0xf5, 0xbb, 0x13, 0xf0, 0x99, + 0x3d, 0xd6, 0x36, 0xa0, 0x3a, 0x28, 0xb0, 0x5d, 0x3d, 0x9f, 0xf9, 0x81, 0xbd, + 0x42, 0x4b, 0x98, 0x3d, 0x29, 0x10, 0x7d, 0x3d, 0x8e, 0xe9, 0xf5, 0xbc, 0xfb, + 0xc1, 0x91, 0xbc, 0x71, 0xda, 0xe2, 0xbc, 0x1e, 0x75, 0x3b, 0xbd, 0xbe, 0x22, + 0x2f, 0x3d, 0xfa, 0xb6, 0x27, 0xba, 0x8c, 0x36, 0x86, 0x3c, 0x45, 0x63, 0xcf, + 0xbc, 0x13, 0x05, 0x5e, 0xbc, 0xba, 0xc5, 0x24, 0xbd, 0xcd, 0x6d, 0x0b, 0x3c, + 0x5d, 0xe6, 0x00, 0x3b, 0x82, 0xbb, 0xcf, 0xbc, 0xdb, 0x1f, 0x31, 0xbd, 0x91, + 0x32, 0x95, 0xbc, 0x81, 0xff, 0x0b, 0xba, 0xa7, 0xe4, 0x0f, 0x3d, 0x50, 0xd4, + 0x2c, 0x3d, 0x4c, 0x82, 0x27, 0x3c, 0x54, 0x76, 0x69, 0x3c, 0xef, 0x41, 0x53, + 0xbb, 0x7b, 0x88, 0x26, 0xbd, 0xfa, 0x19, 0x51, 0x3d, 0x83, 0xe9, 0x89, 0xbd, + 0x96, 0xa7, 0x4a, 0x3d, 0x87, 0xf0, 0xe6, 0xbc, 0x2b, 0x59, 0x61, 0xbc, 0x4a, + 0x9a, 0x7d, 0x3d, 0x7c, 0x95, 0x54, 0x38, 0xa6, 0x6e, 0x69, 0x3d, 0xf3, 0x84, + 0x27, 0xbd, 0x84, 0x7f, 0x26, 0x3c, 0xc3, 0xe1, 0x58, 0x3b, 0xa7, 0x2d, 0xa5, + 0x3d, 0x13, 0x70, 0x2a, 0xbd, 0xae, 0x66, 0x1f, 0x3d, 0x6d, 0x44, 0xff, 0xbc, + 0x66, 0x10, 0xb2, 0x3c, 0x94, 0xd5, 0x98, 0xb9, 0x00, 0xc8, 0xef, 0x3d, 0x5c, + 0x00, 0x2f, 0xbc, 0xd7, 0xb1, 0xf6, 0x3c, 0x1b, 0xdb, 0xe1, 0x3c, 0xaa, 0x78, + 0xe0, 0x3c, 0xb5, 0xe8, 0xd1, 0x3c, 0xda, 0x9e, 0x39, 0xbc, 0xe4, 0x90, 0x84, + 0xbc, 0x42, 0x92, 0x6f, 0xbd, 0xdd, 0xd7, 0x8a, 0x3d, 0xd3, 0x62, 0x90, 0x3c, + 0x1c, 0x20, 0x52, 0x3d, 0x1e, 0x29, 0x72, 0xbd, 0xf4, 0x8e, 0x1c, 0x3d, 0xd9, + 0xda, 0xaf, 0xbc, 0x60, 0x11, 0x8e, 0xbb, 0x71, 0xc1, 0xbf, 0xbc, 0xec, 0x7f, + 0x3d, 0x3c, 0xe5, 0x10, 0x3d, 0xbd, 0x1a, 0xbf, 0x69, 0x3d, 0x3f, 0x56, 0x0b, + 0xbb, 0x19, 0x64, 0x9d, 0x3c, 0xe1, 0x00, 0x05, 0x3d, 0x4f, 0x77, 0x8e, 0x3d, + 0x0f, 0x4d, 0x35, 0x3d, 0xe5, 0x6d, 0x4d, 0xbd, 0x9d, 0xb6, 0x58, 0x3c, 0x64, + 0x44, 0x30, 0xba, 0x08, 0xe8, 0xaa, 0x3c, 0x73, 0xe7, 0x0b, 0x3d, 0x71, 0x00, + 0x8c, 0x3d, 0x1a, 0xd9, 0xeb, 0x3c, 0xde, 0x78, 0xf2, 0xbb, 0xe5, 0x50, 0xcb, + 0x3d, 0x03, 0x80, 0x7f, 0x3b, 0xb4, 0xf7, 0x1a, 0x3d, 0x32, 0xf5, 0xb0, 0x3d, + 0x1c, 0x38, 0xe5, 0x3c, 0xb1, 0x72, 0x05, 0x3d, 0xc3, 0x92, 0xcf, 0x3c, 0xdc, + 0x7b, 0x0c, 0xbe, 0x95, 0x0b, 0xfc, 0x3c, 0x5f, 0x34, 0x18, 0x3d, 0xc2, 0x08, + 0x19, 0xbd, 0x25, 0xd4, 0x7b, 0x3d, 0x1e, 0xca, 0x88, 0xbd, 0x57, 0x5f, 0x9a, + 0x3d, 0x57, 0x98, 0x80, 0x3d, 0x20, 0x7d, 0xdd, 0x3c, 0xdf, 0xb3, 0x65, 0x3d, + 0x88, 0xde, 0x8d, 0xbd, 0x45, 0x90, 0x9d, 0x3d, 0x8a, 0xf8, 0xfa, 0xbc, 0xdf, + 0xe2, 0xef, 0xb9, 0x21, 0x8d, 0x5a, 0xbc, 0x3e, 0x45, 0x17, 0x3c, 0x11, 0x8d, + 0x8d, 0xbd, 0xb9, 0xd3, 0x2b, 0xb9, 0xd1, 0x2b, 0x24, 0xbc, 0x7e, 0x0e, 0x00, + 0x3b, 0xfd, 0xc2, 0x2e, 0xbd, 0x80, 0x7d, 0x0d, 0x3d, 0x91, 0x8a, 0x49, 0x3d, + 0xba, 0x7e, 0x10, 0x3d, 0xc3, 0x56, 0x2a, 0x3d, 0x1a, 0x4d, 0x6e, 0x3d, 0x20, + 0x44, 0x90, 0x3c, 0x2f, 0xd8, 0x79, 0x3d, 0x7b, 0x5c, 0xab, 0x3d, 0x64, 0xa5, + 0xe1, 0x3c, 0x26, 0x94, 0x31, 0x3d, 0xcc, 0xaf, 0xec, 0xbd, 0xc0, 0x25, 0x4b, + 0xbd, 0xd1, 0x06, 0x87, 0x3d, 0x97, 0x3c, 0x44, 0xbd, 0x9c, 0x81, 0xc2, 0xbc, + 0x0a, 0xd3, 0x1a, 0xbd, 0x0d, 0xe3, 0x00, 0xbd, 0x08, 0x6e, 0x53, 0xbd, 0x67, + 0x84, 0x1a, 0x3d, 0xeb, 0xd0, 0x2f, 0x3d, 0x76, 0xea, 0x46, 0x3b, 0x3e, 0x6e, + 0xbe, 0xbc, 0xf3, 0x6a, 0x11, 0x3d, 0x13, 0xed, 0xb8, 0x3c, 0xc1, 0x4f, 0x9a, + 0x3d, 0xd6, 0x9a, 0x31, 0xbd, 0xcc, 0x51, 0x0e, 0x3d, 0x60, 0x8c, 0x89, 0x3d, + 0x66, 0xc1, 0x41, 0xbd, 0x75, 0x80, 0xa2, 0x3d, 0x40, 0xbb, 0x5c, 0x3b, 0x6f, + 0xb6, 0x90, 0x3d, 0xb7, 0x62, 0x02, 0x3c, 0x54, 0x75, 0x78, 0x3d, 0x3d, 0x29, + 0xaf, 0x3d, 0x53, 0x5f, 0x97, 0x3d, 0xaf, 0x83, 0x91, 0xbc, 0xc9, 0x29, 0x55, + 0x3d, 0xda, 0x00, 0x82, 0xbb, 0x8d, 0xcd, 0x2e, 0x3d, 0x9d, 0xcb, 0x88, 0xbd, + 0x4d, 0x93, 0x3d, 0xbd, 0x55, 0xb8, 0x66, 0xbd, 0x98, 0xf2, 0x4e, 0xbc, 0xf9, + 0xe0, 0x28, 0xbc, 0x6f, 0x30, 0x2d, 0x3d, 0xd8, 0xe6, 0x9e, 0x3d, 0x81, 0xcf, + 0x31, 0xbd, 0x31, 0x50, 0x45, 0xbd, 0x90, 0x9e, 0x2f, 0xbd, 0x4b, 0x9a, 0x9a, + 0x3d, 0x2f, 0x1a, 0xb3, 0xbc, 0x05, 0x59, 0x9b, 0xbc, 0xa6, 0x4f, 0x9b, 0xbc, + 0x24, 0x10, 0x9e, 0xbd, 0x91, 0x8e, 0xa5, 0x3c, 0x0c, 0x2a, 0x43, 0x3d, 0x85, + 0x85, 0x87, 0xbd, 0x00, 0x61, 0x36, 0xbd, 0x10, 0xb9, 0x43, 0xbc, 0x58, 0x2c, + 0x24, 0x3b, 0xb7, 0x4f, 0x80, 0x3d, 0x46, 0x0f, 0x29, 0xbd, 0x76, 0x68, 0x44, + 0xbd, 0x57, 0xcf, 0x18, 0xbd, 0x24, 0x15, 0x94, 0x3d, 0x13, 0x57, 0x98, 0x3d, + 0x5e, 0xd6, 0x9c, 0x3d, 0xa0, 0x16, 0x9e, 0x3d, 0x66, 0x87, 0x83, 0xbd, 0x19, + 0x6d, 0x8b, 0x3d, 0x24, 0x60, 0x9a, 0xbc, 0x00, 0x60, 0xea, 0xbb, 0xba, 0x09, + 0x5f, 0xbd, 0xdc, 0xdd, 0xaa, 0x3b, 0x95, 0x08, 0xe9, 0xbc, 0x82, 0x0c, 0xc6, + 0x3c, 0x19, 0xb1, 0xda, 0xbc, 0x80, 0x2e, 0x4b, 0x3c, 0xed, 0xab, 0x29, 0x3d, + 0x17, 0x38, 0x51, 0x3d, 0x52, 0xa3, 0xef, 0x3c, 0xfd, 0x1c, 0x88, 0xbc, 0x40, + 0x9f, 0x3a, 0x3c, 0x87, 0x8a, 0xbe, 0xbc, 0xe5, 0xf4, 0x2a, 0xbd, 0x01, 0x1f, + 0x32, 0x3d, 0x2c, 0xbf, 0x3d, 0xbc, 0x33, 0xd3, 0xf9, 0xbb, 0xc4, 0x58, 0x2d, + 0xbd, 0x5d, 0xa3, 0x8f, 0x3d, 0x27, 0x5d, 0x90, 0xbc, 0xcf, 0x00, 0x82, 0x3d, + 0x0b, 0x65, 0xa7, 0x3d, 0x52, 0x11, 0xff, 0xbc, 0x37, 0xca, 0x18, 0xbd, 0xb9, + 0x2f, 0x9d, 0x3c, 0x36, 0x90, 0x68, 0x3d, 0x85, 0x61, 0x6b, 0x3d, 0x27, 0xb0, + 0x89, 0xbc, 0xcb, 0xb5, 0xac, 0xbb, 0xf4, 0x4b, 0x79, 0xbc, 0x34, 0x73, 0xe7, + 0xbc, 0x81, 0x9b, 0x86, 0x3c, 0x58, 0xc2, 0xce, 0x3c, 0x0a, 0x63, 0x2c, 0xbd, + 0xf6, 0xd3, 0xcf, 0xbd, 0xea, 0xf1, 0x01, 0xbd, 0x7a, 0x64, 0xe0, 0xbc, 0x12, + 0x3a, 0x28, 0x3d, 0x98, 0xe9, 0x98, 0x3d, 0x95, 0xf1, 0xa8, 0xbc, 0x88, 0xb4, + 0x2a, 0x3d, 0x81, 0xdf, 0xc4, 0xbc, 0x62, 0xb8, 0xfb, 0xbc, 0x46, 0xd2, 0x90, + 0xbd, 0x74, 0x0a, 0xc4, 0x3c, 0x8e, 0x57, 0x6f, 0x3d, 0xf9, 0xea, 0x78, 0x3d, + 0xdc, 0x6e, 0x62, 0xbd, 0x46, 0xe2, 0x16, 0xbd, 0xa6, 0x36, 0x37, 0xbd, 0xf5, + 0x36, 0x35, 0xbd, 0x9a, 0x4f, 0xb8, 0xbc, 0xf2, 0xab, 0x15, 0x3c, 0xee, 0x55, + 0xd7, 0x3b, 0xfa, 0xd0, 0x1c, 0xbd, 0xd4, 0x6b, 0x97, 0xbc, 0x91, 0x57, 0x51, + 0xbd, 0x7c, 0xc9, 0x64, 0x3d, 0xf8, 0x29, 0xcd, 0xbc, 0x75, 0x65, 0x67, 0x3d, + 0xaa, 0xd9, 0xa3, 0x3c, 0x55, 0xff, 0x8f, 0x3c, 0x7c, 0x18, 0x46, 0xbd, 0x92, + 0x18, 0x2c, 0x3d, 0x3a, 0x9f, 0x8a, 0xbc, 0xee, 0xd4, 0x05, 0x3d, 0x37, 0x03, + 0xaa, 0xbd, 0xe9, 0x50, 0x07, 0xbe, 0x1a, 0x94, 0x18, 0x3d, 0x79, 0x69, 0x03, + 0xbd, 0x7f, 0xc8, 0xd4, 0xbc, 0x25, 0xa7, 0x86, 0x3a, 0x17, 0xf1, 0x00, 0x3c, + 0xfd, 0x40, 0x10, 0x3d, 0x6e, 0x29, 0xf7, 0x3c, 0x05, 0xb0, 0x38, 0xbd, 0x7e, + 0x44, 0x5a, 0xbc, 0x0e, 0xdf, 0x66, 0x3d, 0x08, 0x9d, 0x10, 0xbc, 0xff, 0x12, + 0x8e, 0xbb, 0x01, 0x3f, 0x67, 0xbc, 0x6e, 0xa6, 0x4f, 0x3d, 0xca, 0x07, 0x63, + 0xbd, 0x97, 0x61, 0x4b, 0x3d, 0x71, 0x21, 0x34, 0x3d, 0x4f, 0xa2, 0x6d, 0x3d, + 0x8f, 0xf5, 0xe8, 0xbd, 0x72, 0x55, 0x4b, 0xbd, 0xee, 0xb2, 0xe9, 0xbc, 0xf2, + 0x49, 0xa7, 0x3d, 0x89, 0x22, 0xf5, 0x3c, 0xd8, 0x73, 0xcb, 0x3d, 0xbb, 0x15, + 0x81, 0x3d, 0x33, 0xf1, 0x5c, 0x3d, 0xa7, 0x30, 0x96, 0xbd, 0x4b, 0x2c, 0x58, + 0xbd, 0x34, 0x05, 0x00, 0x3d, 0xbd, 0x81, 0x92, 0x3d, 0x67, 0x5b, 0x5f, 0xbc, + 0xb4, 0x1e, 0xe6, 0xbd, 0x7c, 0x56, 0x00, 0x3c, 0x7c, 0x6d, 0xa8, 0x3c, 0x9b, + 0x21, 0xbd, 0xbb, 0x71, 0xf4, 0x48, 0xbd, 0xf8, 0xe1, 0x87, 0xbd, 0xd7, 0x4f, + 0xaf, 0xbc, 0x08, 0xef, 0xd9, 0x3c, 0x3e, 0x7b, 0x24, 0x3c, 0xa8, 0xcc, 0xe7, + 0x3c, 0xf0, 0xa0, 0x4a, 0xbd, 0x45, 0xbf, 0x39, 0xbd, 0x4e, 0xb6, 0xd6, 0x3c, + 0xfb, 0xfb, 0x49, 0x3d, 0xdd, 0x90, 0x4e, 0x3c, 0x0c, 0xb0, 0x83, 0x3d, 0x2d, + 0x83, 0x42, 0x3c, 0x1f, 0x45, 0xeb, 0xbb, 0xd3, 0x7e, 0xf2, 0x3b, 0x4d, 0x22, + 0xa6, 0xbd, 0x40, 0x45, 0x5c, 0xbb, 0x8c, 0xa5, 0x1c, 0xbd, 0x57, 0xd9, 0x86, + 0x3d, 0x45, 0xfc, 0x4e, 0x3d, 0xc5, 0x64, 0x24, 0x3d, 0xc9, 0xf4, 0x27, 0x3c, + 0xc7, 0x86, 0x08, 0x3d, 0x9c, 0x3c, 0x13, 0x3b, 0xab, 0x69, 0x12, 0x3d, 0x0d, + 0xfa, 0x80, 0x3d, 0x6b, 0x86, 0x15, 0xbd, 0x93, 0x11, 0x1e, 0xbd, 0x70, 0x3b, + 0x02, 0x3b, 0x50, 0x75, 0x06, 0xbd, 0x61, 0xe8, 0x7b, 0xbc, 0x5a, 0x15, 0xa7, + 0x3d, 0x47, 0x26, 0x0b, 0x3c, 0xb8, 0x03, 0x98, 0x3c, 0xce, 0xcc, 0x8e, 0x3d, + 0x12, 0x6c, 0xba, 0xbc, 0xca, 0x74, 0x5f, 0xbd, 0x84, 0x45, 0xd6, 0x3d, 0x2a, + 0xc6, 0xb3, 0xbc, 0x75, 0x88, 0x53, 0x3d, 0x44, 0xc0, 0x37, 0x3c, 0x69, 0x7c, + 0x59, 0x3d, 0xc1, 0xa5, 0xe5, 0xbc, 0x61, 0xc0, 0x9f, 0x3c, 0xbc, 0x7d, 0x7e, + 0xbc, 0x9c, 0x18, 0x79, 0xbd, 0x09, 0x70, 0x16, 0x3d, 0xdd, 0x36, 0x0b, 0x3d, + 0xcc, 0xba, 0xc8, 0x3c, 0xe6, 0xae, 0x18, 0xbc, 0xd6, 0x1a, 0x20, 0xbd, 0x43, + 0x22, 0x24, 0xbc, 0xcc, 0x3e, 0xd4, 0x3c, 0xe2, 0x43, 0x1a, 0xbb, 0x02, 0x94, + 0xd5, 0x3c, 0x24, 0x73, 0x3d, 0x3d, 0x4d, 0x1c, 0xce, 0x3c, 0x94, 0xea, 0x4a, + 0x3d, 0x33, 0x7a, 0x09, 0x3d, 0xf4, 0xcc, 0x66, 0xbd, 0x13, 0xb9, 0x9e, 0xbd, + 0x98, 0xbe, 0xb4, 0xbc, 0x19, 0x14, 0x21, 0x3d, 0x97, 0xca, 0x50, 0x3d, 0x8f, + 0x3f, 0x2f, 0xbc, 0x69, 0x98, 0x25, 0x3d, 0x55, 0x13, 0x80, 0xbc, 0xef, 0x2e, + 0x82, 0x3d, 0x24, 0xea, 0x71, 0xbd, 0x84, 0x97, 0x32, 0xbd, 0xb0, 0xaa, 0xaf, + 0x3c, 0xfa, 0x13, 0x9b, 0x3d, 0x56, 0xa5, 0x2b, 0x3d, 0x03, 0x06, 0x2d, 0xbc, + 0x6c, 0x24, 0x39, 0xbd, 0x46, 0x80, 0x29, 0x3d, 0x64, 0xdb, 0x61, 0xbb, 0x85, + 0x2a, 0x22, 0xbd, 0x9f, 0x47, 0xc1, 0x3d, 0x71, 0xc5, 0x85, 0xbd, 0x00, 0x31, + 0x9c, 0xb9, 0xc4, 0xd0, 0x2e, 0xbd, 0x08, 0x5d, 0x36, 0x3d, 0x41, 0x70, 0x3f, + 0xbd, 0x01, 0xc0, 0x87, 0x3c, 0x05, 0xf1, 0x37, 0xbc, 0xaf, 0x5d, 0xd4, 0xbb, + 0x10, 0xa9, 0x1c, 0x3d, 0xb8, 0xa9, 0x62, 0xba, 0xae, 0x29, 0x71, 0x3d, 0x51, + 0x57, 0x73, 0xbc, 0x05, 0x0a, 0xb8, 0xbd, 0xe3, 0x38, 0xa1, 0xbd, 0x3d, 0x08, + 0x13, 0x3d, 0x54, 0x69, 0x80, 0xbd, 0xe9, 0x65, 0x60, 0xbd, 0x2e, 0x02, 0x88, + 0x3d, 0x00, 0xdf, 0x58, 0xbb, 0xde, 0x06, 0x35, 0xbd, 0x1e, 0x3f, 0x0a, 0xbd, + 0x35, 0xe2, 0x15, 0xbd, 0xa6, 0xe3, 0x99, 0x3d, 0x42, 0x8e, 0x2e, 0xbd, 0x9b, + 0x10, 0x97, 0xbd, 0xd9, 0x36, 0xca, 0x3b, 0x27, 0x9f, 0x5c, 0xbd, 0xb8, 0x0c, + 0x25, 0xbd, 0x61, 0xe3, 0x8e, 0x3d, 0x8b, 0x23, 0xa5, 0xbc, 0xf4, 0xda, 0x47, + 0xbd, 0x30, 0x95, 0xac, 0x3c, 0xe1, 0xb0, 0xab, 0xbd, 0xb0, 0x5a, 0x15, 0x3d, + 0x58, 0x7e, 0x35, 0x3d, 0x13, 0xeb, 0x48, 0xbc, 0x00, 0xe6, 0x80, 0x3c, 0x39, + 0x59, 0x21, 0xbb, 0xca, 0xf7, 0xbe, 0x3d, 0x2a, 0xb9, 0x37, 0x3d, 0x26, 0x13, + 0x80, 0x3d, 0x9e, 0xbd, 0xc7, 0x3c, 0xb6, 0xd6, 0x50, 0xbd, 0xa6, 0x52, 0x82, + 0x3d, 0x39, 0xa3, 0x81, 0xb9, 0xe3, 0xb2, 0xf8, 0xbd, 0xc5, 0x84, 0x54, 0xbd, + 0xba, 0xea, 0x27, 0x3d, 0x1e, 0xce, 0xcf, 0x3c, 0x0d, 0xd3, 0x6f, 0x3c, 0xa7, + 0xce, 0x87, 0xbc, 0x67, 0xe3, 0x5e, 0xbd, 0xf6, 0xdc, 0x3b, 0x3d, 0xca, 0x8f, + 0x23, 0xbd, 0x69, 0x20, 0x9e, 0x3b, 0x32, 0x59, 0x2e, 0x3d, 0x12, 0x32, 0x09, + 0xbd, 0xa1, 0xc3, 0x2a, 0x3c, 0x68, 0x2a, 0x6b, 0xbc, 0xf7, 0xbf, 0x92, 0xbc, + 0x97, 0x8c, 0x97, 0x3d, 0x8e, 0xc6, 0x74, 0x3c, 0x04, 0x01, 0x47, 0x3c, 0x6b, + 0x51, 0xf0, 0x3d, 0x0e, 0xf6, 0x3b, 0x3b, 0xee, 0xeb, 0x5d, 0x3d, 0x98, 0x69, + 0x9b, 0x3c, 0xb5, 0x47, 0xfc, 0xbc, 0x5e, 0x56, 0x40, 0xbc, 0x15, 0x4e, 0xad, + 0xbb, 0x84, 0xcf, 0x96, 0x3c, 0xe3, 0x32, 0xbe, 0xbc, 0x36, 0xcd, 0xc8, 0x3d, + 0x70, 0xb8, 0x97, 0x3d, 0xd9, 0xc3, 0x28, 0xbd, 0x6c, 0xec, 0x7b, 0x3d, 0xbf, + 0x32, 0xc6, 0xbd, 0x98, 0x0d, 0x0f, 0xbe, 0x32, 0xaa, 0x95, 0x3d, 0x6e, 0x2c, + 0xfd, 0xbc, 0x10, 0x45, 0xc1, 0xbb, 0x4d, 0x8b, 0x03, 0x3d, 0xe4, 0x05, 0xde, + 0xbc, 0x0d, 0x7c, 0xbe, 0x3c, 0x07, 0x24, 0x77, 0x3d, 0x98, 0xb0, 0x2a, 0x3c, + 0x21, 0xc9, 0xa3, 0x3c, 0x1a, 0x6d, 0x69, 0x3d, 0x33, 0xf6, 0xeb, 0xbc, 0x40, + 0x77, 0x90, 0x3d, 0x6c, 0xf5, 0x99, 0x3c, 0x42, 0x69, 0x08, 0x3d, 0x9b, 0x3f, + 0xde, 0xbc, 0xe0, 0x71, 0x04, 0xbd, 0x6a, 0xcd, 0xfe, 0xbb, 0x77, 0xd6, 0xb3, + 0x3d, 0xf9, 0xb4, 0xcc, 0x3b, 0x6a, 0x1c, 0x70, 0x3d, 0x10, 0x34, 0x15, 0xbc, + 0x82, 0x15, 0x3a, 0x3d, 0xa8, 0xa6, 0x02, 0x3d, 0x06, 0x03, 0xaa, 0x3d, 0x15, + 0x2c, 0xe6, 0xbc, 0xac, 0xf0, 0xdc, 0x3c, 0xa7, 0x3b, 0xef, 0xbc, 0x7a, 0xa7, + 0x93, 0x3d, 0xaf, 0x46, 0x87, 0x3c, 0xf9, 0x13, 0x76, 0xbb, 0x30, 0x99, 0x15, + 0xbd, 0x36, 0xd1, 0x8f, 0xbc, 0xc9, 0x26, 0xaf, 0x3d, 0xc0, 0xa3, 0x5b, 0x3c, + 0x69, 0x65, 0x84, 0xbd, 0x1e, 0x30, 0x81, 0x3d, 0xb4, 0xbc, 0x22, 0x3d, 0x16, + 0x60, 0x52, 0x3d, 0x5e, 0xfe, 0x6a, 0xbc, 0x16, 0x65, 0x34, 0xbd, 0xfe, 0xab, + 0xf0, 0x3c, 0xe1, 0xfd, 0x90, 0x3d, 0xd4, 0x61, 0x6a, 0xbd, 0x55, 0xd1, 0x85, + 0xbd, 0x87, 0x6f, 0x66, 0xbd, 0x29, 0x4a, 0x8d, 0x3a, 0xec, 0x8f, 0x91, 0x3d, + 0x07, 0x75, 0x5a, 0x3b, 0x95, 0x09, 0x27, 0x3b, 0x25, 0x10, 0xd3, 0x3d, 0xde, + 0xfe, 0x0b, 0xbd, 0xe8, 0xd4, 0xc4, 0x3c, 0x4e, 0xda, 0x7d, 0x3c, 0x54, 0xb5, + 0xe8, 0xba, 0x69, 0x46, 0x40, 0x3d, 0xd1, 0xd6, 0x48, 0x3c, 0xfa, 0xb9, 0x87, + 0x39, 0x5a, 0x17, 0x20, 0xbc, 0xd5, 0x9b, 0x66, 0x3d, 0x19, 0x23, 0xac, 0x3c, + 0x56, 0x76, 0x5a, 0xbd, 0x7e, 0x50, 0x3c, 0xbc, 0x02, 0x8b, 0x17, 0xbd, 0x42, + 0x85, 0xc6, 0xbd, 0x06, 0x12, 0x9f, 0x3d, 0xad, 0x96, 0xc7, 0xbb, 0xd9, 0xfc, + 0xff, 0xbb, 0xb9, 0x86, 0x71, 0x3c, 0xc7, 0xf6, 0x3f, 0xbd, 0xc2, 0x39, 0xf7, + 0x3a, 0x25, 0xcb, 0xf0, 0x3c, 0xfe, 0x25, 0xb0, 0xbb, 0xd3, 0x39, 0x02, 0x3d, + 0xf8, 0xa3, 0x08, 0xbd, 0xba, 0xf2, 0x4e, 0xbd, 0x53, 0x83, 0x46, 0xbd, 0xae, + 0x06, 0x06, 0x3d, 0x69, 0xf3, 0x8f, 0x3d, 0xd3, 0x57, 0x35, 0x3c, 0x05, 0x92, + 0xb9, 0x3c, 0x60, 0x8e, 0x5b, 0x3b, 0xab, 0x7a, 0x8d, 0xbc, 0xf6, 0xdf, 0x87, + 0xbd, 0x0d, 0xc5, 0x81, 0x3d, 0xec, 0x93, 0x5f, 0x3d, 0xf6, 0x54, 0x85, 0x3d, + 0x86, 0xb3, 0x16, 0xbc, 0x7d, 0x95, 0x97, 0x3d, 0xff, 0xd8, 0x0c, 0x3d, 0x21, + 0x38, 0x6e, 0xbd, 0x68, 0xfc, 0x83, 0x3d, 0x5c, 0x54, 0x1b, 0xbc, 0x26, 0x1d, + 0x03, 0x3d, 0xd8, 0xaa, 0x90, 0xbd, 0xa9, 0x58, 0x0b, 0x3b, 0x02, 0x4e, 0x40, + 0xbd, 0xdc, 0x76, 0xe0, 0xbb, 0x14, 0x2e, 0x24, 0x3d, 0xbb, 0x6b, 0xfe, 0x3b, + 0xfd, 0xb5, 0x99, 0xbd, 0x4b, 0x2b, 0x0e, 0xbd, 0x2f, 0xc8, 0x69, 0xbd, 0xff, + 0xf0, 0x04, 0x3d, 0x46, 0x9c, 0x13, 0x3c, 0x74, 0x89, 0x2e, 0x3d, 0xbe, 0x6e, + 0x52, 0xbd, 0x59, 0x23, 0x34, 0x3d, 0x72, 0x3a, 0x3e, 0xbd, 0xf8, 0x03, 0x7a, + 0x3d, 0x8e, 0xab, 0x74, 0x3c, 0x6e, 0x5e, 0x82, 0x3d, 0x16, 0x5b, 0x25, 0x3c, + 0x56, 0x2c, 0xe7, 0xbd, 0x19, 0x4d, 0xc0, 0x3d, 0x8a, 0xb3, 0xdb, 0xbd, 0x34, + 0xe5, 0x67, 0xbc, 0x0f, 0x5d, 0x35, 0x3d, 0xad, 0xad, 0x94, 0x3d, 0xa5, 0xc3, + 0xba, 0xba, 0xb4, 0x7f, 0x02, 0x3e, 0xde, 0xcd, 0x8d, 0x3d, 0xc3, 0xa4, 0xa4, + 0xbd, 0x7e, 0x1b, 0x37, 0x3d, 0xde, 0xb4, 0x91, 0xbd, 0x78, 0xf2, 0x62, 0xbd, + 0x25, 0x4f, 0x60, 0xbd, 0x4e, 0xd2, 0x25, 0xbd, 0xd3, 0xc3, 0xe8, 0xbb, 0x7f, + 0x00, 0x68, 0x3d, 0x7a, 0x9c, 0x1e, 0xbd, 0x17, 0x70, 0x81, 0x3c, 0xda, 0xb3, + 0x68, 0x3d, 0xab, 0xf3, 0xb4, 0xbc, 0x46, 0x70, 0x16, 0xbd, 0x22, 0xe5, 0x82, + 0x3d, 0x75, 0x02, 0x5a, 0x3d, 0xb5, 0xce, 0x86, 0xbd, 0x20, 0x29, 0xa8, 0xbb, + 0xe5, 0x29, 0x95, 0xbd, 0x63, 0x0c, 0x5f, 0xbd, 0x42, 0x39, 0x99, 0xbc, 0x27, + 0xd6, 0x82, 0xbb, 0x33, 0x1c, 0xda, 0xbc, 0x93, 0x96, 0x76, 0x3d, 0xd3, 0x8c, + 0xd3, 0xbd, 0x75, 0x39, 0xe1, 0x3d, 0x42, 0x5b, 0x98, 0xbd, 0x5a, 0xc4, 0x4f, + 0x3d, 0x3b, 0xb0, 0x14, 0xbd, 0xfc, 0x99, 0x4b, 0xbc, 0xd4, 0x88, 0x13, 0xbb, + 0x6c, 0xca, 0xc4, 0x3d, 0xd4, 0xdc, 0xb1, 0x3d, 0x62, 0x2a, 0x8d, 0x3c, 0xd8, + 0x1b, 0xb7, 0x3c, 0x0b, 0x8d, 0xba, 0xbb, 0x78, 0x25, 0x5c, 0xbd, 0xb9, 0xc6, + 0xbb, 0xba, 0x26, 0x58, 0xc5, 0xbd, 0x5d, 0x48, 0xb7, 0xbd, 0x71, 0x0d, 0x0e, + 0x3d, 0xa8, 0xa7, 0x54, 0xbd, 0x88, 0xfe, 0x84, 0xbc, 0x0b, 0x64, 0x1b, 0xbc, + 0xba, 0xaa, 0x8e, 0x3c, 0x89, 0x54, 0xa5, 0xbc, 0xde, 0x32, 0x9c, 0x3c, 0x90, + 0x13, 0x66, 0xbd, 0xb2, 0x5e, 0x11, 0xbd, 0xd0, 0x5e, 0xfb, 0xbb, 0x2e, 0x6c, + 0x8c, 0xbd, 0x09, 0x4b, 0x2f, 0xbc, 0xa8, 0x5d, 0x27, 0xbd, 0xad, 0xd8, 0x2e, + 0x3d, 0x78, 0x5e, 0xf0, 0x3c, 0x8e, 0xc0, 0x12, 0x3d, 0x49, 0xb5, 0xca, 0xbd, + 0x1b, 0x2e, 0xb0, 0x3d, 0xeb, 0x3c, 0x8b, 0xbd, 0xe2, 0x4b, 0xd6, 0xbc, 0x14, + 0xdf, 0xc3, 0x3c, 0x42, 0x9c, 0x87, 0x3c, 0xb7, 0x90, 0x18, 0x3d, 0xcb, 0x8a, + 0xd8, 0x3d, 0xc1, 0x0c, 0x97, 0x3d, 0x35, 0xe8, 0xd3, 0x3c, 0xb1, 0x05, 0x28, + 0x3d, 0x03, 0xd2, 0xbc, 0x3d, 0x56, 0xce, 0x44, 0x3d, 0x9f, 0xbf, 0x24, 0x3d, + 0x21, 0x81, 0x81, 0xbd, 0xc0, 0xa2, 0xda, 0xbd, 0x50, 0x42, 0x27, 0x3d, 0x5f, + 0xb2, 0xb9, 0x3c, 0x04, 0x67, 0x6c, 0x3d, 0xce, 0x89, 0x2c, 0xbd, 0x08, 0x2d, + 0x4b, 0x3c, 0x88, 0x86, 0xf7, 0x3c, 0xcd, 0x8e, 0x94, 0x3d, 0x5a, 0x47, 0x6f, + 0x3d, 0x67, 0xf4, 0xa2, 0xbd, 0xe3, 0x50, 0x91, 0xbd, 0xde, 0x9e, 0x84, 0x3d, + 0xb3, 0x05, 0xbf, 0x3c, 0x10, 0x17, 0x34, 0x3d, 0xf4, 0x1f, 0x0e, 0xbd, 0x47, + 0xb9, 0x49, 0x3d, 0xb1, 0x61, 0x10, 0x3d, 0x2a, 0x64, 0x90, 0xbd, 0x1e, 0xc9, + 0xb8, 0x3c, 0x7d, 0x23, 0xb8, 0xbd, 0x19, 0x60, 0x85, 0x3d, 0x44, 0xb5, 0x4d, + 0xbd, 0x05, 0x79, 0xec, 0x3b, 0xea, 0x1e, 0x21, 0xbd, 0xeb, 0x34, 0x59, 0x3d, + 0x50, 0xa9, 0x00, 0x3d, 0x72, 0xf1, 0x4c, 0xb9, 0x98, 0x35, 0xc1, 0x3d, 0xbb, + 0x18, 0x36, 0x3d, 0x19, 0x70, 0x62, 0xbd, 0xc5, 0xae, 0x75, 0x3d, 0x27, 0x77, + 0xec, 0xbc, 0xab, 0x6d, 0xe1, 0xbd, 0x75, 0x4a, 0xae, 0x3c, 0x2d, 0xea, 0x18, + 0xbb, 0xdc, 0x0e, 0x7b, 0x3d, 0xb2, 0x28, 0x24, 0xbd, 0x69, 0xd2, 0x78, 0xbd, + 0xed, 0x29, 0x5f, 0xbc, 0xd9, 0x6e, 0x44, 0x3d, 0x3c, 0x6c, 0x87, 0xbd, 0xa5, + 0xdf, 0x96, 0xbc, 0x1c, 0x4c, 0x35, 0x3d, 0x54, 0x97, 0x57, 0xbd, 0xe9, 0x88, + 0x40, 0xbd, 0x6d, 0x9d, 0x71, 0x3c, 0x3f, 0x74, 0xaf, 0xbb, 0x41, 0xfa, 0x4b, + 0x3d, 0x20, 0xe8, 0x7a, 0xbc, 0xe4, 0x37, 0xbe, 0xbd, 0xfa, 0xa2, 0x44, 0xbc, + 0x2a, 0x3c, 0x61, 0xbd, 0xec, 0x0f, 0x0c, 0x3d, 0xd7, 0xef, 0x82, 0xbd, 0x0b, + 0xe4, 0xd2, 0xbc, 0xd2, 0x57, 0x04, 0x3c, 0xa8, 0x6e, 0xce, 0x3d, 0x3c, 0xd8, + 0xa4, 0x3b, 0x1d, 0x19, 0x45, 0xbd, 0xd6, 0x4d, 0x70, 0x3c, 0xed, 0x12, 0xf0, + 0xbc, 0x1f, 0xc6, 0x4c, 0x3c, 0xeb, 0x27, 0x8e, 0xbc, 0x6a, 0xf8, 0x4f, 0x3d, + 0xcf, 0x2c, 0xe3, 0xbd, 0x3b, 0xc9, 0x05, 0xbb, 0xe0, 0xfa, 0xfd, 0x3c, 0xfe, + 0xb8, 0xfb, 0xbc, 0x84, 0xd9, 0x8b, 0x3d, 0xad, 0x88, 0x00, 0x3d, 0x21, 0xfa, + 0x47, 0x3d, 0xf6, 0x17, 0x0d, 0xbd, 0xc5, 0x0c, 0xf1, 0x3c, 0xec, 0x3c, 0x13, + 0xbd, 0x1a, 0x06, 0x4b, 0xbd, 0x76, 0x04, 0xa4, 0xbc, 0x89, 0x87, 0x92, 0x3d, + 0xd2, 0xc6, 0xaf, 0x3d, 0xb1, 0xb1, 0x12, 0x3d, 0x99, 0xa4, 0x23, 0x3d, 0x25, + 0x73, 0x75, 0x3b, 0x18, 0x34, 0xa1, 0xbd, 0xc0, 0x90, 0xa5, 0x3d, 0xaa, 0xa8, + 0x14, 0xbd, 0x6c, 0xbc, 0xf3, 0x3c, 0x8a, 0x47, 0x51, 0xbc, 0xab, 0xfc, 0x2a, + 0x3d, 0xc8, 0xb7, 0x68, 0x3d, 0xff, 0xbf, 0x72, 0x3d, 0x38, 0x39, 0x95, 0x3d, + 0xdc, 0x49, 0x94, 0xbc, 0xbd, 0xce, 0x90, 0x3c, 0xcd, 0x13, 0x35, 0x3d, 0xd4, + 0xd9, 0x51, 0xbd, 0x16, 0xde, 0xfb, 0xbc, 0xc7, 0x00, 0xb9, 0xbd, 0x38, 0x8e, + 0x2e, 0xbc, 0xcb, 0xce, 0x5e, 0x3d, 0x44, 0x22, 0x7a, 0x3c, 0x70, 0x0a, 0x93, + 0x3d, 0x9c, 0x88, 0x81, 0x3a, 0x02, 0x89, 0x01, 0xbd, 0x52, 0x9b, 0x50, 0xbc, + 0xc7, 0x6f, 0x46, 0x3c, 0x41, 0xb4, 0x57, 0x3d, 0x79, 0x89, 0xd2, 0x3b, 0x20, + 0xab, 0x75, 0x3b, 0x40, 0xf2, 0xea, 0x3c, 0x8f, 0x29, 0x8c, 0x3d, 0xb0, 0x20, + 0x45, 0xbd, 0xf4, 0x67, 0x8c, 0x3d, 0xbf, 0x3f, 0x9d, 0x3c, 0xa7, 0x71, 0x01, + 0xbd, 0x37, 0x6b, 0x02, 0xbc, 0x68, 0xc4, 0x2a, 0x3d, 0x43, 0x60, 0x9b, 0xbc, + 0x72, 0xb9, 0x73, 0xbd, 0x90, 0xc4, 0x13, 0x3c, 0xba, 0xbf, 0x50, 0xbb, 0x86, + 0x75, 0x78, 0xbd, 0x2e, 0xaf, 0x69, 0xbc, 0xdb, 0x89, 0xbc, 0x3d, 0x05, 0x7f, + 0xa8, 0xbd, 0x42, 0x5f, 0x02, 0x3d, 0xe1, 0x3c, 0x12, 0xbd, 0xfd, 0xdf, 0x41, + 0x3d, 0x2e, 0xda, 0xe3, 0xbb, 0x80, 0x3c, 0x5f, 0xbd, 0x26, 0x2b, 0x1f, 0xbd, + 0xa8, 0xed, 0xd5, 0x3c, 0xa6, 0x84, 0xf1, 0x3c, 0xbe, 0xd2, 0x9a, 0xbb, 0x5b, + 0x04, 0x61, 0x3d, 0x2b, 0xe5, 0x06, 0xbd, 0xc9, 0xb8, 0x85, 0x3c, 0x64, 0x7a, + 0xc7, 0x3d, 0x4c, 0x12, 0xc9, 0x3c, 0x69, 0x12, 0x63, 0xbd, 0x88, 0x73, 0xbf, + 0x3c, 0xfc, 0x66, 0x50, 0xbb, 0x64, 0x31, 0x9a, 0xbd, 0xeb, 0x81, 0x8d, 0x3d, + 0x7e, 0x4e, 0xc5, 0x3c, 0x15, 0x80, 0x96, 0x3d, 0xb9, 0x1f, 0x65, 0xbd, 0xe3, + 0x99, 0xda, 0xbd, 0x94, 0x02, 0x4a, 0x3c, 0xbf, 0x7b, 0x26, 0x3d, 0x20, 0xae, + 0x9d, 0xbb, 0x84, 0x49, 0x1e, 0x3d, 0x88, 0x11, 0x17, 0x3d, 0x45, 0x77, 0x73, + 0x3c, 0x76, 0x33, 0xaa, 0x3c, 0x28, 0x4d, 0x4b, 0x3d, 0x49, 0x89, 0x37, 0x3c, + 0x3f, 0xe6, 0x92, 0xbd, 0xc8, 0x39, 0xa0, 0x3c, 0xd6, 0xff, 0x0a, 0x3b, 0xb4, + 0xef, 0xad, 0xbd, 0xdb, 0x17, 0x19, 0x3c, 0x9a, 0x54, 0x7c, 0xbd, 0xe7, 0x50, + 0xcc, 0x3c, 0x91, 0xeb, 0x75, 0xbd, 0x9a, 0x45, 0xac, 0x3d, 0xd3, 0x80, 0x4d, + 0xbd, 0x17, 0x6c, 0x19, 0x3c, 0x47, 0xb1, 0x1f, 0xbd, 0xef, 0x17, 0x1d, 0xbd, + 0xa2, 0xc8, 0x58, 0xbc, 0xf9, 0xc6, 0x81, 0xbb, 0x70, 0xfc, 0xa1, 0x3b, 0x70, + 0x74, 0x38, 0x3d, 0xb9, 0x93, 0x6c, 0x3d, 0xb5, 0x22, 0x89, 0x3d, 0xa8, 0x15, + 0xed, 0xbb, 0xee, 0x0c, 0xac, 0xbc, 0xbf, 0xca, 0xbe, 0xbc, 0x8e, 0x0d, 0xbf, + 0xbd, 0xfb, 0x0c, 0x92, 0x3c, 0x3d, 0x1e, 0x61, 0xbd, 0xe1, 0xb2, 0x08, 0xbd, + 0xcd, 0xab, 0x75, 0xbb, 0xc5, 0x1a, 0x2f, 0x3d, 0x4f, 0x02, 0x92, 0x3c, 0x8f, + 0x47, 0x20, 0x3d, 0x33, 0xac, 0xc3, 0x3d, 0xc9, 0xdc, 0xbd, 0xbc, 0x68, 0x6e, + 0xb4, 0x3b, 0x32, 0x32, 0xdc, 0x3d, 0xd8, 0xff, 0x92, 0x3d, 0xb3, 0xa4, 0x6f, + 0xbd, 0xf0, 0xbe, 0x13, 0xbd, 0xff, 0xf5, 0xdf, 0xbd, 0x67, 0xeb, 0x94, 0x3c, + 0xb2, 0xe8, 0x57, 0xbb, 0x92, 0x3f, 0xdc, 0xbb, 0xe3, 0x5f, 0x6b, 0x3c, 0x02, + 0xcc, 0x6c, 0xbd, 0x25, 0xa1, 0x57, 0xbd, 0x22, 0x01, 0x82, 0x3d, 0xc3, 0xcf, + 0xb2, 0x3c, 0xed, 0x35, 0x56, 0xbb, 0xe3, 0xf0, 0x8c, 0x3d, 0xdb, 0xf1, 0xb1, + 0xbc, 0xaa, 0xe4, 0xc2, 0x3b, 0x53, 0x9c, 0xf6, 0xbc, 0x15, 0x86, 0x92, 0x3d, + 0xe4, 0xf9, 0x39, 0x3d, 0x09, 0xa5, 0xa8, 0xbc, 0x6e, 0x89, 0xd1, 0xbc, 0x47, + 0xd4, 0x7b, 0x3c, 0x7b, 0xff, 0xab, 0x3c, 0x15, 0x58, 0x8d, 0xbd, 0x7b, 0x21, + 0xac, 0x3c, 0xda, 0xe5, 0xad, 0xbc, 0x8b, 0xfc, 0xd8, 0xbc, 0x8c, 0xe1, 0x0e, + 0xbc, 0x36, 0x43, 0xc6, 0x3d, 0xfa, 0x15, 0x8b, 0xbc, 0xb8, 0xd0, 0x07, 0x3d, + 0xd9, 0x12, 0x9c, 0x3c, 0x81, 0x20, 0x4f, 0xbd, 0xd8, 0x7f, 0x18, 0x3b, 0x38, + 0xd4, 0x33, 0xbc, 0x00, 0x0f, 0xe2, 0xbd, 0x25, 0xa8, 0xf2, 0x3c, 0x87, 0xa6, + 0x96, 0xbd, 0x84, 0xc3, 0xa8, 0x3c, 0xf4, 0x7a, 0x8b, 0x3c, 0xfd, 0xbd, 0x55, + 0xbc, 0x45, 0x00, 0x97, 0xbd, 0x81, 0x3a, 0xbd, 0x3b, 0x21, 0x43, 0x30, 0xbd, + 0x94, 0x58, 0xa5, 0x3b, 0x30, 0x2f, 0x12, 0xbd, 0xcb, 0xd3, 0x32, 0x3d, 0x36, + 0xd2, 0x7c, 0xbd, 0xf2, 0x77, 0x49, 0x3d, 0x87, 0xdd, 0x87, 0xbc, 0x3d, 0x1a, + 0x02, 0x3d, 0x5a, 0x1b, 0xc1, 0x3c, 0x04, 0xaf, 0x33, 0xbd, 0x84, 0x02, 0x1d, + 0x3d, 0x47, 0x7d, 0x21, 0xbd, 0x46, 0xc4, 0x24, 0x3d, 0x8f, 0x16, 0x27, 0x3d, + 0xce, 0x48, 0x22, 0x3d, 0xd9, 0x6b, 0xa3, 0x3c, 0x31, 0x91, 0xbb, 0x3c, 0xef, + 0x24, 0x88, 0xbb, 0x1e, 0x6e, 0x41, 0xbd, 0x81, 0xea, 0x80, 0x3d, 0xa6, 0xa7, + 0xf2, 0x3d, 0x74, 0xcf, 0xd7, 0x3c, 0x4c, 0x85, 0xf6, 0xbc, 0x57, 0xac, 0x0f, + 0x3c, 0x1c, 0x44, 0x53, 0xbd, 0x44, 0x55, 0x35, 0x3d, 0x14, 0x45, 0x11, 0x3d, + 0x0d, 0xfa, 0xff, 0xbc, 0xe0, 0xef, 0x32, 0x3d, 0x6c, 0x60, 0xac, 0x3b, 0xd2, + 0xe0, 0xab, 0xbb, 0x77, 0x02, 0x3f, 0xbd, 0xcd, 0x77, 0x44, 0x3d, 0x4f, 0x8c, + 0x3e, 0xbd, 0x74, 0xd6, 0x5a, 0xbd, 0x33, 0xb6, 0xf2, 0xbc, 0x94, 0xe4, 0x0e, + 0x3b, 0x6c, 0x9b, 0xa9, 0x3a, 0x61, 0xd7, 0xea, 0xbc, 0xf6, 0x70, 0xe9, 0x3c, + 0x06, 0x81, 0xeb, 0xbc, 0x51, 0x88, 0x47, 0xbb, 0x6c, 0xfb, 0x6d, 0x3d, 0x0a, + 0x9d, 0x29, 0xbb, 0xa0, 0x45, 0x36, 0x3c, 0xe5, 0xd9, 0xb8, 0x3c, 0x09, 0xf4, + 0x09, 0xbd, 0x2a, 0x13, 0x54, 0xbc, 0xad, 0xb0, 0xa3, 0x3d, 0x5a, 0x07, 0xff, + 0x3c, 0x18, 0x10, 0xc9, 0x3c, 0x15, 0xf6, 0x07, 0xbd, 0x05, 0x70, 0x60, 0x3d, + 0xb5, 0xbd, 0x50, 0x3d, 0xeb, 0xe1, 0x11, 0x3d, 0xdf, 0x70, 0x40, 0xbd, 0x51, + 0x6f, 0x67, 0xbd, 0x61, 0xbf, 0xd0, 0x3c, 0x39, 0x5e, 0x14, 0xbd, 0xae, 0x58, + 0xa1, 0x3d, 0xa2, 0x03, 0x88, 0x3d, 0x85, 0x40, 0x89, 0xbd, 0x3e, 0x4f, 0x21, + 0x3c, 0x8b, 0x40, 0xcf, 0x3c, 0xa8, 0x0d, 0x76, 0x3d, 0x2f, 0x57, 0xf4, 0x3b, + 0x78, 0x71, 0x8f, 0x3c, 0x15, 0x80, 0x72, 0x3d, 0x35, 0xc6, 0xe6, 0xbc, 0x1e, + 0xdb, 0x8d, 0x3d, 0xc1, 0x52, 0x58, 0x3d, 0x1e, 0x0c, 0x37, 0x3d, 0x68, 0xdd, + 0x25, 0x3d, 0x1a, 0x65, 0x59, 0xbc, 0x22, 0xe3, 0x8b, 0x3d, 0x29, 0xb2, 0x44, + 0xbd, 0x56, 0x71, 0x34, 0xbd, 0x1c, 0x3f, 0x7c, 0xbb, 0x88, 0x17, 0x72, 0xbc, + 0xbb, 0xb5, 0xae, 0x3c, 0xdd, 0x7b, 0xd5, 0x3c, 0xd3, 0x2f, 0x93, 0x3d, 0x07, + 0x46, 0x38, 0x3d, 0x55, 0x2b, 0x47, 0x3d, 0xd2, 0x5c, 0xda, 0x3d, 0xa4, 0x8e, + 0x80, 0x3d, 0xe6, 0xdb, 0xc9, 0x3c, 0xf3, 0x2d, 0x3f, 0xbd, 0x66, 0x10, 0xd1, + 0xbd, 0xde, 0xa5, 0xda, 0x3c, 0xab, 0x8c, 0xe4, 0x3c, 0x85, 0x1c, 0xc0, 0x3c, + 0xba, 0xe5, 0x95, 0xbd, 0x25, 0x50, 0x92, 0x3c, 0x25, 0x15, 0xc9, 0xba, 0x43, + 0xdc, 0x63, 0xbc, 0x65, 0xd6, 0x07, 0x3d, 0x87, 0x8c, 0x0e, 0xbc, 0x0d, 0x90, + 0x87, 0x3d, 0x9a, 0x0e, 0x4a, 0x3d, 0x67, 0x54, 0x4a, 0x3d, 0x63, 0x8b, 0x24, + 0xbd, 0x56, 0x2c, 0xcf, 0xbc, 0x28, 0x2a, 0x23, 0x3d, 0xc6, 0x80, 0xa3, 0xbc, + 0x66, 0xe5, 0x09, 0xbd, 0x69, 0xdb, 0x93, 0x3d, 0x00, 0xc7, 0x7e, 0xbd, 0xe0, + 0x18, 0x06, 0x3d, 0x02, 0xb9, 0x77, 0xbd, 0x43, 0x60, 0x55, 0x3c, 0x46, 0x45, + 0xa4, 0x3d, 0xb1, 0x0a, 0xac, 0x3c, 0x8a, 0xc5, 0x8e, 0x3d, 0xf6, 0x60, 0x31, + 0xbc, 0x9b, 0x2d, 0xb0, 0x3a, 0xc3, 0xc4, 0x4a, 0xbd, 0x96, 0x31, 0x82, 0xbd, + 0x4e, 0x50, 0x59, 0x3c, 0x2f, 0xf7, 0xd4, 0xbd, 0x18, 0xc1, 0x2b, 0xbd, 0xb8, + 0x26, 0x9d, 0x3c, 0xd6, 0x9c, 0x3b, 0xbd, 0xb6, 0xdd, 0x11, 0xbd, 0x4e, 0x51, + 0xd9, 0x3b, 0xbd, 0xfd, 0x3b, 0xbd, 0xe2, 0xe9, 0x35, 0xbc, 0x0d, 0xb1, 0x9c, + 0x3c, 0x02, 0x6e, 0xab, 0x3c, 0xc9, 0x70, 0x25, 0x3c, 0xae, 0xe4, 0x60, 0xbd, + 0x11, 0xc2, 0x49, 0x3d, 0x9b, 0x09, 0xaf, 0xbc, 0xbc, 0x74, 0x75, 0x3c, 0x38, + 0x61, 0x16, 0x3d, 0x0c, 0x99, 0x94, 0x3d, 0x01, 0x83, 0x03, 0xbb, 0xc5, 0x45, + 0x1b, 0x3d, 0x82, 0xab, 0x6f, 0x3c, 0xe1, 0x41, 0xce, 0x3c, 0x86, 0xd5, 0x79, + 0xbd, 0x0e, 0x6c, 0x69, 0x3d, 0xcf, 0xbb, 0x87, 0x3d, 0x65, 0x17, 0xb4, 0xbc, + 0xca, 0x64, 0x07, 0x3e, 0x7d, 0x34, 0xca, 0x3d, 0x40, 0x0d, 0xfb, 0x3c, 0x0e, + 0xea, 0xc2, 0x3c, 0x06, 0x26, 0x88, 0xbc, 0xed, 0x76, 0x84, 0x3d, 0xca, 0x92, + 0xa4, 0xbc, 0x4c, 0x98, 0x74, 0xbd, 0x62, 0x77, 0xdb, 0xbd, 0x97, 0xba, 0x87, + 0x3d, 0xe9, 0x05, 0x95, 0xbd, 0xcc, 0xfd, 0x99, 0x3d, 0x36, 0x01, 0x0b, 0xbd, + 0x23, 0x33, 0x7d, 0x3d, 0x2f, 0xba, 0x5c, 0x3d, 0xaa, 0xed, 0xb2, 0xbc, 0xfc, + 0xe7, 0x97, 0x3d, 0xaa, 0x40, 0x7d, 0x3d, 0x2a, 0x5f, 0x5e, 0x3d, 0x51, 0x91, + 0x7d, 0xbd, 0xc8, 0xf8, 0x2a, 0x3d, 0x7b, 0x8c, 0x2f, 0x3d, 0x35, 0xe0, 0xb9, + 0xbb, 0xc4, 0x0b, 0x56, 0xbd, 0xcf, 0xd0, 0xb8, 0x3c, 0xf7, 0xef, 0x61, 0x3d, + 0xf5, 0x33, 0x9a, 0x3d, 0x07, 0xd8, 0xf0, 0xbc, 0x34, 0x49, 0x61, 0xbd, 0x7c, + 0x0c, 0x74, 0xbd, 0x0c, 0x85, 0xf7, 0xbc, 0xeb, 0x13, 0xdd, 0xbc, 0x70, 0x3a, + 0xd1, 0x3c, 0xd0, 0x31, 0xe1, 0x3d, 0xbf, 0xb4, 0x90, 0xbd, 0x6c, 0x8a, 0x4f, + 0xbc, 0x89, 0x66, 0x29, 0xbc, 0x5d, 0x8a, 0x18, 0xbd, 0xa4, 0x2b, 0x91, 0xbd, + 0x6a, 0x8d, 0x2b, 0xb9, 0x44, 0x9f, 0xf1, 0xbd, 0xe3, 0x9a, 0x87, 0x3c, 0x3c, + 0x77, 0x5c, 0x3d, 0x1b, 0x6f, 0x50, 0xbd, 0x43, 0x9e, 0x41, 0xbd, 0x13, 0x6f, + 0x5d, 0x3d, 0x44, 0x7f, 0x67, 0x3c, 0xf5, 0x9e, 0x31, 0x3c, 0xc0, 0x48, 0x8b, + 0x3d, 0x48, 0xc4, 0xd0, 0xbc, 0x80, 0x20, 0x17, 0x3a, 0x4c, 0x44, 0x42, 0x3b, + 0xcd, 0x50, 0x0e, 0x3d, 0xf8, 0xdd, 0x6a, 0x3d, 0xa7, 0xa4, 0x57, 0x3c, 0x5c, + 0x60, 0x94, 0x3c, 0xd4, 0x6e, 0x34, 0xbc, 0xa3, 0xa2, 0x8e, 0xbd, 0x88, 0xe0, + 0xad, 0x3d, 0xdb, 0xd6, 0x9f, 0xbd, 0x14, 0xcb, 0x61, 0xbd, 0x02, 0x50, 0x7f, + 0xbd, 0xb9, 0x4c, 0x9d, 0x3d, 0x0d, 0x5a, 0x88, 0x3d, 0x8b, 0x0a, 0x06, 0x3c, + 0xdf, 0x17, 0x8e, 0x3d, 0x75, 0x07, 0x0c, 0x3d, 0x5d, 0xd3, 0x52, 0xbd, 0x22, + 0x56, 0x0b, 0x3a, 0x62, 0x34, 0xcb, 0xbc, 0x55, 0x58, 0xaa, 0x3c, 0x72, 0x28, + 0xa3, 0xbd, 0x60, 0x8d, 0x3f, 0xbc, 0x5b, 0xaa, 0x51, 0xbb, 0xa8, 0x60, 0x31, + 0xbd, 0x8c, 0xc5, 0xfb, 0x3c, 0x90, 0x97, 0x3f, 0xbc, 0x94, 0x3a, 0x45, 0xbd, + 0xb5, 0xc1, 0x8d, 0xbd, 0x07, 0xd0, 0x08, 0x3d, 0x47, 0x05, 0xe2, 0xbb, 0x69, + 0x2e, 0x16, 0x3d, 0xd0, 0x2d, 0x50, 0xbd, 0xd3, 0x88, 0x9e, 0x3d, 0x2f, 0x19, + 0xbb, 0xbc, 0x20, 0x1f, 0xa4, 0x3d, 0x38, 0x4e, 0x9c, 0xbc, 0x71, 0x5a, 0x6e, + 0x3c, 0x47, 0x9a, 0x49, 0x3d, 0x7a, 0x7b, 0x07, 0x3a, 0x54, 0xf5, 0xcd, 0x3d, + 0x54, 0xb0, 0xde, 0x3c, 0xb0, 0xbd, 0x1b, 0x3c, 0x31, 0x85, 0x2c, 0xbd, 0xda, + 0x03, 0xe4, 0xbb, 0x9e, 0xf5, 0x87, 0x3d, 0xef, 0x15, 0x41, 0x3d, 0x82, 0x56, + 0xa3, 0x3d, 0xfa, 0x31, 0x5e, 0xbd, 0xf2, 0x5e, 0x5f, 0xbb, 0x1c, 0xda, 0x9f, + 0x3d, 0x45, 0x09, 0x71, 0xbc, 0x37, 0x80, 0x9a, 0x3b, 0x5a, 0x7a, 0xfd, 0xbc, + 0x37, 0x4f, 0x1a, 0xbe, 0xfa, 0x30, 0xeb, 0xbc, 0xa9, 0xd5, 0x74, 0xbd, 0x18, + 0xad, 0x9b, 0xbc, 0x00, 0xc4, 0xce, 0x3a, 0x98, 0x58, 0x19, 0x3c, 0xf0, 0x22, + 0xa1, 0x3b, 0x84, 0xfa, 0x08, 0xbd, 0x6f, 0xfe, 0x96, 0x3d, 0xe3, 0xc4, 0x90, + 0x3d, 0xa0, 0xc8, 0x5a, 0xbc, 0x97, 0x7f, 0xc2, 0xbc, 0xea, 0xcc, 0xcc, 0x3c, + 0xae, 0xb0, 0x9c, 0xbc, 0x49, 0xdf, 0x97, 0xbc, 0xdd, 0x01, 0x18, 0xbd, 0x66, + 0x26, 0xa7, 0xbc, 0x2a, 0x3d, 0x59, 0xbd, 0x93, 0x1b, 0x1a, 0x3d, 0xd9, 0x46, + 0xcc, 0x3c, 0x00, 0xf0, 0x34, 0x3a, 0x99, 0x3d, 0xc0, 0xbc, 0x08, 0xb1, 0x09, + 0x3c, 0xbe, 0xfb, 0x79, 0x3d, 0xa9, 0x90, 0x86, 0xbd, 0xa2, 0x17, 0x8f, 0xbd, + 0x30, 0x94, 0x8a, 0xbb, 0xd9, 0xd7, 0x82, 0x3d, 0xe4, 0xea, 0x2f, 0xbd, 0x7e, + 0x59, 0x73, 0xbd, 0x46, 0x73, 0xe2, 0xbc, 0xe0, 0xd4, 0x42, 0xbc, 0x3c, 0x6c, + 0xdf, 0x3c, 0x08, 0xce, 0xf9, 0x3c, 0xfc, 0xe4, 0x79, 0xbd, 0xac, 0x5c, 0x4f, + 0xbd, 0x60, 0x67, 0x12, 0xbb, 0xb2, 0xcf, 0xbf, 0xbc, 0xe2, 0x7c, 0x31, 0xbd, + 0xb6, 0xc7, 0x18, 0x3d, 0xdc, 0x89, 0x90, 0xbd, 0x0c, 0xf7, 0x99, 0xbc, 0xa0, + 0x2a, 0x3c, 0xbd, 0x92, 0x1b, 0x38, 0x3d, 0x34, 0xe9, 0x86, 0xbd, 0x69, 0x76, + 0x6d, 0xbd, 0x76, 0x2b, 0x6e, 0x3d, 0x70, 0x53, 0x3f, 0x3d, 0x22, 0xe5, 0x4c, + 0x3d, 0x52, 0x57, 0xfc, 0xbc, 0xf8, 0x6b, 0x31, 0xbd, 0xb4, 0xb1, 0xa3, 0x3c, + 0x10, 0x0c, 0x60, 0x3c, 0xbc, 0x80, 0x85, 0xbd, 0xe6, 0x9f, 0x78, 0xbd, 0x00, + 0x20, 0x90, 0xba, 0xbc, 0x54, 0x5d, 0xbd, 0x6c, 0xd7, 0xc5, 0xbc, 0x87, 0x6b, + 0x87, 0x3d, 0x0a, 0x34, 0x0c, 0x3d, 0x44, 0xe5, 0x47, 0xbd, 0xe0, 0xd3, 0x05, + 0x3b, 0x23, 0x83, 0x11, 0xbd, 0xab, 0x22, 0x8c, 0xbd, 0x48, 0x17, 0xe9, 0x3c, + 0xbd, 0x8a, 0x89, 0x3d, 0xc0, 0x3a, 0x71, 0x3b, 0x08, 0x52, 0x61, 0x3c, 0x40, + 0xb4, 0x6d, 0x3c, 0xa0, 0x6a, 0xa0, 0x3b, 0x00, 0xc4, 0xb9, 0x39, 0x74, 0x71, + 0xa8, 0x3c, 0x13, 0xa7, 0x90, 0xbd, 0x04, 0xb5, 0xb4, 0xbc, 0x70, 0x36, 0x31, + 0x3c, 0x28, 0x25, 0x0f, 0x3c, 0xfc, 0x08, 0x46, 0xbd, 0x80, 0xa0, 0xa5, 0xba, + 0xe2, 0x11, 0x6f, 0xbd, 0x39, 0xf0, 0x31, 0xbd, 0xd8, 0xbe, 0x2f, 0xbd, 0x68, + 0x21, 0x4d, 0xbd, 0x64, 0x1b, 0x8e, 0xbd, 0x80, 0xd4, 0x78, 0xba, 0x92, 0x81, + 0x5a, 0xbd, 0xf4, 0xf9, 0x57, 0xbd, 0x80, 0x59, 0xa2, 0x3c, 0x22, 0xe6, 0xde, + 0xbc, 0x91, 0xdf, 0x87, 0xbd, 0x3a, 0xea, 0x22, 0xbd, 0xba, 0xf7, 0x75, 0x3d, + 0xba, 0x8a, 0x0c, 0x3d, 0x81, 0xa7, 0x8d, 0xbd, 0x90, 0xee, 0x50, 0xbd, 0x14, + 0xa3, 0x90, 0xbd, 0xdc, 0xdf, 0x81, 0x3c, 0x4a, 0xb5, 0x66, 0xbd, 0x10, 0xa0, + 0x94, 0x3b, 0x9a, 0x12, 0x2d, 0xbd, 0xda, 0x60, 0x42, 0xbd, 0xea, 0x9f, 0xb0, + 0xbc, 0x38, 0xfc, 0x02, 0x3d, 0xa6, 0x08, 0x04, 0x3d, 0x23, 0xf6, 0x03, 0xbd, + 0xa2, 0x7a, 0x63, 0x3d, 0x26, 0xca, 0x36, 0x3d, 0x96, 0xd3, 0x0d, 0x3d, 0x3f, + 0xfd, 0x89, 0x3d, 0x08, 0xa3, 0x24, 0xbd, 0x28, 0x10, 0x57, 0xbc, 0xbb, 0xb9, + 0x83, 0x3d, 0x50, 0x2b, 0xb5, 0x3b, 0x9c, 0x94, 0x19, 0xbc, 0xc4, 0x4d, 0x9a, + 0xbc, 0x91, 0xf8, 0x0d, 0xbd, 0x63, 0x13, 0x7d, 0xbd, 0xed, 0xd0, 0x02, 0xbd, + 0x1c, 0x10, 0x85, 0xbd, 0x00, 0xca, 0x36, 0x3c, 0xc8, 0x17, 0x7a, 0x3c, 0x24, + 0x32, 0xc7, 0xbc, 0x88, 0x75, 0xa5, 0x3c, 0x2e, 0x18, 0x39, 0xbd, 0xd4, 0xa9, + 0xfb, 0x3c, 0x8c, 0x61, 0x48, 0x3d, 0x40, 0x34, 0xb1, 0xba, 0xb7, 0xec, 0x83, + 0x3d, 0x7c, 0x1d, 0x5a, 0x3d, 0x30, 0x5c, 0x91, 0x3c, 0xcb, 0x9d, 0x85, 0x3d, + 0x74, 0xa8, 0x35, 0x3d, 0x93, 0x54, 0x76, 0xbd, 0xa3, 0xb8, 0x8c, 0xbd, 0xf3, + 0x38, 0x8d, 0xbd, 0x45, 0x41, 0x8d, 0xbd, 0xb0, 0x35, 0x2c, 0x3d, 0x79, 0x2f, + 0x91, 0x3d, 0x1c, 0xa0, 0xde, 0xbc, 0x26, 0xd7, 0x53, 0xbd, 0xec, 0x6e, 0x11, + 0x3d, 0x1c, 0x44, 0x8f, 0x3c, 0x2b, 0x97, 0x2b, 0xbd, 0x78, 0x4e, 0x62, 0xbc, + 0x4a, 0x20, 0xe3, 0xbc, 0x2e, 0x7e, 0xd5, 0xbc, 0x34, 0xe0, 0xcc, 0xbc, 0x00, + 0xd9, 0x05, 0x3d, 0x6e, 0xe3, 0xd8, 0xbc, 0x32, 0x01, 0x51, 0x3d, 0x57, 0x4a, + 0x83, 0x3d, 0x98, 0x90, 0x4c, 0xbd, 0x0d, 0x8e, 0x8b, 0x3d, 0x76, 0x2c, 0x32, + 0x3d, 0x6a, 0x76, 0x91, 0xbd, 0xc8, 0xf9, 0x85, 0x3c, 0x40, 0x2b, 0x80, 0x3a, + 0xe0, 0x00, 0xe3, 0xbb, 0x00, 0x06, 0x79, 0xb9, 0x27, 0xbd, 0x8f, 0x3d, 0xce, + 0x76, 0x2c, 0x3d, 0x56, 0x63, 0xd7, 0xbc, 0x30, 0x52, 0xf0, 0xbb, 0x69, 0x1f, + 0x85, 0xbd, 0x7e, 0xdb, 0x64, 0xbd, 0x85, 0xd6, 0x87, 0x3d, 0x92, 0xc0, 0x70, + 0x3d, 0x4c, 0x7a, 0x78, 0xbc, 0x6c, 0x7d, 0x2b, 0xbd, 0x6f, 0x2b, 0x85, 0x3d, + 0x98, 0x48, 0x39, 0xbd, 0x8c, 0x9d, 0xce, 0x3c, 0x08, 0xf9, 0x5c, 0xbc, 0xe8, + 0x5a, 0xcd, 0x3c, 0x88, 0xb0, 0x3c, 0x3d, 0xf8, 0x88, 0x4e, 0xbd, 0x30, 0x8f, + 0x38, 0x3c, 0xba, 0xa1, 0xc9, 0xbc, 0xba, 0xdc, 0x6d, 0x3d, 0xc0, 0x39, 0x5a, + 0xbb, 0xa6, 0x2d, 0x1d, 0x3d, 0x04, 0xde, 0xe4, 0x3c, 0x24, 0x67, 0x4f, 0xbd, + 0xde, 0xc0, 0x7c, 0x3d, 0x31, 0x68, 0x09, 0xbd, 0x01, 0x59, 0x80, 0xbd, 0x13, + 0x09, 0x91, 0x3d, 0xc8, 0xdd, 0x18, 0x3d, 0x2b, 0x88, 0x91, 0x3d, 0x50, 0xef, + 0x80, 0x3c, 0xec, 0x4a, 0x65, 0xbc, 0xb0, 0xca, 0x0a, 0x3d, 0x48, 0x1f, 0x29, + 0xbd, 0x56, 0xe9, 0x3a, 0x3d, 0xd0, 0x9c, 0x67, 0xbc, 0xe0, 0x47, 0xdb, 0xbc, + 0xd8, 0x70, 0x4a, 0xbd, 0x86, 0x63, 0x39, 0xbd, 0xfb, 0x2a, 0x10, 0xbd, 0xbc, + 0xfb, 0x42, 0xbd, 0xdc, 0x59, 0xe4, 0xbc, 0x2e, 0x08, 0x5f, 0xbd, 0x34, 0xb6, + 0xe1, 0x3c, 0x76, 0x68, 0x22, 0x3d, 0x18, 0x3d, 0x14, 0x3c, 0xa5, 0xa2, 0x8b, + 0xbd, 0x9c, 0x97, 0x87, 0xbd, 0xbd, 0x22, 0x87, 0x3d, 0x20, 0x18, 0x57, 0x3c, + 0xb6, 0x45, 0x5e, 0x3d, 0xa4, 0x1e, 0x63, 0xbd, 0x88, 0x1f, 0x68, 0x3c, 0xe0, + 0x00, 0x4f, 0x3d, 0x34, 0xe0, 0x5a, 0xbc, 0xd4, 0xd3, 0x61, 0xbc, 0x40, 0x8f, + 0x14, 0xbb, 0xae, 0x4e, 0x94, 0xbc, 0x8d, 0x80, 0x61, 0xbd, 0x11, 0xcc, 0x85, + 0x3d, 0xb4, 0x7b, 0x24, 0xbd, 0x3e, 0x81, 0x15, 0x3d, 0xaa, 0xe5, 0x85, 0xbd, + 0xa0, 0xa4, 0x2c, 0xbb, 0x02, 0x5e, 0x25, 0x3d, 0x5d, 0x8b, 0x37, 0xbd, 0xa1, + 0xb0, 0x25, 0xbd, 0x4a, 0xa5, 0x6b, 0x3d, 0xd3, 0x4a, 0x92, 0x3d, 0x40, 0x57, + 0x06, 0x3d, 0x20, 0xdd, 0x30, 0x3b, 0xb0, 0x9e, 0xd3, 0x3c, 0x62, 0xb5, 0xd8, + 0xbc, 0xa0, 0xec, 0x93, 0xbb, 0x20, 0xc4, 0x7a, 0x3b, 0xc0, 0x64, 0xfe, 0x3b, + 0xcb, 0xb4, 0x90, 0x3d, 0x3f, 0x87, 0x8c, 0x3d, 0xfa, 0x94, 0x21, 0x3d, 0x9c, + 0xc3, 0x03, 0x3d, 0xc2, 0x4f, 0x8d, 0xbc, 0x22, 0x1e, 0xd2, 0xbc, 0xa0, 0xd5, + 0x66, 0xbc, 0xba, 0xf8, 0xcd, 0xbc, 0x7f, 0x26, 0x60, 0xbd, 0x6c, 0x27, 0x90, + 0x3c, 0xf4, 0xd5, 0x85, 0x3c, 0xc0, 0x88, 0x3c, 0xbb, 0x8e, 0x17, 0x9d, 0xbc, + 0x34, 0xb8, 0xef, 0x3c, 0x78, 0x16, 0xbd, 0x3c, 0x41, 0x5e, 0x90, 0xbd, 0x3e, + 0x1c, 0x40, 0x3d, 0xeb, 0xf2, 0x8c, 0x3d, 0xd4, 0xb2, 0xa8, 0xbc, 0x0a, 0xae, + 0x29, 0x3d, 0x40, 0x78, 0x1c, 0xbb, 0x60, 0xfb, 0xd1, 0x3c, 0x9d, 0xd0, 0x84, + 0x3d, 0x8a, 0xcc, 0x08, 0x3d, 0x72, 0x4d, 0x41, 0x3d, 0xa9, 0x49, 0x50, 0xbd, + 0x92, 0x44, 0x1c, 0x3d, 0xc8, 0x15, 0x5f, 0xbd, 0x1a, 0xda, 0xb6, 0xbc, 0xb4, + 0x03, 0xd1, 0x3c, 0xdc, 0x8e, 0xb0, 0x3c, 0x88, 0x61, 0x7a, 0xbc, 0xb0, 0xab, + 0xc4, 0xbb, 0xa2, 0x9f, 0x35, 0xbd, 0xac, 0xc1, 0x1e, 0xbd, 0x78, 0xd0, 0x54, + 0x3d, 0x22, 0x03, 0xa9, 0xbc, 0x00, 0x71, 0x30, 0xbb, 0x30, 0xaa, 0xc8, 0x3b, + 0xa9, 0x9c, 0x35, 0xbd, 0x00, 0xb3, 0x09, 0xbb, 0x40, 0x51, 0x2e, 0x3c, 0xc8, + 0xb4, 0x23, 0x3c, 0x6d, 0xf4, 0x06, 0xbd, 0xaa, 0x77, 0x6f, 0x3d, 0xce, 0xc4, + 0xb1, 0xbc, 0x6f, 0x91, 0x8b, 0x3d, 0x5f, 0xc4, 0x8a, 0x3d, 0xe4, 0x1f, 0xac, + 0x3c, 0x4c, 0xc1, 0x89, 0x3c, 0x4c, 0x09, 0x5d, 0xbd, 0x38, 0x91, 0x3e, 0x3c, + 0xe0, 0x15, 0x30, 0xbd, 0x60, 0x09, 0xd2, 0x3c, 0xe0, 0x4f, 0x35, 0xbb, 0xe8, + 0xf2, 0xdf, 0xbc, 0x40, 0xa5, 0xcc, 0xba, 0x28, 0xaa, 0x04, 0xbc, 0xb4, 0x3b, + 0x3d, 0xbc, 0xa8, 0xbc, 0x9d, 0x3c, 0x22, 0x77, 0x51, 0x3d, 0xd3, 0x53, 0x48, + 0xbd, 0x80, 0x2a, 0x2c, 0x3b, 0x4e, 0x95, 0x79, 0x3d, 0x9c, 0x2c, 0x52, 0xbd, + 0xac, 0x7e, 0xd9, 0x3c, 0x76, 0xd7, 0x78, 0x3d, 0x00, 0xe8, 0x78, 0xbd, 0x2e, + 0x63, 0x0f, 0x3d, 0xeb, 0x59, 0x14, 0xbd, 0x84, 0xd4, 0x1c, 0xbc, 0x1d, 0x54, + 0x1a, 0xbd, 0xe0, 0x16, 0x5c, 0xbb, 0x5c, 0xf1, 0x48, 0x3d, 0x94, 0x95, 0x59, + 0xbc, 0x48, 0x14, 0x37, 0xbd, 0x3e, 0x60, 0x76, 0x3d, 0xb4, 0x88, 0xdb, 0x3c, + 0x24, 0xf3, 0x8b, 0xbc, 0xb8, 0x6e, 0x0f, 0x3d, 0x00, 0x2c, 0xda, 0x3a, 0x79, + 0x80, 0x88, 0x3d, 0x58, 0xf7, 0x26, 0x3c, 0x10, 0x19, 0x45, 0x3d, 0xf9, 0xba, + 0x6a, 0xbd, 0x0e, 0x30, 0x43, 0x3d, 0xe0, 0x09, 0x68, 0x3b, 0x51, 0x84, 0x8f, + 0xbd, 0x6a, 0xa1, 0x7a, 0xbd, 0xbc, 0x1c, 0x72, 0xbd, 0x94, 0xf7, 0x75, 0xbd, + 0xc8, 0x32, 0x69, 0xbd, 0xf5, 0x29, 0x1e, 0xbd, 0x00, 0xe7, 0x59, 0x3a, 0x90, + 0x9c, 0x84, 0xbd, 0x5c, 0x5f, 0x2f, 0xbd, 0x50, 0x8c, 0x95, 0xbb, 0x00, 0x13, + 0x85, 0xbd, 0x26, 0xab, 0x7f, 0xbd, 0xc8, 0x91, 0x2a, 0xbc, 0x34, 0xda, 0xd2, + 0xbc, 0x2c, 0xb7, 0x4b, 0x3d, 0x73, 0xe4, 0x2b, 0xbd, 0x48, 0x46, 0x8f, 0xbd, + 0x0c, 0xa7, 0x36, 0xbd, 0x58, 0x23, 0x9f, 0x3c, 0xec, 0x5b, 0x2e, 0x3d, 0x28, + 0xde, 0x34, 0xbd, 0x00, 0xd5, 0x8e, 0x3b, 0x76, 0xa2, 0x76, 0x3d, 0x64, 0xe8, + 0x4d, 0x3d, 0x47, 0xc2, 0x82, 0xbd, 0x90, 0x0c, 0x8b, 0xbd, 0x9c, 0x98, 0x1a, + 0x3d, 0x74, 0xd4, 0xd1, 0xbc, 0xd6, 0x3b, 0x78, 0x3d, 0x88, 0xad, 0x04, 0xbd, + 0x5c, 0x4e, 0xbf, 0x3c, 0x20, 0xd8, 0x5b, 0x3c, 0x68, 0x77, 0x0e, 0xbc, 0xc0, + 0x8a, 0xc8, 0x3b, 0x00, 0x68, 0x5d, 0xba, 0x4c, 0x05, 0x30, 0x3d, 0x20, 0xb7, + 0x56, 0x3d, 0xa0, 0x6e, 0xef, 0x3c, 0xb4, 0x50, 0x1c, 0x3d, 0x5c, 0x0f, 0x68, + 0xbd, 0xf7, 0x3c, 0x53, 0xbd, 0x96, 0xa5, 0x0c, 0x3d, 0x3a, 0x6c, 0x07, 0x3d, + 0xa0, 0x60, 0x2c, 0xbd, 0x20, 0xaf, 0xbf, 0xbc, 0x00, 0x2d, 0x05, 0xbb, 0xe0, + 0x97, 0x4b, 0x3b, 0x32, 0xdc, 0x37, 0x3d, 0xe2, 0x39, 0x54, 0xbd, 0x2a, 0xde, + 0xeb, 0xbc, 0x1e, 0x8b, 0x6d, 0x3d, 0x0c, 0x92, 0xd6, 0xbc, 0xec, 0x48, 0x19, + 0xbc, 0x23, 0xd9, 0x90, 0xbd, 0x84, 0x8b, 0x83, 0xbd, 0xc8, 0x8c, 0x7c, 0x3c, + 0xfe, 0xca, 0x7d, 0xbd, 0x06, 0xb7, 0x69, 0x3d, 0x34, 0x35, 0xb0, 0x3c, 0x52, + 0x14, 0x56, 0xbd, 0xf4, 0xf3, 0x43, 0xbd, 0x34, 0x5e, 0xbf, 0xbc, 0x9c, 0x32, + 0x1e, 0x3d, 0xa0, 0x4d, 0xe0, 0x3b, 0x00, 0x68, 0x5d, 0xb8, 0x9e, 0x47, 0x7b, + 0x3d, 0xe1, 0xcd, 0x8b, 0x3d, 0xb8, 0x10, 0x8f, 0xbc, 0xc8, 0x30, 0x28, 0x3c, + 0xec, 0x42, 0x28, 0x3d, 0xfe, 0xea, 0x8a, 0xbd, 0x36, 0x76, 0x1a, 0xbd, 0xfa, + 0x9c, 0xca, 0xbc, 0x10, 0xe9, 0x82, 0xbd, 0x72, 0x8b, 0x7b, 0x3d, 0x46, 0x75, + 0x1c, 0xbd, 0x5a, 0xb9, 0x06, 0xbd, 0x6c, 0xa7, 0x25, 0xbc, 0x6a, 0x37, 0xd3, + 0xbc, 0xbc, 0x78, 0x85, 0x3c, 0x98, 0xb7, 0x01, 0x3d, 0x3c, 0xb7, 0x0d, 0x3d, + 0x3c, 0x57, 0x21, 0xbc, 0x28, 0xfb, 0xa7, 0x3c, 0x18, 0x3f, 0x49, 0x3c, 0x81, + 0x34, 0x8d, 0xbd, 0xb4, 0xfb, 0x6e, 0xbd, 0x60, 0x97, 0x95, 0x3c, 0xac, 0xdd, + 0x86, 0xbc, 0xd8, 0x6e, 0xda, 0x3c, 0xd8, 0xd9, 0x3d, 0x3d, 0x90, 0xa6, 0xea, + 0x3c, 0x40, 0x67, 0x3f, 0x3d, 0x3a, 0x43, 0x69, 0x3d, 0x0a, 0x20, 0x5e, 0x3d, + 0x33, 0x91, 0x12, 0xbd, 0xb4, 0xc5, 0x31, 0xbd, 0x0e, 0x96, 0x45, 0x3d, 0xc6, + 0x22, 0x37, 0xbd, 0x7c, 0x12, 0x44, 0x3d, 0xc9, 0x61, 0x8a, 0x3d, 0x1c, 0x66, + 0x44, 0x3d, 0xa2, 0x51, 0x30, 0x3d, 0xc8, 0xdb, 0xd9, 0x3c, 0xd3, 0xfb, 0x8e, + 0xbd, 0x08, 0x6a, 0x91, 0xbd, 0xea, 0x2e, 0x48, 0xbd, 0x60, 0x5b, 0x22, 0xbb, + 0x06, 0x39, 0x53, 0x3d, 0x84, 0xb4, 0x0b, 0xbd, 0xa0, 0x77, 0xfa, 0x3b, 0x84, + 0xaf, 0xaa, 0x3c, 0x47, 0xd2, 0x86, 0xbd, 0xe3, 0xef, 0x43, 0xbd, 0x36, 0x8d, + 0x16, 0x3d, 0x85, 0xa6, 0x85, 0x3d, 0x8e, 0xda, 0xa0, 0xbc, 0xc3, 0x58, 0x80, + 0xbd, 0x93, 0x30, 0x0f, 0xbd, 0x0c, 0x85, 0xcf, 0xbc, 0xc0, 0x8c, 0x2a, 0x3c, + 0x02, 0xe2, 0x0d, 0xbd, 0xe9, 0xf8, 0x8c, 0xbd, 0x15, 0x8d, 0x8b, 0x3d, 0xf3, + 0x1f, 0x8b, 0xbd, 0x0f, 0xa0, 0x80, 0xbd, 0xee, 0x04, 0x63, 0x3d, 0xb4, 0x7a, + 0xf6, 0xbc, 0x60, 0x5b, 0x2e, 0xbc, 0x04, 0x6d, 0x42, 0x3d, 0x8a, 0xfc, 0x1c, + 0x3d, 0x52, 0xb0, 0x27, 0x3d, 0xe8, 0xf9, 0x35, 0xbd, 0xd4, 0xc2, 0x1b, 0x3d, + 0x00, 0x3a, 0x0b, 0xbb, 0x80, 0x7e, 0x4b, 0x3c, 0x06, 0xba, 0x3e, 0xbd, 0x70, + 0xc9, 0x35, 0xbd, 0xe0, 0x8b, 0x9d, 0xbb, 0x16, 0x05, 0x2f, 0xbd, 0xa0, 0xeb, + 0x03, 0x3c, 0x40, 0x3e, 0x95, 0xbc, 0xea, 0x76, 0x73, 0xbd, 0x90, 0xb0, 0xe8, + 0x3c, 0x3e, 0x61, 0x42, 0xbd, 0x17, 0x02, 0x8d, 0xbd, 0x42, 0x66, 0x1d, 0x3d, + 0xfe, 0x31, 0x68, 0x3d, 0x52, 0x8e, 0x30, 0xbd, 0x6b, 0xca, 0x10, 0xbd, 0xbd, + 0xcc, 0x80, 0xbd, 0x38, 0x91, 0x53, 0xbd, 0x90, 0xd7, 0xd3, 0x3c, 0x00, 0x0c, + 0xf4, 0x3b, 0x82, 0xf5, 0x3f, 0xbd, 0xb2, 0xa9, 0x04, 0x3d, 0x62, 0x67, 0x5c, + 0x3d, 0x86, 0xab, 0x91, 0xbc, 0xc2, 0x2b, 0xe8, 0xbc, 0x3a, 0x8a, 0x67, 0xbd, + 0xcc, 0x83, 0xdb, 0x3c, 0xf0, 0x8a, 0x03, 0x3c, 0x94, 0x78, 0x53, 0x3d, 0x9c, + 0x1b, 0xd4, 0x3c, 0xdb, 0xf9, 0x89, 0x3d, 0x40, 0xa5, 0x10, 0x3b, 0x89, 0xed, + 0x80, 0xbd, 0x6e, 0xb8, 0x57, 0xbd, 0x12, 0xc2, 0xcf, 0xbc, 0x44, 0x32, 0xb1, + 0x3c, 0xd5, 0xed, 0x34, 0xbd, 0x5e, 0x6c, 0x5c, 0xbd, 0x68, 0x69, 0x85, 0x3c, + 0x30, 0xdb, 0xb6, 0xbb, 0x00, 0x7f, 0xe0, 0x3c, 0x80, 0x24, 0x1e, 0x3b, 0x78, + 0x6f, 0x81, 0xbc, 0x3a, 0x27, 0x1b, 0x3d, 0x7f, 0xb5, 0x8a, 0xbd, 0xbb, 0xc1, + 0x8e, 0x3d, 0xa8, 0x7e, 0x69, 0x3c, 0x00, 0x80, 0x47, 0xbb, 0x21, 0xb9, 0x15, + 0xbd, 0x14, 0x0b, 0x8e, 0x3c, 0xa2, 0x1b, 0x55, 0x3d, 0x28, 0xea, 0x5b, 0xbd, + 0x10, 0x9a, 0x43, 0x3d, 0x40, 0xf6, 0x8a, 0x3a, 0x58, 0xb1, 0x92, 0xbc, 0x5c, + 0x0a, 0x4e, 0xbd, 0x10, 0xec, 0x1f, 0xbd, 0xa8, 0x31, 0xa7, 0x3c, 0x60, 0xfa, + 0x9f, 0xbb, 0xf0, 0x04, 0xa3, 0xbb, 0xc4, 0xd8, 0x5f, 0xbd, 0xba, 0x5f, 0x66, + 0xbd, 0x52, 0x94, 0x97, 0xbc, 0x1a, 0x9b, 0x22, 0xbd, 0xaa, 0x28, 0x59, 0x3d, + 0xaa, 0x06, 0x64, 0xbd, 0xe7, 0xc2, 0x83, 0xbd, 0xd0, 0x3d, 0xd0, 0xbc, 0x00, + 0x8c, 0xa3, 0x39, 0xd0, 0x27, 0x0c, 0xbc, 0x40, 0x8f, 0x79, 0xbc, 0x9e, 0x32, + 0x7f, 0x3d, 0xac, 0x9b, 0xfd, 0xbc, 0xb1, 0x17, 0x91, 0x3d, 0xa8, 0xca, 0x4e, + 0x3d, 0x40, 0xc3, 0xb7, 0x3a, 0xc0, 0x8e, 0x78, 0xbb, 0x3f, 0x3c, 0x83, 0x3d, + 0x47, 0xdc, 0x81, 0xbd, 0x5b, 0xe6, 0x1c, 0xbd, 0x70, 0xe3, 0xc8, 0xbc, 0x70, + 0x12, 0xd6, 0xbb, 0x0c, 0xb6, 0xe3, 0x3c, 0x88, 0x2a, 0x22, 0x3c, 0xd6, 0xbf, + 0x8d, 0xbd, 0xde, 0x15, 0x20, 0x3d, 0x76, 0x83, 0x3e, 0xbd, 0x85, 0x35, 0x80, + 0x3d, 0xc1, 0x0b, 0x87, 0x3d, 0xbf, 0x64, 0x18, 0xbd, 0x80, 0x22, 0x68, 0x3b, + 0xc4, 0xb0, 0xb0, 0x3c, 0xa2, 0xf2, 0x4f, 0xbd, 0xb6, 0x63, 0x04, 0x3d, 0xc0, + 0x4a, 0xc9, 0x3c, 0x36, 0x66, 0xc0, 0xbc, 0x64, 0x7a, 0x4c, 0x3d, 0xc1, 0x5b, + 0x8c, 0x3d, 0xae, 0xa2, 0x41, 0x3d, 0x66, 0x93, 0x01, 0x3d, 0x6c, 0xb7, 0x37, + 0xbd, 0x8c, 0x03, 0x28, 0xbd, 0x7c, 0xf6, 0x69, 0xbd, 0xa2, 0xe7, 0x0d, 0xbd, + 0xb0, 0xf3, 0x41, 0x3d, 0xc0, 0xbf, 0xc4, 0x3b, 0xe2, 0x58, 0x46, 0xbd, 0x02, + 0xb4, 0x60, 0x3d, 0xa2, 0xf8, 0x29, 0x3d, 0x90, 0xf7, 0xc8, 0x3b, 0xee, 0xad, + 0x43, 0x3d, 0x1b, 0x51, 0x12, 0xbd, 0xee, 0xc3, 0x91, 0xbd, 0x20, 0xad, 0x58, + 0x3c, 0xc6, 0x54, 0x3a, 0x3d, 0xea, 0xba, 0x60, 0xbd, 0x7e, 0x31, 0x22, 0x3d, + 0x98, 0xe6, 0x80, 0xbd, 0x00, 0x41, 0x29, 0x3b, 0x85, 0xec, 0x8c, 0x3d, 0x7a, + 0x8e, 0x3e, 0x3d, 0x42, 0x31, 0xfc, 0xbc, 0x58, 0x3c, 0x08, 0x3c, 0xdc, 0x04, + 0xb5, 0xbc, 0x9e, 0xbf, 0x0f, 0xbd, 0x70, 0xad, 0x2a, 0xbc, 0x6c, 0x83, 0x8c, + 0xbc, 0x6a, 0xd4, 0x6c, 0xbd, 0x62, 0x1b, 0x8e, 0xbc, 0x94, 0x48, 0x1f, 0xbd, + 0x35, 0xe0, 0x3d, 0xbd, 0x60, 0x91, 0x88, 0x3b, 0x6c, 0x16, 0x07, 0x3d, 0x30, + 0xa0, 0x93, 0x3b, 0x3c, 0xec, 0x5e, 0xbc, 0x66, 0xbf, 0x51, 0xbd, 0xfc, 0x42, + 0x47, 0x3d, 0x78, 0x73, 0x71, 0x3c, 0x62, 0x96, 0x89, 0xbd, 0x50, 0x2b, 0xca, + 0x3c, 0x98, 0xc5, 0x21, 0x3c, 0xbb, 0x4b, 0x19, 0xbd, 0x36, 0x22, 0x75, 0x3d, + 0x44, 0x6e, 0x7d, 0xbd, 0xec, 0x88, 0x8d, 0x3c, 0xa8, 0x57, 0x0e, 0x3c, 0x96, + 0x97, 0x01, 0x3d, 0x1c, 0x9c, 0x59, 0x3d, 0xc4, 0x0b, 0x31, 0x3d, 0x60, 0xf0, + 0x6c, 0xbc, 0xb8, 0xa9, 0xb4, 0x3c, 0xd8, 0xbb, 0x33, 0xbc, 0x98, 0x35, 0x99, + 0x3c, 0xd2, 0x49, 0x3d, 0xbd, 0xe6, 0xc9, 0x5b, 0x3d, 0x42, 0xf7, 0x41, 0x3d, + 0xda, 0x13, 0x37, 0xbd, 0x96, 0x91, 0x94, 0xbc, 0xb8, 0xde, 0x89, 0x3c, 0xda, + 0x37, 0x08, 0xbd, 0x20, 0xda, 0x3e, 0x3c, 0xda, 0xe8, 0x61, 0xbd, 0x70, 0x8a, + 0x29, 0x3d, 0x18, 0xa4, 0x8f, 0xbd, 0x20, 0xee, 0x56, 0x3c, 0x70, 0xc3, 0xc8, + 0xbc, 0x5c, 0xf4, 0x99, 0x3c, 0x54, 0xd5, 0x4b, 0xbd, 0x88, 0xcf, 0x6a, 0x3c, + 0xa5, 0xc7, 0x1c, 0xbd, 0x10, 0x98, 0xb3, 0xbb, 0x9a, 0xe0, 0x86, 0xbd, 0x3e, + 0x34, 0x87, 0xbd, 0xfa, 0x36, 0x7d, 0x3d, 0x40, 0x64, 0xfe, 0xbc, 0xd0, 0x4f, + 0x67, 0xbd, 0x21, 0xda, 0x72, 0xbd, 0x2e, 0x02, 0x38, 0xbd, 0xc6, 0xd9, 0xff, + 0xbc, 0x1a, 0x30, 0xb9, 0xbc, 0x58, 0xea, 0x58, 0x3c, 0xb1, 0xb7, 0x03, 0xbd, + 0x80, 0x5b, 0xfc, 0x3a, 0x43, 0x60, 0x80, 0x3d, 0xa8, 0x67, 0x4a, 0xbd, 0x68, + 0xd8, 0x3e, 0x3c, 0xf0, 0xe8, 0x2a, 0x3c, 0x68, 0x26, 0x3f, 0xbd, 0x28, 0x26, + 0x73, 0xbd, 0x38, 0xe5, 0x24, 0x3d, 0x00, 0xb0, 0xa1, 0xba, 0x7e, 0x0f, 0x18, + 0xbd, 0x35, 0x0d, 0x7c, 0xbd, 0x14, 0xa7, 0x3f, 0x3d, 0x16, 0x49, 0x0e, 0x3d, + 0x2e, 0xd8, 0x90, 0xbd, 0x50, 0xc3, 0x21, 0xbd, 0xd4, 0x13, 0x44, 0x3d, 0x70, + 0x10, 0xfd, 0x3b, 0x7b, 0x43, 0x87, 0x3d, 0x64, 0xb7, 0xf9, 0x3c, 0xd6, 0xc6, + 0xb7, 0xbc, 0x00, 0xd8, 0xbb, 0x3b, 0xe0, 0x1b, 0x42, 0xbb, 0x68, 0x5c, 0xcf, + 0xbc, 0xea, 0xfb, 0x8e, 0xbd, 0xdc, 0x09, 0x33, 0x3d, 0x80, 0xef, 0xb9, 0x3c, + 0x00, 0xde, 0x92, 0xb9, 0x31, 0x42, 0x08, 0xbd, 0x80, 0x6d, 0x40, 0x3b, 0x80, + 0xab, 0x20, 0x3d, 0xc0, 0x60, 0xc3, 0xba, 0x0b, 0xb6, 0x5e, 0xbd, 0xd4, 0x28, + 0x3e, 0xbd, 0x47, 0x7b, 0x87, 0x3d, 0x81, 0x52, 0x84, 0x3d, 0x90, 0x8e, 0xc2, + 0x3c, 0x04, 0x5b, 0xf3, 0xbc, 0x70, 0xa9, 0xea, 0x3c, 0x55, 0x55, 0x4d, 0xbd, + 0x52, 0x8b, 0x59, 0xbd, 0xf2, 0xeb, 0x56, 0x3d, 0x1e, 0xc7, 0x3f, 0x3d, 0xe0, + 0x52, 0xa3, 0x3b, 0x16, 0x93, 0x9d, 0xbc, 0x28, 0xeb, 0x36, 0x3d, 0x70, 0x4c, + 0x1d, 0x3d, 0x8d, 0x81, 0x14, 0xbd, 0xb0, 0x22, 0xa0, 0xbb, 0x50, 0xfa, 0x87, + 0x3c, 0x33, 0xc6, 0x2d, 0xbd, 0xd3, 0xd8, 0x85, 0x3d, 0xe8, 0xfd, 0x15, 0x3c, + 0x20, 0x79, 0xe4, 0x3b, 0xb0, 0xd4, 0x4f, 0xbd, 0x24, 0xe9, 0xb5, 0x3c, 0xba, + 0x47, 0x27, 0x3d, 0x23, 0xef, 0x02, 0xbd, 0xf0, 0xac, 0x31, 0x3d, 0x62, 0xde, + 0xdd, 0xbc, 0x2c, 0xa0, 0x29, 0x3d, 0xa5, 0xec, 0x85, 0x3d, 0xa9, 0x1b, 0x8d, + 0x3d, 0x2c, 0x6c, 0xa2, 0xbc, 0xf0, 0xc7, 0x37, 0xbc, 0x6c, 0xf7, 0xc5, 0xbc, + 0xf4, 0x1d, 0x1c, 0xbc, 0x20, 0x3c, 0xc9, 0x3b, 0x9d, 0xff, 0x0b, 0xbd, 0x10, + 0xa3, 0x53, 0x3d, 0x64, 0xbb, 0xc9, 0xbc, 0xfc, 0x8d, 0xe8, 0xbc, 0x20, 0x1f, + 0x5a, 0x3c, 0x11, 0xe2, 0x17, 0xbd, 0xe0, 0x37, 0x97, 0x3b, 0x88, 0x44, 0x2a, + 0xbd, 0x88, 0x79, 0x4c, 0xbd, 0xa8, 0x9e, 0x0d, 0x3c, 0x15, 0x54, 0x8c, 0x3d, + 0xcb, 0x9b, 0x87, 0x3d, 0x18, 0xdd, 0x07, 0xbd, 0x2b, 0x33, 0x81, 0xbd, 0xb2, + 0x57, 0x2e, 0xbd, 0x18, 0xc5, 0x2b, 0xbd, 0x88, 0x10, 0x91, 0xbd, 0x66, 0x69, + 0x15, 0x3d, 0x98, 0x6c, 0xf7, 0x3c, 0x10, 0x05, 0x07, 0xbc, 0x44, 0x3b, 0xc6, + 0xbc, 0x30, 0x43, 0xa8, 0x3b, 0x5b, 0xd8, 0x38, 0xbd, 0x66, 0x01, 0xe8, 0xbc, + 0x36, 0xef, 0xaf, 0xbc, 0x88, 0x76, 0x24, 0x3c, 0x3a, 0x71, 0x5d, 0x3d, 0x30, + 0xa0, 0x38, 0xbc, 0x04, 0x86, 0xf5, 0xbc, 0x30, 0xdc, 0x7c, 0x3c, 0x0c, 0x37, + 0x2f, 0xbd, 0x80, 0xa4, 0x1f, 0xba, 0x2c, 0xa1, 0x2f, 0xbd, 0xb0, 0xb7, 0xa0, + 0x3c, 0x37, 0xb1, 0x14, 0xbd, 0xb6, 0x07, 0x54, 0xbd, 0xb0, 0xbf, 0xd7, 0xbc, + 0x6c, 0xc8, 0x2c, 0x3d, 0x2c, 0x09, 0x31, 0x3d, 0x04, 0x69, 0xe4, 0xbc, 0xa0, + 0x5e, 0x7a, 0xbb, 0x90, 0x52, 0xb3, 0x3c, 0x4e, 0x6b, 0x84, 0xbd, 0xcc, 0x7e, + 0x25, 0x3d, 0x30, 0x08, 0x99, 0xbb, 0x00, 0x08, 0xfc, 0x3b, 0xaa, 0xf0, 0x66, + 0x3d, 0x13, 0xa5, 0x8a, 0x3d, 0xc8, 0x1c, 0xad, 0xbc, 0xf1, 0x48, 0x82, 0x3d, + 0x7d, 0x18, 0x80, 0xbd, 0x14, 0x52, 0xa6, 0x3c, 0x10, 0x21, 0x9c, 0xbb, 0xfc, + 0xda, 0x31, 0xbc, 0x0e, 0x65, 0xd2, 0xbc, 0x74, 0x2a, 0xcd, 0xbc, 0xb6, 0xb6, + 0x64, 0x3d, 0x24, 0x32, 0x55, 0x3d, 0x8e, 0xc7, 0xbc, 0xbc, 0x94, 0x15, 0x89, + 0x3c, 0x72, 0x1e, 0x3b, 0x3d, 0xb0, 0x0e, 0x25, 0x3c, 0xf8, 0x00, 0xad, 0x3c, + 0xc1, 0xb3, 0x92, 0xbd, 0xce, 0xcf, 0x33, 0x3d, 0xe8, 0xec, 0x6a, 0x3c, 0x9e, + 0x76, 0x9c, 0xbc, 0x4e, 0x5f, 0x29, 0xbd, 0x7c, 0xa7, 0x88, 0x3c, 0x00, 0xf3, + 0xbf, 0x3c, 0x10, 0x12, 0x26, 0x3c, 0xf4, 0x7c, 0x4b, 0x3d, 0x90, 0x83, 0xec, + 0xbb, 0xb6, 0x48, 0x92, 0xbd, 0x5c, 0x63, 0x47, 0x3d, 0x3f, 0xb2, 0x71, 0xbd, + 0x60, 0x1f, 0x7e, 0xbc, 0xbc, 0xff, 0x9a, 0xbc, 0x96, 0x17, 0xb2, 0xbc, 0x78, + 0x09, 0x0a, 0x3c, 0xa5, 0xbb, 0x8d, 0x3d, 0x80, 0x7e, 0xbd, 0x3a, 0x8c, 0x61, + 0x8f, 0xbd, 0x70, 0x44, 0x19, 0x3d, 0xde, 0x63, 0x4b, 0x3d, 0x00, 0x61, 0x0b, + 0xbb, 0x36, 0x70, 0x32, 0xbd, 0xc6, 0x8f, 0x71, 0x3d, 0xf0, 0xf7, 0xa0, 0xbc, + 0x00, 0x80, 0x01, 0xb8, 0xe4, 0xc6, 0x93, 0x3c, 0x08, 0xd4, 0x3b, 0x3c, 0x96, + 0x32, 0x40, 0x3d, 0xb8, 0x22, 0x31, 0x3d, 0x4a, 0xd9, 0x6f, 0x3d, 0x28, 0x10, + 0x2c, 0xbc, 0x94, 0x4b, 0x9c, 0xbc, 0x90, 0x38, 0x57, 0x3d, 0xa4, 0x0d, 0x81, + 0xbc, 0x90, 0xa5, 0xb6, 0x3c, 0x9d, 0xfe, 0x78, 0xbd, 0x3c, 0x24, 0x19, 0x3d, + 0xa8, 0x56, 0x0c, 0x3d, 0x6b, 0xec, 0x54, 0xbd, 0x10, 0x49, 0x94, 0xbb, 0x80, + 0x25, 0xe9, 0x3c, 0xe4, 0xb5, 0xe2, 0xbc, 0x68, 0xb2, 0x10, 0x3d, 0x6a, 0x13, + 0xe0, 0xbc, 0x3a, 0x69, 0x44, 0xbd, 0x18, 0x3f, 0xfc, 0x3c, 0x6e, 0x08, 0x60, + 0x3d, 0x5e, 0x5b, 0xa2, 0xbc, 0x7c, 0xbd, 0x81, 0xbd, 0xf0, 0xf9, 0xd6, 0x3b, + 0xfa, 0x80, 0x14, 0xbd, 0xdb, 0xb0, 0x8d, 0xbd, 0xb0, 0x41, 0xe5, 0x3b, 0xe0, + 0x03, 0xe3, 0x3c, 0xf4, 0x88, 0x07, 0xbd, 0x52, 0x89, 0xd0, 0xbc, 0x90, 0x90, + 0x10, 0x3d, 0x9c, 0xc3, 0x3e, 0x3d, 0x2f, 0x07, 0x09, 0xbd, 0x7e, 0x67, 0xf6, + 0xbc, 0xde, 0x88, 0xe1, 0xbc, 0xbe, 0x4b, 0x08, 0xbd, 0xac, 0xc1, 0x24, 0x3d, + 0x5e, 0xd5, 0x3c, 0x3d, 0x80, 0x9e, 0x01, 0xbc, 0xa6, 0xdb, 0xc7, 0xbc, 0xbb, + 0x37, 0x83, 0xbd, 0x34, 0x71, 0x50, 0x3d, 0x10, 0x46, 0x2d, 0xbd, 0x71, 0x50, + 0x67, 0xbd, 0x20, 0x2e, 0x15, 0xbb, 0xaa, 0x05, 0x74, 0x3d, 0xc1, 0xb5, 0x79, + 0xbd, 0x21, 0xaa, 0x44, 0xbd, 0xda, 0xbd, 0x0c, 0xbd, 0xb1, 0xee, 0x8c, 0x3d, + 0x54, 0x83, 0x83, 0xbd, 0x5e, 0xe5, 0x75, 0x3d, 0x52, 0x3d, 0x73, 0x3d, 0x40, + 0xf3, 0xd4, 0x3c, 0x9a, 0x1a, 0x78, 0x3d, 0x85, 0x49, 0x62, 0xbd, 0x6b, 0x57, + 0x91, 0x3d, 0x30, 0xd7, 0x3f, 0x3d, 0xed, 0x16, 0x3f, 0xbd, 0xd0, 0xf4, 0x85, + 0xbb, 0x47, 0x5e, 0x1e, 0xbd, 0x70, 0xe9, 0x87, 0x3c, 0x87, 0x5d, 0x80, 0xbd, + 0xa0, 0x7a, 0xb6, 0xbb, 0x03, 0x86, 0x84, 0xbd, 0x50, 0x4c, 0x74, 0x3c, 0x85, + 0x86, 0x80, 0x3d, 0x00, 0xe2, 0x56, 0xbb, 0x7e, 0xb0, 0x16, 0xbd, 0x10, 0xa9, + 0x80, 0xbd, 0xe0, 0x8b, 0x47, 0x3d, 0x19, 0x07, 0x68, 0xbd, 0x4e, 0xd8, 0x70, + 0x3d, 0xa8, 0x10, 0x2a, 0x3d, 0x22, 0x23, 0x96, 0xbc, 0x92, 0xe3, 0x72, 0xbd, + 0xb8, 0x0f, 0x13, 0x3d, 0x16, 0xc3, 0x53, 0x3d, 0xa4, 0x95, 0x41, 0x3d, 0x02, + 0xc3, 0x6f, 0x3d, 0x48, 0x02, 0xac, 0xbc, 0x40, 0x53, 0x6d, 0x3b, 0xf4, 0x2a, + 0x19, 0xbc, 0x10, 0x1f, 0xc2, 0xbb, 0x21, 0xb8, 0x69, 0xbd, 0x97, 0x8c, 0x8a, + 0x3d, 0x38, 0x13, 0xb4, 0x3c, 0xf1, 0x0d, 0x8d, 0x3d, 0x00, 0x69, 0x30, 0x3d, + 0x38, 0x92, 0xf9, 0x3c, 0xb5, 0xff, 0x8a, 0x3d, 0x15, 0x27, 0x91, 0x3d, 0x96, + 0xd4, 0x00, 0x3d, 0x66, 0xde, 0x1c, 0x3d, 0x7c, 0x48, 0x40, 0x3d, 0x08, 0x06, + 0xf2, 0x3c, 0x8e, 0xfe, 0x71, 0x3d, 0x90, 0xa1, 0xc6, 0xbb, 0x88, 0x57, 0x05, + 0x3c, 0x80, 0x92, 0x6d, 0x3a, 0x80, 0x99, 0xc9, 0xba, 0x0f, 0x0f, 0x33, 0xbd, + 0x76, 0xfc, 0x31, 0x3d, 0xd8, 0x9f, 0x23, 0xbd, 0x8c, 0x07, 0x07, 0xbd, 0x68, + 0x38, 0x5e, 0x3c, 0xf0, 0x39, 0xbf, 0xbc, 0x6c, 0x16, 0xfc, 0x3c, 0x94, 0xf2, + 0xb4, 0xbc, 0x20, 0x52, 0xc4, 0xbb, 0xb7, 0x3f, 0x02, 0xbd, 0x78, 0x48, 0x61, + 0xbd, 0x48, 0xad, 0x6b, 0xbd, 0xcd, 0xb1, 0x8c, 0x3d, 0x20, 0x28, 0xcd, 0x3c, + 0xb4, 0x49, 0x53, 0x3d, 0x30, 0x59, 0x06, 0x3c, 0xda, 0xea, 0x83, 0xbd, 0xf8, + 0xe2, 0x16, 0xbd, 0x96, 0xc3, 0x77, 0x3d, 0x2c, 0x90, 0xf6, 0x3c, 0x94, 0x78, + 0x4d, 0xbc, 0x75, 0x0d, 0x2f, 0xbd, 0xa2, 0x00, 0xa7, 0xbc, 0x32, 0xec, 0x7c, + 0x3d, 0x6c, 0x7a, 0x5a, 0xbc, 0x7e, 0x59, 0x58, 0x3d, 0x60, 0x65, 0x91, 0x3b, + 0x28, 0x8b, 0x75, 0xbd, 0x22, 0xa7, 0x7b, 0x3d, 0xc4, 0xdd, 0x39, 0x3d, 0xe4, + 0x54, 0xa3, 0xbc, 0xb6, 0x39, 0x30, 0x3d, 0x38, 0x91, 0x35, 0x3c, 0xd0, 0xb9, + 0x10, 0x3c, 0x4c, 0x8a, 0xab, 0x3c, 0x04, 0x8d, 0x0e, 0xbd, 0x20, 0xc2, 0xcb, + 0x3b, 0x32, 0xbe, 0x58, 0xbd, 0xec, 0x4e, 0x03, 0x3d, 0xf0, 0x59, 0xee, 0x3c, + 0x18, 0x48, 0x0d, 0xbc, 0xa0, 0xfd, 0xe6, 0xbb, 0x8c, 0x9c, 0x4b, 0x3d, 0xa8, + 0xe8, 0x13, 0x3c, 0x14, 0xb9, 0x4e, 0xbd, 0xe6, 0xbf, 0x03, 0x3d, 0xf0, 0x7a, + 0xdd, 0xbc, 0xc8, 0x1b, 0x91, 0xbc, 0x9b, 0x2a, 0x24, 0xbd, 0x98, 0x93, 0x01, + 0xbc, 0x1a, 0x0c, 0x34, 0x3d, 0xfe, 0xfa, 0xa3, 0xbc, 0x7c, 0x82, 0xbd, 0x3c, + 0x70, 0x96, 0xe8, 0x3c, 0xa6, 0x08, 0x67, 0x3d, 0x48, 0x11, 0x68, 0xbc, 0x90, + 0xfb, 0x58, 0xbd, 0x91, 0x9e, 0x8b, 0xbd, 0x4b, 0xd8, 0x87, 0xbd, 0x6a, 0x90, + 0x63, 0x3d, 0x36, 0xa5, 0x20, 0x3d, 0x30, 0x61, 0x3d, 0x3d, 0x56, 0x99, 0x11, + 0xbd, 0xce, 0xff, 0x70, 0x3d, 0xd5, 0x52, 0x3d, 0xbd, 0x44, 0x1e, 0x92, 0x3c, + 0x6e, 0xb4, 0x44, 0xbd, 0x42, 0xeb, 0xec, 0xbc, 0xa2, 0xea, 0x85, 0xbc, 0x40, + 0x48, 0x01, 0x3b, 0x52, 0xcd, 0x75, 0x3d, 0xe9, 0xa7, 0x08, 0xbd, 0x61, 0x2e, + 0x0c, 0xbd, 0x06, 0xda, 0x24, 0x3d, 0xce, 0xfc, 0xf7, 0xbc, 0x62, 0xab, 0x7d, + 0x3d, 0x2f, 0x02, 0x89, 0xbd, 0xea, 0x05, 0x48, 0xbd, 0xea, 0x7c, 0x7b, 0xbd, + 0x80, 0x05, 0x8c, 0xba, 0xba, 0x77, 0x3d, 0xbd, 0xfa, 0xee, 0x34, 0xbd, 0xd2, + 0x24, 0x28, 0x3d, 0x30, 0xb2, 0x40, 0xbd, 0x52, 0x8b, 0x18, 0x3d, 0xe3, 0xfc, + 0x8b, 0x3d, 0x58, 0x86, 0x65, 0xbc, 0x64, 0x1e, 0xa8, 0xbc, 0xba, 0xc7, 0x75, + 0x3d, 0xdb, 0xb4, 0x80, 0x3d, 0x07, 0x16, 0x67, 0xbd, 0x84, 0x95, 0x6d, 0xbc, + 0x11, 0xb3, 0x1e, 0xbd, 0x40, 0x9b, 0x56, 0xbb, 0x7e, 0x66, 0x57, 0x3d, 0xca, + 0x1c, 0x5e, 0x3d, 0x20, 0xef, 0xe5, 0x3b, 0xd3, 0x0f, 0x2e, 0xbd, 0x8a, 0xdf, + 0x81, 0xbd, 0x58, 0xc9, 0x0f, 0x3d, 0xbc, 0x54, 0x63, 0xbd, 0x60, 0x24, 0x85, + 0xbd, 0x5a, 0xa5, 0xda, 0xbc, 0x12, 0x87, 0x01, 0x3d, 0xf6, 0xc0, 0x96, 0xbc, + 0x78, 0x46, 0x1d, 0x3d, 0xb6, 0x90, 0x62, 0xbd, 0xc0, 0x43, 0x94, 0x3b, 0xf0, + 0xed, 0xce, 0xbb, 0xb8, 0x25, 0x14, 0xbc, 0xf4, 0x5c, 0x20, 0xbc, 0xd8, 0x5b, + 0x1c, 0x3d, 0x44, 0xcb, 0x4c, 0xbc, 0x2e, 0xf6, 0x36, 0x3d, 0x94, 0xa7, 0xe6, + 0xbc, 0xd8, 0xac, 0x4f, 0x3c, 0x06, 0x78, 0x11, 0x3d, 0xe6, 0x53, 0x14, 0x3d, + 0x3b, 0x4b, 0x25, 0xbd, 0x03, 0xb6, 0x88, 0xbd, 0xd0, 0xc2, 0x2b, 0x3c, 0xc5, + 0xf9, 0x12, 0xbd, 0x78, 0x6f, 0xf5, 0x3c, 0xc6, 0xc0, 0x63, 0x3d, 0x60, 0xd4, + 0xa9, 0x3c, 0x1b, 0x87, 0x92, 0x3d, 0x70, 0x70, 0x35, 0xbd, 0xb8, 0xaa, 0x17, + 0x3d, 0xec, 0x13, 0xde, 0xbc, 0x04, 0xc8, 0x8c, 0x3c, 0x3c, 0xcd, 0xf4, 0x3c, + 0x66, 0x81, 0x4b, 0x3d, 0x3e, 0x59, 0x8b, 0xbd, 0xb8, 0xab, 0x04, 0x3c, 0xdc, + 0x9a, 0xd8, 0x3c, 0x00, 0x22, 0x4d, 0x3d, 0x08, 0x10, 0x93, 0x3c, 0x64, 0x64, + 0x7e, 0xbc, 0x32, 0xd1, 0x00, 0x3d, 0xfc, 0x6a, 0x2a, 0xbd, 0x04, 0x05, 0xa8, + 0x3c, 0x4c, 0xb2, 0xc3, 0x3c, 0x57, 0x68, 0x0d, 0xbd, 0x18, 0x0f, 0x6e, 0xbd, + 0x31, 0x3c, 0x0d, 0xbd, 0xa0, 0xef, 0xe0, 0xbb, 0x5a, 0xa3, 0xf2, 0xbc, 0xb3, + 0xcd, 0x88, 0x3d, 0x0c, 0x86, 0x6e, 0xbc, 0x78, 0x6a, 0x14, 0xbc, 0x51, 0x9b, + 0x2e, 0xbd, 0x45, 0x0b, 0x22, 0xbd, 0xf0, 0x38, 0x9e, 0x3c, 0x53, 0x6c, 0x87, + 0x3d, 0x00, 0x20, 0x2d, 0x3a, 0x40, 0xea, 0xd2, 0xba, 0xcd, 0x35, 0x88, 0xbd, + 0xb2, 0xad, 0x62, 0x3d, 0xf6, 0x83, 0xb9, 0xbc, 0x92, 0xb4, 0x4b, 0x3d, 0xe6, + 0x0e, 0x86, 0xbc, 0x55, 0x4e, 0x85, 0x3d, 0x7e, 0x89, 0x05, 0x3d, 0xa1, 0xb1, + 0x83, 0x3d, 0x7c, 0x7c, 0xf5, 0x3c, 0xdb, 0x2e, 0x8c, 0xbd, 0x98, 0x94, 0x5c, + 0xbd, 0x0c, 0xfd, 0xb9, 0xbc, 0x40, 0x7e, 0xa5, 0x3c, 0xc0, 0x1e, 0xd6, 0x3a, + 0x88, 0x80, 0x1d, 0x3c, 0x48, 0x6f, 0xfe, 0x3c, 0x2a, 0x7a, 0xde, 0xbc, 0x9c, + 0x7d, 0x1a, 0xbd, 0x70, 0xd8, 0x1b, 0x3c, 0xa8, 0x27, 0x75, 0xbd, 0x92, 0x9a, + 0x53, 0x3d, 0xb3, 0x0a, 0x8b, 0x3d, 0xd0, 0xe2, 0x10, 0x3c, 0xb0, 0x82, 0x9d, + 0x3b, 0x38, 0x23, 0x10, 0x3c, 0xc0, 0xfb, 0xab, 0xbb, 0x7a, 0xff, 0x77, 0xbd, + 0x3f, 0x50, 0x91, 0x3d, 0x30, 0x33, 0x01, 0x3c, 0x48, 0x28, 0x43, 0x3d, 0xd4, + 0x59, 0xac, 0xbc, 0xa3, 0xa9, 0x0d, 0xbd, 0x1c, 0x90, 0x52, 0xbd, 0x40, 0xa7, + 0x57, 0x3c, 0x94, 0x79, 0x28, 0xbd, 0xf0, 0x27, 0x9b, 0x3c, 0x02, 0x37, 0x7d, + 0x3d, 0x14, 0x5b, 0x94, 0xbc, 0xde, 0x3f, 0x2c, 0xbd, 0x06, 0xe5, 0x2b, 0xbd, + 0x58, 0x3a, 0x01, 0xbd, 0xda, 0x88, 0xa5, 0xbc, 0x27, 0x42, 0x08, 0xbd, 0x30, + 0x39, 0xd1, 0x3b, 0xdc, 0xf2, 0xb6, 0xbc, 0x78, 0xe4, 0xe9, 0x3c, 0x56, 0xdd, + 0x8c, 0xbc, 0x20, 0xbf, 0x17, 0x3d, 0x8a, 0x7a, 0x5e, 0xbd, 0x6a, 0x3e, 0xac, + 0xbc, 0xb2, 0x0d, 0x7b, 0x3d, 0x02, 0x11, 0xae, 0xbc, 0x8c, 0x5a, 0x14, 0x3d, + 0xba, 0x7e, 0xa6, 0xbc, 0xdc, 0x76, 0x0c, 0x3d, 0xfc, 0x09, 0x5a, 0x3d, 0x4e, + 0x8d, 0x8b, 0xbd, 0xd4, 0x0c, 0xa3, 0xbc, 0x7f, 0x0e, 0x8f, 0xbd, 0x20, 0x38, + 0x62, 0xbb, 0xe0, 0x57, 0xf8, 0xbb, 0x00, 0x7b, 0x12, 0xba, 0x5c, 0x6f, 0xbe, + 0x3c, 0x40, 0xc3, 0x2a, 0x3b, 0xf4, 0xe3, 0xb4, 0x3c, 0xda, 0x17, 0x4d, 0x3d, + 0xd0, 0xca, 0x1e, 0x3d, 0x80, 0x09, 0xaa, 0x3c, 0xce, 0x89, 0x5d, 0x3d, 0x24, + 0x5d, 0x0f, 0x3d, 0xa0, 0x6d, 0x44, 0x3c, 0x0e, 0x09, 0x92, 0xbc, 0x00, 0xde, + 0x57, 0x3c, 0x91, 0x01, 0x73, 0xbd, 0x5e, 0x90, 0x1a, 0x3d, 0x4c, 0xf8, 0xd6, + 0x3c, 0xf8, 0x9a, 0x91, 0xbd, 0xe2, 0x1c, 0x5d, 0xbd, 0x80, 0xde, 0x76, 0x3b, + 0xd6, 0x26, 0x2c, 0x3d, 0x00, 0xd0, 0x39, 0xbc, 0xfc, 0x5d, 0xee, 0xbc, 0x7a, + 0xdc, 0x83, 0xbc, 0x3b, 0x14, 0x81, 0x3d, 0x30, 0x85, 0xf3, 0x3c, 0x0e, 0x0d, + 0x85, 0xbd, 0x86, 0x9f, 0xcf, 0xbc, 0x32, 0xf9, 0xfa, 0xbc, 0xdc, 0x92, 0x8e, + 0xbd, 0xf0, 0xf2, 0x45, 0x3c, 0xb2, 0xcd, 0x31, 0xbd, 0x40, 0x13, 0xcc, 0xba, + 0x81, 0x90, 0x0b, 0xbd, 0xf5, 0xd9, 0x7d, 0xbd, 0x74, 0xf2, 0xc1, 0xbc, 0x8e, + 0xb9, 0x2b, 0x3d, 0xb0, 0xef, 0x7e, 0xbd, 0x00, 0x57, 0x81, 0x3c, 0xc2, 0x40, + 0x76, 0xbd, 0xaf, 0xe7, 0x08, 0xbd, 0x02, 0x79, 0x26, 0x3d, 0x77, 0x1f, 0x2f, + 0xbd, 0x20, 0x66, 0x1c, 0x3c, 0x28, 0x56, 0xc2, 0x3c, 0xe8, 0x78, 0x0e, 0x3c, + 0xb8, 0x4e, 0x2c, 0xbc, 0xd0, 0x97, 0x26, 0xbc, 0x5e, 0x8f, 0x3b, 0x3d, 0x30, + 0xff, 0x28, 0x3c, 0x91, 0x25, 0x92, 0x3d, 0x20, 0xd1, 0x20, 0xbc, 0x24, 0xb8, + 0x23, 0xbd, 0xfc, 0xca, 0x55, 0xbc, 0xf8, 0x46, 0xf0, 0x3c, 0xf7, 0x15, 0x88, + 0x3d, 0x96, 0x4a, 0x78, 0x3d, 0x40, 0xdb, 0xce, 0xba, 0x50, 0x38, 0xed, 0x3b, + 0x3a, 0xfd, 0x00, 0x3d, 0x40, 0x1d, 0x3d, 0xbb, 0x8a, 0xd6, 0xae, 0xbc, 0x10, + 0x55, 0x7a, 0xbd, 0x91, 0x66, 0x59, 0x3d, 0x40, 0x74, 0xd5, 0xbc, 0x76, 0x92, + 0xb9, 0xbc, 0xa0, 0x5c, 0x4d, 0x3d, 0x59, 0xd0, 0x4a, 0x3d, 0x65, 0xa7, 0x5e, + 0xbd, 0x45, 0x6b, 0xea, 0x3d, 0x2b, 0x08, 0xdf, 0x3c, 0xb3, 0x37, 0x6e, 0x3d, + 0xfa, 0xad, 0xe0, 0xbc, 0xc3, 0xd2, 0x01, 0xbe, 0x24, 0x15, 0x90, 0x3d, 0x42, + 0xd3, 0xc4, 0x3c, 0x2b, 0xd6, 0x00, 0x3c, 0x9b, 0xf7, 0xcc, 0x3d, 0x7c, 0xc1, + 0x37, 0x3d, 0x4c, 0x98, 0xb6, 0x3d, 0x65, 0xac, 0x04, 0x3d, 0xbe, 0x0d, 0xf6, + 0x3c, 0x0a, 0x47, 0xb9, 0xbd, 0xa0, 0x2d, 0x4f, 0x3b, 0x44, 0x5d, 0xd1, 0xbc, + 0x3c, 0x8b, 0x82, 0x3d, 0xf8, 0xf9, 0x02, 0xbd, 0x21, 0xa7, 0x39, 0xbd, 0xa2, + 0x22, 0x82, 0x3d, 0xda, 0x8a, 0xb9, 0xbd, 0x6c, 0x42, 0x95, 0xbc, 0x98, 0x7b, + 0x9a, 0x3d, 0x1d, 0x34, 0x40, 0xbd, 0x68, 0xfa, 0x6f, 0x3c, 0xd6, 0x23, 0xa0, + 0x3d, 0x5a, 0xe0, 0x71, 0x3d, 0xda, 0xb5, 0x20, 0xbd, 0x0d, 0x43, 0xe0, 0x3c, + 0x77, 0xeb, 0x0c, 0x3d, 0x97, 0x10, 0xf9, 0x3c, 0xdb, 0xd9, 0xe6, 0x3a, 0xcb, + 0xff, 0x63, 0xbd, 0x75, 0x4f, 0xbf, 0xb9, 0x69, 0x4a, 0x20, 0xbd, 0xa2, 0xbf, + 0x56, 0x3d, 0xcc, 0xfe, 0x0e, 0xbe, 0xbe, 0xe9, 0x2e, 0x3d, 0x32, 0x25, 0x5d, + 0xbd, 0x77, 0x8a, 0x43, 0xbd, 0xc8, 0x8d, 0x4d, 0x3d, 0xd7, 0x87, 0xe4, 0x3c, + 0xc4, 0xf1, 0x50, 0x3d, 0x1a, 0xb6, 0x1a, 0x3d, 0x70, 0x13, 0x0f, 0x3c, 0xeb, + 0x1e, 0x6f, 0xbc, 0x4a, 0x22, 0x12, 0x3d, 0x7b, 0xe9, 0xcd, 0x3c, 0x1a, 0x2d, + 0x93, 0xbd, 0x21, 0xcd, 0x4b, 0xbd, 0x52, 0x94, 0x21, 0x3d, 0x1c, 0xb7, 0x0e, + 0xbd, 0x15, 0xea, 0x0c, 0xbd, 0x55, 0x60, 0xb0, 0x3b, 0xb4, 0x1d, 0xd0, 0x3d, + 0x43, 0xa2, 0x7b, 0xbd, 0xc9, 0x7b, 0x12, 0xbd, 0x64, 0x4f, 0x87, 0xbd, 0xea, + 0x0f, 0x8c, 0x3d, 0x07, 0x3a, 0xbb, 0xbd, 0xa8, 0xb6, 0x62, 0xbd, 0x74, 0xe8, + 0x84, 0x3d, 0xc2, 0x72, 0x6a, 0x3d, 0x58, 0xba, 0x67, 0xbb, 0x31, 0xf4, 0xb2, + 0x3d, 0x04, 0x0e, 0x92, 0xbd, 0xd4, 0x9f, 0x7a, 0x3d, 0x81, 0xd4, 0x89, 0xbc, + 0xe5, 0xe2, 0xe7, 0xbd, 0xb2, 0xd7, 0x51, 0xbd, 0x64, 0x57, 0x52, 0xbd, 0xb4, + 0x3f, 0x73, 0xbc, 0x22, 0x15, 0x4e, 0x3d, 0xe9, 0xf0, 0x4c, 0x3d, 0x05, 0x9b, + 0xfa, 0xbc, 0x28, 0xc4, 0xa1, 0x3d, 0xd2, 0x16, 0x51, 0x3d, 0xa0, 0x9f, 0x8f, + 0xbb, 0xc9, 0x02, 0x82, 0x3d, 0x13, 0x45, 0x84, 0x3c, 0x0a, 0x79, 0xc9, 0x3c, + 0xb9, 0x89, 0x19, 0xbd, 0x57, 0x1f, 0x86, 0xbb, 0xaa, 0xfa, 0xa0, 0x3d, 0x27, + 0x94, 0x00, 0xbd, 0x95, 0xf0, 0x86, 0xbd, 0x70, 0x37, 0x81, 0xbc, 0x0a, 0x32, + 0x09, 0x3d, 0x18, 0x6d, 0x18, 0xbd, 0x16, 0x40, 0x7e, 0x3d, 0x69, 0xfb, 0xaa, + 0xbc, 0x31, 0x93, 0x17, 0xbd, 0x3e, 0xc6, 0x59, 0xbc, 0x17, 0xc8, 0xe7, 0x3c, + 0x9e, 0x08, 0xc3, 0x3c, 0x79, 0x41, 0x12, 0x3d, 0xc8, 0xc2, 0x37, 0xbc, 0x3f, + 0xc1, 0x8f, 0xbd, 0xd9, 0x75, 0x94, 0xbd, 0x8c, 0xc3, 0x97, 0x3d, 0x36, 0xad, + 0x1b, 0xbe, 0x28, 0x9f, 0x80, 0xbc, 0x79, 0x5c, 0x84, 0xbc, 0x20, 0x29, 0x6b, + 0x3d, 0xe1, 0xad, 0xd1, 0xbb, 0xa4, 0x2c, 0x08, 0x3d, 0x6e, 0x13, 0x52, 0xbd, + 0x4c, 0x51, 0x60, 0x3d, 0xc0, 0xae, 0x92, 0x3d, 0xd3, 0x90, 0x35, 0xbd, 0x04, + 0x9e, 0x5f, 0xbd, 0x8c, 0xad, 0xee, 0xbc, 0x6f, 0x0b, 0x3e, 0x3d, 0xfb, 0x15, + 0x1c, 0x3c, 0x2f, 0x67, 0x98, 0xbb, 0x90, 0x7f, 0x9f, 0x3d, 0x21, 0x97, 0x2a, + 0xbc, 0xa0, 0x67, 0x9d, 0xbd, 0x5d, 0x64, 0x18, 0x3d, 0xaf, 0x36, 0xd9, 0x3b, + 0xe0, 0x06, 0xdc, 0x3c, 0xd0, 0x51, 0x8e, 0x3c, 0x48, 0x40, 0x56, 0x3d, 0xac, + 0x63, 0xb2, 0xbc, 0x63, 0x31, 0xf6, 0xbc, 0x48, 0x65, 0x07, 0x3d, 0x9c, 0x92, + 0x8d, 0xbd, 0x5c, 0xbb, 0x96, 0xbc, 0xa7, 0xdc, 0x07, 0x3c, 0xc4, 0xe5, 0xd8, + 0x3c, 0xb9, 0xea, 0x11, 0x3c, 0x10, 0x39, 0x13, 0x3a, 0x18, 0x34, 0x28, 0xbd, + 0xf4, 0x41, 0x6c, 0x3c, 0x25, 0x46, 0x12, 0xbd, 0xf9, 0x23, 0x3f, 0x3d, 0xfc, + 0x1d, 0xd9, 0x3d, 0x68, 0xc6, 0xa9, 0xbc, 0x97, 0x32, 0x1c, 0xbd, 0x3f, 0x51, + 0xbf, 0x3d, 0x7e, 0xd5, 0x3c, 0x3c, 0xda, 0x77, 0xcb, 0xbd, 0x10, 0x52, 0xb6, + 0xbc, 0xd8, 0xbd, 0x9b, 0x3d, 0x43, 0xd7, 0x7c, 0x3d, 0x4c, 0x78, 0xb2, 0xbc, + 0x7c, 0xda, 0xc9, 0xbc, 0x31, 0x8c, 0x4d, 0x3d, 0x82, 0x0e, 0xcb, 0xbc, 0xed, + 0xf9, 0xe8, 0x3b, 0xa8, 0x08, 0x4b, 0x3d, 0x38, 0x3c, 0x4a, 0xbd, 0x1d, 0xd9, + 0x0f, 0xbd, 0xd6, 0x17, 0x86, 0x3b, 0xa1, 0x90, 0xab, 0x3d, 0x91, 0xcc, 0x8f, + 0xbd, 0x07, 0xfa, 0x39, 0x3d, 0x11, 0x95, 0x03, 0x3d, 0x29, 0x0f, 0x31, 0xbc, + 0x87, 0xab, 0x3c, 0x3d, 0xc8, 0xe5, 0x5c, 0xb9, 0x44, 0x79, 0x44, 0xbd, 0x6d, + 0x4c, 0x90, 0xbc, 0x86, 0x90, 0xa5, 0xbc, 0x47, 0x61, 0x39, 0xbe, 0xf9, 0xeb, + 0x17, 0x3b, 0xea, 0x28, 0xe4, 0xbc, 0x79, 0x88, 0x12, 0xbc, 0x7a, 0x61, 0xdd, + 0x3d, 0x7f, 0xfe, 0x49, 0x3d, 0x78, 0x92, 0x5c, 0xbd, 0x6d, 0xe2, 0xa4, 0x3b, + 0x68, 0x57, 0x27, 0xbd, 0x61, 0x22, 0xaf, 0x3c, 0x02, 0x98, 0x6e, 0x3d, 0x74, + 0x02, 0xbb, 0x3d, 0x33, 0x4d, 0x24, 0xbd, 0x3e, 0x93, 0x81, 0xbc, 0xb2, 0x1e, + 0x1f, 0x3d, 0xb5, 0x79, 0x64, 0x3b, 0xbc, 0xfb, 0xf6, 0xbc, 0x61, 0x0c, 0xcd, + 0xbd, 0xc1, 0x64, 0x08, 0x3c, 0x6f, 0x3d, 0x27, 0xbd, 0x10, 0xd3, 0xdb, 0xbc, + 0xe4, 0xb6, 0xd2, 0x3b, 0x51, 0x12, 0x81, 0x3d, 0x37, 0xee, 0x87, 0xbc, 0xdd, + 0x80, 0xaf, 0x39, 0x90, 0x85, 0xaf, 0x3d, 0x80, 0x5f, 0x12, 0xbc, 0xcb, 0x3c, + 0x63, 0xbd, 0x81, 0x3c, 0x85, 0x3d, 0x10, 0xe7, 0x54, 0xbc, 0xa6, 0xb7, 0x98, + 0xbc, 0x07, 0x98, 0x2f, 0x3d, 0x70, 0x80, 0x28, 0xbe, 0x7a, 0xe5, 0x77, 0x3d, + 0x0b, 0x81, 0x51, 0xbd, 0xb1, 0xdf, 0x35, 0xbc, 0xd2, 0xf7, 0x0b, 0x3d, 0xbe, + 0x9e, 0x02, 0xbd, 0xa2, 0xc0, 0x03, 0x3d, 0x97, 0xf5, 0x2f, 0xbb, 0xc6, 0x6b, + 0x13, 0xbd, 0x81, 0xbc, 0xe8, 0xbb, 0x2a, 0x57, 0x63, 0x3d, 0x49, 0x18, 0x51, + 0xbc, 0xd7, 0x9e, 0x44, 0xbd, 0x51, 0x59, 0xb8, 0x3b, 0x5b, 0x9b, 0x86, 0x3c, + 0x1d, 0x63, 0x8a, 0x3d, 0x15, 0xc7, 0x94, 0xbd, 0x43, 0xc8, 0x05, 0xbd, 0x7b, + 0xc8, 0x26, 0x3d, 0xdc, 0x03, 0xbd, 0x3c, 0xa0, 0x16, 0x2b, 0xbd, 0x33, 0x15, + 0xfa, 0x3c, 0xfe, 0xce, 0x91, 0xbc, 0x0f, 0x1e, 0xe3, 0x3b, 0x01, 0x19, 0x2b, + 0xbd, 0x26, 0xff, 0x53, 0x3c, 0x4f, 0x22, 0x91, 0xbb, 0xf6, 0x4f, 0x84, 0xbd, + 0xc5, 0xf6, 0x8a, 0x3d, 0x76, 0xcf, 0x90, 0xbd, 0x4d, 0x0e, 0xb7, 0x3d, 0x90, + 0x1f, 0xd0, 0xbc, 0xd8, 0xa6, 0x7c, 0xbd, 0x39, 0xa0, 0x70, 0x3c, 0x33, 0x14, + 0x91, 0xbd, 0xa4, 0x66, 0x12, 0xbb, 0xfd, 0x3b, 0x4e, 0x3d, 0x87, 0x72, 0x0c, + 0x3d, 0xa1, 0x1b, 0x7b, 0xbc, 0xe0, 0x0f, 0xb5, 0xbc, 0x74, 0x49, 0x42, 0xbd, + 0x61, 0x8f, 0x34, 0x3d, 0x40, 0x4a, 0xb0, 0xbc, 0x19, 0xf3, 0x14, 0x3d, 0x5c, + 0xd5, 0x8a, 0x3d, 0x4e, 0xd1, 0x54, 0x3d, 0xd8, 0x0b, 0x0d, 0x3d, 0x04, 0x61, + 0x85, 0x3d, 0x7e, 0x9e, 0x33, 0x3d, 0xd7, 0x75, 0xcb, 0x3b, 0x71, 0x7a, 0x89, + 0xbb, 0xb5, 0x56, 0x62, 0xbd, 0x00, 0xe5, 0x87, 0xbc, 0x84, 0x92, 0xca, 0xbc, + 0xf4, 0x15, 0xbb, 0xbc, 0xe7, 0xae, 0xc5, 0x3a, 0x8a, 0x96, 0x98, 0x3c, 0x55, + 0xb6, 0x9a, 0xbc, 0x59, 0x6f, 0x2c, 0x3d, 0x5b, 0x3b, 0x14, 0x3c, 0xd7, 0xb4, + 0xa6, 0x3b, 0x3f, 0x09, 0x21, 0x3d, 0x64, 0xfc, 0x54, 0x3c, 0x03, 0xd5, 0xf4, + 0xbc, 0x06, 0x74, 0xb6, 0xbd, 0xd5, 0x70, 0x0b, 0xbd, 0xa6, 0xf8, 0x4b, 0x3c, + 0xea, 0x46, 0x32, 0xbd, 0xb4, 0x06, 0x3b, 0x3c, 0xc2, 0xa8, 0x0d, 0xbb, 0x12, + 0x60, 0x6f, 0x3c, 0x20, 0xca, 0x10, 0x3c, 0x05, 0xcc, 0xa6, 0xbc, 0x7a, 0xdd, + 0xdf, 0xbb, 0xcc, 0x65, 0x9e, 0x3c, 0x02, 0x81, 0xe3, 0x3c, 0x58, 0x15, 0x90, + 0x3d, 0x80, 0x4a, 0xb2, 0xbd, 0xd3, 0x92, 0x8d, 0x3d, 0xc8, 0x03, 0xd9, 0xbc, + 0xc9, 0xce, 0x49, 0xbd, 0x57, 0xb1, 0x87, 0xbc, 0xf8, 0xc8, 0xb9, 0x3d, 0xb5, + 0x6a, 0x02, 0xbd, 0x60, 0xe3, 0x24, 0x3d, 0xb3, 0xdd, 0x4d, 0x3d, 0x87, 0x6d, + 0x0e, 0xbd, 0xea, 0x2d, 0x67, 0xbd, 0x62, 0x3b, 0xa9, 0xbc, 0xd1, 0x23, 0x79, + 0x3d, 0x27, 0x90, 0x1a, 0x3d, 0xfa, 0xf4, 0xa3, 0x3c, 0x88, 0xf8, 0x76, 0xbd, + 0x48, 0x27, 0x4e, 0xbd, 0xad, 0xe7, 0x6d, 0x3c, 0xbd, 0x3f, 0xba, 0x3d, 0x6a, + 0x30, 0xb8, 0xbd, 0x2e, 0x5c, 0xc7, 0xbb, 0x76, 0x8f, 0x85, 0xbc, 0x9d, 0x0f, + 0x48, 0x3d, 0xae, 0x8b, 0xa4, 0x3d, 0x72, 0xca, 0x36, 0x3d, 0xcd, 0xab, 0xad, + 0xbc, 0xf4, 0x68, 0x11, 0xbd, 0xe4, 0xf0, 0x20, 0x39, 0x85, 0x8d, 0x52, 0xbd, + 0x73, 0x80, 0x89, 0x3d, 0x3e, 0x97, 0x11, 0xbd, 0x44, 0xe7, 0x13, 0x3d, 0x25, + 0xc3, 0x68, 0x3d, 0x4f, 0x88, 0x1c, 0x3d, 0x51, 0x5f, 0x86, 0xbc, 0xce, 0x97, + 0xfb, 0xbc, 0x0e, 0x5c, 0x11, 0xbd, 0x00, 0x0f, 0x05, 0x3d, 0x8c, 0x5a, 0xe2, + 0x3c, 0xdb, 0x30, 0x8c, 0x3d, 0x69, 0xac, 0xd6, 0x3c, 0xb6, 0x26, 0x22, 0x3d, + 0x11, 0x74, 0x72, 0xbd, 0x85, 0xc5, 0x4e, 0x3b, 0x9c, 0x72, 0x9e, 0x3d, 0xa6, + 0x49, 0x25, 0xbd, 0x9e, 0x77, 0x23, 0x3c, 0x01, 0xbf, 0x35, 0xbc, 0xf9, 0x0a, + 0x06, 0xbd, 0x66, 0xc8, 0x70, 0xbd, 0xb9, 0x54, 0x80, 0x3d, 0x70, 0x83, 0xd1, + 0xbc, 0x7b, 0x7a, 0xd5, 0xbc, 0x72, 0x5e, 0x1e, 0xbd, 0x7d, 0xb0, 0x24, 0x3d, + 0x88, 0x95, 0x3b, 0x3d, 0xb9, 0xc0, 0x4f, 0xbc, 0xf6, 0xf0, 0xcc, 0x3c, 0x6e, + 0x8d, 0x20, 0x3c, 0x0e, 0xe0, 0x8f, 0xbd, 0xfe, 0xd6, 0x2f, 0xbe, 0x40, 0x5e, + 0x05, 0x3c, 0x43, 0x3c, 0x1f, 0x3d, 0x2b, 0xfe, 0x63, 0xbd, 0xac, 0xfc, 0x78, + 0x3d, 0x89, 0xc7, 0x7b, 0xbd, 0xf8, 0x57, 0x38, 0xbd, 0x27, 0xf8, 0x9f, 0x3c, + 0xfe, 0xbe, 0x93, 0xbc, 0xa7, 0x0b, 0x52, 0xbc, 0xf9, 0xc1, 0xae, 0x3c, 0x84, + 0xf4, 0x6a, 0xbc, 0x3c, 0xcf, 0xf6, 0xba, 0x16, 0x08, 0x95, 0xbc, 0xcf, 0xf0, + 0x57, 0xbd, 0x5e, 0x93, 0x98, 0xbd, 0x84, 0x6a, 0xb4, 0x3d, 0xf6, 0x01, 0xe7, + 0xbc, 0x52, 0x9a, 0x85, 0xbc, 0x25, 0x22, 0x99, 0x3d, 0x00, 0xa0, 0x87, 0xbb, + 0xf8, 0xb5, 0x0e, 0xbc, 0xcd, 0xd6, 0x3d, 0x3d, 0x01, 0x80, 0x2d, 0xbe, 0xf5, + 0xcb, 0x94, 0x3d, 0x65, 0x93, 0x7f, 0xbc, 0x90, 0x42, 0x98, 0x3c, 0x1c, 0x10, + 0x13, 0x3d, 0xed, 0xb4, 0x8e, 0x3d, 0xdb, 0xd9, 0x01, 0xbd, 0x18, 0xe6, 0x8b, + 0x3c, 0x64, 0x69, 0x60, 0x3b, 0x63, 0x00, 0x1c, 0xbd, 0xe4, 0x57, 0x43, 0x3d, + 0xac, 0x16, 0xdc, 0x3d, 0x3d, 0x41, 0x3d, 0xbd, 0x18, 0xcb, 0x34, 0xbd, 0x28, + 0x93, 0x06, 0x3b, 0xf2, 0x17, 0x02, 0xbd, 0x2d, 0x29, 0x07, 0xbd, 0xde, 0xd1, + 0x88, 0xbc, 0xd8, 0x1e, 0x86, 0x3d, 0xda, 0xd2, 0xe3, 0xbb, 0xb6, 0xd8, 0x66, + 0xbd, 0xe9, 0xbd, 0x91, 0x3d, 0xd2, 0xf8, 0xa1, 0x3d, 0xce, 0x41, 0x1f, 0x3d, + 0x33, 0x84, 0xfa, 0xbc, 0xa7, 0x81, 0x8f, 0x3c, 0xe2, 0xf0, 0xda, 0xbc, 0x8d, + 0x67, 0x2a, 0x3d, 0xee, 0x5c, 0xef, 0x3d, 0x00, 0xf6, 0x3c, 0xbb, 0xcd, 0xa3, + 0x70, 0x3d, 0x3a, 0x58, 0x89, 0x3d, 0x03, 0xe3, 0x15, 0xbe, 0xfc, 0x75, 0x10, + 0x3c, 0xcc, 0xc4, 0x23, 0xbc, 0xd8, 0x48, 0x1f, 0x3c, 0xb2, 0x7c, 0xa1, 0x3a, + 0x7f, 0x0b, 0xda, 0x3d, 0x0d, 0xd0, 0x03, 0x3d, 0xf3, 0xca, 0xd9, 0x3b, 0x72, + 0x97, 0x1a, 0x3c, 0x5c, 0x19, 0xfa, 0xbd, 0xaa, 0x5d, 0x12, 0x3d, 0x75, 0xda, + 0x58, 0x3d, 0xec, 0x05, 0xb1, 0x3c, 0x6a, 0x21, 0xd9, 0xbc, 0x1d, 0x2c, 0x8c, + 0x3c, 0xfa, 0x2f, 0x1e, 0xbd, 0x93, 0x81, 0x98, 0xba, 0x42, 0x27, 0x62, 0xbd, + 0x1a, 0xe3, 0xa5, 0x3d, 0x17, 0x24, 0x18, 0xbc, 0x73, 0x8a, 0x24, 0xbd, 0xea, + 0x88, 0x92, 0xbc, 0x9d, 0x8d, 0xf7, 0xbc, 0xb4, 0xa6, 0xc8, 0xbd, 0xa0, 0xdd, + 0x8e, 0xbd, 0x4c, 0x81, 0x72, 0x3d, 0x59, 0x67, 0x48, 0xbd, 0x23, 0x21, 0xb3, + 0x3c, 0x6a, 0xc5, 0x43, 0x3d, 0x13, 0x50, 0x85, 0x3d, 0x0a, 0xd5, 0xb9, 0x3c, + 0xf3, 0xe6, 0x2b, 0xbd, 0x32, 0x6c, 0xe6, 0xbc, 0x11, 0x7c, 0x05, 0x3d, 0x99, + 0xeb, 0x48, 0xbc, 0x7d, 0x87, 0x35, 0xbd, 0x8b, 0x42, 0x5f, 0x3d, 0xae, 0x56, + 0x10, 0x3d, 0x02, 0x1e, 0x96, 0x3d, 0xf7, 0x64, 0xab, 0x3d, 0x66, 0xc3, 0xa2, + 0x3c, 0xe6, 0x36, 0xd8, 0xbc, 0x8c, 0xaa, 0x29, 0x3d, 0x52, 0x0b, 0x8b, 0xbc, + 0xce, 0x93, 0xef, 0xbc, 0xd9, 0x9b, 0x2c, 0xbd, 0x4a, 0x7a, 0xe6, 0x3c, 0xa1, + 0xdb, 0xaa, 0x3d, 0xfe, 0xac, 0x77, 0x3c, 0xd0, 0x02, 0xe2, 0xbc, 0x1c, 0xec, + 0xef, 0xbc, 0xe0, 0x92, 0xad, 0xbd, 0x46, 0xe8, 0x02, 0x3d, 0xd0, 0x99, 0x45, + 0x3b, 0x8a, 0xbc, 0x3f, 0xbd, 0x02, 0x86, 0x84, 0xbd, 0x34, 0xfb, 0xc3, 0xbd, + 0x71, 0xb4, 0xb7, 0x3d, 0xc0, 0x74, 0x42, 0xbb, 0xba, 0xef, 0x5d, 0xbc, 0x2b, + 0xd3, 0x21, 0x3c, 0x5a, 0xa2, 0xe4, 0xbc, 0x9f, 0xa9, 0x80, 0xbd, 0xa0, 0x48, + 0xb3, 0x3d, 0x39, 0xbb, 0xa4, 0xbd, 0xa9, 0x25, 0xb4, 0x3d, 0xb7, 0x12, 0xf3, + 0xbc, 0x25, 0x61, 0x37, 0xbd, 0xb9, 0x66, 0x80, 0x3d, 0xcd, 0xce, 0xcf, 0x3d, + 0x9f, 0xd0, 0x90, 0xbc, 0xd7, 0xbd, 0xf4, 0x3c, 0x20, 0x96, 0x8e, 0xbd, 0xd9, + 0xdf, 0x00, 0xbe, 0x8c, 0xf9, 0x5d, 0xbc, 0x58, 0xf0, 0x1e, 0x3d, 0xee, 0xec, + 0x2f, 0xbd, 0x32, 0x6b, 0x46, 0xbd, 0x72, 0x10, 0x2e, 0x3d, 0x33, 0x5a, 0x09, + 0xbd, 0x43, 0x78, 0x14, 0x3d, 0x33, 0xde, 0xa1, 0xbd, 0xcd, 0x6e, 0x35, 0x3c, + 0x05, 0x48, 0x22, 0xbd, 0x5b, 0x57, 0x80, 0x3d, 0x66, 0x64, 0xd7, 0x3b, 0x26, + 0xf1, 0x1a, 0x3c, 0x81, 0x24, 0x8a, 0xbd, 0x00, 0x84, 0x5e, 0xbd, 0xbc, 0xc0, + 0xdc, 0x3b, 0x74, 0x77, 0xa3, 0x3d, 0x8a, 0x55, 0xe3, 0x3c, 0x84, 0x75, 0x2e, + 0x3d, 0x45, 0x17, 0x3c, 0x3d, 0xcf, 0xd9, 0x62, 0xbd, 0x6e, 0x1c, 0xd2, 0x3c, + 0x6e, 0xe1, 0x21, 0xbe, 0x36, 0xf2, 0x95, 0x3d, 0x44, 0x50, 0x00, 0xba, 0x87, + 0x5b, 0xc8, 0xbc, 0xeb, 0xe0, 0xbd, 0x3d, 0x92, 0x7c, 0xff, 0x3c, 0x34, 0x97, + 0x32, 0x3d, 0x8f, 0x57, 0x73, 0x3d, 0x70, 0xfe, 0x5b, 0x3c, 0xba, 0x43, 0xee, + 0xbc, 0xa8, 0x7b, 0x06, 0x3c, 0xfc, 0x87, 0x8f, 0x3d, 0xf2, 0xd6, 0x43, 0xbd, + 0x18, 0x3c, 0x11, 0xbc, 0x1e, 0xc3, 0x62, 0x3c, 0x46, 0x98, 0x9e, 0x3c, 0x5a, + 0x90, 0xc4, 0xbc, 0xe6, 0x6b, 0x72, 0xbd, 0xce, 0x30, 0xa7, 0x3d, 0x81, 0xa2, + 0x10, 0xbd, 0x4e, 0x75, 0x24, 0x3d, 0xff, 0x9d, 0xea, 0xbc, 0x25, 0x08, 0x92, + 0x3c, 0x50, 0x0a, 0xf0, 0xbb, 0xf0, 0x91, 0x8d, 0xbc, 0x4c, 0xd8, 0xc8, 0x3c, + 0x16, 0xbb, 0x5d, 0xbd, 0x24, 0x8d, 0x32, 0x3d, 0x75, 0x67, 0x64, 0x3d, 0xe0, + 0x67, 0x46, 0x3b, 0xbc, 0x93, 0xbb, 0x3c, 0xd2, 0x74, 0x17, 0xbd, 0x45, 0x88, + 0x21, 0xbe, 0x4d, 0x15, 0x95, 0x3d, 0x41, 0x5c, 0xe7, 0xbb, 0xc9, 0x97, 0xfd, + 0xbc, 0x3b, 0xe2, 0x0f, 0xbd, 0x57, 0x38, 0xab, 0x3d, 0x13, 0x12, 0xeb, 0x3c, + 0x92, 0x5d, 0x4f, 0x3d, 0xf0, 0x1f, 0xbf, 0xbc, 0x37, 0x63, 0xf7, 0xbc, 0xa8, + 0x76, 0x32, 0x3c, 0x97, 0xd3, 0xc9, 0xbc, 0x28, 0x83, 0x5b, 0x3d, 0xe2, 0x0f, + 0x90, 0xbd, 0x31, 0x0b, 0x8a, 0xbd, 0x04, 0x7c, 0xd5, 0xbc, 0x16, 0x5d, 0xa7, + 0x3a, 0x54, 0x36, 0x4f, 0xbd, 0x4d, 0xae, 0x64, 0x3d, 0xfd, 0x4c, 0x94, 0xbc, + 0x72, 0x3f, 0x96, 0xbc, 0x41, 0xd7, 0xfa, 0x3b, 0x52, 0x45, 0x03, 0xbc, 0x1f, + 0x50, 0xa6, 0xbd, 0x28, 0xb9, 0x78, 0x3c, 0x16, 0xa5, 0x77, 0x3c, 0xf2, 0x4e, + 0xa1, 0x3c, 0x84, 0xb6, 0x84, 0xbd, 0xc5, 0x78, 0xdc, 0x3c, 0xb4, 0xd1, 0x27, + 0xbd, 0x04, 0x20, 0x8d, 0xbd, 0xa0, 0x12, 0x36, 0x3c, 0xce, 0xb5, 0x31, 0xbe, + 0x4b, 0xfd, 0x44, 0xbc, 0xe3, 0x38, 0x00, 0xbd, 0xca, 0x35, 0x60, 0x3c, 0xc6, + 0xe4, 0x93, 0xb6, 0xc9, 0x84, 0xc0, 0x3a, 0xb3, 0x53, 0x88, 0x3d, 0x08, 0x37, + 0x0b, 0x3c, 0xd9, 0x6d, 0x00, 0xbb, 0x54, 0x22, 0xcc, 0xbb, 0x3c, 0x72, 0xa7, + 0xbc, 0x39, 0xbd, 0xc0, 0x3d, 0xc7, 0xb5, 0x0a, 0x3b, 0xe3, 0xbc, 0x38, 0xbc, + 0x0d, 0x1c, 0x1f, 0xbc, 0xbc, 0x5b, 0x42, 0xbc, 0xf3, 0x43, 0xb2, 0x3c, 0x5e, + 0x7e, 0xc3, 0xbc, 0x40, 0xbf, 0x47, 0x3c, 0xe7, 0x7d, 0x3e, 0xbc, 0x30, 0xf4, + 0x13, 0xbc, 0x5f, 0x8d, 0xd1, 0x3c, 0xe1, 0x93, 0xe7, 0xbc, 0x73, 0x12, 0x87, + 0xbc, 0x52, 0xb6, 0x9d, 0x3b, 0xf6, 0xda, 0x8d, 0x3d, 0x6b, 0xb8, 0x03, 0x3c, + 0x58, 0x8e, 0x25, 0xbd, 0x7b, 0xaa, 0x8a, 0xbc, 0x75, 0xd1, 0x84, 0x3d, 0x0e, + 0x90, 0xcd, 0xbc, 0x17, 0x0e, 0x8b, 0x3d, 0x87, 0x5e, 0x04, 0xbd, 0xe5, 0x99, + 0x9b, 0xbc, 0x0a, 0xdd, 0x3b, 0x3d, 0x22, 0xc9, 0x83, 0xbc, 0xb8, 0x42, 0x3f, + 0x3d, 0x86, 0x99, 0x90, 0x3d, 0x41, 0x4e, 0xa2, 0x3d, 0xf0, 0x89, 0x4f, 0xbd, + 0xa6, 0x28, 0x75, 0xbd, 0xea, 0xf1, 0x56, 0xbd, 0x96, 0xb0, 0x9b, 0xbc, 0x01, + 0x85, 0xb5, 0x3d, 0xcf, 0x71, 0x4c, 0x3d, 0x98, 0xf9, 0x6d, 0xbc, 0xc8, 0x59, + 0x38, 0xbd, 0x12, 0x6f, 0x7b, 0x3d, 0x61, 0xac, 0xf1, 0xbb, 0xd4, 0x32, 0x4a, + 0x3d, 0x92, 0x25, 0x45, 0x3d, 0x53, 0x88, 0x6d, 0xbd, 0xa0, 0x69, 0xda, 0xbb, + 0xf2, 0xf2, 0xda, 0x3b, 0xf3, 0x4d, 0x84, 0xbc, 0x61, 0x96, 0xda, 0x3c, 0xa3, + 0x9c, 0x9a, 0x3b, 0x70, 0x04, 0x93, 0xbb, 0x11, 0x0f, 0xe7, 0xbc, 0x06, 0x52, + 0x86, 0xbd, 0x0f, 0xf5, 0x6c, 0xbd, 0xe1, 0x4c, 0x8d, 0x3d, 0x59, 0x20, 0xa0, + 0xbd, 0xf8, 0x29, 0x94, 0x3d, 0x3f, 0x89, 0x86, 0xbd, 0x15, 0x66, 0x15, 0xbd, + 0xad, 0x80, 0xdf, 0x3c, 0x5b, 0xd4, 0x6c, 0xbc, 0x2c, 0x5f, 0x60, 0x3c, 0x2b, + 0x82, 0xd5, 0x3c, 0x3f, 0x7e, 0x14, 0xbd, 0x6c, 0xe8, 0xaf, 0xbb, 0xee, 0x8b, + 0x27, 0xbd, 0xa0, 0xa8, 0x20, 0xbd, 0xe8, 0x39, 0x54, 0xbc, 0x9b, 0x57, 0xb7, + 0x3d, 0x6a, 0x42, 0x81, 0x3d, 0xd3, 0x09, 0x10, 0xbd, 0x95, 0xd4, 0x3a, 0x3d, + 0x48, 0xe1, 0xb8, 0xbc, 0xf4, 0x91, 0xa0, 0xbd, 0x8e, 0x67, 0x5e, 0xbd, 0x3b, + 0x3d, 0xa0, 0x3d, 0x82, 0x2e, 0x85, 0x3d, 0x10, 0x91, 0x8c, 0xbb, 0x63, 0xb7, + 0x75, 0xbd, 0xf5, 0xd8, 0x35, 0xbd, 0xea, 0x58, 0x11, 0xbb, 0xc4, 0x87, 0xe5, + 0xbc, 0xb4, 0x14, 0xce, 0x3d, 0x86, 0x00, 0x0b, 0x3c, 0x91, 0x4b, 0xb2, 0xbd, + 0xa9, 0x2e, 0x93, 0x3d, 0xc3, 0x3a, 0xc3, 0xbb, 0x7c, 0x8a, 0x83, 0xbd, 0xd2, + 0xb1, 0x2e, 0xbd, 0xbb, 0x27, 0xa9, 0xbd, 0xa7, 0x9f, 0x41, 0x3d, 0x0a, 0x47, + 0x15, 0xbd, 0xeb, 0x11, 0xca, 0x3c, 0xfe, 0x0d, 0xef, 0xbc, 0x71, 0x53, 0x52, + 0x3d, 0x0b, 0x4b, 0x44, 0x3c, 0x9d, 0xbf, 0x10, 0xbb, 0xf9, 0x31, 0xe6, 0x3c, + 0x97, 0x60, 0xbd, 0xbd, 0x8c, 0x40, 0x87, 0x3c, 0x30, 0x66, 0x18, 0x3d, 0x1a, + 0x2b, 0xcd, 0x3c, 0x52, 0x92, 0x7e, 0xbd, 0x58, 0xee, 0x02, 0x3d, 0x0a, 0x85, + 0xf7, 0xbc, 0x76, 0x75, 0x7f, 0xbd, 0xff, 0x11, 0xde, 0x3b, 0x5b, 0x43, 0x4b, + 0x3d, 0xa2, 0x53, 0x3f, 0xbd, 0x90, 0xf3, 0x42, 0xbd, 0x5b, 0xb9, 0x1e, 0x3d, + 0x43, 0x66, 0x46, 0xbc, 0x3e, 0x79, 0x7f, 0xbd, 0x24, 0xa8, 0xa0, 0xbd, 0xd5, + 0xb2, 0xd2, 0x3c, 0xf6, 0x82, 0x7d, 0x3b, 0x52, 0x09, 0x4e, 0xbd, 0x23, 0x30, + 0xfa, 0x3d, 0x62, 0xb4, 0x72, 0x3d, 0xa6, 0x3c, 0x98, 0x3c, 0x20, 0x3f, 0xdd, + 0xbb, 0xb0, 0xfa, 0x4f, 0xbd, 0x0f, 0x36, 0x24, 0xbb, 0x19, 0xbc, 0x7d, 0xbd, + 0x8d, 0xab, 0x2e, 0x3d, 0x1e, 0x67, 0x61, 0x3d, 0x8a, 0x39, 0x61, 0xbb, 0xb1, + 0xa0, 0x01, 0xbc, 0x0d, 0x75, 0x64, 0xbc, 0x89, 0xd7, 0x84, 0xbd, 0x1f, 0x26, + 0xa6, 0xbd, 0x7a, 0x67, 0x62, 0x3d, 0x3d, 0x4d, 0x06, 0xbb, 0xff, 0xe4, 0x92, + 0x3d, 0x32, 0x12, 0x95, 0xbc, 0x4b, 0x2e, 0x8b, 0xbc, 0x8b, 0x4a, 0x14, 0x3c, + 0xea, 0x08, 0x81, 0xbd, 0xb3, 0x3e, 0xb3, 0xbd, 0x96, 0x40, 0xef, 0x3c, 0xc6, + 0xf4, 0x83, 0xbd, 0x70, 0x8a, 0xad, 0xbc, 0x28, 0x6d, 0x26, 0xbd, 0x0e, 0x8f, + 0x89, 0x3a, 0xbc, 0x30, 0xc8, 0xbd, 0x81, 0x3c, 0x22, 0xbd, 0x19, 0x06, 0xb4, + 0x3d, 0x2a, 0xbf, 0x2a, 0x3d, 0xc9, 0xd4, 0x00, 0xbd, 0x74, 0x7d, 0x9b, 0x3b, + 0xc5, 0x7a, 0x13, 0xbd, 0xbf, 0x24, 0x18, 0xbc, 0x63, 0x21, 0xfd, 0x3c, 0x8f, + 0x45, 0xf6, 0xbd, 0xf6, 0xb7, 0x85, 0x3c, 0x49, 0xc7, 0xee, 0xbb, 0x31, 0x16, + 0x9c, 0x3d, 0x86, 0x9e, 0x44, 0x3d, 0x97, 0x25, 0x99, 0x3d, 0x33, 0x23, 0xa6, + 0x3d, 0x7f, 0x66, 0x2b, 0x3d, 0xbd, 0xe9, 0x43, 0x3d, 0x11, 0x56, 0x76, 0xbc, + 0x30, 0x7c, 0x87, 0xbb, 0xfe, 0xae, 0xfb, 0xb8, 0x4c, 0x48, 0x47, 0xbd, 0x74, + 0x13, 0x8b, 0xbd, 0x26, 0x22, 0x87, 0x3d, 0x22, 0xb0, 0x87, 0x3d, 0x9f, 0xc6, + 0x74, 0xbd, 0x7a, 0x47, 0x70, 0x3c, 0xe0, 0x41, 0x8b, 0x3d, 0xfb, 0xa2, 0x43, + 0xbc, 0x63, 0x0d, 0x21, 0xbd, 0x8a, 0x60, 0x36, 0xbb, 0x54, 0xe8, 0x59, 0x3c, + 0x21, 0xd4, 0xa9, 0x3b, 0x00, 0x5b, 0x20, 0x3d, 0x61, 0x25, 0x72, 0x3d, 0x39, + 0x8d, 0x3b, 0x3d, 0x5e, 0xcd, 0x4f, 0x3d, 0xa0, 0x47, 0x0c, 0xbd, 0x34, 0xc9, + 0x09, 0x3d, 0xb8, 0x59, 0xa2, 0xbc, 0x9a, 0xa3, 0x82, 0x3d, 0x1b, 0xd4, 0x1f, + 0xbe, 0xa4, 0x45, 0x9d, 0x3d, 0x9e, 0x03, 0xc6, 0x3c, 0x0c, 0x23, 0x30, 0x3d, + 0x9c, 0xb4, 0xec, 0xbb, 0xf8, 0x66, 0x9c, 0xbc, 0x6c, 0x32, 0x7e, 0x3d, 0x4b, + 0x32, 0x51, 0x3d, 0x64, 0x32, 0x75, 0x3d, 0x1b, 0xc9, 0xd1, 0x3c, 0x98, 0xac, + 0x05, 0x3d, 0x4a, 0x99, 0x74, 0x3b, 0x40, 0x86, 0x41, 0xbd, 0xf6, 0xa7, 0x03, + 0xbd, 0x95, 0x47, 0x23, 0x3c, 0x78, 0xf3, 0x0c, 0x3d, 0xf4, 0x66, 0xdc, 0x3b, + 0x4d, 0x45, 0xbf, 0xbb, 0x65, 0x4b, 0x73, 0xbc, 0x51, 0x10, 0x8c, 0x3c, 0x5e, + 0x5a, 0x67, 0x3d, 0xd7, 0x47, 0x82, 0x3d, 0xdc, 0x32, 0x9c, 0xbc, 0xe4, 0xa5, + 0x87, 0xbd, 0xc2, 0xd2, 0xc4, 0xbd, 0x08, 0xbe, 0x6e, 0x3d, 0xa8, 0x8b, 0xf1, + 0x3c, 0x10, 0xc0, 0xb1, 0xbc, 0x12, 0x09, 0x88, 0x3d, 0x3f, 0x54, 0x25, 0x3d, + 0x11, 0x70, 0x26, 0x3b, 0xdd, 0x48, 0x18, 0x3c, 0x01, 0x3c, 0xee, 0xbd, 0x4f, + 0x63, 0x36, 0xbc, 0xea, 0x7e, 0x3f, 0x3d, 0x86, 0x4d, 0x45, 0x3d, 0x4b, 0x63, + 0x70, 0xbc, 0x32, 0xdf, 0xc0, 0x3d, 0x50, 0x3c, 0x13, 0x3c, 0x0e, 0x61, 0xa3, + 0x3d, 0xe8, 0xc5, 0x37, 0xbd, 0x3b, 0xd7, 0x01, 0xbd, 0x20, 0x1b, 0x89, 0xbc, + 0x70, 0x18, 0xee, 0xbc, 0x3e, 0xeb, 0xfa, 0xbb, 0x18, 0xda, 0xda, 0x3c, 0xd6, + 0x82, 0x19, 0xbd, 0xf1, 0x7e, 0x88, 0xbd, 0x39, 0x1d, 0xb8, 0xbb, 0x67, 0x98, + 0x1c, 0x3d, 0x72, 0x83, 0x90, 0x3d, 0xd3, 0x17, 0x6b, 0xbd, 0xcc, 0x55, 0xa8, + 0x3c, 0x18, 0x2e, 0x2c, 0xbd, 0x08, 0xc4, 0x34, 0x3c, 0xf8, 0x8f, 0x51, 0xbd, + 0x88, 0x62, 0xfe, 0x3c, 0xbc, 0xe0, 0xb1, 0xbc, 0x09, 0x93, 0x88, 0xbb, 0x95, + 0x9c, 0xda, 0x3c, 0x83, 0xda, 0x3a, 0xbd, 0xb8, 0x82, 0x81, 0x3c, 0x39, 0xa8, + 0x8a, 0xbd, 0x8b, 0xb0, 0x31, 0xbb, 0x4a, 0x2c, 0x07, 0xbe, 0xec, 0x84, 0x9b, + 0x3c, 0xc9, 0x97, 0x56, 0x3d, 0x3d, 0xce, 0x97, 0xbd, 0xa6, 0xe3, 0xbc, 0x3d, + 0x91, 0xc4, 0x0f, 0x3d, 0x35, 0xe9, 0xd1, 0xbc, 0x10, 0x48, 0x17, 0x3c, 0x9a, + 0x86, 0x86, 0xbd, 0x08, 0x63, 0xf9, 0xbc, 0xb0, 0xb0, 0x98, 0x3c, 0x3e, 0x7e, + 0x4e, 0x3d, 0xe0, 0x6f, 0x73, 0xbc, 0xa5, 0x9e, 0x03, 0xbd, 0x7c, 0x39, 0x53, + 0x39, 0x6d, 0x86, 0x40, 0xba, 0x1d, 0x71, 0x86, 0x3d, 0x62, 0xec, 0x9d, 0x3c, + 0x03, 0x1e, 0x29, 0x3d, 0xbd, 0xbf, 0xd2, 0xbd, 0xce, 0x1c, 0x0c, 0x3d, 0x7f, + 0xb3, 0x9c, 0x3d, 0x93, 0xa6, 0xa1, 0xbc, 0xb9, 0xf4, 0x6b, 0xbd, 0x17, 0xce, + 0x40, 0xbd, 0x33, 0x15, 0x00, 0x3d, 0xd3, 0x33, 0x9c, 0x3d, 0x01, 0xc6, 0xec, + 0x3c, 0x65, 0x42, 0xba, 0x3c, 0x33, 0x73, 0xec, 0xbc, 0x47, 0xf8, 0x00, 0x3d, + 0xd1, 0x1b, 0x66, 0x3d, 0x10, 0x9b, 0x0b, 0xbe, 0xe6, 0x45, 0x48, 0xbd, 0x90, + 0x46, 0xbd, 0x3c, 0x29, 0xe0, 0xb5, 0xbc, 0x50, 0x42, 0x6a, 0x3d, 0x00, 0x37, + 0x9e, 0x3d, 0xc1, 0x54, 0xa0, 0x3c, 0x00, 0x3c, 0x2f, 0xbb, 0x05, 0x4f, 0xa7, + 0xbc, 0x3d, 0x86, 0x68, 0xbd, 0x24, 0x65, 0x51, 0xbc, 0xff, 0x74, 0x21, 0x3d, + 0x81, 0x5d, 0x25, 0x3d, 0x5d, 0xd0, 0x7a, 0xbd, 0x37, 0xb1, 0x40, 0xbd, 0xf0, + 0xfd, 0x3d, 0x3d, 0x1e, 0xb2, 0x2a, 0xbc, 0x62, 0x35, 0x9e, 0xbd, 0xeb, 0x65, + 0x51, 0xbc, 0x6f, 0xf6, 0x9a, 0xbd, 0x82, 0x5b, 0x81, 0xbc, 0xd7, 0x8a, 0x29, + 0x3d, 0x5a, 0x89, 0x81, 0xbb, 0x6d, 0xf8, 0xe0, 0x3c, 0xa6, 0x56, 0x3c, 0x3d, + 0x9d, 0xc6, 0x49, 0xbc, 0xdf, 0x38, 0x79, 0x3c, 0x51, 0x74, 0x4e, 0x3d, 0x02, + 0xb4, 0x2e, 0xbd, 0x6e, 0x2c, 0x52, 0xbd, 0x98, 0x05, 0x96, 0x3c, 0x5e, 0xef, + 0x12, 0x3d, 0xa9, 0x44, 0x29, 0xbd, 0x29, 0xcf, 0x47, 0x3d, 0x08, 0x33, 0xa3, + 0xbd, 0xc7, 0xe5, 0x26, 0x3c, 0x16, 0xf0, 0xc7, 0xbc, 0x89, 0xde, 0xa2, 0x3a, + 0x57, 0x77, 0xb9, 0x3b, 0xa0, 0x30, 0x9d, 0x3c, 0xd9, 0xf8, 0x91, 0xbc, 0xdc, + 0xac, 0x41, 0x3c, 0xc9, 0xe5, 0x1a, 0xbd, 0x66, 0xcc, 0x89, 0x3d, 0xae, 0x83, + 0x95, 0xbd, 0xf6, 0x92, 0xd3, 0x3c, 0x6a, 0x9a, 0xf7, 0x3c, 0xb4, 0xf9, 0x7c, + 0xbb, 0x79, 0xd8, 0x99, 0xbc, 0x82, 0x88, 0xb6, 0xbc, 0xf7, 0xdf, 0xb3, 0x3d, + 0x57, 0xa6, 0xa7, 0xbd, 0x2e, 0x22, 0xd9, 0xbc, 0xd6, 0x67, 0x91, 0xbc, 0x54, + 0x25, 0x32, 0x3d, 0xc3, 0x91, 0x93, 0xbd, 0x1d, 0x77, 0x33, 0x3b, 0x56, 0xc9, + 0x8b, 0x3d, 0xbf, 0xe2, 0x21, 0x3c, 0xf5, 0x88, 0x80, 0xbd, 0xee, 0x4f, 0xd8, + 0xbc, 0xbf, 0x1c, 0x83, 0xbd, 0xa4, 0x91, 0x61, 0x3d, 0xdc, 0xc1, 0x74, 0x3d, + 0xb4, 0x4d, 0x90, 0xbd, 0x80, 0x3d, 0xbb, 0x3c, 0x27, 0x03, 0xa2, 0xbb, 0x7e, + 0x7e, 0xd9, 0x3c, 0xf4, 0x18, 0x5f, 0xbc, 0xb1, 0xde, 0x83, 0x3d, 0xd5, 0xee, + 0x20, 0xbd, 0xbe, 0xa8, 0x7a, 0xbc, 0x01, 0x94, 0x03, 0xbd, 0x27, 0xa8, 0xfc, + 0xbd, 0x72, 0x14, 0x56, 0x3d, 0x79, 0x46, 0x0d, 0xbc, 0x69, 0x23, 0xd1, 0x3c, + 0x3b, 0x33, 0x49, 0x3d, 0x8d, 0xef, 0x18, 0x3b, 0xe9, 0xe1, 0x8f, 0xbd, 0x4f, + 0x45, 0x05, 0x3d, 0x28, 0x80, 0x49, 0x3c, 0xbd, 0x49, 0x18, 0x3d, 0xfd, 0xd4, + 0x86, 0x3c, 0xcc, 0x56, 0xa6, 0x3c, 0x37, 0x8e, 0xef, 0x3a, 0x57, 0x1e, 0x5f, + 0x3d, 0xc2, 0xef, 0x68, 0xbc, 0x24, 0xc0, 0xbe, 0xbd, 0x9c, 0xfd, 0xa0, 0x3b, + 0x48, 0x3b, 0x5d, 0x3d, 0xcf, 0xe0, 0x2c, 0xbd, 0x49, 0x51, 0xa7, 0x3d, 0x65, + 0xcf, 0x7a, 0xbc, 0x27, 0x68, 0x4c, 0xbd, 0x00, 0xed, 0x99, 0xbc, 0x2a, 0xac, + 0x5d, 0xbd, 0x6b, 0x5c, 0x9a, 0x3c, 0x71, 0xb7, 0x51, 0x3c, 0x1a, 0x04, 0x60, + 0xbd, 0x4b, 0xb8, 0x42, 0x3d, 0xf6, 0x92, 0x4f, 0x3d, 0xcb, 0x7a, 0xc4, 0x3c, + 0xc2, 0x1f, 0x85, 0x3d, 0xbf, 0x4c, 0x3b, 0x3b, 0x52, 0x04, 0x9a, 0xbd, 0x3a, + 0x5c, 0x29, 0x3d, 0x5f, 0x4e, 0xb1, 0x3d, 0xfc, 0x4e, 0x87, 0xbc, 0x59, 0x10, + 0xaa, 0x3d, 0x99, 0xff, 0x43, 0x3d, 0x20, 0x80, 0x8e, 0x3c, 0x79, 0x81, 0x3e, + 0xbd, 0xfe, 0x38, 0xab, 0xbd, 0x3d, 0x72, 0xad, 0x3d, 0x18, 0xa1, 0x64, 0xbd, + 0xa0, 0x6e, 0xb0, 0xbb, 0x19, 0x6b, 0x00, 0x3d, 0x6b, 0x7b, 0x15, 0xbc, 0x45, + 0xb5, 0xa6, 0xbd, 0xef, 0x81, 0x05, 0xbd, 0x9f, 0xe8, 0x37, 0x3d, 0x71, 0xbe, + 0xb6, 0xbc, 0x22, 0x55, 0xd6, 0xbc, 0x0d, 0x9b, 0xcf, 0x3c, 0x47, 0xa3, 0x92, + 0x3d, 0xfd, 0x13, 0x74, 0x3d, 0x4f, 0xef, 0x53, 0x3d, 0x8b, 0xeb, 0x0f, 0xbd, + 0xf9, 0x86, 0x00, 0x3d, 0xb8, 0xd1, 0x68, 0xbc, 0x68, 0xa4, 0x1c, 0xbd, 0x96, + 0x27, 0x01, 0x3d, 0x28, 0x65, 0x4a, 0x3d, 0xef, 0xa3, 0x41, 0xbd, 0xdd, 0xd4, + 0xac, 0x3c, 0x24, 0x42, 0x48, 0x3d, 0x55, 0x49, 0x99, 0x39, 0x7a, 0x2f, 0xde, + 0xbc, 0x7f, 0xff, 0x94, 0x3d, 0x76, 0x44, 0x14, 0xbd, 0xea, 0xa9, 0x05, 0x3d, + 0xd1, 0xa5, 0x2c, 0x3d, 0xfa, 0x4f, 0x0c, 0xbd, 0xda, 0x0a, 0x6d, 0xbd, 0x52, + 0x92, 0x47, 0x3d, 0x8b, 0x87, 0x8b, 0x3d, 0xd0, 0x89, 0x48, 0xbd, 0xaa, 0xbe, + 0x03, 0x3d, 0xa0, 0x14, 0x6d, 0xbd, 0x20, 0x3a, 0x80, 0x3d, 0x08, 0x2f, 0x86, + 0xbd, 0xf9, 0xfd, 0xa4, 0xbd, 0xde, 0xd5, 0x92, 0xbc, 0xcd, 0x8a, 0x64, 0x3d, + 0x48, 0xd0, 0x6c, 0x3d, 0x6a, 0xa3, 0xfa, 0xbc, 0xc3, 0xc7, 0x36, 0xbd, 0xb1, + 0x87, 0x2e, 0xbd, 0x3b, 0x6c, 0x9e, 0x3d, 0x56, 0x18, 0x1a, 0xbe, 0x9e, 0xd1, + 0xf5, 0x3c, 0xb9, 0xfe, 0xc3, 0xbc, 0x46, 0xbc, 0x40, 0xbd, 0x94, 0x3a, 0x48, + 0x3d, 0xbc, 0x4e, 0xbb, 0x3d, 0xa0, 0x7b, 0x94, 0xbc, 0xd8, 0xeb, 0x91, 0x3d, + 0x95, 0xa1, 0x99, 0xbd, 0xf4, 0x73, 0x9c, 0x3b, 0x23, 0x2d, 0x8e, 0x3d, 0x46, + 0x9c, 0xa5, 0xbb, 0x61, 0x13, 0x50, 0xbd, 0xad, 0x99, 0xf8, 0x3c, 0xd2, 0xac, + 0x7d, 0xbd, 0xc1, 0xb2, 0x6d, 0xbc, 0xf7, 0xde, 0x9f, 0xbd, 0x60, 0x72, 0x15, + 0x3d, 0x69, 0xaf, 0xa2, 0x3d, 0xfd, 0x72, 0x79, 0x3d, 0xd0, 0xc0, 0xa1, 0xbb, + 0x80, 0x21, 0x4f, 0x3d, 0xbc, 0x91, 0x0a, 0xbc, 0x23, 0xa3, 0xee, 0xbc, 0xd0, + 0x1a, 0xbb, 0xbd, 0x2a, 0x71, 0x35, 0x3d, 0x21, 0x26, 0x66, 0x3d, 0xb4, 0x17, + 0x89, 0xbb, 0x54, 0x4f, 0x80, 0xbc, 0x47, 0x10, 0xf3, 0xbc, 0x22, 0x75, 0x6c, + 0x3d, 0xb1, 0x75, 0x00, 0x3d, 0xe2, 0xf4, 0xf5, 0xbd, 0xbe, 0xbc, 0x7b, 0x3d, + 0xe3, 0x01, 0xc1, 0xbc, 0x05, 0x25, 0x82, 0xbb, 0x3f, 0x02, 0x5d, 0xbb, 0xa9, + 0xc1, 0x5a, 0x3d, 0xea, 0xe4, 0x5e, 0x3c, 0x96, 0xd6, 0xa5, 0x3c, 0xcb, 0x77, + 0xa4, 0x3c, 0xb2, 0x4f, 0x06, 0xbd, 0x84, 0xc3, 0x2c, 0xbd, 0x48, 0xdc, 0x9d, + 0x3b, 0xdb, 0xd6, 0xbb, 0xbc, 0xc8, 0xdf, 0x98, 0xbc, 0x29, 0x14, 0x31, 0x3d, + 0x6f, 0xfa, 0x4f, 0xbd, 0x7c, 0xb4, 0xaa, 0xbd, 0xe0, 0xeb, 0x2e, 0xbd, 0x53, + 0x3f, 0xc4, 0x3d, 0xbc, 0xcb, 0x38, 0x3d, 0x30, 0x45, 0x30, 0x3c, 0xf0, 0xc1, + 0x0c, 0xbd, 0xb3, 0x20, 0x39, 0xbd, 0x80, 0xe2, 0x8b, 0x3b, 0x35, 0x31, 0x05, + 0xbd, 0xf5, 0xaa, 0x49, 0xbc, 0x7d, 0x08, 0x0a, 0x3d, 0xdd, 0x96, 0x84, 0xbc, + 0x0f, 0xb9, 0x4c, 0x3d, 0x49, 0xea, 0x86, 0x3d, 0xc9, 0xd0, 0x75, 0xbb, 0xcd, + 0x9b, 0xd1, 0x3d, 0x7a, 0x5e, 0x6f, 0xbd, 0x4a, 0x2e, 0xc0, 0xba, 0x3b, 0x7d, + 0x7d, 0xbd, 0x2b, 0x8f, 0xfe, 0xbb, 0x2a, 0xf4, 0xce, 0x3d, 0xf6, 0xfc, 0x06, + 0xbc, 0xdd, 0x02, 0x4a, 0x3c, 0x71, 0x3c, 0x03, 0xbd, 0x03, 0x9a, 0x90, 0xbd, + 0x76, 0xb7, 0xb3, 0xbd, 0xa2, 0xd1, 0x47, 0xbd, 0xc1, 0x56, 0x6e, 0x3d, 0xff, + 0x97, 0x57, 0x3d, 0x50, 0x57, 0xe6, 0xbc, 0x8f, 0xb3, 0x3d, 0xbd, 0x75, 0x8e, + 0x80, 0xbd, 0xc7, 0x6c, 0x43, 0xbc, 0xaa, 0xe3, 0x9d, 0xbd, 0x6f, 0xe4, 0x1d, + 0x3d, 0x3a, 0x57, 0x98, 0x3c, 0x6c, 0x08, 0x5c, 0x3d, 0xeb, 0xd2, 0xa5, 0xbb, + 0xf7, 0x60, 0x08, 0xbc, 0x72, 0x03, 0x3b, 0xbd, 0xe7, 0xc1, 0x8f, 0x3d, 0xb6, + 0x1f, 0x98, 0x3d, 0x59, 0xff, 0x88, 0x3d, 0x51, 0xe9, 0x73, 0xbc, 0x1f, 0x91, + 0xa5, 0x3d, 0x3b, 0x64, 0x17, 0xbd, 0x5b, 0xa5, 0x80, 0x3d, 0x03, 0x38, 0x85, + 0x3d, 0xbe, 0x27, 0x90, 0xbd, 0x4e, 0x87, 0xa3, 0xbc, 0xc1, 0xbb, 0x22, 0xbc, + 0x8b, 0x25, 0xd0, 0xbb, 0x6a, 0x2f, 0x1d, 0x3d, 0x0a, 0xdd, 0x48, 0x3d, 0x0b, + 0x37, 0x37, 0x3d, 0x2a, 0x68, 0x1a, 0x3d, 0xc8, 0x85, 0x4a, 0x3d, 0x0a, 0xa5, + 0x03, 0x3c, 0xd2, 0x41, 0x12, 0x3d, 0x25, 0xc3, 0x24, 0x3b, 0x1a, 0x95, 0x33, + 0x3d, 0xbf, 0xfd, 0xd7, 0x3c, 0xce, 0xff, 0x6e, 0xbc, 0x91, 0xc5, 0x0f, 0x3c, + 0x7e, 0x5f, 0x64, 0xbd, 0x64, 0x7d, 0x1c, 0xbd, 0x42, 0x2d, 0xba, 0x3d, 0x99, + 0x69, 0xa5, 0x3c, 0x39, 0x7d, 0x72, 0xbd, 0x6a, 0xbf, 0x8f, 0x3b, 0xaa, 0x43, + 0x02, 0x3d, 0xb7, 0xb7, 0x35, 0xbd, 0x97, 0xaf, 0x6c, 0x3c, 0x62, 0x39, 0xd6, + 0xbc, 0x33, 0xd6, 0x85, 0x3d, 0x4c, 0x50, 0x47, 0x3d, 0x26, 0x4b, 0x57, 0x3d, + 0xf8, 0x80, 0x15, 0x3c, 0x9e, 0x69, 0x05, 0xbc, 0xa4, 0x13, 0xb5, 0x3d, 0x41, + 0x17, 0xda, 0xbd, 0x48, 0x79, 0x2b, 0xbb, 0xb4, 0x86, 0xcc, 0xbb, 0xad, 0x20, + 0x95, 0xbd, 0x20, 0xf5, 0x01, 0x3e, 0x23, 0x9e, 0x9b, 0x3d, 0xdb, 0xfe, 0x38, + 0x3b, 0x23, 0x42, 0x57, 0x3b, 0x42, 0x99, 0x59, 0x3d, 0xf2, 0x9d, 0xba, 0xbd, + 0x92, 0xe5, 0x5d, 0x3d, 0x20, 0x17, 0x07, 0xbb, 0xf0, 0x57, 0x08, 0x3d, 0x7d, + 0xed, 0x91, 0xbc, 0x2e, 0xc4, 0x8d, 0xbd, 0xdb, 0x15, 0xc2, 0x3c, 0xaa, 0xc3, + 0xe6, 0xbb, 0x90, 0x5d, 0xb4, 0xbc, 0xee, 0xaa, 0x9a, 0x3d, 0x74, 0x6d, 0x22, + 0xbb, 0x00, 0x65, 0xc2, 0xb9, 0x37, 0x30, 0x07, 0xbd, 0x85, 0xbd, 0x60, 0xbb, + 0x2b, 0x40, 0xd7, 0x3c, 0xca, 0x82, 0x33, 0xbd, 0x29, 0xb2, 0x81, 0x3d, 0x08, + 0xee, 0xd5, 0x3c, 0x28, 0x34, 0xdf, 0x3c, 0x3d, 0x41, 0x67, 0xbd, 0x0c, 0x1e, + 0xf7, 0x3c, 0x9c, 0x86, 0xe4, 0x3c, 0x36, 0x7c, 0x07, 0x3d, 0xc7, 0x27, 0x04, + 0xbd, 0x45, 0xcb, 0x77, 0x3d, 0xcf, 0x66, 0x14, 0xbd, 0x29, 0xae, 0x3f, 0xbd, + 0x70, 0x86, 0x25, 0xbc, 0x08, 0xc9, 0xa6, 0x3c, 0x70, 0xa3, 0xa8, 0xbb, 0xbe, + 0x82, 0x49, 0x3d, 0x13, 0xa1, 0x73, 0xbd, 0xd5, 0x6c, 0x35, 0xbd, 0x98, 0xfa, + 0x3a, 0x3c, 0xff, 0x0c, 0xe2, 0xb9, 0x37, 0xe9, 0xf2, 0xbb, 0x78, 0x2d, 0x89, + 0xbd, 0xec, 0x2c, 0x88, 0xbc, 0x97, 0x7f, 0x2e, 0x3d, 0x9e, 0x32, 0x88, 0xbd, + 0x17, 0xdb, 0x20, 0xbd, 0xde, 0xbd, 0xc7, 0x3b, 0x30, 0x01, 0xf4, 0x3c, 0xf8, + 0x47, 0x05, 0xbd, 0xab, 0x0c, 0xdf, 0x3c, 0x8b, 0xdc, 0xa5, 0x3c, 0x62, 0x53, + 0x78, 0xbd, 0xf1, 0x6e, 0x56, 0x3d, 0x1e, 0xf2, 0x79, 0x3d, 0x0a, 0xce, 0x9b, + 0xbc, 0x18, 0xed, 0xaf, 0x3c, 0xd1, 0x1d, 0x8a, 0x3d, 0x78, 0xe8, 0x6e, 0x3c, + 0x1d, 0x2a, 0x84, 0x3d, 0x90, 0xb3, 0x80, 0x3d, 0x26, 0x1f, 0x74, 0x3d, 0x14, + 0xc6, 0x79, 0xbb, 0x37, 0x9d, 0x18, 0x3d, 0x1a, 0x28, 0x86, 0x3d, 0x8b, 0x8e, + 0x0f, 0xbd, 0x50, 0x3e, 0x82, 0xbc, 0x6f, 0x35, 0x70, 0xbd, 0xa5, 0xa6, 0x88, + 0x3d, 0xb6, 0xe7, 0x2a, 0xbd, 0x57, 0x46, 0x0a, 0x3d, 0xd6, 0xba, 0x34, 0xbd, + 0xc2, 0xf8, 0xc1, 0xbc, 0x2e, 0xe5, 0x30, 0xbd, 0xd5, 0x76, 0x85, 0x3d, 0xb4, + 0xeb, 0x88, 0xbd, 0xb5, 0x44, 0x40, 0x3d, 0x08, 0x9a, 0x8f, 0xbd, 0xe4, 0xa2, + 0xdf, 0x3c, 0x40, 0x83, 0xaf, 0x3a, 0xe0, 0xfb, 0x20, 0x3b, 0x84, 0xc3, 0xf1, + 0x3c, 0x13, 0x24, 0x88, 0xbd, 0x03, 0x21, 0x4a, 0xbd, 0xd6, 0x14, 0x39, 0x3d, + 0x10, 0x2c, 0x84, 0xbd, 0x47, 0xe0, 0xed, 0xbc, 0x8e, 0xfd, 0x91, 0xbc, 0x0e, + 0x42, 0x93, 0xbc, 0xe4, 0x43, 0x6b, 0x3d, 0x96, 0xc7, 0x36, 0x3d, 0xb0, 0xc2, + 0xac, 0xbb, 0x28, 0x29, 0x74, 0x3d, 0xf0, 0x10, 0xb5, 0xbb, 0x09, 0x5e, 0x6c, + 0x3d, 0xc3, 0xa9, 0x97, 0x3c, 0x4f, 0xc1, 0x9c, 0x3c, 0x4e, 0xc4, 0xf0, 0x3c, + 0x4e, 0x42, 0xfa, 0xbc, 0x9a, 0x53, 0x79, 0x3c, 0x9e, 0xc3, 0xd8, 0xbc, 0xfe, + 0x1e, 0x57, 0x3c, 0xa2, 0xec, 0x3f, 0xba, 0xfa, 0x34, 0x12, 0x3d, 0x43, 0x1c, + 0xd4, 0x3c, 0xf3, 0x3f, 0xa5, 0x3a, 0xda, 0xa7, 0x96, 0xbd, 0x6a, 0x5f, 0x2a, + 0x3d, 0xbd, 0x83, 0xd3, 0xbb, 0xb8, 0x9c, 0x5b, 0xbd, 0x67, 0xbb, 0x2d, 0x3c, + 0x44, 0x9a, 0xb0, 0xbc, 0x5c, 0x1b, 0xe6, 0x3c, 0x10, 0xfd, 0x67, 0xbd, 0x3b, + 0x8e, 0x94, 0xbd, 0xf3, 0x97, 0xca, 0xbb, 0x3a, 0xae, 0x3f, 0x3c, 0xd2, 0xbe, + 0x81, 0x3d, 0xd7, 0x2c, 0x86, 0xbd, 0x48, 0xc8, 0xbf, 0xbc, 0x00, 0x15, 0x5e, + 0xbc, 0x43, 0x09, 0x1d, 0x3d, 0x3d, 0xe7, 0x75, 0xbd, 0x38, 0xe4, 0x5f, 0x3c, + 0x8f, 0xe1, 0x09, 0x3d, 0xab, 0xa4, 0x16, 0xbd, 0x69, 0x15, 0x35, 0x3d, 0x6d, + 0x6a, 0x20, 0xbd, 0xa1, 0xd2, 0x9b, 0xbb, 0x89, 0xfb, 0xd1, 0x3c, 0x91, 0x05, + 0x82, 0x3d, 0x5c, 0x10, 0x3c, 0xbd, 0x7e, 0x4d, 0x5d, 0x3d, 0x5a, 0xac, 0x44, + 0xbc, 0xe5, 0x82, 0xfd, 0xbc, 0xd7, 0xc2, 0x82, 0xbd, 0xe7, 0xd3, 0x5f, 0x3d, + 0x3e, 0x16, 0x1e, 0x3d, 0x72, 0xcf, 0x9c, 0xbd, 0xf9, 0x44, 0xa2, 0xbc, 0x1c, + 0x64, 0x69, 0xba, 0x9e, 0xc1, 0x01, 0x3c, 0x07, 0xc9, 0x81, 0xbd, 0x18, 0x75, + 0x25, 0xbd, 0x12, 0x0b, 0xfd, 0xbc, 0x00, 0x54, 0xd5, 0x38, 0x73, 0x47, 0x85, + 0xbd, 0xaa, 0x08, 0x68, 0x3d, 0xa5, 0xf5, 0xa8, 0xbc, 0xd7, 0xea, 0x16, 0x3d, + 0x38, 0x81, 0x2a, 0xbd, 0xb0, 0x44, 0x45, 0x3d, 0xe6, 0x66, 0x71, 0x3d, 0x39, + 0x4d, 0x58, 0xbc, 0x6c, 0xd5, 0xbc, 0xbc, 0x40, 0x65, 0xab, 0x3c, 0x92, 0x4f, + 0x83, 0x3d, 0x46, 0xb4, 0x83, 0x3d, 0xf3, 0x7b, 0x5e, 0xbd, 0x8f, 0x77, 0x98, + 0xbc, 0x28, 0xd3, 0xe2, 0xbc, 0xa8, 0x94, 0xdc, 0xbc, 0xdc, 0x3a, 0x03, 0x39, + 0x6e, 0xd2, 0x81, 0x3c, 0x49, 0x64, 0xb8, 0xbc, 0xdb, 0x96, 0x03, 0xbd, 0xeb, + 0x90, 0x4c, 0x3d, 0xcc, 0xc7, 0x45, 0xbc, 0xca, 0xbc, 0x4a, 0xbd, 0xcc, 0xf4, + 0x90, 0x3c, 0x1e, 0x78, 0x93, 0x3b, 0xe8, 0x46, 0x68, 0xbd, 0x02, 0xe7, 0x78, + 0xbc, 0x95, 0x12, 0x48, 0xbd, 0x36, 0xd3, 0x60, 0xbd, 0x0b, 0x6a, 0x1c, 0x3d, + 0x9c, 0xa6, 0xb4, 0x3c, 0x20, 0xe6, 0xca, 0x3c, 0x52, 0x5e, 0x97, 0xbd, 0xe8, + 0x0f, 0x10, 0xbd, 0x01, 0xe8, 0x51, 0xbd, 0xf1, 0x2a, 0x0e, 0xbd, 0x1d, 0x03, + 0x85, 0x3a, 0x00, 0x7f, 0x50, 0x3d, 0x5a, 0x91, 0xd7, 0xbc, 0xc5, 0x55, 0x3b, + 0x3d, 0xd6, 0x47, 0x8a, 0xbd, 0x2d, 0x40, 0x80, 0x3d, 0x49, 0x84, 0xd9, 0xbb, + 0x2c, 0x7d, 0x5a, 0x3d, 0x94, 0x2d, 0xcd, 0x3c, 0x84, 0xe9, 0x90, 0xbd, 0x67, + 0xf2, 0x95, 0xbd, 0xf6, 0x29, 0x12, 0xbd, 0x7b, 0x2e, 0x64, 0x3d, 0xf5, 0x42, + 0x01, 0xbd, 0x42, 0x57, 0x2b, 0x3d, 0x0d, 0xd5, 0x99, 0xbd, 0xdf, 0xd5, 0x4b, + 0xbd, 0xc4, 0x97, 0x4a, 0xbd, 0xb1, 0xb5, 0xa0, 0x3c, 0x97, 0xa5, 0x13, 0xbb, + 0xda, 0x02, 0x11, 0x3d, 0x6e, 0x22, 0xce, 0xbb, 0x9f, 0x3e, 0xf0, 0x3c, 0x92, + 0x5d, 0xb5, 0xbc, 0xda, 0x5e, 0x45, 0x3d, 0x53, 0x93, 0x0a, 0x3d, 0xa4, 0xf0, + 0x8b, 0x3c, 0x4a, 0x4c, 0x04, 0x3d, 0x76, 0xc7, 0x8e, 0x3c, 0x55, 0xba, 0x39, + 0x3c, 0xa5, 0xed, 0x8c, 0xbd, 0x16, 0x33, 0x80, 0xbd, 0x32, 0xd7, 0x3b, 0x3d, + 0x07, 0xe9, 0x62, 0xbd, 0x6e, 0x01, 0x76, 0x3d, 0x42, 0x8b, 0x5e, 0xbd, 0x30, + 0x56, 0x07, 0x3d, 0x2c, 0x8b, 0xdb, 0xbc, 0xaf, 0xff, 0x8f, 0xbd, 0xf3, 0x4a, + 0x5d, 0xbd, 0xb0, 0x52, 0xb7, 0x3b, 0x29, 0x47, 0x9c, 0xbc, 0x5a, 0x8d, 0x30, + 0xbd, 0x71, 0xf8, 0x07, 0x3d, 0xc0, 0x46, 0x27, 0xbd, 0x93, 0x7d, 0x89, 0xbc, + 0xd2, 0x61, 0x39, 0x3d, 0x8d, 0x18, 0x69, 0x3c, 0x43, 0xd6, 0x18, 0xbc, 0x00, + 0x37, 0x0f, 0xba, 0x68, 0x4c, 0x4a, 0x3d, 0x4a, 0x6d, 0x6c, 0xbd, 0x63, 0x4a, + 0x7c, 0xbc, 0x0e, 0xed, 0x6b, 0xbd, 0x43, 0xc3, 0x97, 0xbd, 0xd0, 0x48, 0xa4, + 0xbb, 0xb4, 0x48, 0xa0, 0x3c, 0x89, 0x3c, 0x89, 0xbd, 0x00, 0xa7, 0xb4, 0x39, + 0xe2, 0xd3, 0x5e, 0x3d, 0x19, 0x2b, 0x10, 0xbc, 0x46, 0xef, 0x9a, 0xbd, 0x1c, + 0x32, 0xac, 0x3c, 0xe2, 0x57, 0x4b, 0x3d, 0xf7, 0x44, 0x41, 0x3d, 0x84, 0x06, + 0x89, 0xbc, 0x20, 0xf0, 0xb7, 0x3b, 0x3a, 0x7b, 0x50, 0x3d, 0xc0, 0xe4, 0x59, + 0xbd, 0x06, 0x58, 0x19, 0x3d, 0x80, 0x23, 0xe1, 0x3b, 0xe2, 0xdc, 0x8c, 0xbd, + 0xdc, 0x0a, 0x84, 0x3d, 0x96, 0xfe, 0x23, 0xbb, 0x45, 0x27, 0x40, 0xbd, 0x5d, + 0xc4, 0x0f, 0x3d, 0xcc, 0xe2, 0xab, 0xbc, 0x64, 0xec, 0xf8, 0xbc, 0x5e, 0x9d, + 0x1f, 0xbd, 0xa4, 0x84, 0x16, 0xbd, 0x26, 0x34, 0x99, 0xbd, 0xeb, 0x94, 0x91, + 0x3d, 0xae, 0x2b, 0x25, 0x3d, 0x7d, 0x8a, 0x2c, 0x3d, 0x65, 0xdb, 0xa1, 0xbc, + 0xb9, 0x5c, 0x2a, 0x3d, 0xe4, 0x06, 0x1d, 0xbb, 0xb6, 0xca, 0x17, 0x3d, 0xc8, + 0xd8, 0x12, 0x3d, 0x5c, 0xf3, 0x28, 0xbd, 0x44, 0x6b, 0x85, 0xbc, 0xa0, 0x1c, + 0x05, 0x3b, 0x1e, 0x13, 0x49, 0x3d, 0xd0, 0xbc, 0x07, 0x3d, 0xe4, 0xe8, 0x33, + 0x3c, 0xe1, 0xbe, 0x4c, 0x3d, 0xcf, 0xa9, 0x0d, 0x3c, 0x52, 0x61, 0x62, 0x3d, + 0x2e, 0x19, 0x63, 0x3d, 0xbe, 0x72, 0x86, 0x3d, 0x20, 0x7b, 0x34, 0x3c, 0xa0, + 0x1b, 0x6d, 0xbb, 0xbe, 0xdf, 0xd9, 0x3a, 0x6b, 0xae, 0x4e, 0x3d, 0x3b, 0x38, + 0x7d, 0xbd, 0xa1, 0xee, 0x3b, 0x3d, 0x51, 0x91, 0x37, 0x3b, 0x26, 0x34, 0xe4, + 0xbc, 0x13, 0x50, 0x8c, 0xbd, 0x5b, 0x2d, 0x52, 0xbd, 0xb3, 0xf6, 0x5d, 0xbc, + 0x82, 0x69, 0x3f, 0xbb, 0xf3, 0x6b, 0x14, 0x3d, 0xe8, 0x54, 0x9a, 0x3c, 0x42, + 0xa5, 0x35, 0x3d, 0x99, 0x10, 0x0b, 0xbc, 0x87, 0x55, 0x2d, 0xbd, 0x1f, 0x1a, + 0x16, 0xbd, 0x99, 0xaa, 0x16, 0xbc, 0x1a, 0x04, 0x3e, 0xbd, 0x62, 0x5f, 0x12, + 0x3d, 0xea, 0x90, 0x18, 0x3d, 0x32, 0x9f, 0x17, 0x3d, 0x1c, 0x6f, 0xba, 0x3c, + 0xce, 0xe2, 0x13, 0x3d, 0x47, 0xa2, 0xdb, 0xbc, 0xf7, 0x85, 0x4f, 0xbd, 0x24, + 0x60, 0xc8, 0xbc, 0xea, 0x00, 0x5e, 0xbd, 0x08, 0x73, 0x58, 0x3d, 0xf3, 0x42, + 0x85, 0xbd, 0x0e, 0xcd, 0x91, 0xbd, 0x3c, 0xba, 0xb1, 0xbc, 0x48, 0x41, 0x01, + 0x3d, 0xb1, 0xcf, 0x64, 0x3d, 0x6f, 0x25, 0x9a, 0xbc, 0xda, 0xaa, 0xce, 0x3c, + 0x22, 0x5f, 0x62, 0x3d, 0xf9, 0x36, 0x9b, 0xbd, 0x85, 0x6f, 0x81, 0x3d, 0x22, + 0xd8, 0x2e, 0xbd, 0x72, 0x49, 0x19, 0xbd, 0x21, 0x3c, 0xb9, 0xba, 0xc5, 0x69, + 0x8a, 0xbd, 0x68, 0xec, 0x08, 0xbd, 0xd9, 0x7e, 0x06, 0xbd, 0x0e, 0xa4, 0x36, + 0x3d, 0x9e, 0xbb, 0x65, 0xbd, 0xaf, 0x04, 0x81, 0x3d, 0x07, 0xa0, 0x7b, 0xbd, + 0xa7, 0x30, 0x51, 0xbd, 0x15, 0x8e, 0x05, 0x3c, 0xe0, 0x7a, 0x7c, 0x3c, 0x43, + 0x90, 0x04, 0x3d, 0x00, 0xf1, 0x4b, 0xbb, 0xe0, 0xe9, 0x29, 0x3b, 0x6f, 0x91, + 0x1d, 0xbd, 0xff, 0xc5, 0xd0, 0x3c, 0x6b, 0x02, 0xe3, 0x3c, 0xba, 0x1f, 0x53, + 0xbc, 0x0e, 0xd5, 0x7e, 0x3d, 0x54, 0xe0, 0x97, 0xbc, 0x00, 0x7a, 0xf2, 0xb9, + 0x66, 0x00, 0x84, 0x3d, 0x62, 0x17, 0x08, 0xbd, 0x5a, 0x30, 0x46, 0x3d, 0x75, + 0xb1, 0x37, 0xbd, 0x6f, 0x28, 0x55, 0x3c, 0xe0, 0xc4, 0x82, 0xbd, 0xfc, 0xf5, + 0xb2, 0xbc, 0x96, 0xdc, 0x0a, 0xbb, 0x83, 0x2a, 0x91, 0x3c, 0x29, 0x21, 0x40, + 0x3d, 0xff, 0x1f, 0x9c, 0xbd, 0x82, 0xb2, 0x5d, 0x3d, 0x8e, 0x14, 0x2c, 0x3d, + 0xec, 0xb2, 0xed, 0xbc, 0xb8, 0xa0, 0x3a, 0xbc, 0x66, 0x70, 0x11, 0xbc, 0x49, + 0xa6, 0xd0, 0xbc, 0x55, 0x34, 0x14, 0xbc, 0xb4, 0x65, 0x80, 0x3d, 0x76, 0x98, + 0x87, 0xbd, 0x23, 0x3d, 0xa2, 0x3c, 0xaa, 0xc5, 0x7e, 0x3d, 0xb7, 0x41, 0x91, + 0xbd, 0x9f, 0xe6, 0x80, 0xbd, 0x20, 0x0a, 0x13, 0x3c, 0xc8, 0xa0, 0xf3, 0x3c, + 0x51, 0xf3, 0x04, 0x3d, 0x61, 0x7e, 0x0c, 0x3d, 0xbe, 0x25, 0x47, 0x3d, 0x25, + 0x2b, 0x2b, 0x3d, 0xa9, 0x7a, 0x3f, 0xbd, 0xc2, 0xd4, 0xe3, 0xbc, 0x67, 0xc5, + 0x79, 0x3d, 0x10, 0x4b, 0xb0, 0x3c, 0xb8, 0xd1, 0x87, 0x3c, 0xd3, 0x7b, 0x54, + 0xbd, 0x81, 0x81, 0xcc, 0x3c, 0x85, 0x81, 0x15, 0x3d, 0xaa, 0xa8, 0xb0, 0x3b, + 0x4b, 0x90, 0xae, 0x3c, 0xaa, 0x38, 0x0f, 0x3d, 0x92, 0x82, 0x0a, 0xbd, 0xfd, + 0x99, 0x51, 0x3d, 0x90, 0x87, 0x0b, 0xbd, 0xc6, 0x71, 0x58, 0xbd, 0x4f, 0x17, + 0x86, 0x38, 0x03, 0x9a, 0x00, 0xbd, 0xeb, 0xae, 0x34, 0xbd, 0xab, 0x28, 0x19, + 0x3b, 0xc5, 0x48, 0x6c, 0xbd, 0x4a, 0xa3, 0x7c, 0xbd, 0x1f, 0xe7, 0x00, 0x3c, + 0xf4, 0xd8, 0xd8, 0x3c, 0xbc, 0x01, 0x59, 0xbd, 0xa9, 0x77, 0xb5, 0xbb, 0x67, + 0xc3, 0x82, 0x3d, 0x37, 0xd8, 0x8c, 0x3d, 0xea, 0x92, 0x59, 0x3d, 0x30, 0x97, + 0x31, 0x3d, 0x36, 0xb9, 0x23, 0xbb, 0x98, 0x99, 0x7f, 0xbd, 0x0b, 0xfd, 0x8e, + 0xbc, 0x80, 0xc6, 0x5c, 0xbd, 0xb2, 0xf0, 0x76, 0x3d, 0x7e, 0x01, 0xe5, 0xbc, + 0x0a, 0x94, 0x08, 0x3d, 0xb2, 0x9b, 0x7b, 0xbd, 0xdc, 0x27, 0x6b, 0xbd, 0x32, + 0x1e, 0x41, 0x3d, 0x4b, 0xd8, 0x8a, 0xbd, 0xe6, 0xdc, 0xd5, 0x3c, 0x72, 0xfd, + 0x09, 0xbd, 0x33, 0x80, 0xc5, 0xba, 0xbc, 0xdd, 0xc0, 0x3b, 0xf4, 0x31, 0x9a, + 0xbd, 0x29, 0x45, 0xd9, 0x3c, 0x02, 0x33, 0xd8, 0xbc, 0x97, 0x48, 0x73, 0x3d, + 0x7f, 0x13, 0x88, 0xbd, 0x9b, 0xed, 0x40, 0xbd, 0xae, 0x86, 0x7d, 0xbd, 0xea, + 0xa5, 0x4a, 0x3b, 0x8d, 0xd4, 0xd8, 0x3c, 0x57, 0xc1, 0x28, 0xbc, 0x6a, 0xb8, + 0x15, 0x3d, 0x30, 0xb0, 0xdc, 0xbb, 0x71, 0x34, 0x05, 0xbd, 0x39, 0x9c, 0x8a, + 0x3d, 0x98, 0xdd, 0x45, 0xbc, 0xf1, 0xcc, 0xcb, 0xbc, 0xe1, 0xf6, 0xd8, 0x3c, + 0xae, 0xb9, 0x18, 0xbb, 0x67, 0x50, 0x82, 0x3d, 0x20, 0x71, 0x82, 0x3d, 0x0e, + 0x45, 0x4a, 0xbd, 0x30, 0x86, 0xbe, 0xbb, 0x60, 0xc7, 0x07, 0x3d, 0xdb, 0xf7, + 0x04, 0xbd, 0x9a, 0xc3, 0xb2, 0xbc, 0xe0, 0x58, 0xf5, 0xbc, 0x12, 0x0a, 0x48, + 0x3d, 0xf7, 0x85, 0x2e, 0x3d, 0xab, 0x2b, 0xe6, 0x3b, 0xed, 0x4c, 0x15, 0xbc, + 0x99, 0x4b, 0xb1, 0xbc, 0xa1, 0x82, 0x09, 0x3d, 0x8b, 0x84, 0x09, 0xbd, 0x85, + 0x5a, 0x38, 0xbb, 0x83, 0xc7, 0x80, 0xbd, 0xfe, 0xf3, 0x67, 0xbd, 0x6e, 0x25, + 0x6f, 0x3d, 0x00, 0xa4, 0xf8, 0xbc, 0x3a, 0x24, 0x17, 0xbc, 0xb2, 0x0d, 0x8a, + 0x3c, 0x87, 0xac, 0x69, 0x3d, 0xcd, 0x5f, 0x89, 0xbc, 0x9e, 0x08, 0x7d, 0xbd, + 0x4c, 0xa4, 0xa0, 0xbc, 0x63, 0x21, 0x2c, 0x3d, 0x5a, 0x78, 0x71, 0xbd, 0xa2, + 0xe8, 0x71, 0x3d, 0x2b, 0xc9, 0xc1, 0xbb, 0x6f, 0x4f, 0x78, 0xbd, 0xa9, 0xee, + 0xdf, 0x3c, 0x3c, 0xe2, 0xb3, 0xbc, 0x64, 0xa2, 0x7d, 0xbc, 0xcc, 0x2c, 0x35, + 0x3d, 0xfd, 0x8c, 0x86, 0x3d, 0xe9, 0x57, 0xf3, 0x3c, 0xc1, 0x84, 0x82, 0x3d, + 0x8e, 0x7a, 0x6c, 0xbd, 0xf1, 0x40, 0x04, 0x3d, 0x7e, 0x17, 0x5b, 0x3d, 0x74, + 0xba, 0x83, 0x3a, 0x6f, 0x01, 0x86, 0xbd, 0x62, 0x58, 0x69, 0xbd, 0x33, 0xcd, + 0x07, 0x3d, 0x6e, 0xc5, 0x8c, 0xbd, 0x5a, 0x4c, 0x99, 0x3c, 0x87, 0xb8, 0xf0, + 0x3c, 0xc1, 0x64, 0x8a, 0x3c, 0x4c, 0x69, 0x23, 0xbd, 0x93, 0x75, 0x80, 0x3d, + 0x54, 0x27, 0x87, 0xbd, 0xdc, 0x3e, 0x62, 0x3d, 0x9e, 0xdb, 0x43, 0xbc, 0x03, + 0xd4, 0x65, 0xbd, 0x4c, 0xb6, 0x59, 0x3d, 0xc4, 0xa1, 0xe8, 0xbc, 0xf3, 0xdc, + 0x87, 0x3d, 0xf5, 0x34, 0x82, 0xbc, 0x4e, 0x2d, 0xe2, 0x3b, 0xd6, 0x1e, 0x3d, + 0xbd, 0xea, 0x0c, 0x83, 0x3d, 0x34, 0x3e, 0x20, 0xbd, 0xb6, 0x87, 0x77, 0x3c, + 0x9c, 0x9a, 0xe4, 0xba, 0x48, 0x21, 0xa5, 0xbc, 0xb3, 0x81, 0x89, 0x3d, 0xf4, + 0x2c, 0x49, 0x3d, 0x98, 0xb5, 0xd6, 0xbc, 0x88, 0xdb, 0x30, 0xbd, 0xa4, 0x2f, + 0x88, 0xbc, 0x67, 0xc1, 0xb6, 0xbc, 0x8e, 0xba, 0xb8, 0xbc, 0xdd, 0x22, 0xc2, + 0x3c, 0xaf, 0x08, 0x8f, 0x3b, 0xa5, 0x85, 0xcb, 0xbc, 0x26, 0x24, 0x2c, 0x3d, + 0x2c, 0x73, 0x35, 0x3c, 0xf9, 0xb2, 0xaf, 0xbb, 0xf2, 0x50, 0x2f, 0xbd, 0x15, + 0x10, 0x31, 0x3c, 0x75, 0xdb, 0x67, 0x3d, 0x5c, 0xe2, 0xfe, 0x3c, 0x51, 0xe0, + 0x8d, 0x3d, 0x1c, 0x25, 0xb9, 0x3c, 0xcf, 0x20, 0x80, 0x3d, 0x5c, 0x61, 0xdf, + 0x3c, 0x9a, 0x2e, 0x5d, 0x3d, 0x4d, 0x63, 0xd8, 0x3c, 0x23, 0x0e, 0x32, 0xbc, + 0x6a, 0xaa, 0x61, 0x3d, 0xa3, 0x74, 0x86, 0xbd, 0x60, 0x32, 0x73, 0x3b, 0xe3, + 0x8b, 0x73, 0xbc, 0x6d, 0x26, 0x40, 0x3d, 0x8c, 0xbb, 0xbf, 0xbb, 0x4f, 0x89, + 0xf9, 0x3c, 0x6a, 0xfe, 0x0b, 0x3d, 0x43, 0x89, 0x3f, 0xbd, 0xe6, 0x1f, 0xda, + 0xbc, 0xdf, 0x48, 0x36, 0xbd, 0xd8, 0x5a, 0x8f, 0xbd, 0x58, 0x20, 0xfc, 0x3c, + 0xec, 0xc0, 0x69, 0x3d, 0xc9, 0x17, 0x06, 0xbd, 0xc1, 0x2b, 0xd9, 0x3b, 0xba, + 0x7f, 0x73, 0x3a, 0xde, 0xd4, 0xbd, 0xbc, 0x9f, 0x94, 0xd6, 0x3c, 0xfe, 0xb3, + 0x56, 0x3c, 0xbd, 0xda, 0xd0, 0xbc, 0x9c, 0x13, 0x6c, 0xbc, 0x10, 0x12, 0xab, + 0x3c, 0x94, 0x9f, 0x1d, 0xbd, 0x78, 0xbb, 0x9d, 0x3c, 0x6c, 0xca, 0x00, 0xbd, + 0x4c, 0xb7, 0xb8, 0x3c, 0x09, 0x38, 0xd3, 0x3c, 0x4c, 0x70, 0x91, 0x3c, 0xe9, + 0x6b, 0x26, 0xbc, 0x57, 0x19, 0xa4, 0x3c, 0xd2, 0xf7, 0x54, 0x3d, 0x0f, 0x9a, + 0x48, 0x3d, 0xd0, 0xe2, 0x8f, 0x3b, 0x58, 0x63, 0x13, 0x3c, 0x81, 0xda, 0x1b, + 0xbd, 0x77, 0x24, 0x83, 0x3c, 0xd7, 0x64, 0xc7, 0x3b, 0xb0, 0xf6, 0x6b, 0xbc, + 0x8a, 0xaa, 0x62, 0x3d, 0xa4, 0x13, 0xbb, 0xbc, 0xe8, 0x06, 0xb3, 0x3c, 0xb1, + 0x41, 0x77, 0x3d, 0x1c, 0xac, 0xe0, 0x3c, 0x40, 0x0f, 0x25, 0x3c, 0x89, 0xc0, + 0x54, 0x3c, 0xec, 0x1d, 0x7a, 0x3d, 0x41, 0x1e, 0x31, 0x3d, 0x51, 0x3e, 0x26, + 0x3d, 0x00, 0x55, 0x39, 0xbd, 0x2e, 0x9d, 0x7f, 0x3d, 0x2f, 0xe9, 0x4d, 0xbd, + 0x46, 0x85, 0x35, 0xbd, 0xa2, 0x67, 0xf8, 0x3c, 0x16, 0x0f, 0x82, 0xbd, 0xcd, + 0x48, 0x9a, 0x3b, 0x62, 0xd9, 0x08, 0x3d, 0x67, 0x0f, 0x5a, 0xbc, 0xd0, 0x09, + 0x56, 0xbc, 0x31, 0x38, 0xda, 0xbc, 0x67, 0xf7, 0xa1, 0xbc, 0x8c, 0x2a, 0x79, + 0xbd, 0xb3, 0xf5, 0xb1, 0xbc, 0xe8, 0xf4, 0x8b, 0xbd, 0x5f, 0x45, 0x11, 0xbd, + 0x9f, 0x79, 0x1e, 0xbd, 0xf5, 0xbf, 0x86, 0x3d, 0x4e, 0xd8, 0xed, 0xbc, 0xcd, + 0x66, 0x5b, 0x3c, 0x4a, 0x74, 0x8f, 0x3b, 0xe3, 0x98, 0x4f, 0x3d, 0x0d, 0x54, + 0x91, 0xbb, 0x24, 0xb6, 0x1b, 0x3d, 0xd8, 0x0d, 0xb7, 0xbc, 0x04, 0x76, 0x31, + 0xbd, 0x10, 0x43, 0x11, 0xbd, 0x0e, 0xc2, 0x02, 0xbd, 0x88, 0x66, 0x43, 0x3c, + 0xb5, 0xda, 0x95, 0xbb, 0x07, 0x09, 0x28, 0xbd, 0x22, 0xcc, 0x19, 0xbd, 0xf0, + 0x47, 0xfe, 0x3c, 0x10, 0x43, 0xfb, 0xbc, 0x5f, 0x5f, 0x2c, 0x3d, 0xfb, 0xce, + 0x18, 0xbc, 0xcd, 0x87, 0x6a, 0x3d, 0xee, 0xf6, 0x61, 0xbd, 0x37, 0x86, 0x12, + 0x3d, 0x4c, 0x01, 0xb7, 0x3c, 0x8c, 0x44, 0x19, 0xbd, 0xc1, 0x3d, 0xa6, 0x3c, + 0xcd, 0xf1, 0x5e, 0xbb, 0x9e, 0xe0, 0x41, 0x3d, 0x8c, 0xfb, 0x95, 0xbd, 0xa7, + 0x04, 0xc1, 0xbb, 0xcc, 0xf0, 0x25, 0xbd, 0x1c, 0x72, 0x81, 0x3c, 0x76, 0xf2, + 0x6d, 0x3d, 0x3b, 0xf9, 0x86, 0x3d, 0xc2, 0xbe, 0x4a, 0x3d, 0x5d, 0x80, 0x5a, + 0xbd, 0x63, 0x28, 0x3b, 0xbd, 0xb4, 0xb7, 0x5e, 0x3d, 0x04, 0x5b, 0x57, 0x3d, + 0x64, 0xac, 0x56, 0xbd, 0xb6, 0x67, 0x35, 0xbd, 0xb1, 0xc7, 0x0b, 0x3d, 0x0c, + 0xae, 0x2d, 0x3d, 0xcc, 0x4c, 0x7d, 0xbc, 0x2f, 0x01, 0x34, 0x3d, 0xa8, 0x4e, + 0x63, 0x3d, 0xa3, 0xad, 0xb8, 0xbc, 0x32, 0x0c, 0x25, 0xbd, 0x66, 0x15, 0xab, + 0xbc, 0x8a, 0x1a, 0x10, 0x3d, 0xca, 0xcb, 0x46, 0x3d, 0x4a, 0xe5, 0xfe, 0x3c, + 0x4a, 0xcc, 0xa6, 0x3c, 0x2e, 0x05, 0x4f, 0xbb, 0x31, 0xef, 0x62, 0xbc, 0xa0, + 0xeb, 0x7c, 0xbd, 0x49, 0x9b, 0x13, 0x3d, 0x07, 0x55, 0x82, 0x3d, 0xca, 0x81, + 0x1d, 0xbd, 0x67, 0xc0, 0x52, 0x3b, 0xae, 0xd6, 0x0d, 0x3d, 0x53, 0x79, 0x70, + 0xbd, 0x9c, 0x93, 0xa8, 0xbc, 0x5b, 0xbb, 0x58, 0x3d, 0x73, 0x1d, 0x0b, 0xbd, + 0xe8, 0xe9, 0x0f, 0x3d, 0x3b, 0xda, 0xbd, 0xbb, 0x66, 0x91, 0x80, 0x3d, 0x46, + 0xcc, 0xe8, 0xbc, 0x86, 0xe3, 0x32, 0x3d, 0x37, 0x9f, 0x5f, 0xbc, 0x9a, 0x06, + 0x19, 0xbd, 0xec, 0xb6, 0x78, 0xbd, 0xd9, 0xd5, 0x49, 0xbd, 0xe8, 0xf9, 0x59, + 0x3c, 0x48, 0x30, 0x8c, 0x3c, 0x03, 0x1d, 0x8a, 0x3d, 0x4d, 0x47, 0xc6, 0x3c, + 0x77, 0x88, 0x9d, 0xbd, 0x3e, 0xf0, 0x63, 0xbd, 0x83, 0x92, 0x2b, 0xbd, 0x9a, + 0xb0, 0x05, 0x3d, 0xee, 0x10, 0x86, 0x3c, 0xf1, 0xb2, 0x92, 0xbd, 0x2a, 0x0e, + 0x3f, 0xbd, 0x6c, 0xfc, 0xbb, 0xbb, 0x62, 0xee, 0x16, 0x3a, 0xf8, 0xdb, 0xa1, + 0x3c, 0x1c, 0xce, 0x43, 0xbd, 0xd3, 0xbf, 0x64, 0xbd, 0xe6, 0xb9, 0xc4, 0x3c, + 0x43, 0x6b, 0x63, 0x3c, 0xe8, 0xbd, 0x87, 0x3c, 0x95, 0x2d, 0x29, 0x3d, 0x10, + 0xbd, 0x7a, 0xbc, 0x26, 0xe3, 0x8e, 0xbd, 0xa1, 0x64, 0x70, 0xbd, 0xf7, 0x22, + 0x8f, 0x3d, 0x68, 0x73, 0x95, 0xbc, 0x33, 0x1c, 0xdb, 0xbc, 0x95, 0x44, 0x11, + 0x3d, 0xc5, 0x6c, 0x86, 0xbd, 0xf8, 0x9b, 0x8a, 0xbd, 0x48, 0xba, 0x13, 0x3c, + 0x6a, 0x54, 0x28, 0xbd, 0xd0, 0xaa, 0x15, 0xbd, 0x32, 0x4e, 0x56, 0x3d, 0x8e, + 0x65, 0x4b, 0x3d, 0x62, 0x4d, 0x76, 0xbc, 0x65, 0x5f, 0x05, 0x3d, 0x40, 0xb5, + 0xb5, 0xbb, 0x1a, 0xd6, 0x83, 0x3d, 0x9d, 0xea, 0xa7, 0x3b, 0x73, 0x19, 0x59, + 0x3c, 0xb2, 0x83, 0x25, 0xbd, 0x38, 0x93, 0x9e, 0x3c, 0x95, 0xe2, 0x7a, 0x3c, + 0xc6, 0x09, 0x95, 0xbd, 0xfe, 0x8a, 0x84, 0x3d, 0x09, 0x99, 0x8c, 0x3d, 0x3d, + 0xb5, 0x0e, 0xbd, 0x1e, 0x91, 0x8c, 0xbd, 0xc1, 0x52, 0xce, 0x3c, 0xc2, 0xa5, + 0x88, 0xbd, 0x9c, 0x3f, 0x97, 0xbd, 0x79, 0x5b, 0xd3, 0x3c, 0x20, 0xf6, 0xfd, + 0x3c, 0xcf, 0x37, 0x5f, 0x3c, 0x41, 0xc8, 0x6e, 0xbd, 0xa4, 0xde, 0xf8, 0x3c, + 0xe6, 0x88, 0x19, 0xbc, 0xe3, 0x00, 0x01, 0x3d, 0xa7, 0x4e, 0x1e, 0xbd, 0xb8, + 0xa1, 0x65, 0xbd, 0xbf, 0xfd, 0x81, 0xbd, 0xf0, 0x80, 0xe8, 0xbb, 0x3c, 0x62, + 0xdc, 0x3c, 0x02, 0x96, 0x70, 0x3d, 0x05, 0x55, 0x7d, 0xbd, 0x66, 0xb3, 0x15, + 0x3d, 0xa7, 0x8e, 0x16, 0xbd, 0xf5, 0xcf, 0x06, 0x3d, 0x5b, 0x78, 0xdf, 0xbc, + 0x54, 0xcc, 0x2c, 0xbd, 0xdc, 0x15, 0xc6, 0xbc, 0xeb, 0xaf, 0x87, 0x3d, 0x3b, + 0x65, 0x95, 0xbd, 0x52, 0x02, 0x65, 0x3d, 0x0a, 0x99, 0x0a, 0xbc, 0x6a, 0xfd, + 0x67, 0x3d, 0x00, 0x53, 0x3e, 0xbd, 0xa0, 0xbe, 0xe4, 0xbc, 0xaa, 0x76, 0xf4, + 0x3c, 0xd9, 0x22, 0x3c, 0xbd, 0x28, 0xa2, 0x3b, 0x3b, 0x44, 0x27, 0x7e, 0xbd, + 0xb3, 0xd4, 0xa8, 0x3c, 0xb3, 0x30, 0x29, 0x3b, 0xd0, 0x0f, 0x3b, 0x3b, 0x74, + 0x3e, 0x8a, 0xbd, 0x2f, 0x61, 0x1f, 0xbd, 0x58, 0x65, 0x4a, 0xbd, 0xd7, 0xb7, + 0xf8, 0xbc, 0xfd, 0x91, 0x25, 0xbd, 0xfd, 0xd2, 0x39, 0xbd, 0x49, 0xa6, 0x82, + 0x3d, 0xd8, 0x60, 0x04, 0x3d, 0xf8, 0x76, 0xac, 0x3c, 0x18, 0x61, 0x2d, 0xbc, + 0xd6, 0xf2, 0x0b, 0xbd, 0x18, 0x53, 0x01, 0x3c, 0xac, 0x10, 0xb7, 0x3c, 0x22, + 0xab, 0xd0, 0xbc, 0x40, 0x50, 0x3b, 0x3a, 0xf4, 0x70, 0x44, 0xbd, 0xb8, 0xaa, + 0x81, 0xbd, 0x09, 0x70, 0x8f, 0x3c, 0x51, 0x00, 0xc5, 0xbc, 0x41, 0x17, 0xb8, + 0xbc, 0xd2, 0xe1, 0x07, 0xbd, 0x58, 0xa0, 0x95, 0xbd, 0x7d, 0x24, 0x4b, 0xbd, + 0x47, 0x50, 0x5f, 0x3d, 0x4a, 0x41, 0x1e, 0x3d, 0xc1, 0x38, 0x21, 0xbd, 0xbd, + 0x82, 0x13, 0x3d, 0xdb, 0xe8, 0x4d, 0xbd, 0x76, 0x8d, 0x1d, 0xbc, 0x96, 0x2f, + 0x72, 0x3d, 0xa9, 0x4c, 0x56, 0xbd, 0xe3, 0x39, 0x79, 0x3d, 0xf2, 0xaa, 0x0e, + 0x3d, 0xee, 0xfa, 0x27, 0x3d, 0x70, 0x0c, 0x24, 0x3c, 0x3c, 0xf8, 0x7e, 0xbd, + 0xc2, 0x3b, 0x55, 0xbb, 0x83, 0x9c, 0xcc, 0x3b, 0x52, 0x0f, 0x5d, 0x3d, 0x86, + 0x3f, 0x3a, 0xbc, 0xf0, 0xbb, 0xbc, 0xbb, 0xe0, 0xff, 0xaf, 0x3c, 0x12, 0xca, + 0x22, 0x3c, 0xd4, 0x78, 0x41, 0xbc, 0xc9, 0xaa, 0x1f, 0xbd, 0x7c, 0x59, 0x9e, + 0x3a, 0x1a, 0x15, 0x4d, 0xbc, 0x25, 0x53, 0xfa, 0xbc, 0x6e, 0xbb, 0x82, 0xbc, + 0xc2, 0x7d, 0x8d, 0x3c, 0xa8, 0x73, 0x19, 0xbd, 0x04, 0x34, 0x4c, 0xbc, 0xbb, + 0x37, 0x5e, 0x3d, 0xb8, 0xc0, 0x30, 0x3d, 0xac, 0x71, 0x9d, 0xbd, 0xf8, 0x58, + 0x2a, 0x3b, 0xd0, 0x94, 0xa4, 0x3b, 0xeb, 0x76, 0x5a, 0xbc, 0xcf, 0x43, 0x94, + 0x3c, 0x48, 0x10, 0x66, 0x3d, 0x35, 0xee, 0x78, 0xbc, 0x29, 0x9a, 0x64, 0x3c, + 0x39, 0x2a, 0x27, 0x3d, 0xab, 0x94, 0x8a, 0x3d, 0xb2, 0x3c, 0x0f, 0xbd, 0x76, + 0x7f, 0x46, 0xbd, 0x68, 0xb2, 0x96, 0xbc, 0x98, 0xa2, 0x61, 0x3d, 0x97, 0x72, + 0x92, 0xbd, 0xde, 0xac, 0x51, 0xbd, 0x03, 0xb8, 0x74, 0x3d, 0xb5, 0x3b, 0x8a, + 0xbc, 0x70, 0xbf, 0x42, 0xbd, 0xf0, 0x0f, 0xf9, 0x3b, 0xb6, 0x4d, 0xc5, 0x3c, + 0x16, 0xeb, 0x72, 0x3d, 0x90, 0x81, 0xcd, 0xbb, 0x00, 0x8b, 0x0b, 0xbc, 0xb1, + 0x02, 0xa5, 0x3c, 0xee, 0xa7, 0x7d, 0xbd, 0xf0, 0x26, 0x0e, 0xbd, 0x1c, 0xb0, + 0x52, 0xbd, 0x80, 0xdd, 0x2f, 0xbd, 0x43, 0xbb, 0xeb, 0xbc, 0xf9, 0xa6, 0xd1, + 0xbc, 0xb1, 0x67, 0x29, 0xbd, 0xaa, 0xee, 0xf4, 0x3b, 0xc4, 0xab, 0x59, 0xbd, + 0xb8, 0x83, 0x36, 0x3d, 0x20, 0xfc, 0x60, 0x3b, 0x28, 0xdd, 0x59, 0xbd, 0x5c, + 0x16, 0xd1, 0xbc, 0x00, 0xbc, 0xcb, 0xbc, 0x9f, 0x8e, 0x62, 0xbc, 0x8e, 0xde, + 0x53, 0xbd, 0xec, 0x4f, 0x26, 0x3d, 0xde, 0x94, 0x46, 0xbd, 0x50, 0x30, 0x0e, + 0x3c, 0x20, 0xef, 0x7b, 0xbd, 0x83, 0x86, 0x38, 0x3c, 0x5a, 0xff, 0x1f, 0xbd, + 0x61, 0x3e, 0xd5, 0xbc, 0x0b, 0xac, 0x65, 0x3c, 0xfd, 0x06, 0xa5, 0x3c, 0x2c, + 0x94, 0x47, 0xbd, 0xe2, 0xc3, 0x7e, 0x3d, 0x40, 0xac, 0x67, 0x3d, 0xa4, 0x7a, + 0x77, 0xbc, 0xfc, 0x13, 0xe7, 0x3c, 0x56, 0x69, 0x80, 0x3d, 0x27, 0x58, 0x18, + 0x3d, 0x1e, 0x95, 0x0e, 0x3d, 0x3f, 0xa8, 0x41, 0x3d, 0x0f, 0xbb, 0x16, 0xbd, + 0x45, 0x72, 0x89, 0xbd, 0xf1, 0xd2, 0xfb, 0x3c, 0x8f, 0x6b, 0x65, 0x3d, 0x50, + 0x8a, 0x05, 0x3c, 0x99, 0x24, 0x90, 0xbd, 0xc8, 0x4d, 0x4f, 0x3d, 0x80, 0xb8, + 0xd2, 0x3b, 0xe5, 0x51, 0xae, 0x3b, 0x25, 0x33, 0x2a, 0xbd, 0x05, 0x12, 0xd7, + 0x3c, 0xc2, 0x1b, 0x33, 0x3c, 0x5f, 0x8d, 0x07, 0xbc, 0x79, 0x60, 0x26, 0x3d, + 0xf7, 0x63, 0x83, 0x3d, 0x88, 0xb4, 0xc7, 0xbc, 0x40, 0x5d, 0xb0, 0xba, 0x6e, + 0xaf, 0x39, 0xbd, 0x50, 0x93, 0xf3, 0x3c, 0xc4, 0x3b, 0x53, 0x3c, 0xf9, 0x8b, + 0x60, 0xbd, 0x74, 0x4e, 0xbd, 0x3c, 0x40, 0xe6, 0xdd, 0x3c, 0x30, 0x78, 0x18, + 0x3d, 0xaa, 0xed, 0x76, 0x3d, 0xd7, 0x20, 0x4b, 0x3d, 0x30, 0x08, 0xd1, 0x3c, + 0x52, 0xf0, 0x61, 0x3d, 0x75, 0xea, 0x6a, 0x3d, 0x93, 0xef, 0xeb, 0x3c, 0x35, + 0xad, 0x96, 0xbd, 0xca, 0x41, 0x21, 0x3d, 0x59, 0x18, 0x1e, 0x3d, 0x2c, 0xa8, + 0x81, 0xbd, 0x7e, 0xdb, 0xd7, 0x3c, 0xfc, 0x7e, 0x1b, 0xbd, 0x26, 0x25, 0x86, + 0x3d, 0xa9, 0x58, 0x9b, 0xbd, 0x0a, 0xef, 0xfa, 0xbc, 0xfe, 0x74, 0x74, 0x3d, + 0xb0, 0x51, 0x80, 0xbd, 0x29, 0x42, 0x88, 0x3a, 0x56, 0xe7, 0x8c, 0xbb, 0x16, + 0x5f, 0x43, 0x3d, 0x5b, 0x1d, 0x4c, 0x3c, 0xae, 0x9d, 0xbd, 0xbb, 0xbc, 0xcf, + 0x44, 0xbc, 0x78, 0x8d, 0x6c, 0x3d, 0x30, 0x99, 0x2c, 0x3d, 0x52, 0x17, 0x9e, + 0xbc, 0x3d, 0x52, 0x18, 0xbd, 0xfa, 0xcc, 0xb4, 0x3c, 0x9d, 0x56, 0x8d, 0x3d, + 0x7e, 0xa0, 0x18, 0x3d, 0x88, 0x7b, 0x94, 0xbd, 0xe8, 0x02, 0xc7, 0xbc, 0x08, + 0x22, 0x37, 0x3c, 0x18, 0x3b, 0x5d, 0xbd, 0xa4, 0xbb, 0xb4, 0x3c, 0xb0, 0x8d, + 0x06, 0x3d, 0xe8, 0xf4, 0xb0, 0xbb, 0xb4, 0x8b, 0x31, 0xbc, 0xf8, 0xdf, 0xf4, + 0x3c, 0x29, 0x19, 0x80, 0xbb, 0x29, 0x4c, 0x60, 0x3c, 0x4b, 0x11, 0x93, 0xbd, + 0x4b, 0xbd, 0x66, 0xbd, 0x62, 0x8e, 0x88, 0x3c, 0xfe, 0xa2, 0x37, 0x3d, 0x41, + 0xe1, 0x36, 0xbd, 0xbe, 0x7b, 0xc1, 0x3b, 0x6c, 0xff, 0xba, 0x3c, 0x8f, 0xae, + 0xab, 0xbc, 0x7b, 0x37, 0xd5, 0xbc, 0x0d, 0xac, 0x18, 0xbd, 0xf2, 0xcb, 0x1d, + 0x3d, 0xbb, 0xb0, 0x30, 0x3c, 0xbb, 0x1a, 0x41, 0x3b, 0x5b, 0x36, 0x11, 0xbd, + 0x96, 0xb3, 0x86, 0x3d, 0x0b, 0xcb, 0xf9, 0x3c, 0x5c, 0x23, 0x60, 0xbc, 0x62, + 0xe1, 0x33, 0xbd, 0x10, 0x91, 0x5e, 0x3d, 0xdf, 0xc8, 0x6c, 0xbd, 0xe7, 0x19, + 0x60, 0x3d, 0x87, 0xa0, 0x5b, 0x3c, 0x8a, 0xc5, 0x65, 0x3d, 0x6c, 0x2e, 0x31, + 0x3d, 0x99, 0xc7, 0x1a, 0x3d, 0xe8, 0xe6, 0x6f, 0x3c, 0x10, 0x95, 0xd9, 0x3b, + 0x1d, 0xdd, 0x19, 0xbd, 0xdc, 0xfe, 0x32, 0x3d, 0x83, 0x85, 0x05, 0x3d, 0xd8, + 0x24, 0x16, 0x3d, 0xf7, 0x73, 0x20, 0xbd, 0x77, 0x07, 0xc4, 0x3c, 0xdf, 0xd0, + 0x92, 0x3c, 0x1a, 0x7d, 0x2c, 0xba, 0xb0, 0x19, 0xe8, 0xbc, 0x9e, 0x97, 0xec, + 0xbb, 0x33, 0xb2, 0xb1, 0x3c, 0x89, 0xde, 0x81, 0xbd, 0x9d, 0xae, 0x57, 0xbc, + 0x31, 0xd9, 0xbb, 0x3c, 0xa0, 0x2d, 0x27, 0x3d, 0x00, 0x99, 0x43, 0x3c, 0x2e, + 0x32, 0x9d, 0xbc, 0xa2, 0x6d, 0x81, 0x3d, 0x38, 0xce, 0xc3, 0xbc, 0x8e, 0xd7, + 0x7a, 0x3d, 0x2a, 0x89, 0x00, 0xbc, 0x2e, 0x52, 0x9f, 0xbc, 0x20, 0x47, 0x4d, + 0xbd, 0xd9, 0x79, 0x5f, 0x3d, 0x09, 0x2c, 0x97, 0x3c, 0x9c, 0x28, 0x5f, 0x3b, + 0x9d, 0xd3, 0x65, 0x3d, 0x44, 0x63, 0xbb, 0xbc, 0x0c, 0xfe, 0xc0, 0x3c, 0x71, + 0xfa, 0x08, 0xbd, 0x40, 0x4a, 0xac, 0x3b, 0xca, 0x9d, 0x7a, 0x3d, 0xbd, 0x1c, + 0x52, 0xbd, 0xc8, 0x90, 0x0e, 0x3d, 0x6b, 0x89, 0xbd, 0xbc, 0xa0, 0x74, 0x77, + 0x3c, 0x8a, 0xe4, 0x44, 0xbd, 0x5f, 0x81, 0x56, 0x3c, 0x39, 0x9a, 0xc9, 0xbc, + 0x33, 0xf4, 0x07, 0xbd, 0x48, 0xe0, 0x94, 0xbd, 0x3f, 0xfc, 0xdf, 0xbc, 0x41, + 0x3e, 0xa9, 0x3c, 0x18, 0x06, 0x0e, 0x3c, 0xfb, 0xb9, 0xe2, 0x3c, 0x12, 0x14, + 0x26, 0xbc, 0x8b, 0x15, 0x97, 0xbd, 0x43, 0xc8, 0x23, 0xbd, 0x8e, 0x30, 0xf7, + 0x3a, 0x4c, 0xdc, 0x4f, 0xbd, 0x52, 0x50, 0x3c, 0xbc, 0xda, 0x70, 0x1b, 0x3d, + 0xfc, 0xbc, 0x3a, 0x3d, 0x76, 0x5a, 0x39, 0xbd, 0x48, 0xc3, 0x50, 0x3d, 0xf9, + 0xd3, 0x81, 0xbd, 0x1e, 0xdf, 0x09, 0xbd, 0xd3, 0xa3, 0x7a, 0x3d, 0x71, 0x42, + 0x6b, 0xbd, 0x7e, 0x3a, 0x4e, 0x3d, 0xd0, 0x26, 0xc5, 0xbb, 0xde, 0x7d, 0x2d, + 0x3d, 0xc0, 0xda, 0xd8, 0xba, 0x18, 0x43, 0x63, 0x3c, 0xb5, 0x93, 0xb6, 0x3c, + 0xc7, 0xee, 0x49, 0xbd, 0xb2, 0x73, 0x47, 0xbd, 0xa6, 0x66, 0x3b, 0x3d, 0xea, + 0xa2, 0x04, 0xbd, 0xde, 0x2b, 0x44, 0x3d, 0x41, 0x80, 0xee, 0x3c, 0x11, 0xbe, + 0x72, 0x3c, 0x46, 0xdf, 0x63, 0xbc, 0x4d, 0xc3, 0xfb, 0xbc, 0x3d, 0xbc, 0x86, + 0x3d, 0xf7, 0xad, 0x02, 0xbd, 0x7d, 0xb7, 0x0f, 0xbd, 0x99, 0x8c, 0x51, 0x3c, + 0x85, 0xce, 0x50, 0xbd, 0x0d, 0xe0, 0x41, 0x3d, 0x3a, 0xb3, 0x21, 0xbb, 0xd0, + 0x0b, 0xdd, 0xbb, 0x94, 0x62, 0x25, 0xbd, 0xc0, 0xab, 0xd1, 0xbc, 0xf0, 0xf6, + 0x89, 0xbb, 0xbe, 0x10, 0xb9, 0xbc, 0x68, 0x2e, 0x3a, 0x3c, 0x22, 0x34, 0x20, + 0xbd, 0x4d, 0xd9, 0x75, 0xbc, 0x74, 0x5d, 0x00, 0x3d, 0xf3, 0xd5, 0x5e, 0x3d, + 0x7c, 0x61, 0xcc, 0xbc, 0x56, 0x76, 0x13, 0x3d, 0xda, 0x68, 0xe3, 0x3b, 0xa3, + 0xa1, 0x89, 0x3d, 0xd0, 0xfa, 0x16, 0x3d, 0xf1, 0x86, 0x48, 0x3c, 0x71, 0x81, + 0x83, 0x3b, 0x31, 0x30, 0x2a, 0xbd, 0x4e, 0xc0, 0xd6, 0x3c, 0xe6, 0xf3, 0xfd, + 0xba, 0x6d, 0x46, 0x96, 0x3c, 0x60, 0xcc, 0x67, 0xbd, 0x11, 0x9c, 0xc6, 0x3c, + 0xa8, 0x63, 0x21, 0xbd, 0xdb, 0xb3, 0x70, 0xbc, 0x42, 0x46, 0x38, 0xbd, 0x88, + 0x73, 0x00, 0xbc, 0x48, 0x5e, 0x4e, 0x3d, 0x2d, 0x95, 0x26, 0xbd, 0xa0, 0x22, + 0xb3, 0x3c, 0x56, 0xfb, 0x91, 0xbd, 0x51, 0x13, 0x06, 0x3c, 0x85, 0x69, 0x8a, + 0x3d, 0x23, 0xf8, 0x89, 0xbd, 0x61, 0x24, 0xd3, 0xbc, 0x28, 0xd0, 0x0a, 0x3c, + 0xe9, 0x4e, 0x85, 0x3d, 0xde, 0x12, 0x93, 0xbb, 0x18, 0x55, 0xdd, 0x3b, 0x57, + 0xc2, 0x22, 0xbd, 0x85, 0x3f, 0x0a, 0xbd, 0x9d, 0x49, 0x86, 0x3d, 0x50, 0x01, + 0x8f, 0x3b, 0x2c, 0xbf, 0xf5, 0xbc, 0x6b, 0xec, 0x04, 0x3c, 0x92, 0x0e, 0x9b, + 0xbc, 0xfc, 0xe0, 0x28, 0xbd, 0x16, 0xeb, 0x9d, 0xbb, 0x20, 0xde, 0xf9, 0x3c, + 0x58, 0x77, 0x06, 0xbd, 0x5c, 0x2a, 0x92, 0xbc, 0x62, 0x8d, 0xf6, 0xbc, 0x88, + 0xcc, 0xa3, 0xbb, 0x60, 0xbf, 0xdb, 0x3c, 0x2c, 0xcb, 0x69, 0xbd, 0xe3, 0xcf, + 0x89, 0xbb, 0x35, 0xad, 0x81, 0xbd, 0xf1, 0x3d, 0x3d, 0xbd, 0x05, 0x62, 0x81, + 0x3d, 0x4e, 0xbe, 0x4d, 0x3c, 0x7e, 0xbf, 0x85, 0x3d, 0xfb, 0xc4, 0x23, 0xbb, + 0xd8, 0x1b, 0x78, 0x3d, 0x1d, 0xd7, 0x9d, 0xbd, 0x5d, 0x69, 0x15, 0x3d, 0xb6, + 0x7a, 0x93, 0xbc, 0x8c, 0xf1, 0xdf, 0xbc, 0xec, 0xfa, 0x2b, 0x3d, 0x40, 0xda, + 0x86, 0x3a, 0x1c, 0x0e, 0x2f, 0xbd, 0x38, 0x71, 0x4c, 0x3d, 0x68, 0x87, 0x9a, + 0xbd, 0x12, 0x86, 0x91, 0xbd, 0x60, 0x8f, 0x95, 0xbd, 0xd0, 0xe1, 0xf4, 0xbc, + 0xa2, 0x77, 0x3f, 0x3d, 0xc0, 0xcd, 0xa1, 0x3c, 0xa2, 0x69, 0x6e, 0xbd, 0xba, + 0xc9, 0x79, 0x3d, 0x6d, 0x05, 0xec, 0xbc, 0xb0, 0x63, 0x57, 0x3d, 0xfa, 0x05, + 0xd4, 0xbc, 0xb2, 0xd2, 0x93, 0x3b, 0x7e, 0x40, 0x09, 0xbd, 0xf0, 0x2e, 0xd6, + 0x3c, 0x00, 0x7b, 0x69, 0xbd, 0x6e, 0x10, 0x29, 0xbd, 0x69, 0x91, 0x92, 0xbb, + 0x90, 0x9e, 0x38, 0x3d, 0x99, 0x1b, 0x69, 0xbd, 0x32, 0xd2, 0x49, 0x3d, 0x9d, + 0xa4, 0x5d, 0xbd, 0x8b, 0x8e, 0x20, 0xbd, 0xcf, 0x0b, 0x92, 0xbd, 0x3c, 0xb7, + 0xfb, 0x3c, 0xdf, 0xf9, 0x58, 0x3d, 0xa7, 0xf0, 0x3e, 0xbb, 0x6c, 0x7e, 0xbd, + 0x3c, 0x83, 0xdf, 0x12, 0x3d, 0x37, 0x97, 0x84, 0x3d, 0xe0, 0x4e, 0x36, 0x3d, + 0xf6, 0x06, 0x90, 0xbd, 0x07, 0xc0, 0xce, 0x3c, 0xb1, 0xc0, 0x49, 0x3d, 0x7b, + 0x76, 0x02, 0x3c, 0x29, 0x97, 0x93, 0x3b, 0x16, 0x46, 0x45, 0xbd, 0x10, 0xb1, + 0x92, 0x3b, 0x26, 0x69, 0x45, 0x3d, 0x1e, 0x1a, 0x6d, 0x3d, 0x60, 0x9f, 0xe3, + 0x3b, 0x07, 0xab, 0x5f, 0x3d, 0x65, 0xce, 0x35, 0xbd, 0x61, 0x0d, 0x43, 0xbd, + 0x56, 0xa7, 0x79, 0x3d, 0x61, 0x67, 0x37, 0x3d, 0x26, 0xf4, 0x90, 0xbd, 0x73, + 0x2e, 0x1b, 0x3d, 0x39, 0x48, 0xe2, 0xb9, 0x57, 0x1e, 0x32, 0x3d, 0xaa, 0x2d, + 0x16, 0x3c, 0xae, 0x6a, 0x94, 0xbc, 0xc1, 0x8b, 0x1e, 0xbd, 0xf1, 0x42, 0x4f, + 0xbd, 0x6d, 0x34, 0x66, 0x3d, 0xc2, 0x39, 0x6a, 0xbd, 0x6e, 0x02, 0xab, 0x3c, + 0xa8, 0x60, 0x3d, 0xbd, 0x69, 0x24, 0x93, 0xbd, 0xd2, 0x91, 0x8a, 0xbd, 0xfe, + 0xa0, 0x30, 0xbd, 0xbd, 0x15, 0x28, 0xbd, 0x00, 0x1c, 0x02, 0x3a, 0x2e, 0xe2, + 0x5b, 0xbb, 0xda, 0x90, 0x4d, 0x3d, 0x56, 0xc4, 0xd3, 0xbc, 0x25, 0xb8, 0x6d, + 0x3d, 0x89, 0xe0, 0x47, 0x3d, 0x60, 0x4b, 0x04, 0xbb, 0x00, 0xd5, 0xdc, 0x39, + 0x33, 0xc0, 0x7e, 0x3d, 0xce, 0x0c, 0x51, 0xbd, 0xb2, 0x49, 0xf0, 0xbc, 0xc8, + 0x62, 0xa2, 0xbc, 0xdc, 0x45, 0x2a, 0x3d, 0x5e, 0xe2, 0x1b, 0xbd, 0xa6, 0x02, + 0x9a, 0xbd, 0xe2, 0xf0, 0x89, 0xbd, 0xff, 0x15, 0xa8, 0xbc, 0xc2, 0x94, 0xb9, + 0x3c, 0x8a, 0x28, 0x8b, 0xbc, 0x27, 0x32, 0x7d, 0x3d, 0x2b, 0x24, 0x75, 0xbd, + 0xc1, 0x7f, 0x05, 0xbd, 0x8b, 0x7f, 0x28, 0xbd, 0xa4, 0xd9, 0x9a, 0xbc, 0x03, + 0xc7, 0x23, 0xbc, 0xac, 0xd5, 0x6d, 0xbc, 0xfb, 0xf5, 0x70, 0xbc, 0x5c, 0x28, + 0x5c, 0xbd, 0xf5, 0xa5, 0x54, 0x3d, 0xc4, 0x5f, 0x87, 0xbd, 0x28, 0x92, 0x51, + 0x3c, 0x10, 0xc1, 0x87, 0x3d, 0x00, 0xeb, 0x1c, 0x3c, 0x9a, 0x6a, 0x52, 0x3d, + 0x95, 0xc5, 0x1a, 0x3d, 0x9d, 0x84, 0x9b, 0x3c, 0x56, 0x33, 0xda, 0xbc, 0x28, + 0x01, 0x64, 0x3d, 0xb1, 0x80, 0x4f, 0xbd, 0x50, 0x61, 0x89, 0xbd, 0xe0, 0x1f, + 0x30, 0xbb, 0x63, 0x5a, 0x86, 0x3d, 0x06, 0x30, 0x56, 0x3d, 0xc6, 0x8e, 0x4e, + 0xbd, 0xd1, 0xb8, 0xc6, 0xbc, 0xc6, 0x6c, 0xf4, 0xbc, 0x6c, 0x6f, 0x21, 0x3d, + 0xea, 0x45, 0x86, 0x3c, 0xe7, 0x7b, 0x1c, 0xbd, 0xba, 0x38, 0x54, 0xbd, 0xa4, + 0x78, 0x82, 0x3d, 0xdc, 0x98, 0x18, 0xbc, 0xa0, 0x85, 0x0d, 0x3d, 0x9e, 0xe7, + 0x55, 0xbd, 0x8e, 0x64, 0x30, 0x3d, 0xda, 0xf4, 0x48, 0x3d, 0x69, 0xdc, 0xe8, + 0x3c, 0x68, 0xc7, 0x0d, 0xbd, 0xdf, 0x7e, 0xb4, 0x3c, 0x3a, 0x30, 0x57, 0x3d, + 0xc5, 0x7a, 0x1a, 0xbc, 0x42, 0xa7, 0x8c, 0x3d, 0xb1, 0x9c, 0x4f, 0x3d, 0xa0, + 0x74, 0x36, 0xbc, 0x7e, 0x74, 0x25, 0x3d, 0xc8, 0x7c, 0x48, 0x3d, 0x7f, 0x68, + 0x55, 0x3c, 0xa6, 0x62, 0xf8, 0xbc, 0x16, 0x5b, 0x2d, 0x3d, 0x79, 0x57, 0x6a, + 0xbd, 0x86, 0xf0, 0x8b, 0xbc, 0x20, 0x1c, 0x3f, 0x3c, 0x92, 0x3d, 0x20, 0x3d, + 0x40, 0x29, 0x7b, 0xbd, 0x32, 0x88, 0x5b, 0x3d, 0x28, 0x79, 0x2c, 0x3c, 0xeb, + 0x80, 0xe3, 0x3c, 0xe5, 0x28, 0xa1, 0x3c, 0x95, 0xbb, 0x88, 0x3d, 0x1b, 0xa9, + 0x95, 0xbc, 0xb0, 0x35, 0x5b, 0x3d, 0x02, 0xbd, 0x8e, 0xbc, 0x62, 0xe7, 0x1d, + 0xbd, 0xad, 0xe5, 0xca, 0x3c, 0x6f, 0x93, 0x3f, 0xb9, 0x51, 0x7d, 0x48, 0xbd, + 0x06, 0x75, 0x68, 0x3d, 0xa7, 0x08, 0x7b, 0xbd, 0x5e, 0xeb, 0x73, 0xba, 0xa1, + 0x83, 0x31, 0x3d, 0xcd, 0x92, 0x55, 0x3c, 0x88, 0xdb, 0x3f, 0xbd, 0x67, 0x9c, + 0x35, 0x3d, 0xa9, 0x4b, 0x14, 0x3d, 0x94, 0x6b, 0x6c, 0xbc, 0x6c, 0xa8, 0xe7, + 0x3c, 0xc0, 0x02, 0xf7, 0xbb, 0xcb, 0xbc, 0x85, 0x3a, 0xf1, 0x91, 0xf0, 0xbc, + 0x72, 0x77, 0x83, 0x3d, 0x68, 0xab, 0x30, 0x3d, 0xa0, 0x17, 0x96, 0xbc, 0x7d, + 0xe6, 0x19, 0xbd, 0x18, 0x2c, 0x22, 0x3d, 0x88, 0x14, 0xaa, 0x3c, 0x40, 0x4d, + 0xb3, 0xbc, 0x4c, 0xc2, 0x7a, 0xbc, 0xf8, 0x68, 0x53, 0x3c, 0x16, 0x1d, 0xc6, + 0xbb, 0x2f, 0x2c, 0x71, 0xbd, 0xa3, 0x55, 0x80, 0x3d, 0x96, 0x18, 0x07, 0x3d, + 0x34, 0xa8, 0xa1, 0xbc, 0x2b, 0x39, 0x58, 0x3d, 0x23, 0xc6, 0x68, 0x3d, 0x46, + 0x84, 0x55, 0x3d, 0x0d, 0xd6, 0x3e, 0x3c, 0x2e, 0xc2, 0x0d, 0x3d, 0x88, 0x20, + 0x26, 0x3c, 0x44, 0x1b, 0x23, 0x3d, 0x7f, 0x54, 0x8b, 0xbd, 0xda, 0xa3, 0x54, + 0xbd, 0x9e, 0xad, 0x32, 0x3d, 0x17, 0x7c, 0x78, 0x3d, 0xcd, 0x11, 0x9f, 0xbc, + 0x2c, 0x53, 0x57, 0x3b, 0x1a, 0x5a, 0x0a, 0xbd, 0x6d, 0x40, 0x67, 0x3d, 0x52, + 0xb6, 0x56, 0x3d, 0x1c, 0x07, 0x96, 0xbd, 0xb0, 0x1c, 0x14, 0xbd, 0xc3, 0xda, + 0x2b, 0x3c, 0x7a, 0x02, 0x61, 0x3d, 0xbd, 0x9f, 0x2a, 0xbd, 0x72, 0xf9, 0xbf, + 0xbc, 0x79, 0xfe, 0xa3, 0x3c, 0xfc, 0x45, 0x43, 0xbd, 0x9e, 0xd3, 0x7b, 0x3d, + 0x70, 0x3a, 0x6e, 0xbd, 0x78, 0xdc, 0x30, 0x3c, 0x93, 0x36, 0x67, 0x3d, 0x63, + 0x08, 0x84, 0x3d, 0x5e, 0x4f, 0x40, 0x3a, 0xc5, 0xd9, 0xc1, 0x3c, 0xea, 0x6b, + 0x31, 0x3d, 0x1e, 0xf8, 0xdc, 0xbb, 0x0b, 0x30, 0xfd, 0xbc, 0xc6, 0xf2, 0x87, + 0x3d, 0xc5, 0xc9, 0xc7, 0x3c, 0x98, 0x0c, 0xba, 0x3b, 0xcf, 0x1a, 0x8d, 0xbd, + 0x90, 0xa5, 0xe1, 0xbb, 0x16, 0xc3, 0x64, 0x3d, 0x03, 0x3a, 0x95, 0x3c, 0xaa, + 0x98, 0x32, 0xbd, 0x95, 0xa5, 0x95, 0xbd, 0xde, 0x9e, 0x88, 0x3a, 0xbb, 0x39, + 0x8e, 0xbd, 0x3d, 0xf1, 0x30, 0x3d, 0x6e, 0x57, 0x8c, 0x3d, 0xf3, 0x90, 0x25, + 0xbd, 0xf8, 0x97, 0x2e, 0xbd, 0x21, 0xf3, 0x1b, 0x3d, 0x34, 0xd9, 0x5d, 0xbc, + 0x24, 0x60, 0x23, 0xbc, 0x32, 0x24, 0xa6, 0x3b, 0x01, 0xf1, 0x61, 0xbd, 0x69, + 0x3b, 0xaa, 0x3c, 0x54, 0xf0, 0x53, 0xbd, 0x40, 0x67, 0x64, 0x3b, 0x00, 0x84, + 0xa1, 0xbb, 0xda, 0xb5, 0x6e, 0x3d, 0x0f, 0xfb, 0x3d, 0xbc, 0xf9, 0xf3, 0x0c, + 0xbd, 0x5b, 0x52, 0xd1, 0xbb, 0x43, 0xf7, 0x04, 0xbd, 0xf9, 0x67, 0x7c, 0x3d, + 0x36, 0xed, 0x30, 0xbd, 0xcf, 0x53, 0x62, 0x3c, 0x03, 0xbb, 0x79, 0xbd, 0x6d, + 0xc8, 0x40, 0x3d, 0xc5, 0x5c, 0x19, 0x3d, 0x0e, 0xd5, 0x2d, 0xbd, 0x2d, 0x89, + 0x92, 0x3d, 0xf3, 0xcc, 0x15, 0x3d, 0xe2, 0x92, 0x9e, 0xbc, 0x44, 0x74, 0x8e, + 0xbd, 0x6b, 0x27, 0x96, 0xbd, 0x86, 0xcb, 0xe8, 0x3c, 0xab, 0xda, 0x99, 0xbb, + 0xf6, 0x99, 0x19, 0xbb, 0xe8, 0xb3, 0x49, 0x3d, 0xa4, 0x79, 0x85, 0x3c, 0x4f, + 0xb4, 0xf5, 0xbc, 0x5c, 0x1a, 0xa9, 0xbc, 0xa7, 0x63, 0x1f, 0xbd, 0x33, 0xff, + 0x46, 0xbd, 0x39, 0x7f, 0x97, 0xbd, 0xd8, 0x75, 0x85, 0xbd, 0x55, 0x97, 0x94, + 0xbc, 0x3e, 0x73, 0xb0, 0x3c, 0xf8, 0xb8, 0xee, 0x3c, 0xa0, 0xe4, 0x6e, 0x3b, + 0x00, 0xde, 0x54, 0x3b, 0x3b, 0x2d, 0x90, 0xbc, 0xae, 0xd9, 0x89, 0xbd, 0x65, + 0x3d, 0xf9, 0x3c, 0x5f, 0x64, 0x8a, 0xbd, 0x88, 0x25, 0x7c, 0xbb, 0x8c, 0x64, + 0x35, 0xbc, 0x63, 0x28, 0x0c, 0x3d, 0x2d, 0x9c, 0xde, 0xbb, 0x62, 0x5c, 0x96, + 0xbc, 0x12, 0x3c, 0x35, 0x3d, 0x50, 0x11, 0xcc, 0x3b, 0x56, 0x1a, 0x80, 0xbd, + 0xd0, 0x1a, 0x98, 0xba, 0x88, 0xe4, 0x58, 0x3d, 0x09, 0xc2, 0x9e, 0x3b, 0xce, + 0xc4, 0x3c, 0xbc, 0x88, 0x46, 0x09, 0xbd, 0xea, 0xde, 0x04, 0x3c, 0xd4, 0x45, + 0x5d, 0xbd, 0x18, 0x90, 0x7e, 0x3d, 0x99, 0x67, 0x91, 0x3d, 0x8d, 0x01, 0xd7, + 0xbc, 0x61, 0xdc, 0x6b, 0x3d, 0x36, 0x17, 0x96, 0x3c, 0x7e, 0x27, 0x6f, 0x3d, + 0x52, 0xcb, 0xf7, 0x3c, 0xfc, 0x54, 0x75, 0xbc, 0x36, 0xbd, 0x25, 0x3d, 0x86, + 0xd1, 0x7b, 0xbd, 0x5c, 0x19, 0x12, 0x3d, 0xda, 0xfb, 0x03, 0x3d, 0xee, 0x5f, + 0x37, 0xbd, 0xd4, 0x39, 0x34, 0xbd, 0xb4, 0x2f, 0x8b, 0xbd, 0x29, 0xd4, 0x99, + 0xbd, 0x4e, 0x31, 0x4a, 0x3c, 0x3a, 0x73, 0x7b, 0x3d, 0x97, 0x99, 0xac, 0xbb, + 0x77, 0xe4, 0xac, 0xbc, 0x0c, 0x31, 0xc3, 0xbb, 0xd7, 0xdb, 0x85, 0x3d, 0x31, + 0x4d, 0xd5, 0xbb, 0xb8, 0x71, 0xda, 0x3c, 0x7c, 0x01, 0x5a, 0x3d, 0x32, 0xe9, + 0x57, 0x3d, 0x6f, 0xd9, 0x7a, 0x3d, 0x38, 0x6a, 0x77, 0xbc, 0x7b, 0x63, 0x5c, + 0xbd, 0x8c, 0xe0, 0x02, 0xbd, 0xf2, 0x35, 0x47, 0x3d, 0x93, 0x0e, 0x59, 0xbd, + 0xf8, 0xfa, 0x63, 0x3d, 0x1c, 0x59, 0x49, 0xbd, 0x48, 0x00, 0x3c, 0xbc, 0x52, + 0xd8, 0x14, 0x3d, 0xc3, 0x56, 0x42, 0x3c, 0x7d, 0x74, 0xa9, 0x3c, 0x15, 0x40, + 0x83, 0x3d, 0x9c, 0x8d, 0xe2, 0xbc, 0x47, 0xdb, 0x86, 0x3d, 0xcc, 0x7f, 0x2d, + 0xbd, 0x39, 0xdd, 0x8f, 0x3d, 0xe8, 0xe7, 0x0c, 0x3c, 0xc0, 0xc6, 0xfa, 0x3a, + 0x5e, 0x6c, 0x85, 0xbd, 0xae, 0x8d, 0x79, 0x3d, 0x29, 0x90, 0xd8, 0x3c, 0x09, + 0x17, 0x85, 0xbc, 0x4d, 0xf9, 0x71, 0xbd, 0x74, 0xa6, 0xf3, 0xbb, 0xf0, 0x65, + 0xee, 0xbc, 0x42, 0x45, 0x7b, 0x3d, 0xdc, 0x2b, 0x5e, 0xbd, 0x35, 0x5f, 0x3f, + 0x3d, 0x10, 0x00, 0xdd, 0x3b, 0xb8, 0xd0, 0x94, 0xbc, 0xe8, 0xb4, 0xcc, 0xbc, + 0xb3, 0x71, 0x2d, 0x3c, 0x00, 0x36, 0xc0, 0x3c, 0x3e, 0x20, 0x1e, 0xbd, 0x0e, + 0xdf, 0x62, 0x3c, 0x55, 0xdc, 0x44, 0x3d, 0x27, 0x0e, 0x3a, 0xbc, 0x6b, 0xd4, + 0x8c, 0x3c, 0xcc, 0xcc, 0x7f, 0xbd, 0xd4, 0x43, 0x3d, 0xbd, 0x5b, 0xac, 0x58, + 0x3c, 0xf0, 0x58, 0xd2, 0xbc, 0x49, 0x1d, 0x38, 0x3d, 0x09, 0x7c, 0x1d, 0xbd, + 0x7a, 0x5b, 0x00, 0xbd, 0xe4, 0x6e, 0xf0, 0x3c, 0x4a, 0xd3, 0x56, 0x3d, 0x28, + 0x12, 0x8d, 0xbc, 0xbe, 0x44, 0x65, 0x3d, 0x0a, 0xd4, 0x16, 0xbc, 0xb0, 0x96, + 0x16, 0xbd, 0xfa, 0xf1, 0x8d, 0x3d, 0x41, 0xd6, 0x74, 0x3d, 0xb5, 0x79, 0x85, + 0xbd, 0x5d, 0xfb, 0x8e, 0xbc, 0xd8, 0x46, 0x86, 0xba, 0x2f, 0xa2, 0x8b, 0xbd, + 0xd8, 0x91, 0x90, 0xbc, 0xf7, 0x73, 0xe6, 0xbc, 0x6c, 0x45, 0xac, 0x3c, 0xe4, + 0xbe, 0x60, 0xbc, 0x4b, 0x18, 0x7f, 0x3d, 0x1f, 0xb0, 0x39, 0x3c, 0xc0, 0x64, + 0x71, 0x3d, 0x2f, 0x99, 0x3e, 0xbd, 0xa8, 0x87, 0x2f, 0x3d, 0xdc, 0xb3, 0x94, + 0xbd, 0xfa, 0xe2, 0x8c, 0xbd, 0x28, 0xb5, 0x2a, 0x3c, 0xa3, 0x13, 0x31, 0xbd, + 0xe6, 0xae, 0xfc, 0xbc, 0x98, 0xb6, 0x68, 0xbd, 0x41, 0xdf, 0x66, 0x3b, 0xde, + 0xc5, 0x2e, 0xbd, 0x24, 0x8c, 0x4c, 0xbd, 0xdb, 0x77, 0xe8, 0x3b, 0xc0, 0x23, + 0xc1, 0xbc, 0x50, 0xcb, 0x98, 0xbc, 0x44, 0x4b, 0x32, 0x3d, 0xd0, 0xd5, 0xf9, + 0xbc, 0x40, 0x77, 0xea, 0x3b, 0xaf, 0x97, 0xbc, 0x3c, 0x9f, 0x07, 0x8d, 0x3d, + 0x26, 0xc4, 0x87, 0xbc, 0x48, 0xff, 0x1b, 0x3d, 0x90, 0x07, 0xc0, 0x3b, 0xa0, + 0xeb, 0x61, 0xbb, 0x61, 0x90, 0x8c, 0x3d, 0x46, 0x0b, 0x89, 0xbd, 0x61, 0x99, + 0x09, 0xbd, 0x27, 0xb3, 0x3a, 0xbc, 0xad, 0x56, 0xff, 0xbc, 0xa6, 0xaf, 0x7f, + 0x3d, 0x50, 0x1d, 0x09, 0xbd, 0x82, 0xfd, 0xcd, 0xbc, 0x31, 0x6c, 0x4d, 0x3d, + 0x6d, 0xe8, 0x8c, 0x3c, 0x59, 0x5e, 0xb7, 0xbb, 0xa8, 0x14, 0x49, 0x3d, 0x86, + 0xe4, 0x89, 0xbc, 0x41, 0xc7, 0x0c, 0xbd, 0xf5, 0x84, 0x80, 0x3d, 0x31, 0x71, + 0x88, 0x3d, 0x3b, 0xcf, 0x84, 0xbd, 0x4f, 0xc3, 0x89, 0x3d, 0x24, 0x62, 0x21, + 0xbd, 0xb0, 0xc2, 0xdb, 0x3b, 0xf8, 0xc8, 0x46, 0xbd, 0xa5, 0xe0, 0x89, 0x3d, + 0x89, 0x41, 0x29, 0x3c, 0x90, 0xbd, 0xe7, 0x3c, 0x78, 0xc9, 0x42, 0xbc, 0x1f, + 0xd6, 0x82, 0x3d, 0xfb, 0xcd, 0x87, 0xbd, 0x2a, 0xd2, 0x24, 0xbd, 0x86, 0x49, + 0x6d, 0xbd, 0x62, 0x20, 0xc8, 0xba, 0xb0, 0xc4, 0xec, 0xbc, 0xdf, 0x68, 0xb4, + 0x3a, 0xe3, 0x0f, 0xe7, 0x3c, 0x41, 0xd5, 0x2e, 0xbd, 0xd4, 0xd6, 0x7c, 0xbd, + 0xb6, 0xd8, 0x2f, 0x3d, 0x2e, 0x95, 0xf2, 0xbc, 0x7c, 0xa4, 0xd0, 0xbc, 0x84, + 0x63, 0x61, 0x3d, 0xfe, 0x1c, 0x26, 0x3d, 0x29, 0x38, 0x6e, 0x3c, 0xff, 0xb9, + 0x12, 0xbd, 0xbc, 0xc6, 0x8d, 0x3d, 0xe1, 0xf5, 0x94, 0xbd, 0xd6, 0x91, 0x86, + 0xbd, 0x88, 0xb9, 0x58, 0xbc, 0x50, 0x18, 0xb0, 0xbb, 0x95, 0x6f, 0x84, 0x3d, + 0xd1, 0x02, 0x2c, 0xbd, 0xdd, 0xec, 0x00, 0x3d, 0x2c, 0x87, 0x33, 0x3c, 0x83, + 0xae, 0x83, 0xbd, 0xf9, 0xfc, 0xc7, 0x3b, 0x54, 0x47, 0x34, 0xbc, 0xdc, 0xeb, + 0x44, 0xbc, 0xc1, 0x33, 0x1f, 0xbd, 0x2e, 0xa0, 0xe7, 0xbc, 0x18, 0x92, 0x5b, + 0xbc, 0x75, 0xee, 0x48, 0x3d, 0xcf, 0xe5, 0x29, 0x3c, 0xdd, 0xfb, 0xcd, 0xbc, + 0x1e, 0xfe, 0x15, 0xbd, 0xfa, 0x83, 0x24, 0xbd, 0x74, 0xa7, 0x1b, 0x3d, 0x79, + 0x43, 0xf6, 0x3c, 0xc1, 0x09, 0xcc, 0xbb, 0x23, 0xce, 0x51, 0x3d, 0x90, 0xbd, + 0x6d, 0xbd, 0xd3, 0x87, 0xa9, 0x3c, 0xa6, 0x5c, 0x6b, 0x3d, 0x30, 0xbc, 0xd0, + 0xbb, 0x43, 0x24, 0x71, 0xbd, 0xf1, 0xc3, 0x69, 0xbc, 0xcc, 0x77, 0x5d, 0xbd, + 0xf5, 0x11, 0x95, 0xbd, 0x90, 0x17, 0xc7, 0xbc, 0x44, 0x6c, 0x85, 0xbd, 0xeb, + 0x43, 0xd6, 0x3c, 0xe3, 0x8d, 0x8b, 0x3d, 0xbf, 0x68, 0x3d, 0xbd, 0x6d, 0x69, + 0x86, 0xbd, 0xb5, 0x14, 0x8f, 0xbd, 0xe9, 0x70, 0x0c, 0xbc, 0x97, 0x30, 0x78, + 0x3d, 0xd2, 0x1f, 0x57, 0xbd, 0x08, 0xe4, 0x28, 0x3d, 0x34, 0x1f, 0xf3, 0xbc, + 0x18, 0xb7, 0x66, 0xbc, 0x00, 0x60, 0x30, 0x3c, 0xc1, 0x3d, 0x1f, 0xbd, 0x26, + 0x9a, 0x85, 0x3d, 0xc6, 0x32, 0x88, 0xbd, 0x36, 0x33, 0x5c, 0xbd, 0x81, 0xb7, + 0x89, 0xbd, 0x9f, 0x29, 0xeb, 0xbb, 0xe3, 0x50, 0x3d, 0x3d, 0x24, 0x66, 0x88, + 0xbd, 0xcc, 0xc0, 0x0d, 0x3d, 0xd2, 0xa9, 0x92, 0x3c, 0x54, 0x72, 0x02, 0x3d, + 0xd5, 0x3b, 0x90, 0xbb, 0x3d, 0x9f, 0x63, 0xbd, 0xed, 0xbe, 0x18, 0xbd, 0x59, + 0xec, 0x6e, 0x3b, 0x28, 0xf2, 0x29, 0xbc, 0xc7, 0xce, 0xab, 0x3c, 0xf4, 0xc8, + 0x79, 0xbd, 0x7c, 0x71, 0x30, 0x3d, 0x75, 0xbb, 0x80, 0xbc, 0x5c, 0xc6, 0x6b, + 0xbd, 0x61, 0x73, 0x3c, 0x3d, 0x74, 0x82, 0x33, 0xbd, 0xd2, 0x32, 0x79, 0x3c, + 0x9c, 0x80, 0xb6, 0xbb, 0xef, 0xee, 0x5f, 0x3d, 0xf8, 0x07, 0x30, 0xbd, 0xb1, + 0x7f, 0x2f, 0xbd, 0xc2, 0x76, 0x36, 0xbd, 0x9e, 0x38, 0xa3, 0x3c, 0x7c, 0x4e, + 0x47, 0xbc, 0x48, 0xce, 0x1a, 0x3d, 0xfc, 0xcd, 0xc2, 0x3c, 0x65, 0xb0, 0x07, + 0x3d, 0x51, 0x39, 0x1c, 0x3d, 0x27, 0x56, 0x87, 0x3d, 0x63, 0x07, 0xdd, 0x3c, + 0x2b, 0xd5, 0x82, 0x3d, 0xb0, 0x9d, 0x85, 0xbd, 0xc5, 0x43, 0xf0, 0x3c, 0x19, + 0x0c, 0x95, 0x3b, 0x28, 0x64, 0x6b, 0xbd, 0x8e, 0x23, 0x09, 0xbd, 0xfa, 0x58, + 0xfc, 0x3b, 0x40, 0xca, 0x5d, 0x3c, 0xa0, 0xbe, 0x58, 0xbd, 0xb1, 0x3b, 0x91, + 0xbd, 0xd1, 0x73, 0xf0, 0x3a, 0x1d, 0x07, 0x31, 0x3d, 0x7d, 0x80, 0x07, 0x3d, + 0xda, 0x52, 0x44, 0x3c, 0x78, 0x62, 0x58, 0x3c, 0x8d, 0x84, 0x01, 0x3d, 0x66, + 0x36, 0x76, 0xbd, 0x68, 0xd0, 0x03, 0xbc, 0x43, 0x54, 0x56, 0x3c, 0xae, 0xac, + 0x59, 0x3d, 0x36, 0xce, 0x48, 0xbd, 0xd4, 0xc1, 0x65, 0xbc, 0xd9, 0xee, 0x34, + 0x3c, 0x80, 0x4c, 0x66, 0xba, 0x88, 0xe1, 0x3c, 0x3c, 0xc8, 0xb7, 0x04, 0x3d, + 0x90, 0xdf, 0xdf, 0x3c, 0x20, 0x76, 0x1c, 0x3b, 0xfb, 0x80, 0x1e, 0x3d, 0x7e, + 0xbd, 0x19, 0x3d, 0x1f, 0x28, 0x96, 0xbb, 0x19, 0xa6, 0x3c, 0x3c, 0x3f, 0xc7, + 0xf9, 0xbc, 0x4a, 0xc2, 0x1a, 0xbd, 0xd5, 0xa0, 0x86, 0xbd, 0x3a, 0xc8, 0xd6, + 0x3c, 0xc3, 0x1a, 0x5a, 0x3d, 0x1a, 0x8c, 0x91, 0xbd, 0xd0, 0x10, 0x67, 0x3d, + 0x42, 0x5b, 0x16, 0x3d, 0xa3, 0xd2, 0x5b, 0xbc, 0x6c, 0xa0, 0xb6, 0x3c, 0x65, + 0xe2, 0x1d, 0xbd, 0x9a, 0xdf, 0x0e, 0xbd, 0xc0, 0x74, 0xcf, 0x3b, 0x84, 0xe1, + 0xc1, 0x3c, 0x2a, 0xed, 0x60, 0x3d, 0xe3, 0x10, 0xe4, 0xbc, 0x3f, 0xcc, 0x8b, + 0xbd, 0x95, 0xa5, 0x8b, 0x3d, 0xd8, 0xc3, 0x00, 0xbd, 0x85, 0x56, 0x75, 0x3d, + 0xac, 0x3a, 0x5b, 0x3d, 0x6a, 0x5d, 0xed, 0xbb, 0xbb, 0xd3, 0xd5, 0x3c, 0xac, + 0xb0, 0x3f, 0x3d, 0x70, 0x1a, 0x6b, 0x3c, 0x70, 0xca, 0x28, 0x3c, 0xa2, 0x71, + 0xde, 0xbc, 0x00, 0x22, 0x77, 0x3a, 0x43, 0x45, 0x21, 0xbd, 0x17, 0xa9, 0x34, + 0x3d, 0x4d, 0x49, 0x2d, 0xbd, 0xb5, 0xd6, 0x8b, 0x3d, 0x84, 0xa5, 0xbd, 0xbc, + 0x9d, 0x7f, 0x02, 0xbd, 0x85, 0x08, 0x80, 0xbd, 0xff, 0x2d, 0x8f, 0xbc, 0x04, + 0x5f, 0x3b, 0xbd, 0xba, 0xce, 0x17, 0xbd, 0xf3, 0xfc, 0x80, 0x3d, 0xe1, 0x9c, + 0x8c, 0xbd, 0xaf, 0x1c, 0xc6, 0x3c, 0x77, 0x31, 0x12, 0x3d, 0xde, 0x28, 0x49, + 0xbd, 0x0d, 0xe3, 0x1f, 0xbd, 0x2a, 0x71, 0x30, 0xbc, 0x1e, 0x04, 0x35, 0x3d, + 0x08, 0x0a, 0xad, 0x3b, 0xe9, 0x97, 0x98, 0xbc, 0x26, 0xe3, 0x00, 0x3c, 0xbe, + 0xf9, 0xbb, 0xbc, 0x77, 0x23, 0x34, 0xbd, 0x55, 0x69, 0x61, 0x3d, 0xc4, 0xb9, + 0x8d, 0xbd, 0x5f, 0x82, 0x81, 0x3d, 0x68, 0xff, 0x16, 0xbc, 0x2c, 0xa2, 0x91, + 0xbc, 0x67, 0x62, 0x78, 0xbd, 0x76, 0x32, 0x13, 0x3d, 0x68, 0x26, 0x2b, 0x3d, + 0x1a, 0xbb, 0xdc, 0xbc, 0xae, 0x91, 0x84, 0x3d, 0xc0, 0xfe, 0x8d, 0xbd, 0xfe, + 0x28, 0x88, 0xbc, 0x02, 0x43, 0x0e, 0xbc, 0x0b, 0x35, 0x69, 0xbb, 0xb4, 0xf8, + 0x8b, 0xbd, 0xad, 0x86, 0x6e, 0xbd, 0x5c, 0x92, 0x19, 0xbd, 0x03, 0x18, 0x59, + 0xbd, 0x58, 0x48, 0x55, 0xbc, 0x2e, 0xaf, 0x4d, 0x3d, 0x70, 0x1a, 0x59, 0xbc, + 0x63, 0xf3, 0x3d, 0xbd, 0x97, 0xcd, 0x8f, 0xbd, 0x4b, 0x2b, 0x75, 0x3d, 0x78, + 0xf6, 0x78, 0xbd, 0x40, 0x84, 0x01, 0xbd, 0x04, 0xb6, 0x05, 0xbd, 0x21, 0xa7, + 0xf7, 0x3c, 0x9e, 0x08, 0xc5, 0x3c, 0x3b, 0xde, 0xa8, 0xbc, 0x04, 0x81, 0x85, + 0x3c, 0x7d, 0x36, 0xd2, 0x3c, 0x02, 0xf0, 0xd0, 0xbc, 0xcb, 0xe0, 0x68, 0x3d, + 0xb3, 0x19, 0x89, 0xbd, 0x39, 0xf7, 0x5f, 0x3d, 0x6a, 0x8f, 0x05, 0xbc, 0x7c, + 0xc8, 0x91, 0xbc, 0xec, 0xc4, 0x93, 0x3c, 0xa0, 0x62, 0x3a, 0xbb, 0x59, 0xfc, + 0x1a, 0xbd, 0xc9, 0xcd, 0x95, 0xbd, 0x57, 0xc3, 0x5b, 0xbb, 0x67, 0x2f, 0xe4, + 0x3c, 0x13, 0xcc, 0xa5, 0x3c, 0x1d, 0x6c, 0x39, 0xbc, 0x50, 0x64, 0x83, 0x3c, + 0x50, 0x6d, 0x5b, 0xbc, 0xda, 0x2a, 0xcd, 0x3c, 0x09, 0xb3, 0x96, 0xbd, 0x91, + 0x4f, 0x34, 0x3d, 0x33, 0xd0, 0x17, 0xbd, 0x1d, 0x22, 0x86, 0xbd, 0x9c, 0x1e, + 0x0d, 0xbd, 0xd4, 0x2b, 0x9c, 0xba, 0x67, 0xb5, 0xa7, 0xbc, 0x0f, 0xe2, 0x76, + 0xbd, 0x4b, 0xb9, 0x71, 0x3d, 0x69, 0xa9, 0x9c, 0xbc, 0x30, 0x44, 0x47, 0x3d, + 0xf0, 0xdc, 0x95, 0x3c, 0xe2, 0x1d, 0x22, 0xbd, 0xaa, 0xb5, 0x58, 0xbd, 0x9d, + 0x59, 0x7d, 0xbd, 0xa4, 0x92, 0x95, 0x3c, 0x40, 0xaa, 0x8d, 0xbd, 0xf0, 0x3e, + 0xb4, 0x3c, 0xc2, 0x03, 0x2a, 0xbd, 0xb0, 0xc5, 0x29, 0xbd, 0xc0, 0x7c, 0x42, + 0xbd, 0xea, 0x99, 0x7e, 0x3d, 0xd6, 0xbc, 0x15, 0x3d, 0xb9, 0xda, 0x37, 0xbd, + 0xd0, 0x21, 0x9e, 0x3c, 0x79, 0x2e, 0xab, 0xbb, 0x73, 0x17, 0xcd, 0xbc, 0x7c, + 0x01, 0xe3, 0x3c, 0xb7, 0xb8, 0xf2, 0x3c, 0x11, 0x4b, 0x45, 0x3d, 0x87, 0x86, + 0x9a, 0x3c, 0x2c, 0x70, 0x57, 0xbd, 0x55, 0xdf, 0x1d, 0xbd, 0xf5, 0x86, 0xa6, + 0xbc, 0x21, 0x96, 0x49, 0xbd, 0x36, 0x4c, 0x75, 0xbd, 0xc9, 0x1c, 0xa0, 0x3c, + 0x5d, 0xba, 0x26, 0x3d, 0xd6, 0x56, 0x02, 0x3d, 0x69, 0x90, 0x12, 0xbc, 0x08, + 0x5b, 0x0f, 0xbd, 0x81, 0xce, 0x92, 0xbc, 0x3a, 0xb8, 0x5f, 0x3d, 0x7a, 0xaf, + 0xe7, 0x3c, 0x4d, 0x4b, 0x60, 0xbc, 0x78, 0xc0, 0x6c, 0xbd, 0x85, 0x6f, 0xe7, + 0x3c, 0xaa, 0xc1, 0xb3, 0x3c, 0x8b, 0xe4, 0xb7, 0x3c, 0xdd, 0xd0, 0x39, 0x3d, + 0x48, 0x49, 0x1b, 0x3d, 0xe2, 0x74, 0x28, 0xbd, 0x86, 0x4a, 0x47, 0x3d, 0x30, + 0x77, 0xad, 0x3b, 0xe0, 0xa8, 0x0e, 0xbc, 0xec, 0x36, 0xd1, 0x3c, 0xe3, 0x01, + 0x8f, 0xbd, 0x56, 0x6c, 0x34, 0xbd, 0x8a, 0x99, 0x20, 0xbb, 0xb1, 0x89, 0x12, + 0x3d, 0xea, 0x43, 0x39, 0xbd, 0x26, 0x16, 0xd2, 0x3c, 0xe2, 0x88, 0xc8, 0x3c, + 0x63, 0x15, 0xa0, 0x3c, 0x8d, 0x95, 0x3a, 0x3d, 0x86, 0x69, 0x26, 0xbd, 0x4c, + 0x38, 0xdb, 0x3b, 0xe0, 0xfa, 0x49, 0x3d, 0x62, 0xdf, 0xb4, 0xbc, 0x6a, 0xe4, + 0x89, 0xbc, 0x63, 0x50, 0x6d, 0x3d, 0xfa, 0x35, 0x46, 0xbd, 0xcb, 0xcb, 0x8c, + 0xbc, 0x46, 0x94, 0x66, 0x3d, 0xdd, 0xf8, 0xa2, 0xbc, 0x00, 0x34, 0x8c, 0x3d, + 0x0a, 0xa1, 0x05, 0x3d, 0x73, 0x92, 0x91, 0xbd, 0x64, 0x3e, 0xf4, 0xbc, 0xcd, + 0x5a, 0xa4, 0xbc, 0xe6, 0xce, 0x4b, 0x3d, 0x68, 0xb0, 0xcf, 0xbc, 0x38, 0xd3, + 0xe2, 0x3b, 0xfd, 0x03, 0x38, 0xbd, 0x11, 0xc0, 0x92, 0xbd, 0xa8, 0x82, 0x50, + 0x3d, 0x2a, 0x9a, 0xaf, 0xbc, 0x0e, 0xea, 0x7b, 0x3d, 0x11, 0xf4, 0x95, 0xbc, + 0x34, 0xed, 0xb6, 0x3c, 0x2b, 0x26, 0x6f, 0xbd, 0x15, 0xad, 0x7c, 0x3d, 0x19, + 0xc6, 0xed, 0x3c, 0x00, 0xf8, 0x81, 0xbd, 0x74, 0x82, 0x63, 0xbd, 0x62, 0x76, + 0x53, 0xbd, 0x48, 0x4f, 0x78, 0x3d, 0x76, 0x0e, 0x5c, 0xbb, 0x24, 0x30, 0x30, + 0xbd, 0x86, 0x0a, 0x14, 0x3d, 0x08, 0x29, 0xb3, 0xbc, 0xef, 0x7c, 0x2a, 0xbd, + 0x90, 0xb8, 0x09, 0x3d, 0x47, 0x45, 0x66, 0xbc, 0x30, 0x23, 0xb7, 0xbc, 0x8f, + 0xd2, 0x5e, 0x3d, 0x31, 0x72, 0x33, 0x3d, 0x26, 0xdc, 0x88, 0xbd, 0xeb, 0x0b, + 0x24, 0xbc, 0x14, 0x3c, 0xe9, 0xbc, 0x38, 0xc6, 0xd3, 0x3c, 0x55, 0xd6, 0x09, + 0xbd, 0xe5, 0xf7, 0x21, 0xbb, 0x7d, 0x03, 0x0d, 0x3d, 0xe9, 0x91, 0xd6, 0xbb, + 0x00, 0x90, 0xe4, 0x3a, 0x21, 0x2c, 0x1a, 0x3d, 0x0c, 0xe1, 0x82, 0x3c, 0x0a, + 0xb6, 0x38, 0x3d, 0x6c, 0x03, 0xe9, 0x3c, 0x83, 0x86, 0x05, 0x3d, 0x01, 0x6e, + 0x86, 0x3d, 0x99, 0xc2, 0x47, 0xbd, 0x27, 0x07, 0x57, 0x3d, 0xed, 0xd2, 0x59, + 0x3d, 0x0f, 0xa1, 0x0a, 0xbc, 0x12, 0x62, 0x6c, 0x3d, 0x16, 0x50, 0xf8, 0x3b, + 0x00, 0xf3, 0xdc, 0x3c, 0x5c, 0x4e, 0xa6, 0xbc, 0xfa, 0x73, 0x42, 0x3c, 0xd2, + 0x38, 0x8a, 0xbd, 0x35, 0x94, 0x8d, 0xbc, 0x69, 0x22, 0x3e, 0xbd, 0x83, 0xec, + 0x6f, 0xbc, 0xb6, 0x37, 0xb4, 0x3c, 0xf1, 0xa7, 0x83, 0x3d, 0x62, 0xbc, 0x82, + 0x3d, 0x88, 0x5d, 0xb8, 0xbc, 0xdd, 0x4d, 0x96, 0xbc, 0xaa, 0x38, 0x23, 0xbd, + 0x88, 0x3f, 0x4d, 0xbc, 0xc5, 0x2d, 0xfc, 0x3c, 0x78, 0x63, 0x20, 0x3d, 0xe5, + 0x87, 0x88, 0x3d, 0x08, 0xed, 0x77, 0xbc, 0x38, 0xef, 0x85, 0xbc, 0x19, 0xc5, + 0x90, 0x3d, 0xba, 0xc7, 0x4e, 0x3d, 0xe4, 0xc2, 0xd6, 0x3c, 0xac, 0x97, 0x22, + 0xbc, 0xa4, 0x4d, 0x55, 0xbd, 0x02, 0x71, 0x8b, 0xbd, 0xce, 0x55, 0x86, 0x3d, + 0xf9, 0x00, 0x9c, 0xbc, 0xbc, 0x84, 0x51, 0x3d, 0x3c, 0xaa, 0x21, 0xbd, 0xb3, + 0x0f, 0x43, 0xbd, 0x15, 0x2e, 0x90, 0xbd, 0xa9, 0x5c, 0x7a, 0x3d, 0x11, 0x1e, + 0x4b, 0x3d, 0xc7, 0x35, 0xc9, 0xbc, 0x86, 0x61, 0x77, 0xbd, 0x5c, 0xbb, 0x21, + 0xbc, 0x39, 0x3c, 0x6d, 0x3d, 0xaa, 0xde, 0xdd, 0x3a, 0xe5, 0xad, 0x0b, 0xbd, + 0xd5, 0x2c, 0x8f, 0xbd, 0x9b, 0xd2, 0x40, 0xbc, 0xae, 0xd1, 0x27, 0x3d, 0xa4, + 0x43, 0x61, 0x3c, 0x96, 0x2f, 0x26, 0xbd, 0x4c, 0xdb, 0x50, 0xbd, 0xd0, 0xee, + 0x55, 0xbc, 0xa9, 0xdf, 0x62, 0x3d, 0xa9, 0xc7, 0x14, 0xbd, 0x02, 0x65, 0x41, + 0x3b, 0xdc, 0x7c, 0x20, 0x3c, 0xb5, 0xb9, 0x89, 0x3d, 0x43, 0xc8, 0x8f, 0xbd, + 0xe5, 0x6b, 0x3e, 0x3c, 0xcb, 0x96, 0x8d, 0xbd, 0xe8, 0x9b, 0x7d, 0xbd, 0xad, + 0x41, 0x91, 0x3d, 0x84, 0x7b, 0xc2, 0x3c, 0xe9, 0xf8, 0x8c, 0x3c, 0x6d, 0x06, + 0xf1, 0xbb, 0xac, 0xcc, 0x43, 0x3d, 0x11, 0xd2, 0xe3, 0x3c, 0x69, 0xb6, 0x76, + 0xbc, 0x19, 0x3b, 0x71, 0xbd, 0x82, 0x8a, 0xb9, 0xbc, 0x28, 0x56, 0x3a, 0x3d, + 0xf6, 0x2b, 0x3c, 0x3d, 0x0f, 0x6e, 0xe1, 0xbb, 0x96, 0x11, 0x84, 0xbc, 0xae, + 0xf7, 0x81, 0x3d, 0xd2, 0xd1, 0x80, 0x3d, 0x97, 0xc3, 0xe6, 0xbc, 0x89, 0xe2, + 0x57, 0x3c, 0x3d, 0x6e, 0x8e, 0xbc, 0xca, 0x02, 0x4d, 0xbd, 0x62, 0x3c, 0xc1, + 0xbc, 0x16, 0x10, 0xed, 0xba, 0x3f, 0xe1, 0xef, 0x3c, 0x0a, 0x5c, 0xab, 0xbc, + 0x21, 0xad, 0xd1, 0xbb, 0xbc, 0xfe, 0x32, 0x3c, 0xac, 0x6c, 0x71, 0xbd, 0x15, + 0x98, 0x14, 0x3d, 0xb6, 0xee, 0x3a, 0x3c, 0x35, 0x4c, 0x87, 0x3d, 0xb6, 0xcd, + 0x4c, 0x3d, 0x10, 0xf7, 0xcc, 0x3b, 0xdb, 0x8a, 0x19, 0xbd, 0x00, 0x38, 0xdb, + 0xb8, 0xb3, 0x1b, 0x8e, 0xbd, 0x50, 0xa8, 0x41, 0xbd, 0x64, 0x53, 0x85, 0xbd, + 0x46, 0xcf, 0xcd, 0xbb, 0x65, 0xaf, 0xa4, 0x3c, 0x78, 0x82, 0x22, 0xbd, 0xb1, + 0xb2, 0x19, 0xbd, 0xaa, 0x2b, 0xe5, 0xbc, 0xb8, 0x9c, 0x3d, 0x3d, 0x30, 0x82, + 0x8c, 0x3c, 0xd9, 0x2c, 0x89, 0xbd, 0x27, 0x33, 0x8f, 0x3d, 0x20, 0x09, 0x87, + 0x3d, 0x50, 0x15, 0x05, 0xbd, 0x4b, 0xc1, 0x96, 0xbd, 0x82, 0x2a, 0x33, 0x3d, + 0xc1, 0x9b, 0x6c, 0xbd, 0xac, 0x51, 0x0c, 0xbd, 0xd7, 0xbc, 0x59, 0xbd, 0x69, + 0x2b, 0x37, 0x3c, 0xc0, 0xef, 0x26, 0xbd, 0xc8, 0xba, 0x59, 0x3c, 0xda, 0x1b, + 0x18, 0xbd, 0x11, 0xfb, 0x8b, 0x3d, 0xbf, 0xc8, 0x3d, 0xbd, 0x52, 0x1b, 0x00, + 0x3d, 0xe8, 0x9d, 0x4d, 0xba, 0xe4, 0x9d, 0x44, 0x3d, 0x87, 0x63, 0x06, 0xbd, + 0x76, 0xc3, 0x83, 0x3d, 0x32, 0xe3, 0x84, 0xbd, 0x5a, 0x34, 0x11, 0x3d, 0xe0, + 0xb2, 0x0e, 0xbd, 0xa8, 0x02, 0x8a, 0xbd, 0x9c, 0x92, 0x10, 0x3d, 0x47, 0xfd, + 0x90, 0xbd, 0x24, 0x45, 0x3c, 0x3d, 0x67, 0x62, 0x96, 0xbd, 0xbb, 0x91, 0x79, + 0xbd, 0x80, 0x99, 0x5b, 0xbd, 0x93, 0x7f, 0x83, 0xbd, 0x75, 0x82, 0x10, 0xbd, + 0x07, 0xb0, 0xa7, 0xbb, 0x5b, 0x41, 0x66, 0xbd, 0x82, 0xeb, 0x7a, 0xbc, 0x52, + 0xca, 0x57, 0xbd, 0x7e, 0xe3, 0x66, 0x3c, 0xab, 0x22, 0x68, 0xbd, 0x51, 0x4b, + 0xa9, 0xbc, 0x5e, 0x13, 0xa7, 0xbc, 0xe3, 0x6b, 0x88, 0xbb, 0x80, 0x4c, 0x02, + 0x3d, 0xf3, 0x3c, 0x59, 0xbd, 0xb2, 0x10, 0x7e, 0x3d, 0x1a, 0x9d, 0x13, 0xbd, + 0x8d, 0xd0, 0x5b, 0x3d, 0xca, 0x7a, 0x74, 0x3d, 0x16, 0x53, 0x4b, 0x3d, 0xc9, + 0x0a, 0x89, 0xbd, 0x44, 0x7e, 0x1b, 0xbc, 0x11, 0xca, 0xb2, 0xbc, 0x09, 0xe0, + 0x27, 0xbd, 0xe4, 0xed, 0xfb, 0x3c, 0xe4, 0x1a, 0xf9, 0xbc, 0x50, 0x47, 0x2e, + 0x3d, 0x1b, 0xed, 0x4e, 0x3d, 0x6d, 0x7c, 0x81, 0xbd, 0x72, 0x2a, 0xdc, 0xbc, + 0x6f, 0xa7, 0x59, 0x3d, 0xc0, 0xbd, 0x1e, 0xbc, 0xb2, 0xaf, 0xb9, 0xbc, 0x07, + 0x39, 0xba, 0xbc, 0xf4, 0x63, 0x46, 0xbd, 0x45, 0x7b, 0x1a, 0x3d, 0x79, 0xe9, + 0xf7, 0x3c, 0x9e, 0xba, 0xf0, 0xbc, 0xc1, 0x09, 0xbb, 0x3c, 0x0e, 0x21, 0x52, + 0xbc, 0xed, 0x78, 0x43, 0x3b, 0x73, 0x07, 0x62, 0x3d, 0x71, 0x92, 0x84, 0x3d, + 0x7b, 0x59, 0xb2, 0xbc, 0xe0, 0xba, 0x34, 0xbc, 0x0c, 0x23, 0x14, 0xbd, 0x93, + 0x93, 0x1f, 0xbd, 0xb7, 0x20, 0x6b, 0xbd, 0x8e, 0x60, 0x8c, 0xbd, 0x00, 0xe9, + 0x8c, 0x3d, 0xdf, 0xb4, 0xe1, 0xbb, 0xa0, 0x1a, 0xbf, 0xbc, 0xf6, 0x4c, 0x80, + 0x3c, 0x74, 0xeb, 0x18, 0x3d, 0x28, 0x64, 0x8c, 0x3c, 0xba, 0xbd, 0xd3, 0xbc, + 0x56, 0xc0, 0x6f, 0x3d, 0x09, 0x02, 0x88, 0xbd, 0x02, 0xd5, 0x58, 0x3d, 0xc1, + 0x57, 0x31, 0x3d, 0xfc, 0x52, 0x48, 0x3d, 0x61, 0xdc, 0x64, 0xbd, 0xa7, 0xc3, + 0x2b, 0x3d, 0x3b, 0xea, 0x13, 0xbc, 0x0e, 0xac, 0x3c, 0xbd, 0x7e, 0x92, 0x86, + 0x3c, 0xbf, 0x14, 0x29, 0xbc, 0xf3, 0x91, 0x7f, 0x3d, 0xf1, 0x9a, 0xac, 0x3c, + 0xf8, 0xf5, 0x76, 0x3c, 0xa2, 0x0f, 0x86, 0xbd, 0xc3, 0xeb, 0xb7, 0x3a, 0xff, + 0x56, 0x6c, 0x3d, 0x1c, 0xcc, 0x5a, 0xbd, 0x97, 0x3f, 0x78, 0x3d, 0x92, 0xea, + 0x9d, 0xbc, 0xbc, 0x51, 0x6a, 0x3d, 0xc5, 0x44, 0x65, 0x3c, 0xbc, 0x66, 0x30, + 0x3d, 0x70, 0xe2, 0x26, 0xbd, 0x2e, 0xbe, 0x19, 0x3d, 0x5e, 0xf3, 0x82, 0x3d, + 0x32, 0x2f, 0x86, 0xbd, 0x53, 0x73, 0x81, 0x3d, 0x86, 0xef, 0xa2, 0xbc, 0xdb, + 0xda, 0x62, 0xbd, 0x82, 0x4e, 0xd3, 0xbc, 0x80, 0xed, 0x93, 0xba, 0x50, 0xc2, + 0xd6, 0x3b, 0x82, 0x22, 0xf1, 0xbc, 0x49, 0xd7, 0x7a, 0xbc, 0xe9, 0x00, 0x85, + 0x3d, 0xb7, 0x12, 0x4c, 0xbd, 0x90, 0x25, 0x08, 0xb9, 0x2e, 0x76, 0xcb, 0xbc, + 0x47, 0x11, 0x97, 0xbd, 0x06, 0x96, 0x2f, 0x3d, 0x44, 0x62, 0x65, 0x3d, 0xe7, + 0xa5, 0x1f, 0x3d, 0x2e, 0x9e, 0xbf, 0xbc, 0x00, 0xd8, 0x6c, 0xbc, 0x20, 0xd1, + 0x44, 0xbb, 0x19, 0x61, 0x32, 0x3c, 0xf4, 0x7a, 0x30, 0x3d, 0x11, 0x7b, 0xe4, + 0xbc, 0x6e, 0x1c, 0x50, 0x3b, 0x9b, 0x64, 0x64, 0xbd, 0x89, 0x52, 0x1f, 0x3d, + 0x65, 0x20, 0x2c, 0x3d, 0xb9, 0x45, 0xd7, 0x3c, 0xe8, 0x37, 0x8e, 0x3d, 0x40, + 0x5e, 0x50, 0x3c, 0x7a, 0x66, 0x68, 0xbd, 0x45, 0x1b, 0x31, 0xbd, 0xcb, 0x31, + 0x47, 0x3d, 0x2f, 0x4a, 0xb3, 0x3c, 0x97, 0x3d, 0xbc, 0xbc, 0x55, 0x24, 0x80, + 0xbd, 0x85, 0x56, 0x69, 0xbc, 0x0e, 0x0a, 0x34, 0x3d, 0xec, 0xe8, 0x54, 0xbd, + 0xeb, 0x92, 0x6d, 0xbd, 0xe2, 0x61, 0x41, 0x3c, 0xf3, 0x3c, 0x93, 0xbd, 0x10, + 0xea, 0xbd, 0xb7, 0x42, 0xec, 0x3b, 0xbd, 0x66, 0xe6, 0x80, 0xbd, 0x84, 0xd9, + 0x85, 0x3d, 0x2c, 0xd8, 0xac, 0x3c, 0x72, 0x8e, 0x48, 0x3c, 0x11, 0xa8, 0x9c, + 0xbc, 0x08, 0x31, 0x39, 0x3d, 0x0f, 0x3c, 0x7c, 0x3d, 0x58, 0xba, 0x25, 0x3d, + 0xce, 0x5f, 0x27, 0x3c, 0x7c, 0x7b, 0x65, 0x3d, 0x96, 0xd6, 0x1e, 0x3d, 0x48, + 0x03, 0x73, 0xbd, 0x84, 0x7a, 0x26, 0xbd, 0x92, 0x82, 0x72, 0xbd, 0xeb, 0x8a, + 0x0c, 0xbd, 0x84, 0xe7, 0x5f, 0xbd, 0x0b, 0x83, 0xfc, 0x3c, 0xfb, 0xed, 0x8e, + 0xbd, 0x52, 0xe2, 0x65, 0x3d, 0xd1, 0xa1, 0x4e, 0xbb, 0x5f, 0x41, 0xce, 0xbc, + 0x4b, 0x3d, 0x15, 0xbb, 0x20, 0xc8, 0x90, 0xbd, 0x29, 0xfb, 0x28, 0xbd, 0x04, + 0x06, 0x8a, 0xbd, 0x8a, 0x65, 0x30, 0x3d, 0x00, 0x49, 0x93, 0x3a, 0x6e, 0xb0, + 0x61, 0x3d, 0x94, 0xcc, 0x87, 0xbc, 0x10, 0x13, 0x3a, 0x3d, 0x5a, 0x7e, 0x7f, + 0xbd, 0x4c, 0x1f, 0xd7, 0xbc, 0x82, 0xb3, 0x1e, 0x3d, 0x7e, 0xca, 0x00, 0xbc, + 0xe7, 0x69, 0xe4, 0xbb, 0xd5, 0xad, 0x1f, 0x3d, 0xb6, 0x02, 0x72, 0x3d, 0x4b, + 0x4f, 0x91, 0xbc, 0x69, 0xd1, 0xd2, 0xbc, 0xf4, 0x42, 0xce, 0x3c, 0xf9, 0x95, + 0x8f, 0x3d, 0x5f, 0xd1, 0x52, 0x3c, 0xec, 0xd5, 0x67, 0x3d, 0x79, 0x25, 0x84, + 0xba, 0xf3, 0x43, 0x5f, 0x3d, 0x39, 0xdc, 0x2b, 0x3d, 0xc6, 0x40, 0x67, 0xbd, + 0xbb, 0xfa, 0x02, 0xbd, 0xf6, 0x13, 0x31, 0xbc, 0x1a, 0x8a, 0x5b, 0x3d, 0x28, + 0x8c, 0x3d, 0xba, 0xbd, 0x41, 0x46, 0x3d, 0xc8, 0xb7, 0x80, 0xbb, 0xd7, 0xc5, + 0x71, 0x3b, 0x2a, 0x9d, 0x51, 0xbd, 0xfb, 0xe8, 0x66, 0xbd, 0x49, 0x55, 0xad, + 0xbc, 0x80, 0x74, 0x36, 0xbd, 0x00, 0x48, 0xc7, 0xbc, 0xec, 0x9e, 0xf8, 0x3c, + 0x2d, 0x31, 0x7e, 0x3d, 0x5d, 0xdd, 0x94, 0xbd, 0xfd, 0xce, 0x57, 0x3d, 0xe2, + 0x28, 0x0b, 0xbc, 0x00, 0xec, 0x38, 0x3d, 0x88, 0x2f, 0xc9, 0xbc, 0xe8, 0x5d, + 0x69, 0x3d, 0xd8, 0x1a, 0x04, 0xbc, 0xa5, 0x91, 0x78, 0x3d, 0x4f, 0x30, 0x06, + 0xbc, 0xdf, 0x59, 0x51, 0x3d, 0x00, 0xb6, 0x8f, 0x3a, 0x9f, 0x7e, 0x76, 0xbd, + 0x66, 0xc5, 0x1d, 0x3d, 0x99, 0x26, 0x91, 0xbd, 0x82, 0x51, 0x8e, 0xbd, 0xf6, + 0xf9, 0x81, 0xbc, 0x60, 0x4a, 0x9d, 0x3c, 0x40, 0xfa, 0xf8, 0xbb, 0x96, 0x7a, + 0xf4, 0xbb, 0x8d, 0xfb, 0x02, 0xbd, 0xf0, 0xf1, 0xa8, 0x3c, 0xc9, 0xa7, 0x38, + 0xbd, 0x85, 0xc8, 0x4b, 0xbc, 0xc8, 0x56, 0x13, 0x3d, 0x61, 0x4d, 0x88, 0xbd, + 0x4e, 0xe1, 0x42, 0x3d, 0xec, 0x20, 0x7c, 0xbc, 0x49, 0x1c, 0x91, 0x3d, 0x40, + 0xea, 0x8d, 0xbd, 0x90, 0xa9, 0x5b, 0xbd, 0xe1, 0x98, 0x8e, 0xbd, 0x2f, 0x06, + 0xed, 0xbc, 0xa9, 0xa1, 0xe0, 0x3c, 0x54, 0xa1, 0x76, 0xbd, 0x21, 0x88, 0x70, + 0xbd, 0x16, 0x25, 0x23, 0xbd, 0xb6, 0xdf, 0x4f, 0x3d, 0xaf, 0x39, 0x57, 0x3d, + 0x3f, 0xfa, 0x2a, 0xbd, 0xda, 0x39, 0xcf, 0x3c, 0xf6, 0x8b, 0x5e, 0x3d, 0x49, + 0x9e, 0xec, 0xbc, 0x5c, 0x6b, 0x7f, 0x3d, 0x38, 0xf8, 0x8a, 0xbc, 0x15, 0xc8, + 0x8a, 0xbd, 0xc9, 0xb5, 0x3f, 0x3d, 0x1c, 0xcd, 0x97, 0xbd, 0x3c, 0xa4, 0xb0, + 0xba, 0x85, 0x05, 0x18, 0xbc, 0x0b, 0xf9, 0x81, 0xbd, 0xa7, 0x64, 0x84, 0xbc, + 0x17, 0xa4, 0x86, 0x3d, 0x74, 0xbc, 0x6d, 0xbd, 0xbe, 0xaa, 0xe0, 0x3c, 0x70, + 0x71, 0x01, 0x3d, 0x34, 0x7c, 0x3b, 0x3d, 0xf7, 0xe5, 0x4a, 0x3d, 0x0b, 0x8a, + 0xe2, 0x3c, 0x3a, 0xce, 0x8c, 0xbd, 0xc3, 0x45, 0x17, 0xbc, 0x06, 0x14, 0x40, + 0xbd, 0xc8, 0x4e, 0x2a, 0x3d, 0x1e, 0x87, 0x38, 0x3d, 0x12, 0xe6, 0x8e, 0x3d, + 0x5d, 0x26, 0x24, 0xbc, 0x96, 0x16, 0x0e, 0xbb, 0xbd, 0x7b, 0xe7, 0xbb, 0xee, + 0xf1, 0x86, 0xbc, 0x21, 0x44, 0xe1, 0xba, 0x34, 0xc7, 0x76, 0xbd, 0x84, 0x41, + 0x0f, 0xba, 0x79, 0x2a, 0x77, 0x3d, 0xe0, 0x52, 0xce, 0x3c, 0xd3, 0xbd, 0x0c, + 0x3d, 0xff, 0x57, 0x8b, 0x3d, 0xc6, 0x60, 0xed, 0x3b, 0xfc, 0x72, 0x7f, 0xbd, + 0x18, 0xaa, 0x20, 0x3c, 0xcd, 0x28, 0x0d, 0x3d, 0x18, 0xf7, 0xdb, 0x3a, 0xd6, + 0x93, 0x6a, 0x3d, 0x46, 0x48, 0x55, 0xbd, 0x01, 0x2f, 0x7c, 0x3d, 0x75, 0x2d, + 0x80, 0x3c, 0x4c, 0x22, 0xd0, 0x3c, 0x17, 0x6d, 0x8b, 0xbb, 0x34, 0x25, 0xec, + 0xbc, 0x04, 0x8e, 0x56, 0x3d, 0xd8, 0xab, 0x88, 0x3d, 0x20, 0x51, 0x88, 0xbc, + 0x71, 0xdb, 0xd4, 0x3c, 0x41, 0xe5, 0x03, 0xbd, 0x28, 0x8d, 0x0c, 0x3c, 0xa1, + 0xe2, 0x7d, 0xbd, 0x10, 0xb2, 0xcd, 0x3c, 0x3b, 0xa9, 0xdf, 0xbc, 0x2d, 0x71, + 0x73, 0x3d, 0xfa, 0xcb, 0xd3, 0x3c, 0xb4, 0x04, 0x10, 0xbb, 0xca, 0xec, 0x8c, + 0xbd, 0xd1, 0x28, 0x9a, 0x3c, 0x0f, 0x12, 0x2f, 0x3d, 0x93, 0x67, 0x2a, 0x3d, + 0x94, 0x98, 0xb7, 0x3c, 0x8e, 0x0f, 0xae, 0xbc, 0xc6, 0x7c, 0xd9, 0x3c, 0xa0, + 0x4d, 0x3b, 0xbb, 0x20, 0xf7, 0xd5, 0x3c, 0x7b, 0xa2, 0x72, 0xbd, 0xc5, 0xb9, + 0xbd, 0x3c, 0x59, 0x61, 0x1e, 0x3d, 0x8b, 0x95, 0x8c, 0xbd, 0xbe, 0xbf, 0x9b, + 0xbc, 0x0f, 0x63, 0x7b, 0x3d, 0x92, 0x1a, 0x66, 0x3c, 0x4f, 0xef, 0xa0, 0x38, + 0x8c, 0x24, 0xd9, 0xbc, 0x7d, 0xfa, 0xf8, 0xbc, 0xde, 0xe7, 0x85, 0x3d, 0xa2, + 0xd6, 0x13, 0xbd, 0x5e, 0x38, 0x3d, 0xbd, 0xe7, 0x7e, 0xb0, 0x3d, 0xc5, 0x86, + 0xba, 0xbc, 0x49, 0x12, 0x93, 0xbd, 0x8e, 0x9e, 0xea, 0x3d, 0x48, 0x93, 0x84, + 0xbd, 0x33, 0x48, 0xc7, 0xbc, 0x23, 0x1f, 0x5f, 0x3d, 0x51, 0x20, 0xb5, 0xbb, + 0x93, 0xfa, 0x90, 0x3d, 0x99, 0xe1, 0x31, 0xbd, 0x82, 0x3e, 0x89, 0xbd, 0x99, + 0x5e, 0xe0, 0xbc, 0x0c, 0xc2, 0x03, 0x3d, 0xe2, 0x69, 0xb2, 0x3c, 0x3d, 0xdb, + 0x6e, 0xbd, 0x37, 0xd2, 0x36, 0x3c, 0x89, 0x66, 0x1e, 0xbd, 0xeb, 0x8a, 0x88, + 0x3d, 0x1a, 0x34, 0x3d, 0x3d, 0x84, 0x3a, 0x24, 0x3d, 0x2f, 0xd2, 0x78, 0xbd, + 0x45, 0x13, 0x82, 0x3d, 0x70, 0x07, 0x94, 0x3d, 0xf9, 0xc5, 0x7f, 0xbd, 0x40, + 0x1b, 0x04, 0xbd, 0x74, 0x6f, 0x3a, 0x3d, 0xa0, 0x7d, 0xf8, 0xbc, 0x7e, 0x95, + 0x61, 0x3d, 0xc0, 0x56, 0x5d, 0x3b, 0x16, 0xa4, 0x06, 0x3d, 0x4b, 0x46, 0xbf, + 0xbd, 0x64, 0x97, 0xe8, 0xbc, 0x79, 0xbd, 0x75, 0x3a, 0x50, 0xb6, 0x6a, 0x3c, + 0x7b, 0xcc, 0x29, 0x3c, 0xa8, 0x8f, 0x17, 0x3d, 0xf0, 0xf6, 0xbc, 0x3b, 0x48, + 0x26, 0x78, 0xbd, 0x96, 0x9b, 0xe4, 0x3b, 0x87, 0xe5, 0x70, 0x3c, 0x88, 0xf2, + 0xac, 0xbb, 0x79, 0x75, 0x05, 0x3c, 0x06, 0x38, 0xa5, 0x3d, 0x8b, 0x4e, 0x0a, + 0x3d, 0xf9, 0x2d, 0x95, 0x3d, 0x08, 0xca, 0x7f, 0x3d, 0xc7, 0x5e, 0x1c, 0x3d, + 0xf2, 0xbc, 0x57, 0xbc, 0xc6, 0xaf, 0x5a, 0xbd, 0x7f, 0xc5, 0xc7, 0x3c, 0x69, + 0x5c, 0x00, 0x3c, 0x69, 0xaf, 0x8a, 0x3d, 0x60, 0x07, 0x01, 0x3d, 0xc3, 0x8f, + 0xff, 0x3a, 0xd5, 0x44, 0x1d, 0x3d, 0x66, 0x63, 0x2a, 0xbd, 0xe9, 0xd3, 0x9a, + 0xbd, 0x50, 0xc0, 0x0a, 0xbd, 0x32, 0x2d, 0xc6, 0xbc, 0xf0, 0xb1, 0xd4, 0xbb, + 0x48, 0xcc, 0xdc, 0x3a, 0xcd, 0x33, 0x6f, 0x3d, 0xea, 0x34, 0x95, 0xbd, 0xb8, + 0x4b, 0x2f, 0xbc, 0xe0, 0xa1, 0x0f, 0xbc, 0x0f, 0xee, 0x01, 0x3c, 0x5e, 0x3d, + 0x35, 0x3d, 0x6e, 0x51, 0x81, 0xbd, 0xfa, 0x8d, 0x8b, 0x3c, 0x51, 0xc5, 0x0a, + 0x3d, 0x8a, 0xa8, 0xc4, 0xbc, 0x66, 0x86, 0x19, 0xbd, 0x50, 0x08, 0x8e, 0x3d, + 0x22, 0x74, 0xdd, 0x3b, 0xdb, 0xf4, 0xea, 0x3a, 0xa1, 0x2d, 0x68, 0x3d, 0x7e, + 0x82, 0xc6, 0x3d, 0xe6, 0x89, 0x16, 0xbd, 0xe2, 0x72, 0x78, 0xbd, 0x25, 0xe0, + 0x82, 0xbd, 0xc2, 0x61, 0x66, 0x3c, 0xb2, 0x57, 0x66, 0x3d, 0x47, 0xa3, 0x40, + 0xbc, 0xf7, 0x00, 0x3e, 0xbd, 0x78, 0x7e, 0x42, 0x3d, 0xc3, 0x09, 0x83, 0x3d, + 0x1d, 0xac, 0x09, 0x3d, 0x37, 0xc0, 0xd7, 0x3b, 0xae, 0xbb, 0x34, 0xbd, 0x12, + 0x34, 0x95, 0x3d, 0xf8, 0x3f, 0x20, 0x3d, 0xa8, 0x30, 0x0b, 0xbd, 0x09, 0x71, + 0x02, 0xbd, 0xb7, 0xbc, 0x80, 0x3d, 0x9e, 0x24, 0x48, 0x3d, 0xbb, 0xe7, 0xa6, + 0x3d, 0x59, 0xd4, 0x28, 0xbd, 0x98, 0x85, 0x14, 0xbc, 0x25, 0xbe, 0xae, 0x3c, + 0x1b, 0x82, 0x85, 0x3c, 0x6c, 0x23, 0xc3, 0x3c, 0x7a, 0xe2, 0x03, 0xbd, 0x75, + 0x65, 0x3a, 0x3d, 0x9e, 0x34, 0x76, 0x3b, 0xe1, 0x36, 0x05, 0x3d, 0xd6, 0x9a, + 0x37, 0xbd, 0x66, 0x1c, 0x99, 0x3c, 0x9d, 0x65, 0x2a, 0xbd, 0xc3, 0xdd, 0x60, + 0xbc, 0x6c, 0xa8, 0x06, 0xbd, 0xb8, 0xb4, 0x85, 0xbd, 0xca, 0x5d, 0x65, 0x3c, + 0xe2, 0xce, 0xfa, 0x3c, 0x18, 0xe2, 0x29, 0x3d, 0x4a, 0xd0, 0x31, 0xbc, 0x78, + 0xd4, 0x52, 0x3d, 0x7a, 0x03, 0x47, 0x3d, 0x0e, 0x3a, 0xde, 0xbc, 0xd1, 0x1c, + 0x72, 0xbd, 0x39, 0xb2, 0x8c, 0xbd, 0x1a, 0x1c, 0xba, 0xbd, 0x20, 0x30, 0x5e, + 0x3b, 0x4b, 0x1f, 0x40, 0xbc, 0x70, 0x8b, 0xbd, 0x3c, 0x02, 0x15, 0x12, 0xbd, + 0x92, 0x7d, 0x52, 0xbd, 0x98, 0x66, 0x78, 0xbc, 0x73, 0x75, 0x74, 0x3d, 0x91, + 0x42, 0x88, 0x3d, 0x8a, 0x00, 0x26, 0xbd, 0xca, 0xd7, 0x86, 0x3d, 0xea, 0xcb, + 0x66, 0xbd, 0xb8, 0x28, 0x26, 0x3c, 0xd5, 0x36, 0x90, 0xbd, 0xfa, 0x19, 0x5a, + 0x3d, 0xb2, 0x02, 0x81, 0xbd, 0xe3, 0x63, 0x8d, 0x3d, 0xad, 0x2e, 0x0e, 0x3d, + 0x01, 0x74, 0x4b, 0xbd, 0xa3, 0x91, 0x08, 0x3d, 0x6d, 0xa0, 0x23, 0xbd, 0x84, + 0xbd, 0x0a, 0xbd, 0x28, 0x54, 0x95, 0xba, 0x1c, 0x4a, 0x2f, 0x3d, 0xf0, 0x67, + 0xaf, 0xbc, 0xcc, 0x1e, 0x18, 0x3d, 0xd5, 0xf0, 0x29, 0x3d, 0xd9, 0x19, 0x0a, + 0xbc, 0x91, 0xf8, 0x1c, 0xbc, 0xf0, 0x4b, 0x1a, 0x3d, 0xc8, 0xdc, 0x52, 0xbc, + 0x65, 0x2b, 0x6c, 0xbd, 0x9f, 0x08, 0x9a, 0xbd, 0x11, 0xd4, 0x9e, 0xbc, 0xb0, + 0xa3, 0x0d, 0x3c, 0x20, 0x50, 0xd7, 0x3c, 0x65, 0xfc, 0xb7, 0xbc, 0x43, 0xf5, + 0x0d, 0xbd, 0xb9, 0x3c, 0x2a, 0x3d, 0x66, 0xb3, 0x5b, 0x3d, 0x6d, 0x26, 0xa0, + 0x3d, 0x3a, 0xc0, 0x15, 0xbb, 0x67, 0x1b, 0x0b, 0x3c, 0x20, 0x72, 0xa6, 0xbd, + 0xe2, 0x14, 0xa5, 0xbc, 0x37, 0x10, 0x92, 0x3d, 0x24, 0x2d, 0x1c, 0x3d, 0x47, + 0xbd, 0x2b, 0xbd, 0x68, 0x0f, 0xa5, 0x3d, 0x96, 0x58, 0x98, 0x3d, 0x25, 0x20, + 0xd3, 0x3b, 0xc2, 0x1b, 0xbd, 0x3d, 0x17, 0x2a, 0xa5, 0xbb, 0x34, 0x7e, 0x47, + 0x3d, 0x36, 0xb6, 0xd0, 0x3b, 0x6a, 0xba, 0xf3, 0x3c, 0x54, 0x95, 0x25, 0xbd, + 0x99, 0x51, 0x81, 0x3d, 0xe6, 0x1b, 0x20, 0xbc, 0x2e, 0xc2, 0x3b, 0xbd, 0xb8, + 0xa6, 0x17, 0xbd, 0x86, 0x1f, 0xd7, 0x3c, 0x60, 0x69, 0x8d, 0x3d, 0x00, 0x02, + 0x76, 0xbd, 0x86, 0xdb, 0x85, 0x3b, 0x52, 0xb1, 0xd7, 0x3d, 0x7c, 0xd1, 0x4f, + 0xbd, 0xb0, 0xe7, 0x13, 0xbd, 0xee, 0xe2, 0x0f, 0x3d, 0x2e, 0x0a, 0x11, 0xbd, + 0x59, 0x7e, 0x04, 0xbd, 0xf1, 0xdf, 0x10, 0xbc, 0x9f, 0xfd, 0x90, 0xbc, 0x0a, + 0xec, 0x47, 0x3c, 0x9b, 0x06, 0x5a, 0x3d, 0x0e, 0xe3, 0xee, 0xbc, 0x3b, 0xbf, + 0xc7, 0x3b, 0x1e, 0xc7, 0x17, 0xbd, 0x65, 0x6d, 0x75, 0x3c, 0x81, 0x92, 0xc3, + 0x3c, 0xee, 0x48, 0x9e, 0x3c, 0x6d, 0x2e, 0x4f, 0xbd, 0x42, 0x85, 0x64, 0xbd, + 0xe9, 0x0a, 0xbb, 0xbc, 0x73, 0x3f, 0x40, 0xbd, 0xbd, 0x8c, 0xae, 0x3b, 0x4a, + 0xae, 0x31, 0x3d, 0x9e, 0x39, 0xfd, 0x3c, 0xd7, 0x4e, 0xe0, 0xbd, 0xf6, 0x05, + 0x05, 0xbd, 0xbf, 0x61, 0x31, 0x3c, 0xba, 0x2f, 0x51, 0x3d, 0x16, 0xef, 0xdd, + 0x3c, 0x23, 0x64, 0x18, 0x3c, 0x44, 0x4b, 0xce, 0xbc, 0x13, 0xbd, 0xd7, 0xbc, + 0xc8, 0xc8, 0xb8, 0xbc, 0x76, 0x69, 0x19, 0xbd, 0x76, 0x51, 0x9c, 0xbd, 0xbe, + 0xbc, 0x7d, 0x3d, 0xa3, 0xa2, 0x74, 0x3d, 0xfe, 0xad, 0x06, 0x3c, 0x74, 0xb4, + 0x0f, 0x3b, 0x9f, 0x83, 0x8d, 0x3d, 0xa5, 0x84, 0x70, 0x3d, 0x99, 0xa1, 0xe6, + 0xbc, 0xf2, 0xf1, 0xbd, 0xbc, 0x29, 0xd8, 0x42, 0xbc, 0x48, 0xb0, 0xa7, 0x3c, + 0xce, 0x31, 0x0b, 0xbd, 0x8b, 0xef, 0x39, 0x3d, 0xc5, 0x28, 0xa4, 0x3c, 0xcd, + 0x1b, 0xb7, 0x3c, 0x3f, 0x50, 0x55, 0xbd, 0xf4, 0xa8, 0x9d, 0x3d, 0xe3, 0xdb, + 0xac, 0x3c, 0x5c, 0xae, 0x68, 0xbc, 0x8e, 0xf1, 0x0f, 0xbc, 0x17, 0x29, 0x87, + 0x3c, 0x19, 0x45, 0x23, 0xbd, 0xf0, 0x0f, 0x12, 0xbd, 0x06, 0x74, 0x8b, 0xbd, + 0x10, 0x65, 0x00, 0x3d, 0xa3, 0x9d, 0x8a, 0x3d, 0x1e, 0xf4, 0x3d, 0x3d, 0x4e, + 0x40, 0x7b, 0x3c, 0xa0, 0xc8, 0xf7, 0xbb, 0x2e, 0x19, 0x1a, 0xbc, 0x37, 0x47, + 0x36, 0xbd, 0x8b, 0x65, 0x6d, 0x3d, 0xc0, 0xcd, 0x21, 0xbd, 0x60, 0xb6, 0xa3, + 0xbb, 0xa9, 0x58, 0x42, 0xbc, 0x94, 0x1c, 0x73, 0xbd, 0x82, 0xa5, 0xad, 0xbc, + 0x51, 0xe5, 0xb5, 0x3d, 0xbd, 0xa1, 0x59, 0x3d, 0x13, 0x5b, 0xdb, 0xbc, 0x44, + 0xdc, 0xd3, 0xbc, 0xc8, 0x3f, 0xa5, 0x3d, 0x5d, 0x7c, 0x68, 0x3d, 0xcd, 0xb4, + 0xa7, 0xbc, 0x58, 0x2b, 0x48, 0x3d, 0xe6, 0x22, 0xf6, 0xbc, 0xde, 0x4b, 0x0b, + 0xbd, 0x71, 0x8f, 0x44, 0xbd, 0x8d, 0xa0, 0x17, 0xbd, 0xd3, 0xd3, 0x36, 0x3d, + 0x40, 0x04, 0x3c, 0xbd, 0x4a, 0xdf, 0x82, 0x3b, 0x23, 0x72, 0x20, 0x3d, 0xf5, + 0x84, 0x80, 0xbd, 0xf9, 0x1c, 0xf3, 0xbc, 0x84, 0xd9, 0x86, 0xbd, 0x28, 0x42, + 0x48, 0xbd, 0x90, 0xd7, 0x32, 0x3d, 0x80, 0x98, 0x01, 0xbc, 0x7f, 0x7a, 0x82, + 0xbd, 0x59, 0x12, 0xf3, 0x3c, 0x9b, 0x63, 0xaa, 0xbc, 0x5e, 0x84, 0xb5, 0xbd, + 0x95, 0x77, 0x90, 0x3d, 0xad, 0x26, 0xb4, 0xbd, 0xda, 0xfb, 0x0a, 0xbd, 0x44, + 0x70, 0x73, 0x3d, 0x70, 0x45, 0x41, 0x3d, 0xe6, 0x6b, 0x73, 0x3c, 0x93, 0x01, + 0x78, 0xbd, 0xc3, 0xda, 0xa2, 0x3d, 0x46, 0x41, 0x83, 0x3d, 0x16, 0x40, 0x32, + 0x3d, 0xa7, 0xfb, 0xa7, 0xbd, 0xc0, 0x57, 0x28, 0x3b, 0xd0, 0x2b, 0x84, 0xbc, + 0x85, 0x89, 0x88, 0x3d, 0xc4, 0xa3, 0x8f, 0xbc, 0xbb, 0xc6, 0x96, 0xbd, 0x7c, + 0xae, 0x36, 0xbd, 0xf8, 0x8b, 0x85, 0x3d, 0xfa, 0x35, 0xf5, 0x3c, 0xad, 0x86, + 0x63, 0xbc, 0x7c, 0xc1, 0x54, 0x3d, 0xad, 0xfc, 0x09, 0xbd, 0x3a, 0x1f, 0xf2, + 0x3c, 0xf4, 0x35, 0x65, 0x3c, 0xd0, 0x53, 0x38, 0xbd, 0x99, 0xf8, 0x36, 0x3d, + 0x95, 0xaf, 0x67, 0x3d, 0xd2, 0x76, 0x44, 0x3d, 0x03, 0x46, 0x82, 0x3d, 0xdc, + 0xe2, 0x53, 0xbd, 0x49, 0x59, 0x7b, 0xbd, 0x1c, 0x8b, 0xaf, 0x3a, 0x80, 0x30, + 0x27, 0xbd, 0xdb, 0x9c, 0x87, 0xbd, 0x8e, 0x09, 0x5c, 0x3d, 0x5e, 0x5d, 0x5d, + 0x3d, 0xcc, 0x97, 0xaa, 0xbb, 0x81, 0xe0, 0xb9, 0xbc, 0x61, 0x3a, 0x9a, 0x3b, + 0xc9, 0x99, 0x9f, 0x3d, 0x2d, 0x52, 0x10, 0xbd, 0x90, 0x0b, 0xa1, 0x3c, 0xaf, + 0x88, 0x81, 0xbd, 0xf4, 0x7a, 0x89, 0xbc, 0xb3, 0xe1, 0xc5, 0xbc, 0x8e, 0xe5, + 0x8a, 0xbd, 0x6d, 0xd9, 0x70, 0x3b, 0xdd, 0x1b, 0xa1, 0x3c, 0xdd, 0xeb, 0x42, + 0xbd, 0x01, 0xcb, 0xf2, 0x3c, 0x8e, 0x4f, 0xff, 0xbc, 0x28, 0x5e, 0x6a, 0xbc, + 0x3f, 0xff, 0x26, 0x3d, 0xc4, 0xfa, 0x87, 0xbc, 0xcb, 0x5e, 0x32, 0xbd, 0x1f, + 0xb7, 0xd1, 0xbd, 0x40, 0xb6, 0x8b, 0x3c, 0x22, 0xf5, 0xa5, 0xbc, 0x5e, 0xa1, + 0xf7, 0xbc, 0x1a, 0x43, 0x11, 0x3d, 0xc9, 0xfe, 0x18, 0xbd, 0x34, 0x8b, 0x2f, + 0x3d, 0x2f, 0xe3, 0x8d, 0x3d, 0xaf, 0x7b, 0x69, 0xbd, 0x63, 0x9d, 0xac, 0x3d, + 0xce, 0x45, 0x50, 0xbd, 0xe1, 0x8f, 0x6b, 0xbd, 0x6e, 0xc6, 0x07, 0xbd, 0x58, + 0x1e, 0x12, 0x3c, 0x79, 0xdd, 0x06, 0x3d, 0xea, 0x26, 0x83, 0xbd, 0xaa, 0x63, + 0xce, 0x3d, 0x3a, 0xb3, 0x81, 0x3b, 0x35, 0x9a, 0xc6, 0x3c, 0x27, 0xc4, 0x59, + 0xbd, 0x74, 0x21, 0x30, 0x3d, 0xfe, 0x21, 0x8f, 0xbc, 0xb2, 0x86, 0x78, 0xbc, + 0xbb, 0x4f, 0xd7, 0xbd, 0xda, 0xfe, 0x2c, 0xbd, 0x7b, 0x99, 0x21, 0x3b, 0x61, + 0xe4, 0x68, 0xbd, 0x66, 0xfd, 0xb2, 0xba, 0xbe, 0x3d, 0x53, 0x3d, 0x53, 0x3f, + 0x5c, 0xbd, 0x5b, 0xf9, 0xc4, 0x3c, 0x1c, 0xa3, 0x6c, 0x3d, 0x61, 0x44, 0xfa, + 0x3c, 0x35, 0xb8, 0xd9, 0x3c, 0x6d, 0x40, 0xc8, 0xbc, 0xbf, 0x20, 0x2a, 0x3d, + 0x84, 0xbd, 0x80, 0x3c, 0x19, 0x27, 0x1c, 0x3d, 0xc8, 0xf0, 0x56, 0x3c, 0x74, + 0x85, 0x29, 0x3c, 0xce, 0x5a, 0x91, 0xbc, 0x1f, 0xc3, 0x89, 0xbc, 0x8a, 0xec, + 0x62, 0x3d, 0xd0, 0xc0, 0xd2, 0xbb, 0x29, 0x30, 0x36, 0x3d, 0x71, 0xd4, 0xaf, + 0x3c, 0x29, 0x52, 0xb9, 0xbc, 0x33, 0xc8, 0x2c, 0x3a, 0x97, 0x8e, 0x18, 0xbb, + 0xda, 0xa7, 0x28, 0xbd, 0xaf, 0x8c, 0xc1, 0xbc, 0x62, 0xbb, 0xc7, 0x3b, 0xda, + 0x12, 0xbb, 0xbc, 0x7a, 0xfb, 0x3a, 0xbd, 0x04, 0xc0, 0xe3, 0x3c, 0x0f, 0x84, + 0xdd, 0xbd, 0xa4, 0x83, 0x87, 0x3d, 0x38, 0x8b, 0x5f, 0xbd, 0x60, 0xb4, 0x98, + 0x3c, 0x99, 0xef, 0x5d, 0x3b, 0xda, 0x0b, 0x83, 0x3d, 0x49, 0xf9, 0x93, 0x3d, + 0xe4, 0x29, 0x51, 0xbd, 0x5e, 0x33, 0x4b, 0xbd, 0x7a, 0xc5, 0xd5, 0x3b, 0xc2, + 0xbc, 0x67, 0x3d, 0x89, 0xa1, 0x55, 0xbd, 0x91, 0x0f, 0x55, 0x3d, 0xf8, 0x89, + 0x82, 0xbd, 0x4c, 0xdc, 0xc6, 0xbc, 0xc9, 0xb0, 0x3e, 0xbd, 0x7c, 0x95, 0x25, + 0x3d, 0xa2, 0x9f, 0xe1, 0x3b, 0x17, 0xcf, 0x90, 0xbb, 0xd6, 0x9c, 0x47, 0x3b, + 0xf6, 0x12, 0x74, 0x3d, 0xba, 0x2e, 0xde, 0x3c, 0x3e, 0x06, 0x74, 0x3d, 0x32, + 0x23, 0x5e, 0xbc, 0x02, 0xf3, 0x88, 0xbd, 0x16, 0x5d, 0xdd, 0xbc, 0x50, 0x9b, + 0x0a, 0xbd, 0x8e, 0x56, 0xb9, 0xbc, 0xc8, 0x8b, 0x18, 0x3d, 0xfd, 0x15, 0x80, + 0x3d, 0x4c, 0x97, 0x5a, 0xbc, 0xe2, 0x63, 0xa4, 0xbc, 0xc3, 0x3d, 0x84, 0xbc, + 0x7e, 0xa2, 0x83, 0x3b, 0x6e, 0x8b, 0x4e, 0x3c, 0x24, 0xb4, 0xb3, 0xbb, 0x03, + 0x9e, 0xfd, 0x3b, 0xa4, 0x8b, 0x53, 0x3d, 0xbc, 0x81, 0x61, 0xbd, 0x59, 0xde, + 0x48, 0x3d, 0x21, 0x16, 0x61, 0xbd, 0x31, 0xbc, 0x1c, 0xbd, 0xfc, 0xe8, 0xf4, + 0x3c, 0x88, 0x36, 0x59, 0x3d, 0x12, 0x10, 0xf8, 0xbb, 0xe4, 0x7b, 0x5f, 0xbc, + 0xf0, 0x9d, 0x9e, 0x3c, 0xfb, 0x94, 0xdb, 0xbc, 0x54, 0x67, 0x65, 0xbc, 0x5e, + 0x6e, 0x3b, 0xbd, 0x12, 0x92, 0x59, 0x3c, 0xf3, 0x69, 0x8b, 0x3b, 0x78, 0x99, + 0xdd, 0x3c, 0x85, 0x31, 0x21, 0x3d, 0xe4, 0x6c, 0x33, 0x3d, 0x9c, 0x58, 0x87, + 0xbd, 0xd9, 0xf5, 0x31, 0xbc, 0xce, 0xac, 0xb9, 0x3d, 0x0e, 0x2c, 0x5c, 0x3d, + 0x6a, 0x94, 0xa9, 0x3d, 0x0e, 0xca, 0x4d, 0xbc, 0x68, 0x0f, 0x4d, 0xbd, 0xd5, + 0x31, 0xa6, 0xbc, 0xf1, 0xdc, 0x9b, 0x3d, 0x71, 0x4d, 0xfd, 0xbc, 0xcc, 0x43, + 0x1a, 0x3d, 0x1f, 0x4f, 0x51, 0x3d, 0xf0, 0x07, 0xa4, 0x3b, 0x1a, 0x75, 0x40, + 0x3d, 0xf6, 0xef, 0x13, 0x3d, 0x58, 0x08, 0x04, 0xbd, 0xf3, 0x55, 0x58, 0x3d, + 0x55, 0x7e, 0x6d, 0xbd, 0x96, 0x39, 0x78, 0xbd, 0x19, 0x7d, 0x7f, 0xbd, 0xc3, + 0x4a, 0x9a, 0xbd, 0x64, 0xad, 0x24, 0x3d, 0xc8, 0xab, 0x10, 0x3b, 0xa2, 0x7f, + 0x76, 0xbd, 0xdd, 0xb6, 0x2e, 0x3d, 0xdb, 0xbf, 0x88, 0x3d, 0x49, 0x2e, 0xbd, + 0xbb, 0xdb, 0xdc, 0x86, 0x3d, 0x06, 0xf9, 0x85, 0xbd, 0x3c, 0x44, 0x39, 0xbc, + 0x8b, 0x1c, 0x32, 0x3d, 0xf6, 0x3c, 0x7a, 0x3d, 0x68, 0x1f, 0x13, 0xbd, 0x1d, + 0x1c, 0xed, 0x3c, 0xa8, 0x9b, 0x08, 0xbc, 0xe4, 0x25, 0xf6, 0xbc, 0xf6, 0xd8, + 0x19, 0xbd, 0x24, 0x39, 0x2f, 0xbd, 0x59, 0x25, 0x86, 0xbd, 0xbf, 0xf8, 0x78, + 0xbd, 0x33, 0xec, 0x93, 0xbd, 0x65, 0xdd, 0x55, 0xbd, 0x9d, 0x16, 0x05, 0xbd, + 0x69, 0xe6, 0x79, 0x3d, 0x64, 0xfd, 0xf0, 0xbc, 0xf7, 0xa3, 0x63, 0xbc, 0xb4, + 0x5f, 0xdb, 0xbc, 0x72, 0x22, 0x13, 0x3d, 0x0e, 0x28, 0x03, 0xbd, 0x64, 0x4b, + 0xad, 0x3c, 0xcb, 0x9c, 0x15, 0xbd, 0x58, 0x24, 0x55, 0x3d, 0x85, 0x90, 0x18, + 0xbc, 0x87, 0xb7, 0x95, 0x3d, 0x5e, 0xd9, 0x78, 0xbd, 0xa6, 0x19, 0x80, 0x3d, + 0xd3, 0xf6, 0x08, 0x3d, 0x8c, 0x74, 0x43, 0xbd, 0x06, 0x77, 0x8f, 0xbd, 0x68, + 0xc4, 0x6f, 0xbd, 0x6f, 0x45, 0x03, 0x3b, 0xb4, 0xf9, 0x9c, 0x3c, 0xe2, 0x85, + 0x8f, 0x3c, 0x3a, 0x70, 0x92, 0x3d, 0x06, 0xaa, 0x28, 0xbd, 0x51, 0x46, 0xc2, + 0xbd, 0x39, 0xf2, 0x8f, 0x3d, 0xda, 0xbd, 0x4e, 0x3d, 0x68, 0x6d, 0x57, 0xbc, + 0xb3, 0x41, 0x8b, 0x3d, 0xa8, 0x83, 0xa3, 0xbc, 0x3a, 0x05, 0xbf, 0xbc, 0x5b, + 0x8d, 0x6e, 0x3d, 0xfa, 0x17, 0x8b, 0xbd, 0xff, 0x33, 0x03, 0x3c, 0x4e, 0x35, + 0x6d, 0xbb, 0xf5, 0x98, 0x31, 0xbd, 0xfe, 0x46, 0x20, 0x3c, 0xb7, 0x91, 0x5d, + 0x3d, 0xa9, 0x64, 0x97, 0x3c, 0xd8, 0x6a, 0x59, 0xbd, 0x0b, 0xfb, 0x7c, 0x3d, + 0x05, 0xf1, 0x26, 0xbd, 0xd4, 0xfd, 0x2a, 0x3d, 0x70, 0xca, 0x1d, 0x3d, 0x76, + 0x80, 0xc7, 0xbc, 0xfa, 0x43, 0x7e, 0x3d, 0x6e, 0xda, 0xb6, 0x3c, 0x63, 0x63, + 0x25, 0xbd, 0x39, 0xad, 0x9c, 0xbc, 0x89, 0xa0, 0xbf, 0xbd, 0xc7, 0xd6, 0x19, + 0x3d, 0x36, 0x1d, 0x22, 0x3c, 0x11, 0x87, 0x8b, 0xbd, 0xa8, 0x59, 0x39, 0xbd, + 0xe4, 0x1d, 0x02, 0x3c, 0xf1, 0x0d, 0xf7, 0xbd, 0x16, 0x10, 0xb8, 0x3b, 0x03, + 0xfc, 0xa4, 0x3c, 0x32, 0x06, 0x8f, 0xbc, 0x47, 0x59, 0xa3, 0xbc, 0xac, 0x7f, + 0xda, 0xbc, 0x4b, 0x26, 0x80, 0x3d, 0x73, 0x33, 0x31, 0xbc, 0x83, 0x75, 0x98, + 0xbd, 0xb7, 0x95, 0x65, 0xbd, 0x64, 0x01, 0x21, 0xbd, 0xb8, 0x86, 0x8a, 0x3b, + 0xe5, 0x85, 0x4a, 0xbd, 0xe5, 0xc1, 0x45, 0xbc, 0x97, 0x00, 0xab, 0x3c, 0xb6, + 0x55, 0x1b, 0xbd, 0x41, 0xcb, 0x01, 0x3d, 0x3c, 0x4e, 0x2f, 0xbc, 0x4c, 0x54, + 0xad, 0x3c, 0x70, 0xec, 0x58, 0x3c, 0x57, 0x6e, 0xf9, 0x3c, 0xac, 0xa8, 0x28, + 0xbd, 0xea, 0x4c, 0xce, 0xbb, 0x5f, 0x87, 0x1d, 0xbd, 0x0d, 0xe2, 0x5c, 0x3d, + 0x1d, 0x21, 0x31, 0xbd, 0xf5, 0x47, 0xd7, 0xbd, 0xb5, 0xd5, 0x0c, 0xbd, 0x81, + 0x2b, 0xff, 0x3c, 0x40, 0x81, 0xd2, 0x3c, 0xc3, 0x64, 0x77, 0x3c, 0xd6, 0xdd, + 0xc9, 0xbc, 0xee, 0x42, 0x9e, 0xbc, 0x4a, 0xdb, 0x3c, 0x3d, 0xc2, 0x58, 0x82, + 0x3d, 0xfa, 0x36, 0x24, 0xbd, 0x36, 0x2e, 0x86, 0x3d, 0x68, 0xee, 0x5e, 0xbd, + 0x3c, 0x29, 0x1e, 0xbc, 0x80, 0x1f, 0x88, 0xbd, 0x27, 0xab, 0xb7, 0xbc, 0xce, + 0x18, 0xa7, 0xbd, 0xf6, 0x96, 0xa7, 0xbc, 0xde, 0x1b, 0x0a, 0xbd, 0x15, 0x9b, + 0x1d, 0x3c, 0x2e, 0xb4, 0x9d, 0x3d, 0x61, 0xba, 0xbe, 0xbc, 0xb8, 0xc8, 0x6a, + 0x3d, 0xcc, 0x06, 0xa8, 0xbd, 0x83, 0xae, 0x13, 0xbc, 0x3d, 0xb4, 0x4c, 0xbd, + 0xcc, 0xb5, 0x65, 0xbc, 0x0d, 0xad, 0x8b, 0x3c, 0x0e, 0x2f, 0x91, 0x3c, 0x1a, + 0xfa, 0x1e, 0x3d, 0xbf, 0xe3, 0xf8, 0x3c, 0x21, 0x8d, 0x8c, 0xbc, 0x30, 0x1b, + 0xcb, 0xbc, 0x34, 0x68, 0xf2, 0x3a, 0xed, 0x13, 0x0f, 0xbd, 0x66, 0x39, 0x61, + 0xbd, 0xee, 0x87, 0x42, 0x3d, 0xc0, 0x58, 0x69, 0xbc, 0x3e, 0xe4, 0xd5, 0x3c, + 0x46, 0x68, 0x30, 0xbd, 0x6c, 0x68, 0xad, 0x3c, 0x36, 0x63, 0x13, 0x3d, 0x0c, + 0xf5, 0xf7, 0xbc, 0x56, 0x99, 0x71, 0x3d, 0x4a, 0xba, 0x10, 0x3d, 0xfc, 0xba, + 0x3e, 0x3d, 0x5a, 0xd8, 0x82, 0x3d, 0x70, 0x17, 0x92, 0xbd, 0x0f, 0x9b, 0x77, + 0xbd, 0x06, 0x4d, 0x78, 0x3d, 0xcb, 0x90, 0x96, 0x3d, 0xa5, 0x6d, 0x04, 0xbd, + 0x4a, 0x4f, 0x0f, 0xbc, 0x83, 0x77, 0x3a, 0x3d, 0xdf, 0x43, 0x39, 0x3d, 0x17, + 0x17, 0xf7, 0x3c, 0x3d, 0x1a, 0x44, 0xbd, 0x42, 0x1b, 0xdb, 0xbc, 0x1f, 0x26, + 0x82, 0xbd, 0xfd, 0x51, 0xa5, 0x3d, 0xc5, 0x70, 0x45, 0x3d, 0x00, 0x17, 0xa1, + 0x3c, 0xe1, 0x5c, 0x56, 0xbd, 0x57, 0x8c, 0xe6, 0xbc, 0x87, 0x07, 0xef, 0x3b, + 0x9b, 0x41, 0xbf, 0xbd, 0xa1, 0x85, 0xd5, 0x3c, 0x07, 0x20, 0x0a, 0xbd, 0xc0, + 0x19, 0xf3, 0xbb, 0x1f, 0xb5, 0xba, 0x3b, 0xa0, 0x79, 0x86, 0xbc, 0x62, 0x56, + 0x40, 0xbd, 0x51, 0xf1, 0xa8, 0x3c, 0x83, 0x80, 0x86, 0x3c, 0x18, 0x2b, 0x2d, + 0x3d, 0x8d, 0x66, 0xb6, 0x3c, 0x1d, 0xac, 0x2e, 0xbd, 0x91, 0xbc, 0x3e, 0xbd, + 0xfb, 0x80, 0x75, 0x3d, 0x7d, 0xa1, 0x54, 0xba, 0x0f, 0xd1, 0x2f, 0xbd, 0xcb, + 0x3a, 0x14, 0xbd, 0x76, 0xd3, 0x82, 0xbc, 0x15, 0x06, 0xf5, 0x39, 0xa4, 0xdb, + 0x6e, 0x3d, 0x42, 0x46, 0xb7, 0x3c, 0xa3, 0x20, 0x00, 0x3d, 0xfc, 0x4f, 0x2b, + 0xbd, 0x06, 0xb1, 0x7e, 0x3d, 0xf8, 0x37, 0xc9, 0xbc, 0x0d, 0x90, 0xd7, 0xbc, + 0xb7, 0x8e, 0x0e, 0x3d, 0x68, 0xd8, 0x1d, 0xbc, 0x57, 0xb5, 0x11, 0x3d, 0x68, + 0x20, 0x0b, 0x3d, 0x85, 0xda, 0x1e, 0xbd, 0xe0, 0xc0, 0x6b, 0xbd, 0x44, 0x69, + 0x96, 0xbd, 0xec, 0xbd, 0x38, 0xbc, 0x09, 0x65, 0x85, 0xbd, 0xb4, 0xf4, 0x57, + 0xbd, 0x35, 0xe4, 0xb2, 0xbc, 0xf7, 0x90, 0xd0, 0x3c, 0x78, 0xd1, 0x83, 0xbd, + 0xe7, 0x8d, 0x1b, 0xbd, 0x49, 0xa3, 0x94, 0x3d, 0x56, 0xf3, 0x44, 0xbd, 0xb2, + 0xce, 0x5e, 0x3d, 0x42, 0x8e, 0x37, 0xbd, 0x22, 0x3e, 0x79, 0xbd, 0xa0, 0x71, + 0x6c, 0x3d, 0x23, 0x13, 0xb3, 0xbb, 0x0d, 0x32, 0x21, 0x3c, 0x35, 0x5e, 0xfd, + 0xba, 0x0d, 0x0c, 0xbd, 0x3b, 0xcb, 0x0c, 0xaa, 0xbb, 0x33, 0xe8, 0x08, 0xbd, + 0x43, 0x7a, 0xa5, 0xbc, 0x15, 0x50, 0x89, 0x3d, 0xd1, 0x86, 0x5b, 0x3d, 0x2a, + 0xd8, 0x4c, 0x3d, 0xe1, 0x63, 0x19, 0xbc, 0xee, 0xf0, 0x6f, 0x3d, 0xfa, 0xc2, + 0x44, 0x3d, 0x88, 0x3c, 0x6b, 0xbd, 0xe3, 0x24, 0xbb, 0xbc, 0x4c, 0xe6, 0x21, + 0x3b, 0x47, 0xf2, 0xa1, 0xbc, 0x46, 0x96, 0xfd, 0x3c, 0x4c, 0x21, 0x86, 0xbd, + 0x32, 0x28, 0x83, 0xbc, 0x70, 0x39, 0xa0, 0xbd, 0x80, 0xca, 0x4d, 0xbd, 0xc4, + 0x91, 0x8d, 0xbc, 0xab, 0xae, 0x08, 0x3c, 0x54, 0xff, 0xb5, 0xbb, 0x76, 0xae, + 0xbe, 0x3c, 0xd8, 0xd1, 0xa5, 0x3d, 0x03, 0x0c, 0x44, 0x3d, 0x92, 0x96, 0x40, + 0xbd, 0xd5, 0xc5, 0x1f, 0x3d, 0xdf, 0x09, 0xc0, 0x3c, 0xfb, 0x0d, 0x5f, 0x3d, + 0xfd, 0x07, 0x04, 0x3d, 0x1c, 0x43, 0x9a, 0xbd, 0xd7, 0x14, 0x72, 0xbd, 0x2d, + 0x50, 0x84, 0xbd, 0x6a, 0x16, 0x7d, 0x38, 0xa6, 0xff, 0x90, 0x3d, 0x44, 0xb7, + 0xcc, 0x3c, 0x5d, 0x5f, 0x69, 0xbd, 0x92, 0x8d, 0x6d, 0x3d, 0xf9, 0x02, 0x99, + 0xbc, 0xe5, 0x7a, 0xc5, 0xbd, 0xde, 0x5c, 0x69, 0x3d, 0xee, 0xbf, 0xf4, 0x3c, + 0x92, 0x19, 0x96, 0x3d, 0xf3, 0x5b, 0x35, 0xbd, 0xf3, 0x90, 0x3b, 0x3d, 0x90, + 0xe2, 0xc2, 0xbc, 0x98, 0x91, 0xf9, 0xbc, 0x3b, 0x3b, 0x82, 0xbd, 0xb0, 0x85, + 0x30, 0x3d, 0x14, 0x12, 0xea, 0xbc, 0x21, 0x84, 0x8c, 0x3d, 0x93, 0xcd, 0x65, + 0x3d, 0xc9, 0x26, 0xda, 0xbc, 0xd5, 0xc3, 0x4e, 0x3c, 0xcc, 0x6e, 0x0f, 0x3d, + 0x8d, 0xaf, 0x47, 0x3c, 0x9c, 0xfa, 0xe1, 0x3c, 0x3c, 0xe0, 0x4c, 0x3d, 0x79, + 0x22, 0xed, 0x3c, 0xf4, 0x05, 0x3a, 0x3d, 0x59, 0xc0, 0x22, 0xbd, 0x5e, 0xaa, + 0xf8, 0xbc, 0xc4, 0xda, 0x22, 0x3c, 0x76, 0x88, 0xaf, 0x3c, 0x1c, 0xf4, 0x3b, + 0x3d, 0x4e, 0x6a, 0x1b, 0x3d, 0x60, 0xc7, 0x85, 0x3c, 0xb2, 0xc7, 0x75, 0x3d, + 0xbd, 0xe4, 0xbe, 0xbc, 0x54, 0x8e, 0x82, 0x3d, 0x36, 0x27, 0x6a, 0xbc, 0x0d, + 0x99, 0x00, 0xbd, 0x38, 0x5e, 0x9f, 0xbc, 0x9d, 0x49, 0xd6, 0x3d, 0xbb, 0x1a, + 0x85, 0x3d, 0x6f, 0x89, 0x9f, 0x3c, 0xc5, 0x0b, 0xa7, 0xbc, 0x9e, 0x5a, 0xfa, + 0xbc, 0xd3, 0x59, 0x50, 0xba, 0x3f, 0xc6, 0xbc, 0xbd, 0xb3, 0x9c, 0x12, 0xbd, + 0x05, 0x39, 0xd6, 0x3b, 0x58, 0x14, 0x0d, 0x3d, 0x63, 0x0e, 0x19, 0x3d, 0x69, + 0x9b, 0xa2, 0x3d, 0x68, 0x4d, 0x13, 0x3c, 0x06, 0x73, 0x64, 0xbd, 0x28, 0x79, + 0x3c, 0xbd, 0x26, 0x23, 0x28, 0xbc, 0xb5, 0xa2, 0xa5, 0xba, 0xf6, 0x5f, 0x89, + 0xbc, 0x66, 0x2e, 0x79, 0xbd, 0x90, 0xee, 0x54, 0xbc, 0x99, 0xf4, 0x4e, 0x3c, + 0xdb, 0xdc, 0xd0, 0xbc, 0x3f, 0xed, 0x43, 0xbd, 0x03, 0xdf, 0xf4, 0x3c, 0x7d, + 0x40, 0x2b, 0x3c, 0xfb, 0x1d, 0x64, 0x3d, 0xcd, 0x1f, 0xb8, 0x3d, 0xb1, 0xb2, + 0x0f, 0x3d, 0x30, 0xf6, 0x38, 0xbd, 0x54, 0xef, 0x84, 0xbc, 0x2f, 0x3f, 0xac, + 0xbd, 0xe0, 0xe1, 0xc4, 0xbc, 0x49, 0x0a, 0x03, 0xbd, 0xb8, 0x78, 0x43, 0xbc, + 0xbf, 0xbc, 0x80, 0x3a, 0x1a, 0x41, 0x39, 0x3d, 0xd0, 0x5d, 0x8c, 0x3d, 0x8d, + 0x8f, 0x5e, 0xbc, 0xfd, 0x1b, 0xed, 0xbd, 0x22, 0x7c, 0x99, 0xbc, 0x4c, 0xb3, + 0x1d, 0xbc, 0x10, 0xbb, 0x1c, 0x3c, 0x19, 0x89, 0xd3, 0xbc, 0x2a, 0x64, 0x37, + 0x3d, 0x11, 0x87, 0x00, 0x3c, 0x39, 0x0d, 0x1c, 0x3d, 0xb8, 0xeb, 0xde, 0xbc, + 0x26, 0x9d, 0x05, 0xbd, 0x51, 0xca, 0x0d, 0xbd, 0xa9, 0xe0, 0xbc, 0x3c, 0xd6, + 0x01, 0x2d, 0xbd, 0x72, 0x14, 0xd3, 0x3c, 0xf2, 0x07, 0x81, 0x3c, 0xe4, 0xbb, + 0x00, 0x3d, 0x0b, 0x42, 0x09, 0x3b, 0x0e, 0x99, 0x71, 0xbd, 0x32, 0x91, 0x10, + 0xbd, 0xa0, 0x0b, 0x05, 0xbd, 0x7f, 0xf8, 0xf6, 0x3c, 0xd4, 0x72, 0xbd, 0x3c, + 0xdf, 0xcc, 0x8a, 0x3d, 0x0e, 0x3d, 0x24, 0x3d, 0x71, 0x5a, 0x52, 0xbd, 0xb6, + 0x11, 0xda, 0xbc, 0x5b, 0xec, 0x9c, 0x3d, 0x4a, 0x73, 0xfd, 0xbc, 0xc1, 0x2b, + 0x9f, 0xbd, 0x06, 0xed, 0x2f, 0xbd, 0x38, 0x4c, 0x53, 0x3d, 0x36, 0x8d, 0xc1, + 0x3c, 0x14, 0x26, 0xa3, 0xbd, 0x2d, 0x2f, 0x0a, 0xbb, 0xfd, 0x7d, 0xa5, 0xbd, + 0x10, 0xbe, 0xe4, 0x3b, 0x77, 0x22, 0x6a, 0x3d, 0xdd, 0x33, 0xc3, 0x3c, 0x3e, + 0x8e, 0xbb, 0xbd, 0x60, 0x54, 0x81, 0x3d, 0x02, 0xcf, 0x15, 0x3d, 0x06, 0x28, + 0xd5, 0x3d, 0xda, 0xb6, 0x6f, 0xbd, 0xf6, 0x93, 0x86, 0xbc, 0x98, 0x16, 0x45, + 0x3d, 0xdc, 0x9e, 0x47, 0x3c, 0x8b, 0x3a, 0x82, 0xbd, 0x11, 0x05, 0xb6, 0xbd, + 0x0e, 0x26, 0xc1, 0xbc, 0xe2, 0xdc, 0xab, 0x3d, 0x10, 0x6e, 0x84, 0x3d, 0x49, + 0x2f, 0x1c, 0xbb, 0x0e, 0x73, 0x7a, 0x3c, 0x82, 0x17, 0x29, 0x3d, 0x88, 0x40, + 0x91, 0x3b, 0x2d, 0xcd, 0xf3, 0xbc, 0xcc, 0x39, 0x37, 0xbd, 0xb0, 0x03, 0x17, + 0x3d, 0xb8, 0xd0, 0x22, 0x3d, 0xc6, 0x69, 0x90, 0x3c, 0x09, 0x0f, 0xc2, 0x3b, + 0x7a, 0x64, 0xcc, 0xbc, 0x26, 0x93, 0x22, 0x3d, 0xa3, 0xe0, 0x4b, 0xbd, 0x7d, + 0xca, 0x2f, 0xbb, 0xda, 0x26, 0x19, 0x3d, 0xe7, 0x88, 0x47, 0xbc, 0x4e, 0x0f, + 0x3b, 0x3d, 0xf8, 0x1c, 0x1c, 0x3d, 0xb4, 0x23, 0x8e, 0x3d, 0xaf, 0xa6, 0x10, + 0xbd, 0xfc, 0x9a, 0x9c, 0x3c, 0x35, 0x69, 0x9f, 0x3d, 0xe4, 0x5f, 0x8f, 0xbd, + 0xc7, 0xe3, 0x98, 0x3d, 0xab, 0xb8, 0xcc, 0x3b, 0x6a, 0xa9, 0x0f, 0xbd, 0x0d, + 0x8a, 0x6a, 0xbd, 0x1e, 0xec, 0x10, 0x3d, 0xa0, 0x13, 0xe8, 0x3b, 0xc0, 0x77, + 0x93, 0x3c, 0x3f, 0x03, 0x0b, 0x3d, 0xde, 0x40, 0xb4, 0x3c, 0xfc, 0xdb, 0x06, + 0xbd, 0xc3, 0x86, 0x90, 0x3d, 0x54, 0x89, 0x37, 0x3d, 0x55, 0xd4, 0x8d, 0xbd, + 0x39, 0x31, 0xb7, 0xbc, 0xab, 0x31, 0xc0, 0xbc, 0x60, 0x17, 0xdb, 0xbb, 0x49, + 0xa9, 0x2f, 0xbc, 0xbf, 0xcb, 0xd6, 0x3b, 0x83, 0x93, 0x16, 0x3d, 0xba, 0xdd, + 0x1b, 0xbd, 0xd1, 0x6a, 0x17, 0x3d, 0x45, 0x0f, 0x1d, 0xbd, 0xa3, 0xc1, 0xb5, + 0xbd, 0x88, 0x0e, 0x6e, 0x3d, 0x41, 0x5d, 0x06, 0x3d, 0xd8, 0xeb, 0xb4, 0x3c, + 0xe5, 0xc8, 0x88, 0xbb, 0x48, 0x65, 0x47, 0x3d, 0xff, 0xe8, 0xa6, 0xbd, 0x12, + 0x2a, 0x10, 0xbd, 0xd0, 0x90, 0x8b, 0x3d, 0x17, 0x08, 0xfc, 0xbc, 0x8e, 0xb4, + 0x9a, 0xbc, 0x70, 0x79, 0x3f, 0x3d, 0xd8, 0xad, 0x06, 0x3c, 0xf8, 0x4e, 0x81, + 0xbd, 0x82, 0xf1, 0x71, 0xbd, 0x9f, 0x19, 0xcc, 0xbd, 0xaf, 0x6a, 0x45, 0x3d, + 0x4e, 0x39, 0x25, 0x3d, 0x17, 0x43, 0x74, 0x3d, 0x52, 0x51, 0x53, 0xbd, 0x53, + 0x10, 0x5f, 0xbd, 0x5f, 0x60, 0xf7, 0x3c, 0xf4, 0x07, 0x6d, 0x3d, 0x68, 0x1d, + 0x29, 0x3d, 0xd6, 0xf7, 0xad, 0xbc, 0x09, 0x0d, 0x8f, 0xbd, 0x17, 0xae, 0xd7, + 0x3c, 0x63, 0xf2, 0xc7, 0xbc, 0x4e, 0xa0, 0x05, 0xbd, 0x53, 0x3b, 0xc5, 0xbc, + 0x81, 0xf4, 0x82, 0x3d, 0x5e, 0xc9, 0x56, 0xbd, 0x32, 0xb8, 0xbd, 0xbc, 0xf2, + 0x3e, 0xc7, 0xbc, 0x76, 0x7f, 0x76, 0xbd, 0x19, 0x45, 0x13, 0xbd, 0xb9, 0x17, + 0x88, 0x3d, 0xef, 0x15, 0x68, 0xbd, 0x7a, 0xb8, 0xf6, 0x3a, 0xa8, 0x56, 0x72, + 0xbb, 0x96, 0x68, 0xce, 0x3d, 0x13, 0x43, 0x0a, 0xbd, 0x87, 0x3f, 0x91, 0x3c, + 0xd7, 0x12, 0x8b, 0x3b, 0x2f, 0x85, 0xbf, 0xbc, 0x33, 0xfc, 0x62, 0xbc, 0x5f, + 0xb3, 0x8f, 0xbc, 0x9f, 0x1a, 0xf5, 0xbc, 0x3b, 0x75, 0x68, 0x3d, 0x58, 0xae, + 0x3c, 0x3d, 0xe3, 0x00, 0x5d, 0x3d, 0xcf, 0x69, 0x9c, 0x3d, 0xdb, 0x20, 0xb3, + 0x39, 0x31, 0x1a, 0x7a, 0xbc, 0x11, 0x37, 0xd0, 0x3c, 0x1d, 0x5d, 0x84, 0x3d, + 0xb2, 0x5d, 0xe9, 0xbc, 0x24, 0x74, 0xe5, 0xbc, 0x86, 0x1d, 0xea, 0xbb, 0x65, + 0x94, 0x76, 0x3d, 0x9a, 0xb2, 0xeb, 0x3c, 0x62, 0x9f, 0x44, 0xbb, 0xca, 0x35, + 0xa8, 0xbc, 0x25, 0x51, 0x23, 0x3d, 0xa9, 0xac, 0x00, 0xbd, 0xb9, 0x13, 0xa6, + 0x3d, 0x3e, 0x3e, 0x10, 0xbc, 0x5f, 0x40, 0x8b, 0x3d, 0x75, 0xef, 0x70, 0x3b, + 0xf8, 0x66, 0xa4, 0x3c, 0x69, 0x24, 0x84, 0x3c, 0x2a, 0xd2, 0x76, 0xbc, 0x67, + 0xef, 0x9f, 0xbc, 0xe1, 0x67, 0xcb, 0xbc, 0xe1, 0x4c, 0xa9, 0xbd, 0x18, 0xb6, + 0x96, 0x3d, 0x29, 0xaa, 0x84, 0xbd, 0x80, 0x0d, 0x5b, 0x3d, 0x35, 0xe7, 0x02, + 0x3d, 0xea, 0xf8, 0x46, 0xbd, 0xba, 0x63, 0x42, 0x3d, 0x3e, 0x6d, 0x83, 0x3d, + 0x0d, 0x47, 0x3c, 0xbd, 0x79, 0xe3, 0xa1, 0x3c, 0x7b, 0x77, 0x17, 0xbd, 0x4d, + 0x55, 0x53, 0x3d, 0xc3, 0x91, 0x7e, 0xbd, 0x9b, 0x6b, 0x49, 0x3d, 0x30, 0xad, + 0xc7, 0xbc, 0xc1, 0x27, 0x3e, 0xbd, 0xea, 0xaf, 0x51, 0x3d, 0x12, 0x3a, 0x94, + 0xbc, 0xf1, 0x36, 0xf1, 0x3c, 0x6a, 0x5a, 0x93, 0x3b, 0x88, 0x1e, 0xb1, 0xbc, + 0x3c, 0x43, 0x37, 0xbd, 0x74, 0xda, 0x9a, 0xbd, 0x53, 0x3d, 0x7b, 0x3d, 0xe7, + 0x18, 0xdd, 0xbc, 0xba, 0x1b, 0xd9, 0xbc, 0xe8, 0x9a, 0x64, 0xbd, 0xca, 0x36, + 0x2b, 0x3d, 0xc6, 0x99, 0xbc, 0x3c, 0xa6, 0x76, 0x72, 0x3d, 0x59, 0x8a, 0xb5, + 0x3c, 0x07, 0xf8, 0xd7, 0x3d, 0xdd, 0xaf, 0x2a, 0xb8, 0x77, 0xac, 0xb7, 0x3c, + 0x53, 0xd6, 0x12, 0xbd, 0x19, 0x6c, 0x63, 0x3c, 0xe0, 0xf5, 0x32, 0xbd, 0x72, + 0xc2, 0xae, 0xbd, 0x04, 0x6b, 0x12, 0x3c, 0xea, 0x76, 0x99, 0x3d, 0x5e, 0x14, + 0x25, 0xbd, 0x16, 0x01, 0x01, 0xbc, 0x6d, 0x0e, 0xb8, 0x3d, 0x78, 0x70, 0x85, + 0x3b, 0x7b, 0xb9, 0x55, 0xbb, 0x59, 0xa4, 0x2f, 0x3d, 0xbb, 0xf1, 0x4e, 0xbc, + 0x6e, 0x1e, 0x6f, 0x3d, 0x6d, 0xd0, 0x82, 0x3d, 0xa1, 0x2a, 0x38, 0xbd, 0x82, + 0x0e, 0x81, 0x3d, 0x51, 0x1a, 0xe8, 0x3c, 0x78, 0x0f, 0xb2, 0xbc, 0xdb, 0x4a, + 0x9f, 0x3d, 0xeb, 0xf7, 0x5f, 0x3b, 0xf0, 0x3e, 0xe2, 0xbc, 0x9c, 0x11, 0x91, + 0x3c, 0xb0, 0xbd, 0x1a, 0x3c, 0xce, 0x3f, 0x1c, 0xbb, 0x0e, 0xe3, 0x0b, 0x3d, + 0x2e, 0x44, 0x15, 0x3d, 0x90, 0x12, 0xe8, 0x3c, 0x84, 0xb7, 0x46, 0x3d, 0x4f, + 0x51, 0x90, 0x3c, 0x5f, 0xee, 0xe8, 0x3c, 0x8f, 0xa8, 0xd2, 0xbb, 0x86, 0x20, + 0x7c, 0x3d, 0xe8, 0x1f, 0x48, 0xbc, 0xbb, 0x7f, 0x59, 0x3d, 0x62, 0xf1, 0x8a, + 0xbc, 0x94, 0x28, 0x0c, 0x3c, 0xdd, 0x8f, 0x1a, 0xbd, 0xad, 0x5a, 0xa8, 0x39, + 0x4d, 0x0c, 0x71, 0x3d, 0x96, 0xa2, 0x91, 0x3d, 0xe7, 0x9c, 0x69, 0xbc, 0x1f, + 0x9d, 0x0c, 0xbd, 0x6e, 0xbe, 0xe7, 0x3c, 0x97, 0x28, 0x35, 0xbd, 0x11, 0xb7, + 0x8c, 0xbd, 0x3b, 0xc0, 0xc1, 0x3c, 0x02, 0x96, 0xd7, 0x3c, 0x79, 0x02, 0x4d, + 0xbc, 0x6c, 0xad, 0xb7, 0x3c, 0x9a, 0xef, 0x29, 0x3d, 0xe9, 0x73, 0x9b, 0x3d, + 0x58, 0xd3, 0x17, 0x3d, 0xea, 0xcc, 0x2d, 0xbd, 0x64, 0x3a, 0x9e, 0xbd, 0x9a, + 0x8b, 0x3c, 0xbd, 0x4f, 0x97, 0x88, 0xbc, 0x1b, 0x18, 0x27, 0xbc, 0x22, 0xdc, + 0xde, 0xbd, 0xb4, 0xbe, 0x94, 0xba, 0x5a, 0xc7, 0xe0, 0x3b, 0xe9, 0xd7, 0x07, + 0x3c, 0xcb, 0x47, 0xf2, 0x3c, 0x04, 0xca, 0x2f, 0x3d, 0x25, 0x4d, 0xd9, 0x3c, + 0xc1, 0xb9, 0x37, 0xbd, 0xa1, 0x9a, 0x0c, 0x3d, 0x78, 0xae, 0x88, 0xbd, 0x02, + 0xb5, 0x98, 0x3d, 0x63, 0x8b, 0x79, 0xbd, 0xab, 0xe4, 0xaa, 0x3d, 0x5a, 0x1e, + 0x02, 0xbc, 0x16, 0x17, 0x68, 0x3b, 0xf8, 0x36, 0x0d, 0x3b, 0x1f, 0x67, 0x8c, + 0xbd, 0xbc, 0x52, 0xe2, 0xbc, 0x2f, 0xee, 0xe2, 0xbb, 0x46, 0x45, 0x08, 0x3d, + 0xd2, 0xea, 0xc9, 0x3c, 0x00, 0xcc, 0x5c, 0x3d, 0x1e, 0x1f, 0x54, 0x3c, 0x10, + 0x3e, 0x8e, 0x3c, 0x1e, 0x6d, 0x5f, 0xbd, 0xfb, 0xdb, 0x64, 0x3d, 0x62, 0x27, + 0xb5, 0xbd, 0x0a, 0x8c, 0x51, 0xbd, 0x5e, 0x4d, 0xae, 0xbd, 0xd4, 0xd2, 0x65, + 0x3d, 0x88, 0xc4, 0xc0, 0x3c, 0x25, 0x97, 0xb9, 0xbb, 0x6d, 0x7c, 0x5b, 0x3d, + 0x42, 0x2f, 0x0e, 0xbb, 0x42, 0xfc, 0xb3, 0xba, 0x38, 0x1c, 0xae, 0xbc, 0x4d, + 0xba, 0x7a, 0xbd, 0x15, 0xf7, 0x9d, 0x3d, 0x51, 0xc4, 0x82, 0x3d, 0x70, 0xa9, + 0x47, 0x3d, 0x68, 0x1c, 0xdf, 0x3c, 0xef, 0x44, 0x71, 0x3c, 0xdf, 0x7d, 0x80, + 0x3d, 0x6c, 0x6c, 0xcd, 0xbc, 0x9b, 0xf2, 0x68, 0x3d, 0x61, 0x10, 0x64, 0x3d, + 0x31, 0x19, 0xda, 0x3c, 0xc3, 0x1c, 0xdc, 0xbb, 0xe1, 0x30, 0x13, 0xbc, 0x4d, + 0xd5, 0xaf, 0xbb, 0x39, 0xaa, 0x43, 0xbd, 0x9a, 0x51, 0x75, 0xbd, 0xc3, 0x2b, + 0x5e, 0x3c, 0x2f, 0x60, 0xed, 0x3c, 0x2a, 0x8e, 0x87, 0x3d, 0x0e, 0x88, 0x08, + 0xbd, 0xcb, 0x1a, 0xc2, 0x3b, 0x86, 0xdb, 0x44, 0xbd, 0x3c, 0xb2, 0xd8, 0xbc, + 0xd8, 0x5c, 0x2a, 0x3d, 0xf9, 0xb9, 0x06, 0xbd, 0xf6, 0x2f, 0x52, 0x3d, 0xda, + 0x46, 0xe9, 0x3b, 0xeb, 0x10, 0xd5, 0x3c, 0x5a, 0x5a, 0x70, 0x3b, 0x58, 0xd3, + 0x30, 0x3c, 0xb3, 0x7e, 0x00, 0xbd, 0x81, 0x37, 0x56, 0xbd, 0x0a, 0x66, 0x12, + 0xbd, 0xd7, 0xca, 0x80, 0xbd, 0x89, 0x4c, 0x52, 0x3d, 0x42, 0x49, 0xab, 0x3c, + 0x79, 0xe8, 0xa6, 0xbd, 0xa2, 0x35, 0xd5, 0xbd, 0xa3, 0x0c, 0x0e, 0xbd, 0x4f, + 0x10, 0x8a, 0x3d, 0xd4, 0xbe, 0x64, 0x3d, 0x38, 0x13, 0xfd, 0x3d, 0x86, 0xc8, + 0x82, 0xbd, 0xd2, 0x11, 0x46, 0x3d, 0xcc, 0x13, 0x6a, 0x3d, 0x29, 0x91, 0xe2, + 0xbc, 0x9a, 0x59, 0xc8, 0xbc, 0x6d, 0xd3, 0x79, 0xbd, 0x00, 0x17, 0xbd, 0x3d, + 0x2f, 0x3d, 0x13, 0xbd, 0xf2, 0x5e, 0x5a, 0x3d, 0x91, 0xd3, 0x22, 0xbc, 0x8d, + 0x7d, 0xdd, 0x3c, 0xcb, 0xd3, 0x47, 0x3d, 0x51, 0x39, 0x43, 0x3d, 0x8e, 0xba, + 0xb3, 0x3c, 0xcf, 0xdc, 0x5d, 0xbc, 0xe8, 0xf4, 0x69, 0xbd, 0x75, 0xed, 0x4a, + 0xbd, 0x3e, 0xa3, 0x52, 0x3d, 0x55, 0xbe, 0x6e, 0xbd, 0x84, 0x86, 0xb3, 0xbc, + 0x7d, 0x3b, 0x4f, 0xbd, 0xd0, 0x9c, 0x8f, 0xbb, 0xe4, 0x9f, 0x39, 0x3d, 0x10, + 0x5c, 0xf0, 0xbb, 0x64, 0x15, 0x82, 0xbc, 0x12, 0xf8, 0x45, 0x3d, 0xf6, 0xfc, + 0x40, 0x3d, 0x64, 0x01, 0x84, 0xbc, 0x4e, 0x97, 0x28, 0x3d, 0xc0, 0xb8, 0x30, + 0x3d, 0xf8, 0x94, 0x71, 0xbd, 0x59, 0x5a, 0x61, 0xbd, 0x9e, 0x55, 0x8d, 0xbd, + 0x00, 0x77, 0xfa, 0xbc, 0x9c, 0xbf, 0x17, 0x3d, 0x94, 0x7a, 0x4f, 0xbd, 0xb1, + 0xa6, 0x8f, 0xbd, 0xad, 0xc3, 0x8a, 0x3d, 0xf0, 0xca, 0x8b, 0x3c, 0x2a, 0xe4, + 0x2b, 0xbd, 0x34, 0x81, 0x44, 0xbd, 0x48, 0x55, 0x52, 0xbd, 0x2e, 0x7e, 0x63, + 0x3d, 0x3a, 0x07, 0x4e, 0x3d, 0xb0, 0xb9, 0x7a, 0x3c, 0x18, 0x7d, 0x6e, 0xbc, + 0x7a, 0x0e, 0x3c, 0xbd, 0xdc, 0x81, 0x8c, 0xbd, 0xc8, 0xa4, 0x71, 0x3c, 0xca, + 0x20, 0x28, 0x3d, 0x28, 0x36, 0xf6, 0x3c, 0x28, 0xef, 0x3c, 0x3d, 0x88, 0x83, + 0x3e, 0x3c, 0x74, 0x45, 0x34, 0x3d, 0x80, 0x11, 0x06, 0xba, 0x8c, 0xd1, 0x79, + 0xbc, 0x84, 0x71, 0x26, 0xbd, 0x98, 0x15, 0x15, 0x3c, 0x4a, 0x0e, 0x92, 0xbc, + 0x75, 0x17, 0x83, 0x3d, 0xfc, 0x9c, 0xc1, 0xbc, 0x4c, 0xe3, 0xb5, 0x3c, 0x10, + 0xc9, 0x23, 0x3c, 0xd0, 0xde, 0x1a, 0x3c, 0x22, 0x15, 0x92, 0xbd, 0xe6, 0x39, + 0x48, 0xbd, 0x16, 0x40, 0x91, 0xbd, 0x5c, 0xf1, 0xb4, 0x3c, 0x4a, 0xf7, 0xbc, + 0xbc, 0x80, 0x48, 0x44, 0x3c, 0xc8, 0x47, 0x15, 0xbc, 0xcb, 0x39, 0x4d, 0xbd, + 0x04, 0xe1, 0xc0, 0x3c, 0x86, 0x40, 0x43, 0xbd, 0x3f, 0x39, 0x6a, 0xbd, 0x00, + 0xfd, 0x30, 0xbb, 0x18, 0x14, 0x60, 0xbc, 0xf0, 0x88, 0x12, 0x3d, 0x21, 0xf7, + 0x90, 0x3d, 0xfc, 0xcc, 0xa1, 0x3c, 0xa6, 0x1f, 0x2d, 0x3d, 0x0a, 0x14, 0x46, + 0xbd, 0x37, 0x3c, 0x5f, 0xbd, 0x32, 0x53, 0x94, 0xbc, 0x58, 0x51, 0xb1, 0xbc, + 0xd7, 0x03, 0x89, 0x3d, 0xfe, 0x03, 0x37, 0xbd, 0x9e, 0x06, 0x89, 0xbd, 0xbc, + 0xf6, 0x41, 0x3d, 0xf0, 0x87, 0x32, 0x3d, 0xdc, 0x11, 0xeb, 0xbc, 0x4a, 0x89, + 0x3b, 0x3d, 0xd2, 0xf1, 0x2b, 0x3d, 0x78, 0xcb, 0x38, 0xbc, 0x46, 0xda, 0xff, + 0xbc, 0xee, 0x9c, 0x8d, 0xbd, 0x14, 0x8e, 0xcd, 0xbc, 0x08, 0x6f, 0x05, 0x3d, + 0x00, 0xac, 0x8e, 0xbd, 0x90, 0xa2, 0x84, 0xbb, 0x9b, 0x36, 0x32, 0xbd, 0x2b, + 0x3f, 0x89, 0x3d, 0x80, 0x9a, 0x03, 0xbb, 0x06, 0xac, 0x17, 0x3d, 0xf8, 0x22, + 0x3f, 0xbd, 0x75, 0xae, 0x90, 0xbd, 0x76, 0xdd, 0x3e, 0xbd, 0x7c, 0x72, 0x92, + 0x3c, 0x4c, 0x38, 0x44, 0xbd, 0xba, 0x8f, 0x21, 0x3d, 0x00, 0x88, 0x7e, 0xbb, + 0xdc, 0xd2, 0x92, 0x3c, 0x1a, 0x45, 0x77, 0x3d, 0x54, 0xa1, 0x50, 0xbc, 0x44, + 0xea, 0x2d, 0x3d, 0x8e, 0xbd, 0x1d, 0x3d, 0x1b, 0xb9, 0x88, 0x3d, 0x20, 0xc4, + 0x8b, 0xbd, 0x43, 0x9e, 0x05, 0xbd, 0x80, 0x93, 0x4a, 0x3d, 0x02, 0xb3, 0x8a, + 0xbd, 0x40, 0x5c, 0xbb, 0x3b, 0x54, 0x22, 0x37, 0xbd, 0x04, 0xd5, 0xed, 0xbc, + 0xae, 0xce, 0x87, 0xbd, 0x0c, 0x0f, 0xe3, 0xbc, 0xc1, 0x1f, 0x48, 0xbd, 0x68, + 0x6a, 0x9a, 0x3c, 0xd0, 0x0b, 0x8f, 0x3c, 0xc8, 0x5c, 0x00, 0x3d, 0x60, 0xf9, + 0xd5, 0xbb, 0x57, 0x9a, 0x88, 0xbd, 0xf2, 0x1a, 0x8d, 0xbd, 0x52, 0x69, 0x63, + 0x3d, 0xb8, 0x69, 0x89, 0x3c, 0x56, 0xfb, 0x0a, 0x3d, 0x00, 0xc3, 0x10, 0xba, + 0x0e, 0xcd, 0x56, 0xbd, 0x1a, 0xf7, 0x61, 0x3d, 0xf8, 0x95, 0x8b, 0xbd, 0x3c, + 0x34, 0x14, 0xbd, 0xed, 0xc6, 0x8f, 0x3d, 0xee, 0xc2, 0x1c, 0x3d, 0xa0, 0x9d, + 0x04, 0xbb, 0xfd, 0x06, 0x56, 0xbd, 0xa0, 0xe7, 0x12, 0x3b, 0xae, 0x01, 0xbd, + 0xbc, 0xb0, 0x52, 0x16, 0x3d, 0x00, 0x9e, 0x97, 0xba, 0x40, 0xaf, 0x58, 0x3d, + 0xa4, 0x80, 0x97, 0x3c, 0xa0, 0x07, 0x22, 0x3b, 0x59, 0x3b, 0x01, 0xbd, 0x83, + 0x64, 0x87, 0x3d, 0x0e, 0xfd, 0x96, 0xbc, 0x3a, 0xf8, 0x7b, 0xbd, 0x7d, 0x61, + 0x0a, 0xbd, 0xe2, 0x4c, 0x58, 0xbd, 0xc0, 0x1b, 0x81, 0xbb, 0x70, 0x48, 0x0b, + 0x3d, 0x5a, 0x4c, 0x94, 0xbc, 0x6a, 0x49, 0x5b, 0x3d, 0x58, 0x79, 0x7a, 0x3c, + 0x54, 0xe4, 0x10, 0xbd, 0x0f, 0x05, 0x8c, 0x3d, 0x00, 0x70, 0xb3, 0xba, 0xfe, + 0x52, 0xec, 0xbc, 0x80, 0x87, 0xe5, 0x3b, 0x76, 0x35, 0x7f, 0x3d, 0x20, 0x23, + 0x36, 0x3b, 0x48, 0xe0, 0x16, 0x3d, 0x0e, 0xdb, 0x53, 0x3d, 0x76, 0x7d, 0xcb, + 0xbc, 0x79, 0xf8, 0x5c, 0xbd, 0x8a, 0x7c, 0x39, 0x3d, 0x8c, 0x87, 0x1d, 0x3d, + 0x3a, 0x32, 0x08, 0xbd, 0x54, 0xa9, 0x6a, 0xbc, 0x22, 0xad, 0xad, 0xbc, 0xd2, + 0x4b, 0x68, 0x3d, 0x86, 0x89, 0xee, 0xbc, 0x42, 0xee, 0x7d, 0x3d, 0x56, 0x9e, + 0x46, 0x3d, 0x58, 0xcd, 0xd0, 0x3c, 0xb4, 0x6d, 0x9f, 0x3c, 0x0c, 0x5b, 0x20, + 0xbd, 0x40, 0xe8, 0x2c, 0x3b, 0x23, 0xd1, 0x80, 0x3d, 0xee, 0x0f, 0xc8, 0xbc, + 0x1c, 0x52, 0xd5, 0x3c, 0x68, 0x8d, 0x63, 0xbc, 0x9c, 0xb3, 0x37, 0xbd, 0x0c, + 0x04, 0xde, 0x3c, 0x50, 0x20, 0x93, 0x3b, 0xac, 0xef, 0xf6, 0x3c, 0xac, 0x6e, + 0x93, 0xbc, 0x92, 0x06, 0x64, 0x3d, 0x28, 0xdd, 0x74, 0x3c, 0xf7, 0x67, 0x86, + 0x3d, 0x2c, 0x86, 0x43, 0x3d, 0x30, 0x55, 0x89, 0xbd, 0xa0, 0xf0, 0xd7, 0xbb, + 0xe4, 0x7f, 0x05, 0x3d, 0x18, 0xf7, 0x3f, 0x3c, 0x46, 0xaf, 0xcb, 0xbc, 0x80, + 0xf0, 0xb3, 0x3b, 0xdc, 0xe9, 0x81, 0x3c, 0xef, 0x3f, 0x5c, 0xbd, 0xfe, 0xb8, + 0xa1, 0xbc, 0x90, 0x44, 0x41, 0x3c, 0x4e, 0xc8, 0x30, 0xbd, 0x63, 0x6e, 0x72, + 0xbd, 0xbc, 0x52, 0xbf, 0xbc, 0x7c, 0x04, 0x47, 0xbd, 0x4c, 0xe3, 0x4e, 0xbd, + 0x34, 0x8b, 0x36, 0x3d, 0xd1, 0xf2, 0x33, 0xbd, 0x16, 0x48, 0x09, 0x3d, 0x8c, + 0x31, 0x00, 0xbd, 0xd9, 0x91, 0x8e, 0xbd, 0xf2, 0x8d, 0x64, 0xbd, 0x48, 0x20, + 0xbf, 0xbc, 0x60, 0x89, 0x53, 0x3b, 0x00, 0x96, 0x71, 0x3a, 0x44, 0x6e, 0x8c, + 0xbd, 0x90, 0x6b, 0x7d, 0xbd, 0x64, 0x71, 0xa6, 0x3c, 0x52, 0x23, 0x70, 0x3d, + 0xf3, 0x05, 0x80, 0x3d, 0xb4, 0xe2, 0x68, 0xbd, 0x20, 0x6f, 0xf9, 0x3b, 0x60, + 0x31, 0x2c, 0x3d, 0x30, 0x78, 0x4b, 0xbd, 0xd8, 0xae, 0x23, 0xbc, 0x40, 0xea, + 0xc5, 0x3a, 0xd0, 0xe7, 0x86, 0xbd, 0xa0, 0x57, 0x47, 0x3d, 0x70, 0x78, 0xab, + 0x3b, 0x1c, 0xab, 0xb1, 0xbc, 0x2a, 0x75, 0x5d, 0xbd, 0xd0, 0xd1, 0x26, 0xbd, + 0x90, 0x93, 0x3a, 0xbd, 0xb4, 0x8a, 0xe9, 0xbc, 0xac, 0xf1, 0xa5, 0xbc, 0x10, + 0xa3, 0xa7, 0xbb, 0x02, 0xb2, 0x73, 0xbd, 0x2e, 0x27, 0xb7, 0xbc, 0xd0, 0x0c, + 0x92, 0xbd, 0x0e, 0x8e, 0x77, 0x3d, 0x5a, 0x78, 0x0a, 0x3d, 0xf4, 0xa9, 0xc5, + 0x3c, 0x82, 0x8a, 0x15, 0x3d, 0x3d, 0x25, 0x13, 0xbd, 0x7e, 0x35, 0x12, 0xbd, + 0x2a, 0xd2, 0x6e, 0x3d, 0x78, 0x60, 0xcb, 0xbc, 0x70, 0x92, 0x81, 0xbd, 0xca, + 0x3f, 0x2f, 0xbd, 0x3b, 0x71, 0x67, 0xbd, 0x80, 0x79, 0x83, 0xba, 0xc6, 0x2a, + 0x47, 0x3d, 0x86, 0x99, 0x72, 0x3d, 0x6c, 0x59, 0x8f, 0x3c, 0x73, 0x59, 0x14, + 0xbd, 0x23, 0x83, 0x82, 0x3d, 0x94, 0x4d, 0x8b, 0xbd, 0x9c, 0x05, 0x2f, 0xbd, + 0x60, 0xae, 0x57, 0x3d, 0x95, 0x1c, 0x86, 0x3d, 0x26, 0xaf, 0x78, 0x3d, 0x47, + 0x4b, 0x4e, 0xbd, 0x96, 0xfd, 0x75, 0x3d, 0xb2, 0x63, 0x35, 0x3d, 0xc0, 0x00, + 0xa3, 0x3b, 0x12, 0x16, 0x3d, 0x3d, 0x8e, 0xd2, 0x56, 0xbd, 0x02, 0xff, 0xec, + 0xbc, 0x96, 0x20, 0xcc, 0xbc, 0xf4, 0x61, 0x0b, 0x3d, 0x20, 0x12, 0x58, 0x3b, + 0x5a, 0xa3, 0x4c, 0x3d, 0x80, 0x86, 0x64, 0x3b, 0x0e, 0x77, 0x70, 0x3d, 0xd0, + 0x7b, 0xe8, 0xbb, 0x92, 0x2d, 0x20, 0xbd, 0xc8, 0x33, 0x6f, 0xbc, 0xf8, 0x0f, + 0x76, 0x3c, 0x3a, 0xea, 0x36, 0x3d, 0xc0, 0x6c, 0x47, 0x3b, 0x00, 0x3b, 0x98, + 0xbc, 0x88, 0x52, 0x3b, 0x3c, 0xa8, 0x58, 0x54, 0x3c, 0x5a, 0xff, 0x4f, 0x3d, + 0xfe, 0x26, 0x5e, 0x3d, 0x7c, 0x39, 0x8e, 0xbc, 0x96, 0x37, 0x75, 0x3d, 0xbd, + 0x95, 0x86, 0xbd, 0x6b, 0x40, 0x91, 0x3d, 0x40, 0x14, 0x3a, 0xbb, 0xf0, 0xe0, + 0x0f, 0xbc, 0xeb, 0x23, 0x82, 0x3d, 0xe0, 0x7c, 0x8e, 0x3b, 0x60, 0x71, 0x11, + 0xbc, 0x3e, 0x89, 0x2c, 0xbd, 0x9a, 0x0a, 0x7f, 0xbd, 0xe8, 0x86, 0xcd, 0x3c, + 0xd4, 0x1d, 0xfe, 0x3c, 0xc6, 0x1f, 0x63, 0x3d, 0xe8, 0x6a, 0x2d, 0x3c, 0xec, + 0xb5, 0x02, 0x3d, 0x78, 0xcb, 0xe0, 0xbc, 0x74, 0x19, 0x64, 0xbc, 0xf0, 0xf7, + 0x69, 0xbc, 0x11, 0x97, 0x92, 0xbd, 0xe2, 0x89, 0x8b, 0xbd, 0x36, 0xe1, 0xa2, + 0xbc, 0x38, 0x7d, 0xb2, 0xbc, 0xf4, 0x26, 0x16, 0x3d, 0x70, 0x40, 0x90, 0xbd, + 0xe0, 0x0a, 0x70, 0x3c, 0x86, 0xb8, 0x35, 0x3d, 0x67, 0xd7, 0x8d, 0x3d, 0xd0, + 0xdc, 0x17, 0xbc, 0x10, 0xf7, 0xcd, 0xbb, 0xfe, 0x64, 0x59, 0x3d, 0x34, 0xf3, + 0x3c, 0xbd, 0x40, 0xfe, 0xae, 0xba, 0xd1, 0x87, 0x85, 0x3d, 0x10, 0x58, 0x65, + 0xbd, 0x66, 0xaf, 0x5d, 0xbd, 0x42, 0x56, 0x5d, 0x3d, 0x7c, 0xce, 0x5f, 0xbd, + 0xc0, 0x38, 0x96, 0x3a, 0x33, 0x59, 0x90, 0x3d, 0x06, 0x1a, 0xa6, 0xbc, 0xd4, + 0xb0, 0x83, 0x3c, 0xa8, 0xf4, 0x07, 0x3c, 0xa5, 0x8f, 0x90, 0x3d, 0x36, 0xd8, + 0xc0, 0xbc, 0xf0, 0xf5, 0x31, 0x3d, 0x30, 0x56, 0x88, 0xbd, 0x3c, 0x96, 0x05, + 0xbd, 0x89, 0xc2, 0x89, 0x3d, 0x19, 0x10, 0x06, 0xbd, 0xa2, 0xaa, 0x63, 0x3d, + 0x5e, 0x9b, 0x76, 0xbd, 0xa5, 0x57, 0x8c, 0x3d, 0x48, 0xe9, 0x2a, 0x3c, 0xe0, + 0xd9, 0x3a, 0x3b, 0xd3, 0x1c, 0x7f, 0xbd, 0x8c, 0x60, 0x21, 0xbc, 0x38, 0xc1, + 0x67, 0xbc, 0xf0, 0x83, 0x62, 0x3c, 0x58, 0xcb, 0x3f, 0x3d, 0xc7, 0xd9, 0x83, + 0x3d, 0x3e, 0xf5, 0x90, 0xbd, 0xeb, 0xb8, 0x8b, 0xbd, 0x0a, 0x86, 0x05, 0x3d, + 0x61, 0xb6, 0x39, 0xbd, 0x56, 0x8f, 0x04, 0x3d, 0x19, 0xbd, 0x33, 0xbd, 0x24, + 0xd1, 0x50, 0x3d, 0xd0, 0x14, 0xf8, 0x3c, 0x2c, 0x43, 0x49, 0x3d, 0x98, 0xa1, + 0x53, 0xbc, 0xc2, 0x43, 0x26, 0x3d, 0x8e, 0xed, 0xff, 0xbc, 0xb7, 0x58, 0x75, + 0xbd, 0x00, 0xb7, 0x85, 0x3a, 0x8c, 0xb1, 0x83, 0xbc, 0x08, 0x40, 0x92, 0xbd, + 0x35, 0x28, 0x08, 0xbd, 0x30, 0x4f, 0x84, 0x3c, 0x34, 0x0b, 0x22, 0xbc, 0x30, + 0x1a, 0x07, 0x3c, 0xaa, 0xd6, 0x87, 0xbd, 0xa2, 0xfd, 0x7d, 0xbd, 0xfe, 0xa0, + 0xb7, 0xbc, 0xa2, 0x0a, 0x33, 0x3d, 0x10, 0x60, 0xe4, 0xbb, 0x64, 0x49, 0x10, + 0xbd, 0xf4, 0xd0, 0x48, 0xbc, 0x12, 0x7a, 0x38, 0x3d, 0x28, 0xb9, 0xee, 0xbc, + 0x05, 0xbe, 0x50, 0xbd, 0xce, 0x2f, 0xd5, 0xbc, 0x04, 0x8f, 0x39, 0xbd, 0xa8, + 0x16, 0x0c, 0xbd, 0x64, 0xe1, 0x79, 0xbc, 0xd4, 0x20, 0x8c, 0x3c, 0x28, 0x73, + 0x1c, 0x3d, 0x20, 0x66, 0x97, 0x3c, 0x66, 0x6e, 0xc1, 0xbc, 0x6d, 0xfc, 0x91, + 0xbd, 0xc5, 0x79, 0x89, 0xbd, 0xd0, 0x3c, 0x90, 0x3c, 0xfc, 0x19, 0x55, 0xbd, + 0x72, 0x96, 0x80, 0xbd, 0x80, 0x81, 0x46, 0x3d, 0xea, 0x10, 0x30, 0x3d, 0x00, + 0xdc, 0xe2, 0x3b, 0x44, 0x30, 0x78, 0xbc, 0x3a, 0x5b, 0x39, 0x3d, 0x00, 0x8d, + 0x8c, 0xbb, 0x70, 0x9f, 0x3b, 0xbc, 0x1c, 0xa9, 0x5c, 0xbc, 0x04, 0xa9, 0xe4, + 0xbc, 0x3a, 0xd9, 0x39, 0x3d, 0xa0, 0x11, 0xfd, 0x3c, 0x76, 0x3b, 0xf9, 0xbc, + 0xb9, 0xdd, 0x6f, 0xbd, 0xf5, 0xcb, 0x91, 0xbd, 0xee, 0x45, 0x5d, 0xbd, 0x13, + 0x1c, 0x8d, 0xbd, 0x10, 0xb7, 0xb6, 0x3b, 0x60, 0xc8, 0x77, 0x3b, 0x70, 0x4d, + 0xbf, 0xbb, 0x38, 0x4f, 0x80, 0xbd, 0xa9, 0x6b, 0x92, 0xbd, 0x78, 0x8e, 0x7e, + 0x3c, 0x70, 0xd1, 0x6e, 0x3c, 0x79, 0x4c, 0x85, 0xbd, 0xcc, 0xac, 0x2b, 0x3d, + 0x49, 0x46, 0x5f, 0xbd, 0x68, 0x60, 0x6d, 0xbc, 0x50, 0x53, 0xe4, 0x3b, 0x35, + 0x39, 0x81, 0x3d, 0xf0, 0x01, 0x12, 0x3c, 0x4c, 0x27, 0x8b, 0xbd, 0xce, 0x8d, + 0x71, 0x3d, 0xcc, 0x9a, 0x8e, 0xbd, 0x9e, 0x6f, 0xcd, 0xbc, 0xea, 0x23, 0x19, + 0x3d, 0xac, 0xed, 0x95, 0x3c, 0x76, 0x32, 0x68, 0x3d, 0x08, 0xcc, 0x58, 0x3c, + 0xc8, 0xe2, 0xcc, 0x3c, 0xf1, 0x85, 0x81, 0x3d, 0x06, 0xdc, 0x6b, 0x3d, 0x16, + 0x15, 0xf0, 0xbc, 0xda, 0x56, 0x4e, 0x3d, 0x58, 0x5c, 0x90, 0xbc, 0xe4, 0x79, + 0x37, 0xbd, 0x40, 0x1b, 0x6a, 0xbd, 0x00, 0x4e, 0x63, 0x3b, 0xbc, 0xfc, 0x35, + 0x3d, 0xe6, 0x87, 0xf9, 0xbc, 0xb0, 0xfc, 0x0c, 0x3d, 0x96, 0x7f, 0x53, 0xbd, + 0x1e, 0xe1, 0x04, 0x3d, 0x10, 0x11, 0x87, 0x3c, 0xce, 0xd1, 0x42, 0x3d, 0x1c, + 0x27, 0xca, 0xbc, 0xd8, 0x71, 0xfa, 0x3c, 0xea, 0xce, 0x76, 0x3d, 0x2c, 0x0e, + 0xbc, 0x3c, 0x9b, 0x96, 0x48, 0xbd, 0x60, 0x7b, 0x93, 0xbb, 0x8a, 0x69, 0xa8, + 0xbc, 0xc0, 0xcd, 0x79, 0x3c, 0xd0, 0xe0, 0x87, 0xbd, 0xe6, 0x91, 0x53, 0xbd, + 0x96, 0xe0, 0x03, 0x3d, 0x8b, 0x7a, 0x81, 0xbd, 0x16, 0x64, 0x80, 0xbd, 0x84, + 0xac, 0x87, 0x3c, 0xf8, 0xb7, 0xfc, 0xbc, 0x63, 0x2a, 0x38, 0xbd, 0x5a, 0x71, + 0x35, 0xbd, 0xda, 0xff, 0x49, 0xbd, 0x50, 0xcd, 0xdb, 0xbb, 0xc0, 0x85, 0x37, + 0xbb, 0x2a, 0x21, 0x35, 0x3d, 0xb6, 0x59, 0xcc, 0xbc, 0x10, 0x02, 0xe7, 0x3b, + 0x78, 0xf5, 0x54, 0xbc, 0xb0, 0x3c, 0x58, 0x3c, 0xf4, 0x96, 0x59, 0x3d, 0x10, + 0xd7, 0xd2, 0xbb, 0x1a, 0x0c, 0x79, 0x3d, 0x48, 0x2c, 0x6b, 0x3c, 0xc0, 0x44, + 0x89, 0xbb, 0x5c, 0xf0, 0xa3, 0x3c, 0xd0, 0x1c, 0x07, 0x3d, 0x02, 0xcd, 0x94, + 0xbc, 0xa8, 0x51, 0x99, 0xbc, 0xc0, 0xb9, 0x40, 0x3c, 0xe0, 0x85, 0x86, 0x3c, + 0x74, 0x77, 0x9f, 0x3c, 0x15, 0xe0, 0x71, 0xbd, 0x00, 0xf1, 0xfc, 0xb9, 0x50, + 0x39, 0x11, 0x3c, 0xb7, 0x13, 0x81, 0x3d, 0x60, 0x31, 0xe5, 0x3c, 0x8c, 0x42, + 0xf6, 0xbc, 0x4c, 0x34, 0x8a, 0xbc, 0xb8, 0x26, 0xe6, 0x3c, 0xf4, 0x56, 0x69, + 0xbc, 0xcc, 0xb4, 0xa1, 0x3c, 0xf0, 0x8e, 0x48, 0xbd, 0xcb, 0xab, 0x91, 0xbd, + 0x00, 0xc4, 0x5e, 0xbb, 0xdd, 0xf5, 0x8c, 0x3d, 0xc8, 0x1a, 0x8a, 0x3c, 0x1c, + 0x9c, 0xda, 0xbc, 0x89, 0x6e, 0x83, 0x3d, 0x00, 0x6e, 0x3c, 0x39, 0x80, 0x82, + 0xd0, 0x3a, 0x00, 0x09, 0xc2, 0xb9, 0x04, 0x06, 0x38, 0xbc, 0x0a, 0x7a, 0xf7, + 0xbc, 0x50, 0xac, 0x1d, 0x3c, 0x9e, 0xd8, 0xfa, 0xbc, 0xea, 0xed, 0x71, 0xbd, + 0x7f, 0xf6, 0x0a, 0xbd, 0x20, 0x2d, 0x30, 0x3b, 0xd0, 0x7c, 0x96, 0x3b, 0x2e, + 0x61, 0x3f, 0x3d, 0xb0, 0x0a, 0x2d, 0x3d, 0x80, 0xac, 0x47, 0xbb, 0x7a, 0x9e, + 0xe6, 0xbc, 0x50, 0x90, 0x44, 0x3c, 0x0d, 0x23, 0x8e, 0xbd, 0x00, 0x3a, 0x59, + 0x3a, 0x12, 0xa5, 0x52, 0xbd, 0xbc, 0x90, 0xac, 0x3c, 0x00, 0x77, 0xe1, 0x3a, + 0x83, 0x27, 0x8a, 0xbd, 0x40, 0xcd, 0xb0, 0xbc, 0x6a, 0xf8, 0x22, 0x3d, 0xc0, + 0xfe, 0xc8, 0xbb, 0x52, 0x28, 0x63, 0x3d, 0xb2, 0xd2, 0xbe, 0xbc, 0x80, 0x68, + 0x42, 0xbc, 0xa4, 0x31, 0x58, 0xbc, 0xae, 0xda, 0x3a, 0xbd, 0xcb, 0xd7, 0x80, + 0xbd, 0x32, 0x43, 0x60, 0x3d, 0x52, 0xc1, 0xa9, 0xbc, 0x18, 0x3a, 0x2d, 0x3c, + 0x8e, 0x17, 0x5f, 0xbd, 0x9d, 0xcc, 0x85, 0x3d, 0x5c, 0x7c, 0x12, 0x3d, 0xde, + 0x24, 0x78, 0x3d, 0xec, 0xba, 0x16, 0x3d, 0xd1, 0xb1, 0x3d, 0xbd, 0xf0, 0x7f, + 0xe3, 0x3c, 0xe0, 0xf7, 0xef, 0xbb, 0x28, 0x65, 0x18, 0xbd, 0x7a, 0x38, 0x48, + 0x3d, 0xad, 0xff, 0x81, 0xbd, 0x72, 0xe6, 0x69, 0x3d, 0x98, 0x35, 0x08, 0xbd, + 0x16, 0xb5, 0x3a, 0xbd, 0x26, 0x18, 0x52, 0xbd, 0xc4, 0xb5, 0xc9, 0x3c, 0xbc, + 0xcc, 0x93, 0x3c, 0x6e, 0x74, 0xc9, 0xbc, 0xae, 0x05, 0x14, 0x3d, 0x96, 0x6c, + 0x78, 0x3d, 0x48, 0xe7, 0x7a, 0xbc, 0xe2, 0x8b, 0x65, 0xbd, 0xda, 0x9c, 0x97, + 0xbc, 0xbc, 0xc8, 0xab, 0x3c, 0xf0, 0xb1, 0x5f, 0xbd, 0xbe, 0x43, 0x3d, 0x3d, + 0xf8, 0xc7, 0x81, 0xbd, 0xd0, 0xc7, 0xcd, 0x3c, 0xfe, 0x77, 0x72, 0xbd, 0x32, + 0x3c, 0x7c, 0x3d, 0xfa, 0x2e, 0x84, 0xbc, 0x4c, 0xbc, 0x04, 0x3d, 0xc6, 0x29, + 0x8f, 0xbd, 0x4c, 0x07, 0xb8, 0x3c, 0x51, 0xb8, 0x45, 0xbd, 0x4c, 0x84, 0x7b, + 0xbd, 0x8e, 0x26, 0x3e, 0xbd, 0x48, 0xcc, 0x96, 0xbc, 0xb0, 0x59, 0x32, 0x3d, + 0xd6, 0x47, 0xba, 0xbc, 0xf9, 0x32, 0x81, 0x3d, 0xb0, 0xb8, 0x88, 0xbb, 0x80, + 0x93, 0xfd, 0x3a, 0x4a, 0x8d, 0x39, 0x3d, 0x88, 0x34, 0xa1, 0x3c, 0x20, 0x3b, + 0x53, 0x3b, 0x10, 0x26, 0x35, 0x3d, 0x50, 0xab, 0x77, 0xbc, 0x89, 0x68, 0x69, + 0xbd, 0x56, 0xd0, 0x15, 0x3d, 0x56, 0x3f, 0x3e, 0xbd, 0xa0, 0x94, 0xb5, 0x3c, + 0xa9, 0x10, 0x90, 0xbd, 0xfa, 0xe9, 0x48, 0xbd, 0x66, 0x62, 0x6a, 0x3d, 0xdc, + 0x51, 0xb0, 0x3c, 0x20, 0x13, 0x4d, 0xbd, 0x40, 0xbf, 0xe5, 0xba, 0x50, 0x61, + 0x9e, 0x3b, 0xa0, 0xbd, 0xeb, 0xbc, 0xd9, 0x55, 0x48, 0xbd, 0x4c, 0xbf, 0x0e, + 0xbd, 0x80, 0x28, 0x20, 0x3b, 0xea, 0x77, 0x72, 0x3d, 0x08, 0xd6, 0x02, 0x3d, + 0x7b, 0x14, 0x42, 0xbd, 0x8c, 0x7f, 0x91, 0x3c, 0x82, 0xe4, 0x16, 0xbd, 0x30, + 0x61, 0xaf, 0x3c, 0xd2, 0x5c, 0x5a, 0xbd, 0xc0, 0x16, 0x69, 0x3b, 0xe9, 0x5b, + 0x84, 0x3d, 0x49, 0xc3, 0x7e, 0xbd, 0x90, 0x7f, 0xf7, 0x3c, 0x3e, 0xd5, 0x85, + 0xbd, 0x38, 0xb7, 0x43, 0x3c, 0x4e, 0x4d, 0xc0, 0xbc, 0x00, 0x78, 0xea, 0x3a, + 0x32, 0xb2, 0x92, 0xbd, 0xb0, 0xc3, 0x1d, 0x3c, 0x90, 0xc2, 0x23, 0x3c, 0x80, + 0x14, 0xc5, 0x3b, 0x00, 0xf1, 0x87, 0xbc, 0x26, 0xf4, 0x8a, 0xbd, 0x10, 0xa6, + 0x9a, 0x3b, 0x78, 0x8b, 0x72, 0xbd, 0x85, 0xef, 0x12, 0xbd, 0xd8, 0x93, 0x02, + 0x3d, 0x80, 0x8b, 0xca, 0x3a, 0x18, 0x72, 0x17, 0xbc, 0x65, 0x2d, 0x83, 0x3d, + 0xfb, 0xe9, 0x81, 0x3d, 0x60, 0xf3, 0x46, 0xbd, 0xb4, 0xab, 0x1a, 0xbc, 0x30, + 0x0c, 0xf9, 0x3c, 0xb6, 0xc5, 0x63, 0xbd, 0x8e, 0x20, 0xdd, 0xbc, 0x5c, 0x18, + 0x97, 0xbc, 0x10, 0x42, 0x43, 0x3d, 0x11, 0xab, 0x84, 0x3d, 0xec, 0xcf, 0x30, + 0x3d, 0x38, 0x0e, 0x6a, 0x3c, 0x3e, 0x40, 0xd9, 0xbc, 0xce, 0x14, 0x14, 0x3d, + 0x5c, 0xe6, 0x71, 0xbc, 0xf8, 0xd8, 0xf2, 0x3c, 0x98, 0x96, 0x21, 0xbc, 0xbe, + 0xdb, 0x18, 0xbd, 0xe6, 0x7f, 0x28, 0xbd, 0xab, 0x56, 0x23, 0xbd, 0xc2, 0x40, + 0x8e, 0xbd, 0x8c, 0x92, 0xc3, 0x3c, 0xd4, 0x0a, 0x13, 0xbd, 0xbe, 0x25, 0x05, + 0x3d, 0x12, 0x58, 0x0d, 0x3d, 0xd7, 0x65, 0x79, 0xbd, 0x9c, 0x54, 0x4e, 0x3d, + 0x02, 0x2a, 0x40, 0x3d, 0xef, 0xcd, 0x01, 0xbd, 0x11, 0x5c, 0x92, 0x3d, 0xb0, + 0x03, 0x95, 0x3c, 0xa0, 0x08, 0x19, 0x3b, 0x79, 0xad, 0x8c, 0x3d, 0x19, 0x93, + 0x7a, 0xbd, 0x40, 0xfa, 0xc6, 0xbb, 0x68, 0xb6, 0xa8, 0x3c, 0x45, 0x29, 0x8d, + 0xbd, 0x90, 0x3e, 0x13, 0xbc, 0x1a, 0x2d, 0x70, 0x3d, 0xc1, 0xdd, 0x6a, 0xbd, + 0x50, 0x75, 0x01, 0xbd, 0xc1, 0x8d, 0x91, 0xbd, 0xdd, 0x3f, 0x84, 0xbd, 0xa3, + 0xc6, 0x8d, 0x3d, 0xce, 0x23, 0x5b, 0x3d, 0x7e, 0xfb, 0x7d, 0x3d, 0xd5, 0xf4, + 0x23, 0xbd, 0x4c, 0x65, 0x8d, 0xbc, 0xb0, 0x76, 0x89, 0xbd, 0x28, 0xc4, 0x82, + 0xbd, 0x40, 0x70, 0x71, 0x3b, 0xfa, 0x55, 0x8e, 0xbc, 0x40, 0x08, 0xf0, 0x3a, + 0x02, 0x81, 0x56, 0x3d, 0xfe, 0x51, 0xf8, 0xbc, 0x1a, 0xcd, 0x91, 0xbd, 0xfb, + 0x66, 0x7b, 0xbd, 0xb0, 0xbb, 0xf2, 0xbc, 0xbb, 0x24, 0x23, 0xbd, 0x5c, 0x6c, + 0x6d, 0xbd, 0x08, 0xa0, 0x8b, 0x3c, 0xb7, 0x93, 0x1d, 0xbd, 0x74, 0x9f, 0x21, + 0x3d, 0x1c, 0x43, 0x33, 0xbd, 0x66, 0x2c, 0x1c, 0xbd, 0xfe, 0xf5, 0x11, 0xbd, + 0x10, 0x32, 0xef, 0xbc, 0x40, 0x70, 0x6f, 0xbb, 0xa1, 0xca, 0x8f, 0x3d, 0x12, + 0x42, 0x13, 0x3d, 0x38, 0x2e, 0xf3, 0x3c, 0x16, 0x69, 0x77, 0x3d, 0x6d, 0xa9, + 0x1e, 0xbd, 0xdc, 0xf5, 0xba, 0xbc, 0xc4, 0xe8, 0x1f, 0xbd, 0xfc, 0xc7, 0x08, + 0x3d, 0x8c, 0x9a, 0x28, 0x3d, 0x80, 0xbb, 0x14, 0x3b, 0xce, 0x47, 0x68, 0x3d, + 0xd3, 0x75, 0x10, 0xbd, 0x30, 0x9e, 0xb1, 0x3b, 0x48, 0x08, 0x80, 0x3c, 0x53, + 0xbe, 0x7e, 0xbd, 0x54, 0xdd, 0x5c, 0xbd, 0x89, 0x15, 0x77, 0xbd, 0x20, 0x13, + 0x00, 0x3b, 0xab, 0x6a, 0x15, 0xbd, 0x70, 0x62, 0x0b, 0xbc, 0xb6, 0x69, 0x44, + 0x3d, 0x9e, 0x71, 0x44, 0x3d, 0xfb, 0x84, 0x1e, 0xbd, 0xc8, 0x25, 0x3e, 0xbc, + 0xa8, 0x9e, 0xa6, 0x3c, 0xa0, 0x0c, 0x0b, 0x3d, 0x48, 0xe7, 0xb1, 0xbc, 0x2f, + 0xfc, 0x8a, 0x3d, 0xbc, 0x2a, 0x27, 0xbc, 0x80, 0x69, 0x38, 0x3c, 0xa0, 0x89, + 0xb4, 0xbb, 0x10, 0xb6, 0x56, 0xbc, 0x80, 0xaa, 0x37, 0x3b, 0xbd, 0x66, 0x1d, + 0xbd, 0xb9, 0x3e, 0x6c, 0xbd, 0x14, 0xc1, 0x1e, 0x3d, 0x10, 0xd3, 0xa5, 0x3b, + 0x1c, 0x9a, 0x43, 0xbc, 0xa0, 0xb3, 0xdd, 0xbc, 0xf8, 0x82, 0xb8, 0x3c, 0xc8, + 0x76, 0x1b, 0x3d, 0x7e, 0x2b, 0x5c, 0x3d, 0x20, 0xd8, 0x7f, 0xbd, 0x88, 0xe0, + 0xa0, 0x3c, 0x1c, 0x48, 0x26, 0x3d, 0x50, 0x53, 0x1e, 0x3c, 0xf0, 0x07, 0x54, + 0x3c, 0xc9, 0xde, 0x05, 0xbd, 0x2c, 0x34, 0x84, 0x3c, 0xa8, 0x30, 0x1b, 0x3c, + 0x6c, 0xa1, 0x3c, 0xbd, 0x00, 0x58, 0xc1, 0xb8, 0xf0, 0xd4, 0xf9, 0x3b, 0xf0, + 0xb3, 0x2e, 0x3d, 0x14, 0xe3, 0x4f, 0x3d, 0x70, 0x0b, 0x73, 0x3c, 0x8b, 0xca, + 0x89, 0xbd, 0x9c, 0xd8, 0x85, 0x3c, 0x9c, 0x34, 0x4b, 0xbc, 0xf5, 0x38, 0x71, + 0xbd, 0x01, 0xe5, 0x84, 0x3d, 0xd4, 0xde, 0x25, 0xbc, 0x80, 0xc0, 0xb1, 0xbb, + 0x80, 0xca, 0xfc, 0x3b, 0x78, 0xe0, 0x2d, 0xbd, 0xda, 0x90, 0x29, 0xbd, 0x3a, + 0xdb, 0x37, 0xbd, 0x00, 0x81, 0xa1, 0xbb, 0x3a, 0xcb, 0x71, 0xbd, 0x1c, 0x8e, + 0x29, 0xbc, 0x68, 0x0a, 0x5f, 0xbc, 0x0f, 0x86, 0x91, 0xbd, 0x98, 0x61, 0x62, + 0x3c, 0x82, 0x06, 0x4e, 0xbd, 0xa0, 0x7a, 0x35, 0x3b, 0xfa, 0xbc, 0x31, 0x3d, + 0xee, 0x18, 0x3a, 0x3d, 0xe0, 0xf0, 0x9d, 0xbb, 0x87, 0xba, 0x8f, 0x3d, 0x0e, + 0x75, 0x24, 0x3d, 0x92, 0xf6, 0x77, 0x3d, 0x78, 0xda, 0x72, 0xbc, 0xe4, 0x5c, + 0x55, 0xbc, 0xe3, 0xbf, 0x87, 0x3d, 0x74, 0x55, 0x5c, 0xbd, 0x88, 0x2b, 0x0b, + 0xbc, 0x68, 0xd5, 0x21, 0x3d, 0x0a, 0x05, 0x94, 0xbc, 0x5f, 0xb7, 0x8a, 0x3d, + 0x48, 0x83, 0x5c, 0x3c, 0x08, 0x83, 0x77, 0xbc, 0xc4, 0x31, 0xd6, 0x3c, 0xb8, + 0x48, 0x52, 0x3c, 0x00, 0xcb, 0xda, 0x3b, 0x32, 0x6a, 0x5f, 0xbd, 0x76, 0x7f, + 0x8f, 0xbd, 0xc0, 0xb7, 0xb2, 0x3c, 0x91, 0x5e, 0x1d, 0xbd, 0x92, 0x5d, 0x62, + 0x3d, 0x9c, 0x2b, 0x65, 0xbd, 0x3e, 0xe5, 0x2a, 0x3d, 0x29, 0xb7, 0x81, 0xbd, + 0x74, 0xa2, 0xda, 0x3c, 0x1a, 0xcb, 0x15, 0x3d, 0x56, 0x35, 0x60, 0x3d, 0x50, + 0x4a, 0x4f, 0xbc, 0xb2, 0x3c, 0x73, 0x3d, 0x88, 0x39, 0x71, 0xbd, 0xa0, 0x73, + 0x7d, 0xbd, 0x18, 0x14, 0xac, 0x3c, 0xa8, 0x1a, 0x57, 0x3d, 0x00, 0x3a, 0x77, + 0xbc, 0x2a, 0xd5, 0x93, 0xbc, 0x7e, 0x27, 0x41, 0x3d, 0xa0, 0x96, 0x19, 0x3d, + 0x18, 0x3e, 0xe5, 0x3c, 0x56, 0xda, 0x0d, 0x3d, 0xb2, 0x5f, 0x1d, 0x3d, 0x0c, + 0x27, 0xd6, 0x3c, 0xc6, 0x34, 0x89, 0xbd, 0x84, 0xe7, 0x65, 0xbd, 0xfc, 0x87, + 0xba, 0x3c, 0xd6, 0x7b, 0x3b, 0xbd, 0xe8, 0xf4, 0x49, 0xbd, 0x70, 0x19, 0x0d, + 0x3c, 0x5a, 0x0c, 0x18, 0x3d, 0xe6, 0x0e, 0x26, 0x3d, 0x12, 0xa0, 0x61, 0xbd, + 0xec, 0xa3, 0x26, 0x3d, 0xf4, 0xef, 0xe0, 0x3c, 0xdd, 0xc0, 0x88, 0xbd, 0x08, + 0x87, 0x0e, 0x3d, 0x2b, 0xb7, 0x18, 0xbd, 0xe6, 0xd5, 0x1f, 0xbd, 0x38, 0xc1, + 0x37, 0x3c, 0x88, 0x9a, 0x74, 0xbd, 0x04, 0xce, 0x04, 0x3d, 0x00, 0x5c, 0xab, + 0xbc, 0xbd, 0x47, 0x4b, 0xbd, 0xf0, 0xc1, 0x33, 0xbc, 0x2c, 0x4d, 0xca, 0x3c, + 0x84, 0xfd, 0xed, 0xbc, 0x6c, 0xf2, 0x2c, 0x3d, 0x1b, 0x24, 0x87, 0x3d, 0x7a, + 0x67, 0x8f, 0xbc, 0x84, 0xab, 0x50, 0xbc, 0x84, 0xd2, 0x0b, 0x3d, 0x18, 0x03, + 0x03, 0x3d, 0x80, 0x54, 0x01, 0x3d, 0xbc, 0x41, 0xd8, 0x3c, 0x60, 0xe4, 0x34, + 0x3d, 0x3d, 0xfb, 0x26, 0xbd, 0xcc, 0x6f, 0x1f, 0x3d, 0xc0, 0xb0, 0x30, 0xbb, + 0x7f, 0xb2, 0x83, 0xbd, 0x8f, 0xed, 0x91, 0x3d, 0xa0, 0xe6, 0xe2, 0xbb, 0xfa, + 0x94, 0x67, 0x3d, 0x70, 0xd4, 0x69, 0xbd, 0x80, 0xba, 0xed, 0x3c, 0xce, 0x26, + 0xb8, 0xbc, 0xfe, 0xd9, 0x1c, 0x3d, 0xae, 0x09, 0x0e, 0x3d, 0x4f, 0x3d, 0x52, + 0xbd, 0x87, 0xde, 0x62, 0xbd, 0x02, 0x63, 0xff, 0xbc, 0x70, 0x60, 0xbd, 0x3b, + 0x3c, 0x3f, 0xe7, 0x3c, 0x9c, 0x9c, 0x34, 0xbd, 0x82, 0xcf, 0x82, 0xbd, 0xa2, + 0xdb, 0x39, 0x3d, 0x70, 0x89, 0xe8, 0x3c, 0xad, 0x61, 0x80, 0xbd, 0xd8, 0x58, + 0x34, 0xbd, 0xf6, 0x79, 0x5f, 0xbd, 0xd0, 0x9b, 0xc6, 0x3c, 0x02, 0x91, 0x0f, + 0x3d, 0x90, 0xe4, 0xc1, 0x3b, 0xff, 0xa7, 0x8e, 0x3d, 0x99, 0x07, 0x92, 0xbd, + 0x30, 0x36, 0xe4, 0x3b, 0xf0, 0xd6, 0x38, 0xbd, 0xea, 0x6d, 0x2d, 0xbd, 0x0e, + 0x11, 0xf6, 0xbc, 0x80, 0x5b, 0x53, 0x3b, 0x1c, 0x44, 0x41, 0x3d, 0xab, 0x98, + 0x7b, 0xbd, 0x20, 0x36, 0x71, 0x3b, 0x87, 0x93, 0x20, 0xbd, 0xb0, 0x35, 0x27, + 0xbd, 0xd2, 0x2b, 0x75, 0x3d, 0x90, 0x12, 0xdc, 0xbc, 0x06, 0x6c, 0x2b, 0x3d, + 0xe0, 0x86, 0x20, 0xbb, 0x9d, 0xdd, 0x88, 0x3d, 0xec, 0xe2, 0x19, 0x3d, 0x70, + 0x76, 0xb4, 0x3c, 0x0e, 0x49, 0x42, 0xbd, 0x34, 0x9c, 0xe3, 0x3c, 0xe0, 0x1d, + 0xf8, 0xbb, 0xfc, 0x83, 0xc2, 0xbc, 0xdc, 0xe1, 0x8d, 0xbc, 0x04, 0x9b, 0xa7, + 0x3c, 0x54, 0x5a, 0xfc, 0x3c, 0x80, 0x63, 0x14, 0xba, 0xcc, 0x46, 0x08, 0x3d, + 0x46, 0xf5, 0x2b, 0x3d, 0xe0, 0x8b, 0x48, 0x3d, 0xa0, 0x99, 0xfd, 0x3b, 0x41, + 0x57, 0x87, 0x3d, 0xe4, 0xcb, 0x56, 0xbd, 0x1f, 0xa4, 0x3f, 0xbd, 0xac, 0x66, + 0x85, 0x3c, 0xaa, 0x3a, 0x55, 0x3d, 0x32, 0x06, 0x29, 0x3d, 0x9a, 0xb8, 0x5a, + 0xbd, 0x00, 0xfc, 0xbb, 0xba, 0xd7, 0x80, 0x86, 0x3d, 0xb4, 0x7c, 0xf5, 0x3c, + 0xac, 0xf4, 0x36, 0x3d, 0x82, 0xef, 0x65, 0x3d, 0x49, 0x63, 0x5c, 0xbd, 0x66, + 0xe0, 0x8f, 0xbd, 0x42, 0x66, 0x28, 0x3d, 0xfc, 0xec, 0x08, 0x3d, 0x0a, 0x9c, + 0x1e, 0x3d, 0x65, 0x3c, 0x45, 0xbd, 0x73, 0x4f, 0x88, 0x3d, 0xec, 0x1e, 0xbf, + 0xbc, 0xee, 0xa7, 0x55, 0x3d, 0x10, 0x84, 0x57, 0x3c, 0xd4, 0x12, 0xdf, 0x3c, + 0xa8, 0x8f, 0x8f, 0xbd, 0x56, 0x80, 0x89, 0xbd, 0x08, 0xc5, 0x09, 0xbc, 0xfd, + 0x84, 0x22, 0xbd, 0xb2, 0x0a, 0x66, 0x3d, 0x0a, 0x86, 0x61, 0x3d, 0x79, 0xf8, + 0x81, 0xbd, 0x7a, 0x81, 0x49, 0xbd, 0x88, 0x62, 0x7f, 0x3c, 0x8c, 0x81, 0x71, + 0xbd, 0x42, 0x9e, 0x86, 0xbd, 0x30, 0x5d, 0xf6, 0x3b, 0x6c, 0xc0, 0x29, 0xbc, + 0x88, 0x30, 0xdf, 0xbc, 0xda, 0xed, 0xf4, 0xbc, 0x98, 0x29, 0x34, 0xbd, 0xc0, + 0x10, 0xbe, 0x3a, 0x9b, 0x69, 0x8c, 0x3d, 0x40, 0x02, 0x98, 0xba, 0x2b, 0x85, + 0x76, 0xbd, 0x0c, 0xfd, 0xd3, 0x3c, 0x62, 0x37, 0x08, 0x3d, 0x0a, 0xe3, 0xe9, + 0xbc, 0x80, 0x1c, 0xc9, 0x3a, 0x54, 0x4b, 0x39, 0xbc, 0x28, 0xae, 0x7a, 0x3c, + 0x60, 0xd7, 0xe9, 0x3b, 0x08, 0xbe, 0x52, 0xbd, 0x04, 0x99, 0x3d, 0xbd, 0xd0, + 0xd2, 0x13, 0xbd, 0x1a, 0x86, 0x8e, 0xbc, 0xeb, 0xaa, 0x6a, 0xbd, 0x00, 0x23, + 0xa3, 0xb9, 0xc8, 0x76, 0x77, 0xbc, 0x36, 0x45, 0x72, 0xbd, 0xe4, 0xd7, 0x8a, + 0xbc, 0xfd, 0xfa, 0x8c, 0x3d, 0x2b, 0xc3, 0x07, 0xbd, 0x6d, 0xd0, 0x87, 0x3d, + 0xec, 0xa4, 0xde, 0x3c, 0x92, 0x4b, 0x65, 0x3d, 0x20, 0x6c, 0x2c, 0xbd, 0x00, + 0xb7, 0x0c, 0x3b, 0x96, 0x7f, 0x4b, 0x3d, 0xec, 0xe9, 0xdb, 0xbc, 0xaa, 0x06, + 0x3b, 0x3d, 0x20, 0x8c, 0x33, 0x3d, 0xe1, 0x03, 0x18, 0xbd, 0xe0, 0xa5, 0x0a, + 0xbc, 0x30, 0x1d, 0x5f, 0x3c, 0xfc, 0x28, 0x6d, 0xbd, 0x43, 0x41, 0x90, 0x3d, + 0x58, 0x87, 0x30, 0x3c, 0xdd, 0x8c, 0x60, 0xbd, 0xec, 0x2a, 0xba, 0xbc, 0xf2, + 0x9d, 0xa9, 0xbc, 0x30, 0xb0, 0x06, 0x3c, 0x68, 0x3e, 0x53, 0x3c, 0x78, 0xab, + 0xff, 0xbc, 0xa8, 0x34, 0x0d, 0xbc, 0x4e, 0x3f, 0x01, 0x3d, 0x00, 0x96, 0x44, + 0x3b, 0x2c, 0xa3, 0xda, 0x3c, 0xba, 0xc4, 0x2e, 0xbd, 0x72, 0xbd, 0x2f, 0x3d, + 0xfc, 0x1b, 0x7d, 0xbc, 0x9e, 0xbf, 0x7e, 0x3d, 0x02, 0x94, 0x19, 0x3d, 0x94, + 0x36, 0x4f, 0x3d, 0xf1, 0xee, 0x68, 0xbd, 0x54, 0x9c, 0x87, 0x3c, 0xfa, 0x3e, + 0x7e, 0x3d, 0x02, 0xec, 0x84, 0xbc, 0x12, 0xe7, 0x89, 0xbd, 0xa4, 0x90, 0xa6, + 0x3c, 0x3c, 0x7a, 0x89, 0xbc, 0x86, 0x5d, 0x54, 0x3d, 0xa4, 0xad, 0x53, 0xbc, + 0x32, 0xc5, 0x00, 0x3d, 0x1e, 0x53, 0x0b, 0x3d, 0xef, 0xae, 0x02, 0xbd, 0x7c, + 0xd8, 0x03, 0x3d, 0x38, 0x0e, 0xa5, 0xbc, 0x51, 0xc4, 0x83, 0x3d, 0x66, 0xcb, + 0x8f, 0xbd, 0xa6, 0xfe, 0xb6, 0xbc, 0xa4, 0xb1, 0x97, 0x3c, 0x00, 0xad, 0xb2, + 0x3a, 0x0f, 0xb7, 0x33, 0xbd, 0x37, 0x1f, 0x6f, 0xbd, 0x57, 0x39, 0x8c, 0x3d, + 0x54, 0xe4, 0xb7, 0xbc, 0x1e, 0x63, 0x52, 0xbd, 0x00, 0x3b, 0x43, 0xbd, 0x50, + 0x48, 0xf1, 0xbb, 0x18, 0x01, 0x81, 0xbd, 0x90, 0x1c, 0xaf, 0xbc, 0x06, 0xf8, + 0x7d, 0xbd, 0xf0, 0xe0, 0xa5, 0xbc, 0x08, 0x06, 0xc3, 0x3c, 0x22, 0xff, 0x83, + 0xbc, 0x4c, 0xef, 0x88, 0xbd, 0x36, 0xf2, 0x77, 0x3d, 0x54, 0x3b, 0xd4, 0xbc, + 0xa7, 0xa2, 0x8e, 0x3d, 0xac, 0xb2, 0x99, 0x3c, 0x10, 0x08, 0x88, 0xbb, 0x81, + 0x58, 0x8d, 0xbd, 0xf8, 0x25, 0x29, 0xbd, 0x1c, 0x0f, 0x26, 0xbd, 0x8e, 0x7a, + 0x81, 0xbd, 0x5c, 0x14, 0x8d, 0xbd, 0x81, 0xdd, 0x8f, 0xbd, 0xc8, 0xa2, 0x5f, + 0xbc, 0xc0, 0x48, 0xda, 0xba, 0xfe, 0x26, 0x14, 0x3d, 0xe2, 0x9a, 0x89, 0xbd, + 0x66, 0x8d, 0x59, 0x3d, 0xd8, 0xf8, 0x45, 0x3d, 0x0b, 0xb1, 0x04, 0xbd, 0x7a, + 0x32, 0xdd, 0xbc, 0x00, 0x01, 0x24, 0xbb, 0xc5, 0x97, 0x87, 0xbd, 0x7c, 0xea, + 0x46, 0x3d, 0x85, 0xc1, 0x81, 0x3d, 0xe8, 0x63, 0x24, 0x3d, 0x5d, 0xb3, 0x84, + 0xbd, 0xca, 0xa4, 0x04, 0x3d, 0xea, 0xe8, 0xf0, 0xbc, 0xdc, 0x41, 0x05, 0xbd, + 0xe8, 0x40, 0x4c, 0xbd, 0xb0, 0xb7, 0x2d, 0x3d, 0xa9, 0x0c, 0x1f, 0xbd, 0xd0, + 0x50, 0x97, 0x3b, 0x3f, 0x9c, 0x0f, 0xbd, 0xac, 0xa8, 0x59, 0xbd, 0xdb, 0x76, + 0x87, 0x3d, 0x08, 0xd7, 0x52, 0x3c, 0xc8, 0xf0, 0x1c, 0x3d, 0xec, 0xc1, 0x4a, + 0x3d, 0x44, 0x87, 0x81, 0x3c, 0xbe, 0x6f, 0x13, 0x3d, 0x80, 0x36, 0x49, 0x3c, + 0xae, 0xea, 0x73, 0x3d, 0x70, 0xd3, 0x2d, 0x3d, 0xde, 0xbb, 0x9d, 0xbc, 0xaa, + 0xba, 0x32, 0x3d, 0x7b, 0xc1, 0x3c, 0xbd, 0x42, 0x4e, 0x5f, 0xbd, 0x9a, 0xd4, + 0x75, 0xbd, 0x52, 0x8d, 0x4a, 0x3d, 0xb4, 0x42, 0x8f, 0x3c, 0x20, 0x32, 0x92, + 0xbc, 0x39, 0x52, 0x0a, 0xbd, 0xd8, 0xf6, 0x21, 0xbd, 0x8b, 0x5e, 0x26, 0xbd, + 0x42, 0x45, 0x5b, 0xbd, 0x06, 0x86, 0x7f, 0xbd, 0x65, 0x5a, 0x57, 0xbd, 0x78, + 0x0a, 0x41, 0xbd, 0x5d, 0x12, 0x89, 0xbd, 0x40, 0x70, 0x34, 0xbc, 0xa0, 0x15, + 0x43, 0xbb, 0x76, 0xc5, 0x48, 0x3d, 0x40, 0x0b, 0x36, 0x3d, 0x40, 0x3a, 0x3f, + 0x3b, 0x58, 0xc4, 0xa3, 0x3c, 0x70, 0xdc, 0xdf, 0x3c, 0x50, 0x13, 0x1c, 0x3d, + 0xc0, 0x6d, 0xcc, 0xbb, 0x62, 0xc7, 0x32, 0xbd, 0x15, 0x3f, 0x8b, 0x3d, 0xb5, + 0x5b, 0x14, 0xbd, 0xf1, 0x00, 0x3f, 0xbd, 0x90, 0xe9, 0x53, 0x3c, 0xae, 0xa0, + 0x1f, 0xbd, 0x54, 0x4f, 0xc8, 0xbc, 0x7c, 0x0b, 0x3a, 0xbc, 0x96, 0x74, 0x38, + 0x3d, 0xa6, 0x9b, 0x3f, 0xbd, 0xf4, 0xfd, 0x88, 0xbc, 0x18, 0x1c, 0x97, 0xbc, + 0xc8, 0xcf, 0xea, 0x3c, 0xd9, 0x76, 0x8c, 0x3d, 0x3e, 0x07, 0x87, 0xbc, 0xa8, + 0xb5, 0x3f, 0x3c, 0x74, 0x96, 0x79, 0xbd, 0x30, 0xfc, 0x4e, 0x3c, 0x60, 0x75, + 0x25, 0x3d, 0x28, 0xd6, 0x7a, 0x3c, 0x38, 0xf6, 0x3e, 0x3c, 0x90, 0xd8, 0xf6, + 0xbc, 0x0a, 0x8b, 0x78, 0x3d, 0x94, 0x29, 0xc7, 0xbc, 0xa0, 0x3e, 0xe9, 0xbc, + 0x20, 0xfc, 0xa9, 0x3c, 0xde, 0xab, 0xd2, 0xbc, 0x97, 0x63, 0x8b, 0xbd, 0xa0, + 0xe7, 0x52, 0xbb, 0xa4, 0xf2, 0x36, 0xbc, 0x50, 0x49, 0xb9, 0xbb, 0x1f, 0x9e, + 0x88, 0x3d, 0x86, 0xea, 0x9d, 0xbc, 0x38, 0x1b, 0xf5, 0x3c, 0x46, 0xea, 0x1e, + 0xbd, 0x00, 0xad, 0x18, 0xba, 0x1e, 0x19, 0x6b, 0xbd, 0xa4, 0x1f, 0x90, 0x3c, + 0xf5, 0xb4, 0x42, 0xbd, 0x48, 0xf2, 0x1f, 0xbd, 0x26, 0x05, 0x12, 0x3d, 0x80, + 0x01, 0x58, 0xbd, 0xee, 0x98, 0x51, 0xbd, 0xb8, 0xcd, 0x96, 0xbc, 0x65, 0xbc, + 0x81, 0x3d, 0x90, 0x57, 0xcd, 0x3b, 0xa0, 0x9a, 0x30, 0x3c, 0xa6, 0xa4, 0x82, + 0xbd, 0x20, 0xa1, 0xc6, 0xbb, 0x95, 0x3a, 0x8c, 0xbd, 0x00, 0xa2, 0x72, 0x3c, + 0x00, 0xd6, 0x58, 0x3b, 0xc8, 0x1f, 0x7d, 0x3c, 0xf0, 0x98, 0xe1, 0xbb, 0x02, + 0x83, 0xe7, 0xbc, 0x9a, 0xc9, 0x67, 0x3d, 0xf5, 0x03, 0x90, 0xbd, 0x00, 0x9e, + 0x55, 0xba, 0x80, 0xa0, 0x05, 0x3b, 0x00, 0x53, 0x6d, 0x3c, 0x16, 0xc9, 0x6a, + 0x3d, 0x96, 0x11, 0x04, 0x3d, 0x10, 0x45, 0xff, 0xbb, 0xd2, 0x78, 0x2a, 0xbd, + 0xbb, 0xe1, 0x8d, 0xbd, 0x8c, 0x4a, 0xc7, 0xbc, 0x20, 0x1c, 0x23, 0x3d, 0x10, + 0xb3, 0xff, 0x3b, 0xd8, 0xec, 0x36, 0x3c, 0x64, 0xf1, 0xa7, 0x3d, 0x22, 0xd3, + 0xb0, 0xbd, 0xba, 0xd3, 0xc4, 0x3c, 0x7f, 0x35, 0x0a, 0x3d, 0xb1, 0xba, 0xc0, + 0x3d, 0x70, 0x6e, 0x10, 0x3c, 0x0b, 0x3f, 0x43, 0x3d, 0x75, 0x57, 0x4f, 0xbd, + 0xf7, 0xae, 0x5e, 0xbd, 0xd6, 0xc7, 0x9f, 0x3d, 0x15, 0x89, 0x08, 0x3d, 0x02, + 0x77, 0x49, 0x3c, 0x19, 0x3b, 0xc5, 0xbc, 0xa2, 0x8d, 0x43, 0xbd, 0x7b, 0x63, + 0x22, 0xbc, 0xb8, 0x4c, 0xbe, 0x3d, 0x98, 0x23, 0x2a, 0xbd, 0xd2, 0x49, 0x69, + 0xbd, 0x58, 0xae, 0x14, 0x3d, 0xdc, 0x52, 0x85, 0xbd, 0xd0, 0x91, 0xea, 0x3c, + 0x93, 0x04, 0x5c, 0x3d, 0xdf, 0xf9, 0x20, 0x3d, 0xd3, 0x87, 0x3f, 0xbd, 0xae, + 0xe4, 0x6a, 0x3c, 0xed, 0x34, 0x27, 0x3c, 0x79, 0x2d, 0x67, 0x3d, 0x63, 0xb8, + 0x57, 0xbc, 0x9f, 0x7f, 0x79, 0xbd, 0x44, 0x92, 0x9b, 0x3d, 0x60, 0x08, 0x40, + 0xbd, 0xde, 0x4c, 0x9c, 0x3c, 0xdd, 0x61, 0x21, 0x3c, 0x86, 0xd4, 0x15, 0xbd, + 0xf9, 0xd9, 0xe1, 0xbd, 0x40, 0xc7, 0x2f, 0x3d, 0xa7, 0x36, 0x89, 0x3d, 0x8a, + 0xdc, 0xa0, 0xbd, 0x5a, 0x12, 0x99, 0x3c, 0x8a, 0x63, 0xfa, 0xba, 0x77, 0x80, + 0xa2, 0xbd, 0x68, 0x8f, 0x19, 0xbc, 0x91, 0x17, 0xfc, 0x3c, 0xc7, 0x5f, 0xa0, + 0x3c, 0x21, 0x34, 0xf2, 0xbc, 0x09, 0x55, 0x1d, 0xbc, 0xcf, 0x87, 0x01, 0xbc, + 0xba, 0xe9, 0x8c, 0x3d, 0x07, 0xf7, 0x93, 0x3c, 0xe2, 0x86, 0x80, 0x3c, 0xd7, + 0xf7, 0x45, 0xbd, 0x8d, 0x5c, 0x55, 0x3d, 0x40, 0x89, 0x73, 0x3c, 0x7a, 0xe1, + 0x5c, 0x3c, 0x6a, 0x34, 0xe7, 0xbc, 0x25, 0x79, 0xaa, 0x3a, 0x13, 0x23, 0xa1, + 0x3d, 0x4b, 0x1e, 0xe1, 0x3c, 0x49, 0xbb, 0xb5, 0xbc, 0xa6, 0x19, 0xa9, 0x3c, + 0x4e, 0xf1, 0x2a, 0x3d, 0x69, 0x81, 0xac, 0x3c, 0x00, 0x31, 0x46, 0x3c, 0x84, + 0x9b, 0x17, 0xbd, 0xa3, 0x50, 0x70, 0x3d, 0xf9, 0x6d, 0x91, 0xbd, 0x41, 0x1f, + 0xad, 0x3b, 0x9c, 0x7c, 0xa5, 0xbc, 0xd7, 0xa0, 0x8f, 0xbb, 0xfe, 0xeb, 0x05, + 0x3d, 0xc5, 0x31, 0xc5, 0x3a, 0x9a, 0x3c, 0x08, 0x3d, 0xc2, 0x6d, 0x27, 0xbd, + 0xa5, 0xc1, 0x7a, 0x3c, 0x4c, 0x25, 0x41, 0xbd, 0x3e, 0x6e, 0xd0, 0x3c, 0x6b, + 0x0e, 0x6d, 0x3d, 0xb4, 0x47, 0x86, 0x3c, 0x60, 0xc8, 0x03, 0x3d, 0x78, 0xb8, + 0xb3, 0x3d, 0xfb, 0x4b, 0x0d, 0x3d, 0x44, 0x4c, 0xc0, 0x3b, 0xd1, 0xa8, 0x33, + 0xbc, 0xf8, 0x4d, 0x8d, 0xbd, 0x3b, 0xeb, 0x15, 0xbd, 0x16, 0xef, 0x19, 0xbb, + 0x66, 0x45, 0x2c, 0xbd, 0x50, 0x0b, 0xab, 0xbb, 0x95, 0x0b, 0x06, 0xbd, 0x2c, + 0x1f, 0x33, 0xbd, 0xe4, 0xa5, 0xb7, 0x3a, 0xa0, 0xa0, 0xe4, 0xbc, 0x6c, 0x3b, + 0x65, 0x3d, 0x1e, 0xa8, 0x8b, 0x3b, 0xe0, 0xb7, 0x82, 0x3c, 0x3f, 0x77, 0x5b, + 0x3d, 0xd1, 0xd3, 0x0a, 0x3c, 0xdd, 0xbc, 0xaa, 0xbd, 0xb2, 0x81, 0x91, 0xbc, + 0x0f, 0xcb, 0x5d, 0x3d, 0x08, 0xa9, 0xf0, 0xbc, 0x9b, 0xc4, 0x0c, 0x3c, 0xf7, + 0x0d, 0x64, 0xbc, 0x1c, 0xa0, 0xa5, 0xbc, 0x5b, 0x1d, 0x2d, 0xbd, 0x03, 0x78, + 0x59, 0x3d, 0x1b, 0x8a, 0x13, 0x3d, 0xaa, 0x9c, 0x14, 0xbd, 0x57, 0xe2, 0xf1, + 0x3c, 0x5f, 0xaa, 0x58, 0x3d, 0x6c, 0x19, 0xb5, 0xbc, 0x20, 0xeb, 0x3c, 0x3d, + 0xe0, 0xda, 0xd5, 0x3c, 0x54, 0x6f, 0x6f, 0xbd, 0x91, 0x64, 0x82, 0x3d, 0xed, + 0xcd, 0x10, 0x3b, 0xec, 0x91, 0x1c, 0x3d, 0xad, 0xee, 0xc0, 0x3c, 0xb9, 0x84, + 0xb8, 0x3d, 0x67, 0xe4, 0x19, 0xba, 0xc5, 0xca, 0x00, 0x3b, 0xbc, 0x29, 0xcb, + 0xbc, 0xca, 0x3c, 0x20, 0xbd, 0x6e, 0xed, 0x2e, 0xbd, 0xd8, 0x47, 0x83, 0xbd, + 0x1f, 0x0b, 0x52, 0xbd, 0x10, 0x29, 0x29, 0x3c, 0xfa, 0x35, 0xd2, 0xbc, 0xbe, + 0x31, 0x1b, 0x3d, 0x9c, 0x28, 0xdc, 0xbc, 0xb7, 0x93, 0x70, 0xbb, 0x7b, 0xa8, + 0x83, 0xbc, 0xcb, 0xf0, 0x9a, 0x3c, 0x53, 0x7d, 0x31, 0xbd, 0x8a, 0x47, 0x4a, + 0x3c, 0xf2, 0xe7, 0x79, 0xbd, 0xe7, 0x10, 0x64, 0xbc, 0x69, 0xf1, 0xa9, 0xbc, + 0x5c, 0xfc, 0x9b, 0x3d, 0x5a, 0xcf, 0x14, 0x3d, 0xec, 0x08, 0x63, 0x3d, 0x69, + 0x0f, 0x99, 0xbd, 0x6a, 0x76, 0xeb, 0x3c, 0xbd, 0x2f, 0x8f, 0x3d, 0xa0, 0x54, + 0x8f, 0x3d, 0x7e, 0x08, 0x84, 0x3d, 0xba, 0x94, 0x42, 0x3d, 0x7c, 0xae, 0xf9, + 0xbd, 0x70, 0x32, 0x7f, 0x3c, 0x2f, 0xd3, 0x88, 0xbc, 0x9a, 0x1a, 0x49, 0x3d, + 0xf6, 0xed, 0x54, 0xbd, 0x7e, 0x15, 0x66, 0x3d, 0x81, 0x94, 0x7f, 0x3d, 0x4a, + 0xfb, 0x5f, 0x3c, 0xd7, 0x10, 0x3a, 0x3c, 0xf8, 0x02, 0x89, 0xbd, 0x9f, 0x9c, + 0xb9, 0xbc, 0x02, 0x4c, 0x5b, 0x3d, 0x80, 0xe7, 0x33, 0x3c, 0x55, 0x86, 0x99, + 0x3d, 0x9d, 0xa9, 0xad, 0xbd, 0x9e, 0x1b, 0x76, 0xbb, 0xb8, 0x62, 0x49, 0x3d, + 0x22, 0x21, 0x65, 0x3d, 0x22, 0x6d, 0x0f, 0x3d, 0x60, 0x23, 0x87, 0xbc, 0xc8, + 0xfc, 0x26, 0xbd, 0xc5, 0x47, 0x8c, 0xbd, 0x22, 0x6e, 0xe2, 0xbc, 0xf0, 0x78, + 0x2e, 0x3d, 0xa4, 0x7f, 0xa5, 0xbc, 0xf1, 0x41, 0xae, 0x3d, 0xa4, 0x08, 0x0b, + 0x3d, 0xe8, 0xbb, 0x1c, 0xbc, 0xf8, 0xdd, 0x85, 0xbc, 0x72, 0x87, 0xea, 0x3c, + 0x4a, 0xaa, 0x9a, 0x3d, 0x86, 0xdb, 0xb6, 0x3d, 0x0f, 0xb5, 0xd1, 0xba, 0xfc, + 0x88, 0x62, 0xbd, 0x08, 0x54, 0xfd, 0x3d, 0x35, 0xf8, 0x2e, 0xbd, 0x3b, 0xbb, + 0xc9, 0x3d, 0x9c, 0xb6, 0x57, 0x3d, 0x03, 0x65, 0x58, 0x3d, 0x13, 0xd0, 0x1d, + 0xbd, 0xbb, 0xb1, 0xbf, 0xbc, 0x78, 0x00, 0xde, 0xbc, 0x5c, 0xcb, 0x48, 0xbd, + 0xd3, 0xa1, 0x85, 0x3d, 0x08, 0x35, 0xf6, 0xbc, 0x4c, 0x66, 0x89, 0x3d, 0x09, + 0x92, 0xa6, 0xbc, 0x64, 0x99, 0x9e, 0xbd, 0xae, 0x80, 0x85, 0xbd, 0x99, 0xe0, + 0xe2, 0x3c, 0x8e, 0x75, 0x66, 0xbc, 0x1e, 0x8c, 0xb9, 0xbd, 0x57, 0x43, 0xa8, + 0x3c, 0x31, 0x71, 0xac, 0xbc, 0xb5, 0x75, 0x01, 0x3d, 0x10, 0x39, 0x5c, 0xbd, + 0xa6, 0xf9, 0x7b, 0xbd, 0xf6, 0xea, 0x5d, 0x3d, 0xd3, 0x34, 0xc7, 0xbc, 0x4e, + 0xdc, 0x76, 0xbc, 0x7c, 0x98, 0x26, 0x3c, 0xfb, 0x7a, 0x27, 0xbd, 0x44, 0xe6, + 0x44, 0xbd, 0x26, 0xc5, 0xb2, 0x3d, 0xb1, 0x6e, 0xfa, 0xbd, 0x79, 0xcc, 0x29, + 0xbd, 0x08, 0xae, 0x46, 0xbc, 0x9d, 0x74, 0x67, 0x3d, 0xa3, 0xb6, 0x98, 0x3d, + 0x92, 0xae, 0x3f, 0xbc, 0xef, 0x8c, 0x90, 0x3d, 0xeb, 0x4c, 0x02, 0xbc, 0x21, + 0x7d, 0xe5, 0x3c, 0xd4, 0x6f, 0x47, 0xbd, 0x1a, 0xe8, 0x84, 0x3c, 0x0c, 0x96, + 0x85, 0xbd, 0xa9, 0x69, 0xa7, 0xbb, 0x8c, 0x1e, 0x82, 0xba, 0xff, 0x78, 0x04, + 0xbc, 0x25, 0xb9, 0xaa, 0xbd, 0x0b, 0x03, 0x48, 0xbc, 0xb3, 0xbb, 0x88, 0xbd, + 0x00, 0x26, 0xba, 0xbd, 0x82, 0x41, 0x81, 0x3d, 0xfa, 0x3d, 0xc7, 0x3c, 0x38, + 0x5c, 0x49, 0xbd, 0x0d, 0x4d, 0x3a, 0x3d, 0x67, 0x58, 0x0a, 0xbd, 0x7e, 0xf6, + 0x82, 0x3b, 0x1a, 0x7a, 0x7b, 0x3d, 0xba, 0xff, 0x84, 0x3c, 0x46, 0x87, 0x84, + 0x3c, 0xe8, 0x6c, 0x29, 0x3d, 0x8c, 0x6a, 0xac, 0xbc, 0x89, 0x34, 0x91, 0xbd, + 0xb9, 0xaf, 0xa6, 0x3c, 0xe0, 0x9e, 0xaf, 0xbc, 0xd2, 0x7a, 0x38, 0x3d, 0xac, + 0xbf, 0xc9, 0x3d, 0x73, 0xa1, 0x13, 0x3d, 0x7d, 0xe1, 0xf2, 0x3c, 0x73, 0xec, + 0xcf, 0x3b, 0xfd, 0x7b, 0x8e, 0x3d, 0x1e, 0xb2, 0xf3, 0xbc, 0xdc, 0x32, 0x03, + 0xbe, 0x5e, 0xfa, 0x1b, 0x3d, 0xdc, 0x1a, 0x25, 0x3d, 0x00, 0xcd, 0x48, 0xba, + 0x13, 0x9d, 0xbe, 0x3d, 0x2e, 0x05, 0x77, 0xbd, 0x17, 0x74, 0x9e, 0xbd, 0xae, + 0xc5, 0x62, 0x3c, 0x95, 0xf4, 0x59, 0x3d, 0x36, 0xd2, 0xa4, 0x3d, 0xab, 0x2b, + 0x84, 0xbc, 0x87, 0x89, 0x55, 0x3d, 0xd0, 0xde, 0x5d, 0xbc, 0xcd, 0xb0, 0xce, + 0xbc, 0x29, 0xa0, 0xc8, 0xbc, 0x8a, 0x0b, 0xf1, 0x3c, 0xb8, 0xce, 0x9c, 0x3c, + 0x14, 0xd1, 0x36, 0x3d, 0x50, 0x4b, 0x08, 0xbd, 0x85, 0x95, 0x4b, 0xbd, 0x31, + 0x9e, 0xcf, 0xbc, 0xff, 0x96, 0x83, 0x3d, 0x6c, 0x32, 0x15, 0x3c, 0x6d, 0xfd, + 0xb0, 0x3d, 0x05, 0xd8, 0x33, 0xbd, 0x1b, 0x74, 0x8d, 0xbd, 0xfb, 0x92, 0x21, + 0xbd, 0xde, 0x6c, 0x8f, 0xbc, 0xcc, 0x1e, 0x0f, 0xbd, 0xfa, 0xc4, 0xb8, 0xbb, + 0xc6, 0xe2, 0x1e, 0x3d, 0x9b, 0xd2, 0x99, 0xbb, 0x0f, 0x21, 0x5a, 0xbd, 0x32, + 0xb3, 0x8b, 0x3c, 0x08, 0x0c, 0x2e, 0x3b, 0x81, 0xda, 0x5f, 0xbd, 0x44, 0x42, + 0x81, 0x3c, 0x11, 0xf4, 0xb3, 0xbb, 0xf5, 0x91, 0xdd, 0xbd, 0x20, 0xdd, 0xb0, + 0x3b, 0x94, 0xc1, 0xe4, 0x3c, 0x7c, 0x2f, 0x5d, 0xbd, 0x8b, 0x1f, 0xf3, 0x3c, + 0xf7, 0xc1, 0xd1, 0xbd, 0x2e, 0x5f, 0x5d, 0xbd, 0x35, 0x2c, 0x92, 0x3b, 0x47, + 0x24, 0x34, 0x3d, 0x7f, 0x44, 0x71, 0x3d, 0x39, 0xd7, 0xfc, 0x3c, 0x60, 0x34, + 0x49, 0xbd, 0x70, 0xdc, 0x80, 0x3c, 0x3b, 0xe4, 0x5d, 0xbc, 0x7d, 0x7f, 0xe3, + 0x3c, 0x6d, 0x96, 0x2e, 0x3d, 0x7b, 0x5c, 0x15, 0x3d, 0xc3, 0x8f, 0x78, 0x3c, + 0x5b, 0x2f, 0x2d, 0xbc, 0x30, 0xfd, 0x3a, 0x3d, 0x79, 0x6a, 0xbb, 0x3d, 0x1a, + 0xb0, 0x4d, 0x3c, 0xe2, 0x91, 0x9a, 0x3b, 0x3c, 0x03, 0xa4, 0x3d, 0xa9, 0x2a, + 0x3a, 0xbd, 0xfc, 0xbb, 0x88, 0x3d, 0x16, 0x7f, 0x2a, 0x3c, 0xdd, 0xfc, 0x43, + 0x3d, 0x41, 0x34, 0x3f, 0x3d, 0x80, 0x68, 0x76, 0xbd, 0xbb, 0xab, 0xa9, 0x3d, + 0x4f, 0x4c, 0x17, 0x3d, 0xa3, 0x6e, 0x48, 0x3c, 0x24, 0xdf, 0xed, 0xbc, 0xa9, + 0xca, 0x8e, 0xbd, 0x28, 0x64, 0x51, 0x3d, 0x65, 0xea, 0x94, 0x3d, 0x80, 0xc3, + 0x08, 0x3b, 0xba, 0xc6, 0x38, 0x3d, 0xa3, 0x2f, 0x64, 0xba, 0x16, 0xc1, 0x28, + 0x3d, 0xfb, 0x5a, 0x4c, 0x3c, 0xd9, 0x21, 0x26, 0xbd, 0xb9, 0x19, 0xbd, 0x3d, + 0xba, 0x00, 0x59, 0x3c, 0xeb, 0x40, 0x14, 0xbc, 0x24, 0x37, 0xe9, 0xbc, 0x5e, + 0x99, 0xd0, 0xbc, 0x7c, 0xbc, 0x18, 0xbd, 0x71, 0x23, 0x56, 0x3d, 0xca, 0xa7, + 0x30, 0xbe, 0x37, 0x29, 0x5b, 0xbd, 0x73, 0xfa, 0x30, 0x3d, 0xb7, 0x67, 0xcd, + 0xbc, 0x92, 0xa3, 0x54, 0x3c, 0xf8, 0x54, 0xaa, 0x3d, 0xba, 0x13, 0x8c, 0x3d, + 0x35, 0xa3, 0xa6, 0x3c, 0x11, 0x44, 0x1d, 0xbc, 0x56, 0xe4, 0x18, 0xbd, 0xd6, + 0x33, 0xab, 0x3c, 0x2c, 0x70, 0xa8, 0xbc, 0xa0, 0xd7, 0xc8, 0xb8, 0x56, 0xd9, + 0x69, 0x3d, 0xab, 0xaf, 0x5e, 0xbd, 0x09, 0xbf, 0xb1, 0xbd, 0xad, 0xf1, 0x50, + 0x3c, 0xe0, 0x69, 0x47, 0xbd, 0x21, 0x32, 0x2b, 0xbb, 0x66, 0x24, 0x90, 0xbd, + 0xf8, 0xca, 0xbf, 0xbc, 0x1f, 0x85, 0x02, 0xbd, 0xc9, 0x47, 0xa6, 0x3d, 0xaa, + 0xeb, 0x9b, 0xbc, 0xcf, 0x49, 0x88, 0xbd, 0x40, 0xf0, 0x4e, 0xbc, 0xe3, 0x45, + 0x16, 0x3d, 0xd4, 0x2e, 0xa4, 0xbc, 0xaf, 0xe6, 0x81, 0x3d, 0x62, 0xef, 0x2c, + 0xbc, 0x95, 0xea, 0x63, 0xbd, 0x33, 0x76, 0x9e, 0x3d, 0x16, 0xdf, 0xd6, 0xbd, + 0xa4, 0xb0, 0xde, 0x39, 0xee, 0xfc, 0x89, 0x3d, 0xbd, 0x48, 0xbe, 0x3b, 0xd1, + 0xbb, 0x31, 0xbc, 0x69, 0x1b, 0x26, 0xbd, 0xc1, 0x34, 0xec, 0x3c, 0x33, 0x47, + 0xd5, 0x3c, 0xd0, 0xfb, 0x5c, 0x3b, 0xec, 0x71, 0x27, 0xbc, 0x48, 0x88, 0x62, + 0x3c, 0x60, 0x89, 0x76, 0x3b, 0x4c, 0x07, 0xe8, 0x3c, 0xd5, 0xb4, 0x16, 0x3d, + 0x9d, 0x21, 0x9f, 0x3c, 0x9d, 0x78, 0xb3, 0xbd, 0xeb, 0x74, 0x21, 0xbd, 0xdb, + 0x5e, 0x75, 0xbd, 0x02, 0xf1, 0x9b, 0x3d, 0x50, 0x67, 0x30, 0xbc, 0xc4, 0xa7, + 0xe6, 0x3c, 0x77, 0x75, 0x6e, 0x3c, 0xfd, 0x7e, 0x9e, 0xbb, 0x79, 0xed, 0x77, + 0xbc, 0x18, 0x82, 0x40, 0x3d, 0x18, 0xd1, 0x93, 0x3d, 0x4a, 0xa2, 0x32, 0xbb, + 0x83, 0xd5, 0x51, 0x3c, 0xa1, 0x52, 0xd9, 0x38, 0x6a, 0x5e, 0xb4, 0x3d, 0x73, + 0xb2, 0x1f, 0xbd, 0x02, 0xe7, 0x06, 0xbd, 0x25, 0x20, 0x5c, 0xbd, 0x6a, 0x66, + 0x16, 0x3d, 0xef, 0x75, 0x7c, 0x3d, 0x4b, 0xa8, 0x89, 0x3d, 0x17, 0x5e, 0x82, + 0xbc, 0xd7, 0x41, 0x80, 0x3d, 0x67, 0x41, 0xaf, 0xbc, 0x93, 0x11, 0x9b, 0x3d, + 0x4a, 0x03, 0xb3, 0xbd, 0x0d, 0x82, 0x32, 0xbd, 0x39, 0x35, 0xee, 0xbc, 0x07, + 0x60, 0x87, 0xbd, 0x51, 0xb7, 0x4d, 0x3b, 0xe4, 0x6e, 0xbf, 0xbb, 0x24, 0x01, + 0x36, 0xbd, 0x24, 0x02, 0x10, 0xbd, 0xfe, 0x24, 0x4f, 0xbd, 0xaf, 0xc2, 0x34, + 0xbc, 0x21, 0x39, 0xd9, 0x3c, 0x80, 0x73, 0x88, 0x3c, 0x8e, 0xaf, 0x84, 0xbd, + 0x1e, 0x05, 0x8b, 0xbd, 0xd2, 0xa7, 0x0e, 0x3d, 0x53, 0xe6, 0x89, 0x3b, 0xf3, + 0xd7, 0xa7, 0x3d, 0x58, 0xf7, 0x29, 0x3d, 0xb1, 0x45, 0x9f, 0x3c, 0x3d, 0xf4, + 0x73, 0x3d, 0x73, 0xd2, 0x4d, 0xbd, 0x6f, 0x4a, 0x0f, 0x3d, 0xc1, 0x60, 0x95, + 0xbd, 0xf4, 0x0f, 0x8e, 0x3d, 0x83, 0x58, 0xed, 0xbd, 0x58, 0x39, 0x12, 0x3c, + 0x20, 0x58, 0x39, 0x3d, 0xf4, 0xc9, 0x14, 0x3d, 0x5f, 0xa1, 0x0a, 0x3d, 0xd0, + 0x80, 0x42, 0xbd, 0x2b, 0xc9, 0x35, 0xbd, 0xa5, 0xe0, 0xf9, 0xbc, 0x11, 0xe4, + 0x8b, 0x3c, 0x0f, 0x18, 0x33, 0xbd, 0xb7, 0x53, 0x8f, 0xbc, 0xa8, 0xfe, 0x4f, + 0xbd, 0x1f, 0x8d, 0xf9, 0x3b, 0x33, 0x31, 0xa6, 0x3d, 0xb7, 0x6d, 0x03, 0x3c, + 0x80, 0xaa, 0xda, 0xbd, 0x82, 0x6e, 0xc5, 0x3c, 0x22, 0xaa, 0xba, 0x3c, 0xfd, + 0xd9, 0xcd, 0x3c, 0x16, 0x60, 0x5a, 0x3c, 0x48, 0xdb, 0x36, 0x3d, 0x10, 0xf4, + 0x84, 0xbc, 0x78, 0xf4, 0x8c, 0x3d, 0x24, 0xd3, 0xf2, 0xbc, 0x8e, 0xac, 0x16, + 0xbd, 0x41, 0x7a, 0xf1, 0x3c, 0xd3, 0x25, 0x77, 0x3d, 0x26, 0xf2, 0x63, 0x3d, + 0x7a, 0xb2, 0xa0, 0x3d, 0x00, 0xbb, 0xa4, 0x3c, 0x11, 0xd2, 0xf7, 0xbc, 0x92, + 0x58, 0xa7, 0x3d, 0xa1, 0x9e, 0xaf, 0xbd, 0x38, 0xb3, 0x0b, 0x3c, 0xf3, 0xbb, + 0x62, 0x3c, 0x98, 0x07, 0x9c, 0x3d, 0xa3, 0x56, 0xba, 0xba, 0x1a, 0x8d, 0x95, + 0x3d, 0x13, 0x14, 0x7b, 0x3d, 0xfe, 0x05, 0xb3, 0x3d, 0xd2, 0x56, 0x01, 0x3c, + 0x9e, 0xad, 0x44, 0x3d, 0xc7, 0xd7, 0x98, 0x3c, 0x1e, 0xfb, 0x18, 0x3d, 0x58, + 0x4c, 0x53, 0xbc, 0xf2, 0x16, 0xf1, 0xbb, 0xae, 0x3a, 0xad, 0xbd, 0x3d, 0xdd, + 0x40, 0xbd, 0x9f, 0xa1, 0x9c, 0xbd, 0xb6, 0xb7, 0x09, 0xbc, 0x74, 0xc3, 0xbc, + 0xbd, 0x22, 0xf9, 0x61, 0xbc, 0x71, 0x46, 0x80, 0xbc, 0x26, 0x48, 0x53, 0xbd, + 0x6a, 0xb7, 0x5d, 0x3d, 0xb9, 0xc9, 0x66, 0x3d, 0xaf, 0x27, 0x00, 0xbd, 0x24, + 0x28, 0xd3, 0x3a, 0x53, 0xfb, 0x5d, 0xbd, 0xf4, 0x8b, 0x8a, 0x3d, 0x80, 0x14, + 0x8e, 0xbd, 0x72, 0xcc, 0xa7, 0x3d, 0xd4, 0x5b, 0xff, 0xbc, 0xdf, 0x54, 0x43, + 0xbd, 0x6a, 0x25, 0xe1, 0x3b, 0xe2, 0xe9, 0x09, 0xbd, 0x55, 0xad, 0x63, 0xbd, + 0x14, 0xb6, 0xa9, 0x3b, 0x0c, 0xba, 0xd8, 0xbc, 0xc3, 0x6d, 0x53, 0xbd, 0x42, + 0xa5, 0x5f, 0xbd, 0x7b, 0x04, 0x22, 0xbd, 0x15, 0x56, 0x77, 0x3c, 0x53, 0x67, + 0xe6, 0xbc, 0x69, 0xe6, 0x89, 0x3c, 0x80, 0xcc, 0xbb, 0xbb, 0xea, 0x11, 0xb5, + 0x3d, 0x02, 0x35, 0xb6, 0x3b, 0x98, 0x78, 0x19, 0x3d, 0xae, 0x02, 0xdd, 0xbd, + 0x88, 0x78, 0x35, 0x3c, 0x30, 0x8b, 0x9d, 0xbd, 0xce, 0x4f, 0xad, 0xbd, 0x27, + 0xf3, 0xcf, 0x3c, 0xda, 0x15, 0x82, 0xbd, 0x50, 0x43, 0x86, 0x3c, 0xff, 0x0b, + 0xca, 0x3b, 0xec, 0x3f, 0xd1, 0xbc, 0x53, 0xc4, 0x15, 0x3d, 0x72, 0x9f, 0x12, + 0x3d, 0xcb, 0x3b, 0xcc, 0x3c, 0x90, 0xd2, 0x3a, 0x3d, 0x42, 0x53, 0x0d, 0xbc, + 0x46, 0x82, 0x93, 0x3d, 0xe9, 0x9a, 0xb1, 0xbd, 0x05, 0x99, 0x98, 0xbb, 0x52, + 0x17, 0x71, 0xbd, 0x6e, 0xb6, 0x8d, 0xbd, 0x0f, 0xe1, 0x66, 0xbd, 0x2b, 0x2f, + 0x1b, 0x3d, 0x97, 0x2f, 0xf4, 0xbc, 0xc0, 0xc0, 0x0f, 0x3d, 0xf3, 0x36, 0x6f, + 0x3d, 0x38, 0x99, 0x97, 0x3c, 0xca, 0x4a, 0xca, 0xbd, 0xe2, 0x66, 0x11, 0x3b, + 0xa8, 0xe8, 0x03, 0xbd, 0x60, 0xbf, 0x7e, 0xbb, 0x6d, 0x53, 0xb9, 0x3d, 0x50, + 0x02, 0x0c, 0x3c, 0xe3, 0x5f, 0xbb, 0xbd, 0xd1, 0xc0, 0xbd, 0xbc, 0x42, 0x35, + 0x89, 0x3d, 0x36, 0x8e, 0x9c, 0xbd, 0xac, 0x4a, 0x92, 0xbd, 0x7c, 0xb8, 0x65, + 0xbd, 0x77, 0xdd, 0x5e, 0xbd, 0x58, 0x55, 0x38, 0xbd, 0x2e, 0xa6, 0x67, 0x3c, + 0x7d, 0x81, 0x0b, 0xbd, 0x7b, 0xda, 0x92, 0x3d, 0x07, 0xec, 0x98, 0xbc, 0x6c, + 0x89, 0x35, 0xbd, 0x1b, 0x09, 0x0a, 0x3d, 0xca, 0x57, 0x27, 0x3c, 0xab, 0xff, + 0x2e, 0x3d, 0x97, 0xd7, 0x8d, 0xbd, 0xfa, 0x59, 0xb3, 0x3d, 0xb2, 0x38, 0x31, + 0x3d, 0xd2, 0x30, 0x2b, 0x3d, 0xa5, 0x8d, 0xa4, 0x3b, 0xc9, 0xca, 0xe4, 0x3c, + 0x0a, 0x75, 0x99, 0x3d, 0x3f, 0x85, 0x08, 0x3d, 0xff, 0x4e, 0x4e, 0x3d, 0x00, + 0xfb, 0x74, 0x3d, 0x90, 0x22, 0xb2, 0xbb, 0xed, 0xe6, 0x8c, 0xbb, 0x23, 0x48, + 0xe6, 0x3b, 0xfc, 0x6e, 0x62, 0xbd, 0xd5, 0x72, 0x58, 0x3d, 0xc8, 0x23, 0xce, + 0x3c, 0xf2, 0x1f, 0x3b, 0x3c, 0xd0, 0x69, 0xc6, 0x3b, 0x18, 0x15, 0x62, 0x3c, + 0xa8, 0x0a, 0x2b, 0x3d, 0x94, 0xed, 0x79, 0xbd, 0xf1, 0xff, 0x81, 0xbc, 0xb8, + 0x90, 0x3e, 0xbd, 0x4d, 0x8e, 0x25, 0x3d, 0x04, 0x91, 0xef, 0x3d, 0xb9, 0x57, + 0x17, 0x3d, 0x3a, 0xef, 0x01, 0xbd, 0xc4, 0x52, 0x59, 0xbc, 0x8a, 0x5e, 0x8e, + 0xbd, 0xe7, 0x23, 0xf5, 0xbc, 0x4f, 0xe7, 0x1f, 0xbd, 0x1f, 0x86, 0x82, 0xbc, + 0x1e, 0xf9, 0x53, 0x3d, 0xdf, 0x9c, 0x0a, 0x3c, 0xbf, 0xc9, 0xcc, 0x3c, 0xec, + 0xa1, 0x3e, 0xbc, 0x9c, 0x8e, 0x5e, 0x3a, 0xfd, 0xd8, 0x90, 0xbc, 0xe8, 0x4c, + 0xc7, 0xbc, 0xf2, 0x0f, 0x4b, 0x3a, 0x08, 0x9d, 0xbc, 0xbc, 0xab, 0x39, 0x4d, + 0x3d, 0xea, 0x3d, 0x6b, 0x3d, 0x5c, 0x84, 0x80, 0x3d, 0x7d, 0x95, 0xf8, 0xbc, + 0x70, 0xb2, 0x18, 0xbd, 0x2a, 0x02, 0x79, 0x3d, 0xe8, 0xd9, 0x3c, 0x3d, 0x67, + 0xaf, 0x29, 0x3d, 0x39, 0x45, 0x27, 0xbd, 0x0a, 0x7b, 0x12, 0xbd, 0xbb, 0xdc, + 0xe9, 0xbc, 0x73, 0x04, 0x83, 0xbd, 0x5d, 0xe4, 0x1c, 0xbd, 0xf0, 0x70, 0x29, + 0x3d, 0x87, 0x1e, 0x0d, 0xbd, 0x39, 0x86, 0xf0, 0x3c, 0xf5, 0x57, 0x3e, 0xbd, + 0xc8, 0x3c, 0x18, 0xbc, 0xf4, 0xa8, 0xa0, 0x3d, 0x5c, 0xa0, 0x6c, 0x3d, 0x02, + 0x7a, 0x7e, 0xbc, 0x0b, 0xb6, 0x6d, 0xbd, 0xb0, 0x9a, 0xa8, 0x3c, 0xee, 0x24, + 0x11, 0x3d, 0x54, 0x87, 0xf7, 0xbc, 0x57, 0x52, 0x70, 0xbd, 0x1e, 0x35, 0x46, + 0xbd, 0x38, 0x2d, 0x82, 0x3d, 0x9d, 0x1a, 0x3c, 0xbd, 0x53, 0x7b, 0xa6, 0x3d, + 0x29, 0x4b, 0xab, 0x3d, 0x0c, 0x43, 0x2d, 0x3d, 0x1a, 0x12, 0x95, 0x3d, 0x3b, + 0xf1, 0x3e, 0x3d, 0x80, 0xf6, 0x8d, 0xbd, 0x1b, 0xb6, 0xb4, 0xbc, 0x98, 0x23, + 0x79, 0xbd, 0xb7, 0xf6, 0xc5, 0x3d, 0x10, 0xd5, 0x48, 0x3d, 0x58, 0x7c, 0x9f, + 0xbd, 0xa0, 0x5a, 0x16, 0xbd, 0x82, 0xfb, 0x8e, 0xbd, 0x0b, 0xec, 0xed, 0xbc, + 0x92, 0xb7, 0xa3, 0xbd, 0xd5, 0xfd, 0x85, 0xbd, 0x54, 0xc9, 0x20, 0x3d, 0xad, + 0xa1, 0x90, 0xbd, 0x83, 0xd6, 0xfb, 0xbc, 0xe2, 0x46, 0x43, 0x3b, 0xfe, 0xa6, + 0xbd, 0xb7, 0x8f, 0xd3, 0xaf, 0x3d, 0x75, 0xb9, 0x9d, 0x3d, 0xd5, 0xfc, 0x2a, + 0x3c, 0xc6, 0x7e, 0xd6, 0xbc, 0x08, 0xcd, 0x4c, 0xbd, 0xcf, 0x4f, 0x73, 0x3d, + 0x3e, 0x7f, 0xb7, 0xbc, 0xbc, 0xa9, 0xfd, 0xbc, 0xf4, 0x8b, 0xa6, 0xbc, 0x11, + 0x90, 0xd0, 0xbc, 0x47, 0xf7, 0x4d, 0x3c, 0xed, 0x09, 0x64, 0xbd, 0x61, 0x49, + 0x8d, 0xbc, 0xc8, 0xd3, 0x3c, 0x3d, 0x72, 0x23, 0x88, 0x3d, 0xc3, 0xa7, 0x2e, + 0x3d, 0x67, 0x01, 0x2d, 0xbd, 0xcc, 0x34, 0xa0, 0xbd, 0x7e, 0xc7, 0xf8, 0xbc, + 0x0c, 0xf5, 0xaf, 0xbb, 0x6e, 0xa6, 0x4f, 0x3d, 0xe2, 0xb9, 0x88, 0xbd, 0x87, + 0x6f, 0xf9, 0xbc, 0x82, 0x23, 0x16, 0x3c, 0x10, 0x0c, 0x69, 0x3b, 0xab, 0x02, + 0xe2, 0x3c, 0x57, 0x6a, 0x08, 0xba, 0x4e, 0xc7, 0x6a, 0x3d, 0x30, 0x86, 0x6d, + 0x3c, 0xee, 0xb3, 0x84, 0x3d, 0xf9, 0xc4, 0x3a, 0x3d, 0x6f, 0x21, 0x8d, 0xbb, + 0xef, 0x7e, 0xc1, 0x3b, 0x05, 0xca, 0x12, 0xbc, 0x8a, 0x77, 0x2b, 0xbd, 0x1e, + 0x23, 0x32, 0x3d, 0x32, 0x8b, 0x03, 0x3d, 0xd3, 0x33, 0x0a, 0xbd, 0x3f, 0xdd, + 0x59, 0xbd, 0x18, 0xfa, 0x00, 0x3d, 0x46, 0x0b, 0xdd, 0x3b, 0x96, 0x2b, 0x4c, + 0xbd, 0xc8, 0xcc, 0xa7, 0x3d, 0xe2, 0xad, 0x2e, 0x3d, 0xbc, 0x68, 0x54, 0x3d, + 0xcb, 0x88, 0xae, 0x3c, 0x00, 0xd8, 0x15, 0xbc, 0x18, 0x4b, 0xb5, 0xbd, 0x89, + 0x31, 0x93, 0xbd, 0x84, 0xd3, 0x57, 0x3d, 0x86, 0x2c, 0x6c, 0x3d, 0x18, 0x08, + 0xb1, 0x3d, 0x14, 0x61, 0xbc, 0xbc, 0x25, 0xa4, 0x27, 0xbd, 0xfa, 0xdd, 0xb7, + 0xbd, 0x81, 0xaf, 0x1d, 0xbc, 0x06, 0x91, 0x5d, 0x3d, 0x54, 0xfb, 0xc9, 0xbc, + 0x0b, 0x35, 0x9a, 0x3b, 0x48, 0x7f, 0x1c, 0xbd, 0xaa, 0x85, 0x54, 0x3d, 0x3e, + 0x43, 0xfe, 0xbb, 0xcb, 0xf9, 0xbf, 0x3b, 0x4b, 0x03, 0xed, 0x3c, 0xe0, 0x7f, + 0x85, 0x3d, 0xe2, 0x52, 0x82, 0x3d, 0x98, 0x11, 0x94, 0x3d, 0x39, 0x2d, 0x26, + 0x3c, 0xce, 0x96, 0x5e, 0xbd, 0x6c, 0x42, 0x31, 0xbd, 0xca, 0x90, 0xd4, 0x3b, + 0x66, 0xa9, 0xc0, 0xbd, 0x23, 0x2e, 0x8d, 0x3d, 0x26, 0xc8, 0x4a, 0xbc, 0x2a, + 0xbd, 0x09, 0xbd, 0x26, 0xa5, 0xe6, 0x3c, 0x1e, 0x7c, 0xaa, 0x3d, 0x1b, 0x52, + 0x15, 0x3d, 0xb2, 0xa4, 0x81, 0x3d, 0x73, 0x78, 0x8a, 0x3c, 0x60, 0x6d, 0x4a, + 0xbd, 0x60, 0xc1, 0x3b, 0xbc, 0x14, 0xc6, 0xfb, 0x3c, 0x48, 0x70, 0x05, 0xbd, + 0xc1, 0xa4, 0x98, 0x3d, 0x71, 0x0a, 0xc4, 0xbd, 0x25, 0xdd, 0x31, 0xbd, 0x99, + 0x3a, 0x94, 0xbd, 0xa1, 0x45, 0xbf, 0x3c, 0x54, 0x14, 0xbf, 0xbc, 0xfd, 0x98, + 0xd2, 0xbd, 0xca, 0x27, 0x87, 0xbd, 0x1a, 0x52, 0x3a, 0x3d, 0xc3, 0xcf, 0x42, + 0xbc, 0x4c, 0x2f, 0xe0, 0x3a, 0x96, 0x3f, 0x5e, 0x3b, 0xba, 0xc2, 0x1d, 0xbd, + 0xed, 0x26, 0x42, 0xbd, 0xf6, 0xe0, 0xb4, 0x3d, 0xbe, 0x39, 0x23, 0xbc, 0x05, + 0x9d, 0xba, 0x3c, 0xe9, 0x38, 0x2f, 0xbb, 0x15, 0x9c, 0xbb, 0x3d, 0x22, 0xca, + 0x66, 0x3c, 0x10, 0x16, 0xdb, 0xbc, 0x11, 0x3d, 0xda, 0x3d, 0xac, 0x48, 0x37, + 0xbd, 0xac, 0x3e, 0x08, 0xbd, 0x8b, 0xb1, 0x7f, 0x3d, 0xe7, 0x31, 0xa3, 0x3c, + 0xd5, 0xe9, 0xb6, 0x3d, 0x53, 0xc1, 0x19, 0xbd, 0x2f, 0xc2, 0x35, 0xbd, 0xf9, + 0xa6, 0xa2, 0xbd, 0x46, 0x22, 0x2b, 0x3d, 0x2a, 0x2c, 0x3b, 0xbd, 0xf3, 0x8e, + 0x07, 0x3c, 0xff, 0xb1, 0x09, 0xbd, 0xbd, 0x01, 0x0f, 0xbb, 0x04, 0x7f, 0x4a, + 0xbd, 0xb9, 0xca, 0x87, 0x3d, 0x4e, 0x96, 0x12, 0xbc, 0x7b, 0x9a, 0x7d, 0x3d, + 0x1b, 0x48, 0x08, 0xbc, 0x1b, 0x36, 0x8a, 0x3d, 0xd1, 0x48, 0xe1, 0x3c, 0xb9, + 0xb0, 0x6f, 0x3d, 0x51, 0x6a, 0x83, 0xbb, 0xaa, 0xf0, 0xac, 0x3d, 0x61, 0xdb, + 0x43, 0xbd, 0x2e, 0xcf, 0xa2, 0x3d, 0xa6, 0x41, 0x89, 0x3d, 0x53, 0x86, 0xe1, + 0xbc, 0xda, 0x91, 0x9a, 0xbd, 0xba, 0xf7, 0x86, 0x3d, 0x8b, 0x8c, 0xab, 0xbd, + 0xa2, 0x2c, 0x6b, 0x3d, 0x31, 0x66, 0x83, 0x3c, 0xce, 0xd5, 0x0e, 0xbd, 0x35, + 0x29, 0x73, 0x3d, 0x9b, 0xf7, 0xb0, 0x3d, 0x51, 0x33, 0x21, 0x3d, 0x4c, 0xa1, + 0x4b, 0x3d, 0x58, 0xe3, 0xd5, 0xbc, 0x9f, 0xe4, 0x68, 0x3b, 0xed, 0x0b, 0x1e, + 0x3b, 0xc8, 0x06, 0x8c, 0x3c, 0x67, 0x47, 0x17, 0xbd, 0x63, 0xb4, 0xd1, 0xbc, + 0xf3, 0x34, 0x55, 0xbc, 0xde, 0x7b, 0x31, 0xbd, 0x17, 0x4e, 0x74, 0xba, 0x8b, + 0x65, 0x43, 0xbc, 0x01, 0xcc, 0xa0, 0x3d, 0xc7, 0x20, 0xa2, 0xbd, 0x63, 0x70, + 0x67, 0x3c, 0x65, 0xa0, 0x8d, 0x3d, 0xdf, 0xc9, 0x3d, 0xbc, 0x2f, 0xfa, 0x44, + 0x3b, 0xd2, 0xcf, 0x42, 0x3d, 0x9a, 0x40, 0x06, 0x3d, 0x67, 0x53, 0x4b, 0xbc, + 0x43, 0x50, 0x4a, 0x3c, 0x23, 0xb9, 0xa1, 0xbc, 0xad, 0x34, 0xe3, 0xbc, 0xac, + 0xc4, 0x4f, 0xbd, 0x4b, 0x40, 0xe5, 0xbb, 0xc3, 0xf1, 0x50, 0xbd, 0x98, 0x34, + 0x28, 0xbd, 0x28, 0xf8, 0xae, 0x3d, 0xd1, 0x27, 0x8f, 0x3c, 0xb4, 0x8c, 0x8b, + 0x3d, 0x73, 0xf2, 0x07, 0xbb, 0x65, 0x39, 0x61, 0xbd, 0x9a, 0x90, 0xcb, 0xbb, + 0x18, 0x2f, 0x8e, 0xbd, 0x65, 0xab, 0x4b, 0x3d, 0xd1, 0x40, 0x64, 0xbd, 0x10, + 0xdb, 0x83, 0xbd, 0x3b, 0x12, 0xa5, 0x3d, 0x31, 0x45, 0x78, 0x3d, 0xa4, 0xb1, + 0x26, 0x3d, 0xac, 0x10, 0x42, 0xbc, 0xbe, 0x62, 0xb3, 0xbd, 0x4e, 0x3d, 0x76, + 0x3c, 0x66, 0x0e, 0xde, 0xbc, 0x4f, 0x82, 0xd0, 0xbd, 0xf1, 0x86, 0x8e, 0xbd, + 0xf1, 0xe8, 0x37, 0x3c, 0xb7, 0xbb, 0x0e, 0x3d, 0x1c, 0xc4, 0x05, 0x3d, 0x15, + 0x50, 0x86, 0x3d, 0x81, 0x10, 0x92, 0x3b, 0x0a, 0xff, 0xed, 0x3c, 0x91, 0x9b, + 0xb3, 0xbb, 0xb5, 0xba, 0x26, 0xbc, 0x89, 0xef, 0x0f, 0x3d, 0x52, 0xde, 0x47, + 0x3d, 0x9d, 0x0f, 0x0c, 0x3d, 0x80, 0xee, 0xcb, 0xbd, 0xe2, 0xc7, 0x82, 0xbd, + 0x1a, 0xf6, 0x64, 0x3c, 0xaf, 0xa7, 0xbf, 0xbc, 0xfc, 0x41, 0x37, 0x3c, 0xf9, + 0x88, 0xfe, 0xbc, 0xdf, 0x47, 0x8d, 0xbc, 0x55, 0x09, 0x0b, 0xbd, 0x32, 0x50, + 0x00, 0xbd, 0x83, 0x62, 0xaf, 0xbc, 0xdc, 0xac, 0x5e, 0xbd, 0xb6, 0x22, 0x54, + 0xbd, 0x74, 0xd7, 0x00, 0x3c, 0xe3, 0x5a, 0xcb, 0xbc, 0xaa, 0x37, 0x25, 0xbd, + 0x64, 0x98, 0x5f, 0x3d, 0x81, 0xdf, 0x8b, 0x3c, 0x23, 0xef, 0x66, 0x3b, 0x84, + 0x67, 0x55, 0xbb, 0xd2, 0x11, 0x98, 0xbd, 0x2b, 0x15, 0x82, 0x3d, 0xeb, 0x1e, + 0xc6, 0x3c, 0x56, 0x83, 0xcb, 0xba, 0xd0, 0xc7, 0x2d, 0x3d, 0xd1, 0xcd, 0x0c, + 0x3d, 0xe4, 0x5c, 0x5a, 0xbc, 0x4a, 0xf3, 0x73, 0xbd, 0x43, 0xdc, 0xfe, 0x3c, + 0x00, 0xd6, 0x2f, 0x3d, 0x06, 0x22, 0x49, 0xbb, 0x4e, 0x45, 0x71, 0xbc, 0xb3, + 0x3c, 0x00, 0x3d, 0x1a, 0xae, 0x58, 0xbd, 0x15, 0x61, 0x92, 0x3d, 0x14, 0xb9, + 0xf8, 0xbc, 0x15, 0x2c, 0x1b, 0x3d, 0x31, 0x97, 0x3b, 0xbc, 0xe2, 0xe7, 0x18, + 0x3d, 0xcf, 0xf0, 0x1f, 0xbd, 0x7c, 0x1e, 0x0f, 0x3d, 0xb1, 0x27, 0x7f, 0xbd, + 0xb8, 0xdd, 0xb2, 0xbd, 0xcc, 0xc2, 0x44, 0x3d, 0x44, 0x5c, 0x06, 0xbd, 0x4f, + 0x6a, 0x4a, 0xbd, 0x43, 0x2c, 0x87, 0x3d, 0xb7, 0xe9, 0x48, 0xbd, 0x60, 0x01, + 0x07, 0xbd, 0x0b, 0xe4, 0x78, 0x3a, 0x92, 0x5d, 0x64, 0xbd, 0x7c, 0xcf, 0x81, + 0xbc, 0xe2, 0x59, 0xab, 0x3c, 0xf0, 0xbc, 0x68, 0xbc, 0xc3, 0x2d, 0x3d, 0x3d, + 0x27, 0xb2, 0xce, 0x3d, 0x44, 0x61, 0x0e, 0x3c, 0x94, 0x6d, 0x02, 0xbd, 0xe5, + 0x6f, 0xc2, 0x3c, 0x70, 0xab, 0x8a, 0x3a, 0x14, 0xab, 0x04, 0x3c, 0x9d, 0xd4, + 0xab, 0x3d, 0x0a, 0x7d, 0x64, 0x3c, 0x17, 0xb5, 0xce, 0x3b, 0x66, 0xbd, 0x24, + 0x3d, 0xed, 0xce, 0x77, 0xbd, 0xed, 0x6e, 0x7f, 0xbd, 0x70, 0xe8, 0x10, 0xbc, + 0x6a, 0x80, 0x37, 0x3d, 0x2d, 0x0b, 0x83, 0x3d, 0x8e, 0x4b, 0x5e, 0xbd, 0xd6, + 0x38, 0x34, 0xbd, 0xce, 0xaf, 0x88, 0x3d, 0xef, 0x64, 0x10, 0xbc, 0xa0, 0x8b, + 0xac, 0xbd, 0x70, 0xa5, 0x50, 0x3c, 0x87, 0x3d, 0x83, 0x3d, 0x70, 0x63, 0x57, + 0xbd, 0xf3, 0x6a, 0x44, 0x3d, 0x3a, 0x49, 0xda, 0xbd, 0x1b, 0x74, 0xde, 0xbd, + 0x0d, 0xb2, 0x34, 0x3d, 0x04, 0x0f, 0x87, 0x3d, 0x04, 0xb1, 0x25, 0xbd, 0x5f, + 0x2c, 0x01, 0xbc, 0x9a, 0x55, 0x6b, 0x3b, 0xad, 0xdf, 0x5e, 0x3d, 0x7f, 0x85, + 0x2a, 0x3c, 0xfa, 0x88, 0xfa, 0xbc, 0x0d, 0x79, 0x8b, 0xbd, 0x01, 0x45, 0x73, + 0x3d, 0x11, 0xde, 0xb6, 0x3c, 0xcc, 0xb5, 0xa4, 0x3c, 0xe8, 0xc5, 0x67, 0xbc, + 0x66, 0x99, 0x92, 0x3d, 0x36, 0xb0, 0x79, 0xbd, 0x14, 0x41, 0xa7, 0x3d, 0xfe, + 0x98, 0xcf, 0x3c, 0x32, 0xf7, 0x0a, 0x3d, 0xa6, 0x4a, 0x45, 0x3d, 0x83, 0xa0, + 0x9e, 0x3d, 0x86, 0x2e, 0x71, 0x3d, 0x92, 0x9c, 0x4d, 0x3d, 0xed, 0x24, 0xeb, + 0xbc, 0x3e, 0xfe, 0xc0, 0xbc, 0xcd, 0x6e, 0x4f, 0x3c, 0x83, 0x86, 0xa5, 0xbd, + 0xa4, 0xd7, 0xa5, 0xbc, 0xe0, 0x9a, 0x38, 0x3d, 0xe2, 0x79, 0xcd, 0x3c, 0x4a, + 0xe2, 0xa1, 0x3c, 0x94, 0x66, 0xd1, 0xbc, 0xe6, 0xed, 0x9b, 0x3c, 0x68, 0xb1, + 0x41, 0x3b, 0x1b, 0x65, 0x0b, 0x3d, 0xdd, 0x50, 0xae, 0xbd, 0x29, 0xf9, 0xfc, + 0xbc, 0x33, 0xe6, 0x37, 0xbd, 0xb6, 0x53, 0xbb, 0x3c, 0x0c, 0x5e, 0xf6, 0x3d, + 0x75, 0xbb, 0xf6, 0xbc, 0xf8, 0xc6, 0x9a, 0x3d, 0x8f, 0xe5, 0xc4, 0x3c, 0x88, + 0xee, 0x33, 0xbc, 0x73, 0xb2, 0x87, 0x3c, 0xd4, 0xd8, 0x58, 0x3c, 0x15, 0x37, + 0x82, 0x3d, 0xc1, 0x4f, 0x38, 0xbc, 0xba, 0x8e, 0xf9, 0xbb, 0x7c, 0x56, 0xe0, + 0xbd, 0xca, 0x23, 0x94, 0xbc, 0x24, 0x41, 0xae, 0x3d, 0x89, 0x4e, 0x9a, 0x3c, + 0xcb, 0x28, 0xe3, 0x3c, 0xf1, 0xfa, 0x05, 0x3d, 0xe3, 0xa4, 0x80, 0xbd, 0x6f, + 0xda, 0x16, 0x3d, 0xc7, 0xee, 0x77, 0xbd, 0xa8, 0xe3, 0xb1, 0xbc, 0x6f, 0x70, + 0x90, 0xbc, 0x78, 0x35, 0x48, 0x3d, 0xac, 0xdb, 0x23, 0xbd, 0x4e, 0xbd, 0xe4, + 0xbb, 0x79, 0x88, 0xd0, 0xbb, 0xf2, 0xa9, 0xb6, 0xbd, 0x54, 0x46, 0x5d, 0xbd, + 0xc6, 0xb2, 0x95, 0x3d, 0xe6, 0x67, 0x52, 0x3d, 0xa6, 0x5d, 0x7f, 0xbd, 0x0b, + 0xe5, 0xad, 0x3b, 0x91, 0xf6, 0x0c, 0x3c, 0x33, 0x45, 0xab, 0xbc, 0xa7, 0x84, + 0xb3, 0xbc, 0xf5, 0xb0, 0x6c, 0x3c, 0x08, 0xc9, 0xb4, 0x3c, 0x61, 0x9d, 0x8b, + 0x3c, 0x0d, 0x19, 0x87, 0x3d, 0xaa, 0xbc, 0xd3, 0xbc, 0x85, 0x92, 0x8e, 0x3b, + 0xfc, 0x26, 0x49, 0xbd, 0x56, 0x7e, 0x7f, 0x3d, 0xf3, 0x85, 0x61, 0xbd, 0x8c, + 0x5b, 0xf0, 0x3c, 0x14, 0x09, 0x65, 0xbd, 0x66, 0x78, 0x38, 0xbb, 0x2c, 0x69, + 0x4d, 0xbd, 0x33, 0x31, 0x46, 0x3d, 0x6d, 0xb8, 0xa6, 0xbc, 0x69, 0x4e, 0xc3, + 0x3d, 0xc9, 0x54, 0x93, 0xbd, 0x1a, 0x80, 0x83, 0x3d, 0x06, 0x1b, 0xa8, 0x3c, + 0xf0, 0x64, 0x65, 0x3c, 0xae, 0xd7, 0xb2, 0x3d, 0x03, 0xc0, 0xf0, 0x3c, 0x9d, + 0xbf, 0x84, 0xbd, 0xa6, 0x60, 0xfd, 0xbd, 0x58, 0x27, 0x41, 0x3d, 0x3f, 0x70, + 0x9f, 0x3c, 0x13, 0x59, 0x37, 0xbd, 0x6b, 0x61, 0x4e, 0xbd, 0xb5, 0xf3, 0x26, + 0x39, 0x10, 0x99, 0xc5, 0x3c, 0x7c, 0xda, 0x28, 0x3d, 0x23, 0x7b, 0x78, 0x3b, + 0xa5, 0x5f, 0x1c, 0xbd, 0x8e, 0x82, 0xd0, 0x3c, 0x42, 0x5a, 0x29, 0x3d, 0x5c, + 0x7a, 0x1d, 0xb8, 0xf8, 0x4e, 0x3c, 0xbc, 0x24, 0xee, 0x52, 0x3b, 0x56, 0xfa, + 0x0b, 0x3d, 0xe2, 0xa4, 0xc4, 0x3b, 0xd1, 0x51, 0xe1, 0xbd, 0x22, 0xbb, 0x7f, + 0xbd, 0xd3, 0x54, 0x6d, 0x3d, 0x75, 0x61, 0xaa, 0x3d, 0x4a, 0xd4, 0x33, 0x3d, + 0x2d, 0x5f, 0x91, 0x3c, 0x38, 0xc6, 0xe3, 0xb9, 0x91, 0x94, 0x38, 0x3d, 0x87, + 0x92, 0xd5, 0x3c, 0xb3, 0x59, 0x34, 0xbd, 0x74, 0x48, 0x64, 0xbd, 0x90, 0xb1, + 0xba, 0x3c, 0xd1, 0x21, 0x97, 0x3c, 0xb9, 0x24, 0xa7, 0x3c, 0xa0, 0xe7, 0xe8, + 0xbd, 0xf1, 0xc5, 0x45, 0x3c, 0x93, 0x0e, 0x2e, 0x3d, 0x31, 0x84, 0xd5, 0xbc, + 0xd7, 0x86, 0xbf, 0x3c, 0x5b, 0xae, 0xb8, 0x3c, 0xc3, 0x7e, 0xf3, 0xbc, 0xb1, + 0xd7, 0x0c, 0x3d, 0x2a, 0x33, 0xcc, 0x3d, 0x86, 0x09, 0x6b, 0x3d, 0xb6, 0xa4, + 0x97, 0x3d, 0x15, 0x03, 0x89, 0x3d, 0x5c, 0x5c, 0x85, 0x3d, 0x47, 0x39, 0x65, + 0x3d, 0xd2, 0x8b, 0x06, 0xbd, 0x6c, 0xed, 0x55, 0x3b, 0x30, 0xd5, 0x99, 0xbc, + 0x7d, 0x00, 0xb5, 0xbb, 0x54, 0xe8, 0x12, 0xbd, 0x8c, 0x6f, 0x3e, 0x3c, 0x07, + 0x15, 0x9a, 0x3d, 0xf2, 0x93, 0xa1, 0x3d, 0x0a, 0xf7, 0x7c, 0x3d, 0x89, 0xe9, + 0xc0, 0x3c, 0xc4, 0x63, 0x6d, 0x3d, 0x02, 0x6a, 0xa9, 0x3d, 0x85, 0x9b, 0x4b, + 0x3d, 0x20, 0x90, 0x99, 0x3c, 0xcd, 0xb5, 0x1f, 0x3d, 0x7f, 0x5e, 0x72, 0xbd, + 0x19, 0x42, 0x08, 0xbc, 0x4c, 0xd0, 0x60, 0xbd, 0x28, 0x45, 0x5d, 0xbd, 0x9f, + 0x9e, 0x95, 0xbd, 0xf8, 0x82, 0x82, 0xbd, 0x14, 0xd6, 0x3c, 0x3d, 0x55, 0x69, + 0x6e, 0x3d, 0x6e, 0xd1, 0x37, 0xbc, 0x6a, 0x72, 0x34, 0xbd, 0x67, 0x77, 0xa4, + 0xbc, 0xd0, 0xb2, 0xaa, 0x3d, 0xfa, 0xbb, 0x32, 0x3d, 0x5b, 0xfd, 0x1e, 0x3d, + 0x6b, 0x18, 0x8a, 0x3b, 0xd1, 0xe0, 0x3b, 0x3c, 0x0e, 0xaa, 0xb8, 0xbc, 0xd8, + 0x60, 0x73, 0x3d, 0x18, 0xea, 0xac, 0x3d, 0x0a, 0x98, 0x8c, 0xbd, 0xa8, 0xae, + 0x90, 0x3d, 0xa4, 0x92, 0x81, 0x3b, 0xfa, 0x7d, 0x67, 0x3d, 0xd1, 0x86, 0xad, + 0x3d, 0xa0, 0x03, 0x2e, 0xbc, 0xa7, 0x6d, 0xf7, 0x3c, 0x93, 0xfe, 0x81, 0x3d, + 0x55, 0x43, 0xdd, 0x3b, 0x9e, 0xc7, 0x19, 0x3d, 0xc1, 0x4e, 0x1e, 0x3d, 0x4a, + 0xb6, 0x3c, 0xbd, 0xae, 0x17, 0x16, 0xbd, 0xa1, 0xf5, 0x4d, 0xbd, 0x89, 0x2c, + 0x04, 0xbd, 0xd3, 0xeb, 0x93, 0x3d, 0x35, 0xae, 0x19, 0x3c, 0xf8, 0x48, 0xa5, + 0x3c, 0x94, 0x41, 0xf4, 0xbc, 0x67, 0x32, 0x41, 0xbd, 0x19, 0x2d, 0x38, 0x3d, + 0x57, 0x90, 0x6f, 0xbc, 0xea, 0xb3, 0x89, 0xbc, 0x73, 0x19, 0x5b, 0x3d, 0x9d, + 0x72, 0xae, 0x3d, 0xb9, 0x8b, 0x23, 0xbd, 0xa4, 0x13, 0x43, 0xbc, 0xd0, 0x4d, + 0x12, 0x3d, 0xd7, 0xa3, 0x38, 0xbd, 0xc9, 0xb4, 0xd5, 0x3d, 0x4b, 0x93, 0x24, + 0x3c, 0xd2, 0xfa, 0xe8, 0xbc, 0xdb, 0xa3, 0x0b, 0xbd, 0xc2, 0xdd, 0x5e, 0x3d, + 0x4c, 0x2c, 0xa5, 0xbd, 0xd2, 0x24, 0x77, 0xbd, 0x50, 0xd3, 0xa1, 0x3d, 0xca, + 0xe7, 0x00, 0x3a, 0xbf, 0x15, 0xed, 0xbc, 0x83, 0xc3, 0x60, 0x3d, 0xba, 0x44, + 0x82, 0x3d, 0xa4, 0x8d, 0x93, 0x3d, 0x7a, 0xdf, 0x92, 0xbd, 0x2e, 0x60, 0xcd, + 0x3b, 0x8a, 0xc9, 0x67, 0x3d, 0xbc, 0x59, 0x2e, 0xbd, 0xd6, 0x96, 0xb0, 0x3d, + 0x89, 0x2f, 0xd1, 0xbc, 0x18, 0xd2, 0x0c, 0xbc, 0xc4, 0xf8, 0x84, 0x3d, 0x50, + 0xc8, 0x52, 0xbd, 0xa8, 0xc1, 0x58, 0xbd, 0xa3, 0xe1, 0x26, 0x3d, 0x61, 0x05, + 0x00, 0x3d, 0x5d, 0xe9, 0x84, 0x3d, 0xc2, 0x44, 0x37, 0x3d, 0xfb, 0xf3, 0xb0, + 0xbc, 0x69, 0x4b, 0x6c, 0xbd, 0xa9, 0x6b, 0xa4, 0xbc, 0x77, 0x53, 0x84, 0x3c, + 0x12, 0x21, 0x0c, 0xbd, 0x0d, 0x59, 0x08, 0xbc, 0x44, 0xb6, 0x11, 0xbd, 0xaa, + 0xef, 0x8e, 0x3d, 0x4e, 0x39, 0x32, 0x3d, 0x40, 0x7f, 0x7a, 0xbd, 0xa8, 0x2d, + 0xbf, 0xbc, 0x3a, 0xff, 0x30, 0x3d, 0xff, 0x61, 0xbb, 0x3b, 0xc3, 0xdf, 0x96, + 0xbc, 0x22, 0x74, 0x53, 0xbd, 0x69, 0x07, 0x8a, 0xbd, 0x46, 0x58, 0xe0, 0x3c, + 0x91, 0x62, 0x31, 0xbd, 0x38, 0x57, 0x01, 0xbc, 0x09, 0x74, 0x93, 0xbc, 0x3e, + 0xb2, 0x8a, 0x3c, 0xd8, 0x12, 0x1d, 0xbd, 0xd7, 0xf6, 0xc2, 0xbc, 0x86, 0x55, + 0x11, 0x3c, 0x28, 0x0d, 0x70, 0x3d, 0x98, 0xa3, 0x8a, 0x3d, 0x7b, 0xf0, 0x93, + 0xbd, 0xc2, 0x7c, 0x0b, 0xbd, 0xfa, 0x05, 0xcc, 0x3c, 0x5f, 0x77, 0x19, 0x3d, + 0xe0, 0x09, 0xb3, 0x3c, 0x13, 0x77, 0x8a, 0xbc, 0x1f, 0x76, 0x36, 0x3c, 0xfb, + 0x4f, 0x97, 0x3d, 0x1f, 0xec, 0x31, 0x3d, 0xf9, 0x14, 0x79, 0x3d, 0x50, 0xab, + 0x92, 0xbd, 0xda, 0x3c, 0xf3, 0xba, 0x2f, 0x4d, 0x72, 0xbc, 0x0f, 0x3a, 0xc6, + 0x3c, 0x7e, 0xf5, 0x40, 0xbd, 0x0f, 0xf2, 0x87, 0xbd, 0xc9, 0x6e, 0xef, 0xbc, + 0x06, 0xec, 0xce, 0xbc, 0x3d, 0x26, 0x2b, 0xbd, 0x4a, 0x6a, 0x53, 0x3d, 0x1b, + 0x90, 0x1a, 0xbb, 0x39, 0xb6, 0x23, 0x3d, 0xa2, 0xbd, 0x88, 0xbd, 0xd7, 0x0d, + 0x2a, 0xbc, 0xf5, 0xf6, 0x94, 0xbd, 0xf0, 0xd7, 0x52, 0xbc, 0x85, 0x99, 0x83, + 0xbd, 0xdd, 0xc4, 0x8c, 0xbd, 0xaa, 0x19, 0x4a, 0x3d, 0x26, 0x21, 0xec, 0x3c, + 0x0f, 0xe7, 0x1b, 0xbc, 0x39, 0x8e, 0xea, 0xbc, 0x03, 0xdc, 0x2f, 0xbd, 0x03, + 0x8c, 0x8c, 0x3d, 0xe4, 0xcb, 0x7f, 0xbc, 0xc6, 0xb9, 0xfd, 0x3b, 0x78, 0x5b, + 0x44, 0xbd, 0xd0, 0x3d, 0x89, 0xbc, 0xe0, 0xdb, 0xc2, 0xbc, 0x84, 0x8d, 0x39, + 0xbd, 0x9a, 0x7b, 0x9a, 0x3b, 0x5d, 0xb4, 0x88, 0xbc, 0xf3, 0xf0, 0x8e, 0xbd, + 0x27, 0x0c, 0x41, 0x3d, 0xe7, 0x60, 0xa0, 0x3c, 0x86, 0xb6, 0xa9, 0xbc, 0x15, + 0x55, 0x4f, 0xbd, 0xf4, 0x53, 0xfb, 0xbc, 0xdf, 0x4d, 0x0d, 0x3d, 0x06, 0x46, + 0x7d, 0xbd, 0x37, 0x4d, 0xb0, 0xbc, 0x7d, 0x65, 0x1e, 0xbd, 0x30, 0x1a, 0x00, + 0xbb, 0x16, 0x56, 0x28, 0xbd, 0xb4, 0xef, 0xdd, 0xbc, 0xcc, 0xbc, 0x40, 0xbd, + 0x95, 0xce, 0x84, 0xbd, 0x97, 0x26, 0x98, 0xbd, 0x86, 0x1f, 0x80, 0xbd, 0x64, + 0x16, 0x97, 0x3c, 0x9b, 0xd0, 0x22, 0x3c, 0x05, 0x08, 0x52, 0xbb, 0xd2, 0x11, + 0x8e, 0xbd, 0x3c, 0xa3, 0x8c, 0x3d, 0x4c, 0xdb, 0xa0, 0xbd, 0x24, 0xe2, 0x0a, + 0xbd, 0x24, 0x87, 0x69, 0x3c, 0x7c, 0x72, 0xb2, 0x3c, 0xda, 0xcd, 0x0c, 0x3d, + 0xd1, 0x51, 0x4c, 0x3d, 0xb6, 0xaf, 0x30, 0xbd, 0x07, 0xa0, 0x64, 0x3d, 0x09, + 0x30, 0x59, 0x3d, 0x68, 0xb3, 0x06, 0xbd, 0x01, 0x85, 0xe4, 0xbc, 0x10, 0x9f, + 0x2a, 0xbd, 0xe0, 0x85, 0x93, 0x3d, 0x71, 0xe0, 0x13, 0xbd, 0x28, 0x8b, 0x8e, + 0x3c, 0x53, 0x74, 0x71, 0xbc, 0x6a, 0x6d, 0xad, 0x3d, 0x88, 0xf7, 0x32, 0x3c, + 0xfb, 0xde, 0x41, 0x3c, 0x90, 0x33, 0x4c, 0xba, 0x89, 0xe4, 0x1d, 0x3c, 0x47, + 0x26, 0xb5, 0xbc, 0x5c, 0x9c, 0x9d, 0xbd, 0xd4, 0xe8, 0xdb, 0x3b, 0x7f, 0x88, + 0x99, 0x3d, 0x79, 0xd9, 0xb8, 0xbc, 0x76, 0x00, 0xb9, 0x3d, 0x74, 0x04, 0xb9, + 0xbc, 0xde, 0x84, 0x38, 0x3d, 0x5c, 0x38, 0x91, 0x3d, 0x80, 0x37, 0x04, 0xbd, + 0xfa, 0x1a, 0x34, 0x3d, 0x36, 0x16, 0x11, 0x3d, 0xf3, 0x66, 0x86, 0x3d, 0x84, + 0x83, 0x16, 0xbd, 0xec, 0x1a, 0x43, 0xbd, 0x06, 0xf8, 0x64, 0x3d, 0x96, 0x19, + 0x31, 0x3b, 0x75, 0x30, 0x9e, 0x3d, 0xf5, 0xfa, 0xd1, 0xbb, 0x96, 0xf3, 0xc8, + 0xbc, 0x84, 0x0f, 0x6d, 0xbd, 0xd1, 0x3e, 0x77, 0x3c, 0xbb, 0xb8, 0xf1, 0xbc, + 0x49, 0xf5, 0x70, 0x3d, 0x33, 0x33, 0x44, 0xbd, 0xc9, 0xca, 0xf5, 0x3c, 0x5d, + 0xe3, 0x2c, 0xbc, 0x06, 0x48, 0xb8, 0x3d, 0xfe, 0xac, 0x12, 0x3d, 0x1d, 0xd6, + 0x86, 0x3d, 0x54, 0xa5, 0x39, 0x3d, 0x4d, 0x88, 0xeb, 0x3c, 0x14, 0xe2, 0x3e, + 0x3c, 0xb5, 0xe9, 0xd3, 0xbc, 0x97, 0xe0, 0x7e, 0x3c, 0x9b, 0xa2, 0x5a, 0xbc, + 0x14, 0xab, 0x89, 0x3d, 0x4a, 0xdc, 0x93, 0x3d, 0xe8, 0xee, 0xb5, 0xbc, 0x5f, + 0x9a, 0x9b, 0x3b, 0x26, 0x69, 0x55, 0x3c, 0x7d, 0x50, 0x89, 0xbc, 0xe0, 0x93, + 0x8c, 0x3b, 0x44, 0xbc, 0x23, 0xbd, 0x47, 0x76, 0x85, 0x3d, 0xfd, 0x6a, 0x25, + 0x39, 0x3e, 0x57, 0x9c, 0x3d, 0x70, 0xdd, 0xd0, 0x3b, 0x40, 0xdf, 0x3b, 0x3d, + 0x47, 0x5c, 0xbd, 0xbc, 0x90, 0x3d, 0x33, 0xbd, 0xd8, 0xc6, 0x76, 0xbd, 0xf2, + 0xd8, 0x51, 0x3d, 0x17, 0x60, 0x9c, 0xbd, 0x32, 0x78, 0x1b, 0xbd, 0xb4, 0xef, + 0x70, 0x3d, 0xfa, 0x9d, 0xb6, 0x3b, 0x88, 0x5c, 0xe0, 0x3a, 0x47, 0x1b, 0xf8, + 0xbc, 0x3b, 0x66, 0xcb, 0xba, 0x30, 0xe1, 0x04, 0xbd, 0x58, 0xbe, 0x87, 0xbd, + 0xc2, 0xa5, 0x10, 0xbc, 0x48, 0x34, 0xa3, 0x3d, 0x44, 0xa4, 0x77, 0x3d, 0x7d, + 0xe5, 0x94, 0xba, 0x23, 0xd9, 0xa3, 0xbc, 0xf6, 0xf6, 0xc6, 0xbc, 0xea, 0xd8, + 0x31, 0xbd, 0x9f, 0x50, 0x24, 0x3d, 0xc8, 0x2a, 0x37, 0x3d, 0xaf, 0xe4, 0x82, + 0x3d, 0x28, 0x20, 0x70, 0x3d, 0xa3, 0x27, 0x52, 0x3d, 0xbd, 0x34, 0x8a, 0x3c, + 0x8c, 0x2c, 0xde, 0x3c, 0x35, 0xf4, 0x70, 0xbd, 0x35, 0x89, 0x19, 0x3d, 0x54, + 0x59, 0x46, 0xb9, 0xa6, 0xfb, 0xc0, 0xbc, 0x56, 0x95, 0x8d, 0x3d, 0xd1, 0x4f, + 0x71, 0x3d, 0xe1, 0xe3, 0x9f, 0x3d, 0x05, 0xe2, 0x82, 0xbd, 0xb7, 0xcf, 0x06, + 0x3d, 0x02, 0x28, 0xa3, 0xbc, 0xd0, 0xcf, 0x48, 0x3d, 0x8e, 0x69, 0x3b, 0xbc, + 0x1e, 0x83, 0x14, 0xbb, 0x72, 0x67, 0x82, 0x3b, 0x64, 0x7d, 0xeb, 0xbc, 0x2a, + 0x76, 0xe5, 0xba, 0x6a, 0xd8, 0x3c, 0xbd, 0x10, 0xc0, 0x4c, 0x3d, 0x64, 0x44, + 0x64, 0x3d, 0xbe, 0xb4, 0x31, 0xbd, 0x0c, 0x43, 0x09, 0xbd, 0xa4, 0x6d, 0x8d, + 0xbd, 0xd0, 0xbf, 0x4a, 0x3d, 0x09, 0x76, 0x90, 0xbd, 0x29, 0x9c, 0x0b, 0x3d, + 0x7c, 0x61, 0x74, 0xbd, 0xb9, 0x1c, 0x1c, 0xbd, 0x09, 0x6d, 0xad, 0x3b, 0x3e, + 0xb4, 0x93, 0xbc, 0x1f, 0x5a, 0xa4, 0x3c, 0xe2, 0x7a, 0x89, 0xbd, 0x1c, 0x1d, + 0x49, 0x3c, 0x0c, 0xc3, 0x06, 0xbd, 0xf9, 0xe2, 0xd6, 0x3c, 0x1a, 0x44, 0x57, + 0xbd, 0x7a, 0xac, 0x50, 0x3d, 0x39, 0xe4, 0xc4, 0x3c, 0xfb, 0x1e, 0x04, 0x3d, + 0x8a, 0xf6, 0x53, 0xbd, 0xfc, 0xac, 0x62, 0xbc, 0x44, 0xcc, 0x20, 0x3d, 0xf6, + 0x5e, 0xa0, 0x3c, 0x88, 0x20, 0xcd, 0xba, 0x6b, 0xc7, 0x1c, 0xbd, 0x66, 0xd2, + 0x16, 0xbb, 0x8b, 0x02, 0x58, 0xbd, 0x17, 0x15, 0x83, 0x3d, 0xef, 0x6a, 0x84, + 0x3d, 0x00, 0x91, 0xd1, 0xba, 0x9a, 0xa6, 0x83, 0x3d, 0x6e, 0x12, 0x9c, 0xbd, + 0x4c, 0x00, 0x46, 0x3d, 0x08, 0x8e, 0xcf, 0x3b, 0x53, 0x98, 0xb9, 0xbc, 0x5c, + 0x33, 0x43, 0x3d, 0x05, 0x7b, 0x03, 0xbd, 0x82, 0x26, 0x35, 0xbd, 0xbf, 0x76, + 0x75, 0xbd, 0x08, 0x78, 0x49, 0xbd, 0xe1, 0x7e, 0x53, 0xbc, 0xf0, 0x64, 0xf2, + 0x3c, 0x56, 0xaf, 0x1a, 0x3d, 0x1c, 0x8f, 0x08, 0x3d, 0x11, 0xac, 0x91, 0xbd, + 0xe8, 0x21, 0x06, 0x3d, 0xf5, 0xbb, 0xdb, 0xbc, 0x0c, 0xc9, 0x81, 0xbd, 0x74, + 0x76, 0x83, 0xbd, 0x5e, 0xf3, 0x40, 0xbd, 0xd6, 0xbb, 0x98, 0x3d, 0x4b, 0x9a, + 0x93, 0x3c, 0x25, 0x64, 0x9d, 0xbd, 0xf4, 0xf4, 0x9e, 0xbc, 0x66, 0xbe, 0x2b, + 0xbb, 0xad, 0xa4, 0x82, 0x3c, 0x76, 0x08, 0x5d, 0xbd, 0x2c, 0xf4, 0x2f, 0xbd, + 0xb3, 0x5e, 0x84, 0x3d, 0x62, 0xad, 0x06, 0x3d, 0x6a, 0xe5, 0xea, 0xbc, 0xd8, + 0x06, 0x23, 0x3d, 0x85, 0x25, 0xeb, 0xbc, 0xa9, 0x01, 0xab, 0xbb, 0x28, 0xe4, + 0xf3, 0x3c, 0x9f, 0x9e, 0x8e, 0xbd, 0x3f, 0xe2, 0x2c, 0xbc, 0xe0, 0xfd, 0xc1, + 0x3c, 0x84, 0x67, 0xa7, 0xbb, 0xc5, 0x1d, 0xfc, 0xbc, 0xee, 0x05, 0x6b, 0xbd, + 0x9a, 0x29, 0xc9, 0xbc, 0x35, 0x9c, 0x0f, 0x3d, 0xff, 0xd3, 0x1c, 0xbd, 0x60, + 0x5c, 0x3d, 0xbd, 0x85, 0xf0, 0x81, 0x3d, 0xe6, 0x58, 0x0f, 0xbc, 0xda, 0x46, + 0x01, 0xbd, 0xe4, 0xae, 0x88, 0xbd, 0xe2, 0x4a, 0x47, 0xbd, 0x51, 0xf0, 0x7e, + 0xbd, 0x18, 0xc7, 0x82, 0x3d, 0x85, 0xf7, 0x26, 0x3d, 0x7f, 0xe0, 0xc0, 0xbc, + 0x28, 0xa7, 0x56, 0x3b, 0x86, 0xe9, 0x17, 0xbb, 0x75, 0xc7, 0x81, 0x3d, 0x0c, + 0x95, 0x19, 0xbc, 0x27, 0x0d, 0x62, 0xbd, 0xae, 0x2f, 0x14, 0x3b, 0xcf, 0x26, + 0x47, 0xbd, 0x75, 0xe8, 0x26, 0x3d, 0x99, 0x94, 0x48, 0x3d, 0xac, 0xe6, 0x3f, + 0x3d, 0x50, 0xa8, 0xee, 0x3c, 0x25, 0x3e, 0xef, 0xbc, 0x98, 0xfe, 0x37, 0xbc, + 0x05, 0x4b, 0x28, 0x3d, 0xa5, 0x42, 0xfc, 0x3c, 0x40, 0xda, 0x68, 0x3d, 0xf7, + 0x91, 0x35, 0x3d, 0xae, 0xa1, 0x1a, 0x3d, 0xeb, 0xc7, 0x1b, 0xbd, 0x98, 0x7d, + 0xb1, 0x3c, 0xf7, 0xe7, 0x0b, 0xbd, 0x72, 0x31, 0x47, 0x3d, 0x47, 0xeb, 0x85, + 0xbd, 0x4f, 0x71, 0x1f, 0xbc, 0xae, 0x19, 0x1b, 0xbd, 0x30, 0xc5, 0xd7, 0xbb, + 0x94, 0xbe, 0x05, 0x3d, 0x39, 0x66, 0x94, 0x3c, 0x68, 0xab, 0x65, 0xbc, 0x4a, + 0x43, 0xd3, 0xbc, 0x66, 0x6e, 0x22, 0x3d, 0x2c, 0xb6, 0x45, 0x3d, 0xec, 0xf0, + 0x09, 0xbd, 0x15, 0x84, 0xd6, 0x3c, 0x67, 0xb6, 0x5e, 0xbd, 0x48, 0xb9, 0x1b, + 0x3d, 0xef, 0x6b, 0x36, 0x3d, 0xfa, 0x9f, 0x60, 0x3c, 0xfb, 0x49, 0x8c, 0x3d, + 0x50, 0x0b, 0xfd, 0x3c, 0x43, 0x24, 0xf5, 0x3c, 0x48, 0xf5, 0x1c, 0x3d, 0x24, + 0xed, 0x55, 0xbd, 0x12, 0x2a, 0x33, 0xbd, 0x6f, 0x59, 0x3b, 0xbb, 0xeb, 0x66, + 0xe0, 0xbc, 0x7b, 0x67, 0x60, 0xbb, 0x19, 0x8c, 0x85, 0x3c, 0x72, 0x71, 0x22, + 0x3b, 0x7f, 0xa1, 0x22, 0xbd, 0x9e, 0xcd, 0x04, 0x3d, 0x00, 0xf6, 0xff, 0xb9, + 0xdf, 0x8b, 0x16, 0xbd, 0xc1, 0x0c, 0xfd, 0x3c, 0x9b, 0xf9, 0x5b, 0xbd, 0x71, + 0x73, 0x8c, 0x3d, 0x0f, 0x55, 0x63, 0x3d, 0x20, 0xbf, 0xb9, 0x3c, 0xa3, 0xc5, + 0x85, 0x3d, 0xfd, 0x98, 0x2e, 0xbd, 0xb4, 0x02, 0x2e, 0xbc, 0xe2, 0x12, 0x46, + 0xbc, 0x90, 0x41, 0x6f, 0xbd, 0x0d, 0xc7, 0x68, 0x3d, 0x4e, 0x58, 0x4f, 0x3c, + 0xc0, 0xeb, 0x1d, 0xbb, 0x3d, 0xcb, 0x9f, 0xbd, 0x29, 0x0c, 0x7f, 0x3d, 0x8a, + 0x62, 0x4d, 0xbc, 0x01, 0x3c, 0x7b, 0x3d, 0x3c, 0x41, 0xb8, 0x3c, 0xa9, 0x70, + 0x53, 0x3d, 0x32, 0x94, 0xab, 0x3d, 0xdc, 0x75, 0x4c, 0x3d, 0xab, 0x5d, 0xd6, + 0xbc, 0xae, 0x74, 0x0a, 0xbd, 0x7f, 0xf5, 0xec, 0x3c, 0xff, 0x6e, 0x4c, 0xbd, + 0x0c, 0x65, 0x16, 0xbc, 0x4f, 0x2a, 0x58, 0x3c, 0xe2, 0x17, 0xa0, 0x3d, 0x6a, + 0x10, 0x83, 0xbc, 0xfc, 0x40, 0xc0, 0x3d, 0xbc, 0xa0, 0xad, 0xbc, 0xde, 0xdc, + 0x98, 0x3d, 0xaf, 0x54, 0x84, 0xbb, 0x64, 0xcd, 0xdf, 0x3c, 0xab, 0x93, 0x2c, + 0xbc, 0x44, 0x5c, 0x29, 0x3c, 0xac, 0x7f, 0x27, 0x3d, 0xb2, 0x34, 0xee, 0x3c, + 0x66, 0xf2, 0xd9, 0x3c, 0x4d, 0xaf, 0x86, 0x3d, 0xee, 0x79, 0x10, 0xbd, 0xa2, + 0x84, 0x31, 0xbd, 0xe2, 0xf9, 0x43, 0x3d, 0x26, 0x87, 0xf1, 0x3b, 0xf0, 0x3a, + 0x8f, 0xbd, 0x3e, 0x23, 0x5d, 0xbd, 0x75, 0x0a, 0x7c, 0x3d, 0x15, 0xe4, 0x5a, + 0xbd, 0x45, 0xb3, 0xb2, 0x3c, 0xe3, 0xc4, 0x36, 0x3d, 0x7d, 0x89, 0x9f, 0x3c, + 0x9e, 0x54, 0xaa, 0xbb, 0x89, 0x2e, 0x88, 0xbd, 0xad, 0xe0, 0x89, 0xbc, 0x69, + 0xe9, 0x66, 0xbd, 0x94, 0xa9, 0xf4, 0xbc, 0xb3, 0xde, 0x21, 0xbd, 0x0b, 0x5a, + 0x82, 0xbd, 0x55, 0x78, 0x00, 0x3d, 0x1f, 0x1d, 0xa2, 0xbd, 0x5c, 0xe4, 0x4b, + 0xbd, 0x63, 0x9e, 0xa6, 0xbd, 0x44, 0xdb, 0x75, 0xbd, 0x6a, 0xe7, 0xf3, 0xbc, + 0xdc, 0xa5, 0x2c, 0xbd, 0xc7, 0xcd, 0x8d, 0x3c, 0xd4, 0x97, 0x85, 0x3c, 0xc5, + 0x19, 0x4a, 0xbc, 0x48, 0x7d, 0x09, 0xbc, 0xd6, 0x74, 0x2c, 0xbd, 0x94, 0xb6, + 0xf9, 0x3c, 0xfd, 0x54, 0x8d, 0x3d, 0xdf, 0x85, 0x57, 0x3d, 0x82, 0x58, 0x67, + 0x3d, 0x67, 0x4a, 0xe8, 0xba, 0xec, 0xb0, 0xe9, 0x3c, 0x9a, 0xf0, 0x1f, 0x3d, + 0x80, 0xbc, 0x7e, 0xbd, 0x15, 0xe3, 0x16, 0x3d, 0x49, 0xb7, 0x33, 0xbc, 0x03, + 0xbe, 0x65, 0xbd, 0x6c, 0x41, 0x8b, 0x3d, 0x93, 0x68, 0x85, 0xbc, 0x50, 0x1a, + 0x50, 0xbd, 0x10, 0xbe, 0x7f, 0xbc, 0x15, 0x0c, 0x58, 0xbc, 0x48, 0xe9, 0x92, + 0xbd, 0x48, 0x67, 0x3e, 0xbc, 0x38, 0x60, 0x66, 0xbd, 0x76, 0xac, 0x9e, 0xbd, + 0x4d, 0xc9, 0x61, 0x3d, 0x0b, 0xa6, 0x9f, 0xbd, 0x8f, 0x08, 0xcb, 0x3c, 0x60, + 0x17, 0x35, 0x3d, 0x60, 0x75, 0x7a, 0x3c, 0x24, 0x97, 0x48, 0x3a, 0x64, 0x78, + 0x90, 0xbc, 0xf3, 0x93, 0xb8, 0xbb, 0x46, 0x84, 0x69, 0xbd, 0xd6, 0x71, 0x43, + 0x3d, 0xb4, 0x2b, 0x62, 0xbc, 0x47, 0x6b, 0x08, 0x3c, 0x0e, 0x23, 0xeb, 0xbc, + 0xf4, 0xc8, 0xb0, 0xbc, 0x3f, 0x17, 0xbe, 0xbc, 0x11, 0xc5, 0x99, 0x3d, 0x50, + 0x81, 0x15, 0x3d, 0x8e, 0xd8, 0x7d, 0x3d, 0xfd, 0x07, 0x8d, 0xbb, 0x7a, 0x46, + 0xea, 0x3c, 0x7d, 0xc9, 0x2c, 0x3d, 0x1e, 0x27, 0x2f, 0x3d, 0x67, 0x04, 0x05, + 0xbc, 0x8f, 0x0a, 0x71, 0xbc, 0x44, 0xcb, 0x78, 0xbc, 0x3b, 0x8e, 0x17, 0x3d, + 0x8c, 0x61, 0xf6, 0x3c, 0xdf, 0x7a, 0x54, 0x3d, 0x93, 0xe6, 0xaa, 0xbc, 0xef, + 0x19, 0xd2, 0xbc, 0xb8, 0xec, 0x13, 0x3d, 0xed, 0x16, 0x39, 0x3d, 0x7c, 0xb2, + 0xdc, 0x3c, 0x03, 0xf9, 0x84, 0xb9, 0xe7, 0xbd, 0x70, 0xbc, 0xea, 0x33, 0x77, + 0x3d, 0xa8, 0xd3, 0x55, 0x3c, 0x3b, 0x55, 0x04, 0x3c, 0x72, 0x75, 0x67, 0xbc, + 0xde, 0x63, 0x4b, 0xbc, 0x73, 0xc5, 0x01, 0xbd, 0x2e, 0x1b, 0x01, 0x3c, 0xb2, + 0xeb, 0x57, 0x3d, 0x81, 0xaa, 0x2d, 0xbd, 0x68, 0x5f, 0x1c, 0xbd, 0x0e, 0x36, + 0x77, 0x3d, 0xd9, 0xb5, 0x27, 0x3c, 0x99, 0x74, 0x27, 0x3d, 0xae, 0x86, 0x74, + 0xbd, 0x57, 0x12, 0x0e, 0xbd, 0x37, 0x30, 0x2a, 0x3d, 0x5e, 0xf5, 0x3b, 0x3d, + 0x37, 0x81, 0x6f, 0x3d, 0xd3, 0xe7, 0x4b, 0xbd, 0x4a, 0x7f, 0x85, 0x3d, 0xce, + 0x31, 0x21, 0x3d, 0xda, 0xf8, 0x86, 0xbc, 0x5e, 0x6d, 0x1f, 0x3c, 0x80, 0x1b, + 0x06, 0x3b, 0xd7, 0x82, 0x5f, 0x3d, 0x74, 0xc0, 0x26, 0xbd, 0x1d, 0x0e, 0x8d, + 0xbc, 0x00, 0xfe, 0x06, 0x3d, 0x5f, 0x91, 0x79, 0xbd, 0x53, 0x7a, 0xee, 0xbc, + 0x64, 0x03, 0x41, 0x3d, 0x66, 0xa9, 0xfa, 0xba, 0x67, 0x37, 0x40, 0xbd, 0xd8, + 0x7f, 0x23, 0xbd, 0x1a, 0x9f, 0x03, 0xbc, 0x93, 0x26, 0x03, 0xbd, 0xeb, 0xf7, + 0x58, 0xbc, 0x04, 0xe4, 0xdc, 0xb9, 0xb6, 0xbb, 0x9b, 0x3b, 0x9e, 0x4b, 0x14, + 0x3d, 0x5a, 0x9a, 0xd4, 0xba, 0x59, 0xcd, 0x21, 0xbd, 0x00, 0xc3, 0x85, 0x3c, + 0xec, 0xbf, 0xf2, 0xbc, 0x0e, 0x59, 0x3a, 0xbd, 0xa7, 0x8f, 0x81, 0x3d, 0x11, + 0x2d, 0x63, 0xbd, 0x55, 0x42, 0xe8, 0xbc, 0x6b, 0x6e, 0x8c, 0x3c, 0xa3, 0x84, + 0x1d, 0xbd, 0x8c, 0xda, 0x4f, 0x3c, 0xb2, 0x36, 0xd1, 0x3c, 0x4f, 0x27, 0x71, + 0x3d, 0xf8, 0x32, 0x8c, 0x3c, 0x5c, 0xe8, 0x69, 0xbc, 0x42, 0xcb, 0x24, 0x3d, + 0x8f, 0xd8, 0x6b, 0xbd, 0x87, 0xd2, 0x9c, 0xbd, 0xc5, 0x3f, 0xb5, 0x3c, 0x08, + 0xfc, 0xf9, 0x3c, 0x5b, 0x21, 0x7e, 0x3d, 0xef, 0x06, 0x65, 0xbc, 0xda, 0x92, + 0x02, 0x3c, 0xb1, 0xf0, 0x99, 0xbc, 0x2e, 0x72, 0xe7, 0xbc, 0x32, 0x44, 0x6a, + 0xbd, 0xdd, 0xbb, 0x20, 0x3b, 0xa1, 0xbf, 0xa3, 0x3c, 0xd2, 0x4f, 0x9b, 0x3c, + 0xf8, 0x55, 0xbe, 0x3c, 0x35, 0xe3, 0x0a, 0x3d, 0xf0, 0x8a, 0x89, 0xbc, 0xd7, + 0xd7, 0x6f, 0x3d, 0x96, 0xd9, 0x70, 0xbd, 0x00, 0x50, 0x20, 0x39, 0x1f, 0xa7, + 0x17, 0x3d, 0x4f, 0x4f, 0xc3, 0xbb, 0xf6, 0x99, 0x40, 0xbd, 0x87, 0xd4, 0x2a, + 0xbd, 0x09, 0x54, 0x06, 0x3d, 0x87, 0x46, 0xf4, 0xbb, 0x9c, 0x12, 0x12, 0x3c, + 0x2f, 0xc9, 0xd1, 0x3c, 0x4c, 0x47, 0x4e, 0x3d, 0xf9, 0x77, 0x64, 0xbd, 0xd1, + 0xa5, 0x17, 0xbd, 0xf3, 0x5b, 0xdb, 0x3c, 0x98, 0x30, 0x55, 0x3d, 0x3f, 0x3d, + 0x37, 0xbd, 0x54, 0x12, 0xed, 0xbc, 0x30, 0x26, 0x1d, 0x3d, 0x72, 0x80, 0x8a, + 0x3d, 0xf1, 0xd7, 0x4c, 0xbd, 0xa9, 0xc7, 0x83, 0x3d, 0x86, 0xba, 0x93, 0xbd, + 0x6b, 0x0a, 0x90, 0xbd, 0x96, 0x8c, 0x64, 0xbd, 0x40, 0x70, 0xf1, 0x3a, 0xc0, + 0x39, 0x79, 0x3d, 0x27, 0xda, 0x24, 0xbc, 0x36, 0x2e, 0x3c, 0x3d, 0xb0, 0xbe, + 0x90, 0xbd, 0x20, 0x68, 0x14, 0xbc, 0x00, 0xa4, 0x3e, 0xbc, 0x85, 0xb9, 0x44, + 0xbd, 0xa2, 0x06, 0x52, 0xbd, 0x6e, 0xae, 0x4a, 0xbd, 0xbe, 0x73, 0x6c, 0xbd, + 0x49, 0xee, 0x3e, 0xbd, 0x36, 0x8a, 0xe0, 0x3c, 0x7f, 0x94, 0x8a, 0xbd, 0x19, + 0x1d, 0x11, 0xbd, 0x15, 0x3e, 0x55, 0xbd, 0x4b, 0xcd, 0x7b, 0x3d, 0x63, 0xd7, + 0x9f, 0xba, 0x83, 0xcb, 0x37, 0xbd, 0xa4, 0x4f, 0x21, 0xbd, 0xa5, 0xaf, 0xec, + 0xbc, 0xcd, 0x46, 0xae, 0xbd, 0xe8, 0x66, 0x9d, 0x3c, 0x7c, 0x84, 0xa6, 0xbc, + 0x85, 0xcc, 0x7f, 0x3d, 0xa5, 0x28, 0xa6, 0xbd, 0x2f, 0x3a, 0x55, 0xbc, 0xb4, + 0x8b, 0xc8, 0xbc, 0xd3, 0x90, 0x5e, 0x3d, 0x49, 0x79, 0x81, 0xbd, 0x50, 0xc3, + 0x79, 0xbc, 0x90, 0x04, 0x9b, 0xbd, 0x1e, 0xdb, 0x73, 0x3d, 0x97, 0x15, 0x7e, + 0x3c, 0x5f, 0xf6, 0x83, 0x3d, 0x1d, 0x20, 0x32, 0x3c, 0xda, 0x32, 0x7a, 0xbd, + 0x8f, 0xa0, 0x69, 0x3c, 0x20, 0xe0, 0x87, 0xbd, 0x08, 0xb7, 0x2f, 0x3d, 0x5e, + 0x6c, 0x26, 0xbd, 0xba, 0xa8, 0xbe, 0xbc, 0xb3, 0x9b, 0xb7, 0xbc, 0xc1, 0x3e, + 0x8e, 0x3d, 0x45, 0x90, 0x3f, 0xbd, 0x82, 0xee, 0x0c, 0x3d, 0x62, 0xe1, 0x38, + 0xbc, 0x30, 0x95, 0x8b, 0x3c, 0xc6, 0x6b, 0x58, 0x3d, 0x7c, 0xca, 0x06, 0xbd, + 0x03, 0xa3, 0x7b, 0x3d, 0x77, 0xef, 0x83, 0x3c, 0x24, 0xc7, 0x69, 0x3d, 0xf6, + 0xed, 0x35, 0xbd, 0xaa, 0x2d, 0x33, 0x3d, 0x71, 0x69, 0x72, 0x3c, 0xed, 0x0d, + 0x80, 0x3c, 0x02, 0x0d, 0x47, 0x3d, 0x30, 0x51, 0x86, 0xbc, 0x0a, 0xad, 0x8d, + 0xbc, 0x80, 0xab, 0x1c, 0x3d, 0x68, 0x17, 0x3d, 0x3d, 0x47, 0x3c, 0x36, 0xbd, + 0x32, 0x58, 0xfb, 0x3c, 0x27, 0x47, 0x82, 0x3d, 0xb8, 0x9c, 0x92, 0xbc, 0xab, + 0xa8, 0xaf, 0xbb, 0x97, 0xb4, 0x7b, 0x3d, 0xdb, 0x16, 0xad, 0xbc, 0xa8, 0x50, + 0x8b, 0xbd, 0x50, 0x91, 0x4d, 0x3c, 0xe1, 0x69, 0x73, 0x3c, 0x62, 0x4f, 0x30, + 0xbd, 0x00, 0x70, 0x6a, 0x3c, 0x57, 0xbb, 0x8f, 0x3d, 0xe6, 0x60, 0x44, 0xbd, + 0x33, 0x5a, 0xc2, 0xbc, 0xe6, 0xae, 0x82, 0xbd, 0x1e, 0xad, 0x6e, 0xbd, 0xc9, + 0x43, 0x30, 0x3d, 0x30, 0x4a, 0x65, 0x3c, 0x79, 0x1d, 0xc7, 0x3c, 0x97, 0xab, + 0x1e, 0x3b, 0x95, 0x60, 0xd7, 0xbc, 0xcc, 0xed, 0xa1, 0xbc, 0xa3, 0x6d, 0x6b, + 0xbd, 0xd8, 0xc4, 0x30, 0x3c, 0xcf, 0x3e, 0x8b, 0xbc, 0x82, 0xd9, 0x0d, 0xbc, + 0x6b, 0x1f, 0xdb, 0xbc, 0xb7, 0x65, 0x76, 0xbd, 0x19, 0x3a, 0xfb, 0x3c, 0xe8, + 0x08, 0x08, 0xbd, 0x0b, 0xdb, 0x00, 0xbd, 0x4c, 0x51, 0x19, 0xbd, 0x2e, 0x6c, + 0x37, 0x3d, 0xc0, 0xdf, 0x1e, 0x3b, 0x64, 0x10, 0x49, 0x3d, 0x77, 0x9b, 0xca, + 0xbc, 0xca, 0x17, 0xfb, 0xbc, 0xe6, 0xa4, 0x92, 0x3d, 0xfd, 0x90, 0x77, 0x3d, + 0x82, 0x5e, 0x6b, 0x3d, 0xe5, 0x15, 0x3c, 0x3d, 0xc3, 0x45, 0xf9, 0xbb, 0x0c, + 0x61, 0x88, 0xbd, 0x26, 0xa1, 0x68, 0xbd, 0x67, 0x2c, 0x1e, 0xbd, 0x2b, 0xfe, + 0x3e, 0xbd, 0xb9, 0x45, 0x0b, 0xbd, 0x8e, 0x79, 0x09, 0xbd, 0x16, 0xdf, 0x45, + 0xbd, 0x52, 0xbb, 0x24, 0xbc, 0x84, 0x55, 0x78, 0xbd, 0xb7, 0x6d, 0x55, 0x3d, + 0xb8, 0xe4, 0x8a, 0x3d, 0xcc, 0x8e, 0x2d, 0xbd, 0xf8, 0x0a, 0x13, 0x3c, 0xda, + 0x22, 0x23, 0x3d, 0xee, 0x07, 0x1e, 0x3d, 0xee, 0x5c, 0x38, 0xbd, 0x1b, 0xfa, + 0xc1, 0xbc, 0x62, 0x88, 0x82, 0xbc, 0x9e, 0x6c, 0x39, 0xbd, 0xe8, 0xc8, 0x90, + 0xbd, 0xb2, 0xaf, 0x0e, 0xbd, 0x87, 0xc1, 0x61, 0xbc, 0x91, 0xcf, 0x21, 0x3b, + 0xaa, 0x52, 0x88, 0xbd, 0x2b, 0xcb, 0x8e, 0xbd, 0x42, 0x58, 0xb0, 0x3c, 0x72, + 0x3e, 0x9a, 0x3c, 0x1e, 0x92, 0x09, 0x3d, 0xc6, 0x67, 0x9a, 0xbd, 0xa0, 0xb0, + 0x29, 0x3b, 0x51, 0x6e, 0x0c, 0xbd, 0x88, 0x0d, 0x4d, 0xbd, 0x1c, 0xc3, 0xee, + 0x3c, 0x43, 0xfc, 0x61, 0x3d, 0x74, 0x13, 0x84, 0x3c, 0x10, 0xbc, 0xd4, 0x3c, + 0x8a, 0x20, 0x9d, 0x39, 0x0a, 0x33, 0xdd, 0x3b, 0xee, 0x75, 0x96, 0xbd, 0x77, + 0x4f, 0xa2, 0x3c, 0x1a, 0x55, 0xe4, 0xbc, 0x17, 0x4b, 0x5c, 0xbc, 0xe8, 0x22, + 0x5a, 0xbd, 0xcf, 0xa8, 0x46, 0x3c, 0x2e, 0x1d, 0x2c, 0xbd, 0x7c, 0x53, 0x62, + 0xbc, 0x4e, 0xdc, 0x25, 0x3d, 0x3c, 0x94, 0x4e, 0xbd, 0xba, 0x9a, 0x3b, 0xbd, + 0x32, 0x01, 0x02, 0x3d, 0x57, 0xd2, 0x80, 0x3d, 0x88, 0x7d, 0xb4, 0xbc, 0x81, + 0xbf, 0x7f, 0xbd, 0xf7, 0xbb, 0x89, 0x3d, 0xa0, 0xba, 0x30, 0x3d, 0x13, 0xd5, + 0x91, 0x3d, 0xc7, 0x59, 0x37, 0x3d, 0x3c, 0xc1, 0x95, 0xbd, 0x41, 0x62, 0x94, + 0xbc, 0x09, 0x66, 0x25, 0xbc, 0x4a, 0x10, 0x84, 0xbd, 0xf0, 0x61, 0x09, 0x3d, + 0x7c, 0xba, 0x6d, 0x3d, 0x43, 0x44, 0x60, 0x3d, 0xbc, 0x42, 0x2d, 0x3d, 0x09, + 0x6d, 0x2d, 0x3d, 0x3b, 0x61, 0xb1, 0x3c, 0xd7, 0xb2, 0x36, 0xbc, 0x10, 0xe9, + 0x06, 0xbd, 0xd4, 0x30, 0x64, 0x3d, 0x4e, 0xb2, 0x8d, 0xbc, 0x54, 0x0d, 0x24, + 0xbd, 0xb6, 0x13, 0xe8, 0x3c, 0xe1, 0xd2, 0xd3, 0x3c, 0xd2, 0xc8, 0x99, 0xbc, + 0x5c, 0x05, 0x75, 0x3d, 0x58, 0x19, 0x91, 0x3d, 0x66, 0x5b, 0x03, 0xbd, 0xf4, + 0x88, 0xbd, 0xbc, 0xff, 0x51, 0x93, 0xbc, 0xaa, 0xc8, 0x3e, 0x3d, 0x57, 0x16, + 0xbc, 0xba, 0xf4, 0xe1, 0xa0, 0xbd, 0x3a, 0x82, 0x94, 0xbd, 0x77, 0xfa, 0x86, + 0xbd, 0xa6, 0xfd, 0x84, 0xbb, 0x91, 0x28, 0xeb, 0xbb, 0x86, 0xfd, 0xca, 0xbc, + 0x7f, 0xd4, 0x10, 0xbc, 0xea, 0x09, 0x08, 0xbd, 0xbe, 0x9e, 0x23, 0xbc, 0x5a, + 0x6a, 0x4f, 0xbd, 0x00, 0xf1, 0x54, 0x3d, 0xf4, 0x72, 0xb8, 0xbc, 0x0a, 0xde, + 0x0f, 0x3d, 0x27, 0x61, 0x1b, 0x3d, 0xed, 0xb6, 0x49, 0xbd, 0x11, 0x6d, 0xfb, + 0x3c, 0x51, 0x41, 0x75, 0x3d, 0x0b, 0x3b, 0x68, 0x3d, 0x1e, 0xb2, 0x6c, 0xbd, + 0xd0, 0x5a, 0xfe, 0x3c, 0x3d, 0xa0, 0x30, 0xbd, 0xc8, 0xf9, 0x89, 0x3c, 0x10, + 0x06, 0x72, 0x3d, 0xed, 0x61, 0xe1, 0x3a, 0x35, 0x65, 0x7e, 0x3d, 0x16, 0x6c, + 0x4d, 0x3d, 0x8a, 0xf6, 0x5a, 0x3d, 0x3e, 0x18, 0x64, 0x3d, 0x36, 0x9a, 0xbe, + 0x3c, 0x14, 0xa7, 0xba, 0xbc, 0x93, 0x98, 0xe3, 0x3c, 0x14, 0x13, 0x30, 0x3d, + 0xa8, 0x9a, 0x71, 0xbc, 0xd0, 0x9e, 0xfd, 0xbc, 0x10, 0x8b, 0xa7, 0xbd, 0xb9, + 0x47, 0x2f, 0x3d, 0x44, 0xff, 0x9c, 0xbd, 0x5b, 0x84, 0x3e, 0xbd, 0xc6, 0xa4, + 0xaa, 0x3c, 0x5b, 0xa9, 0x0e, 0xbd, 0x6b, 0xa6, 0x33, 0x3d, 0x65, 0x26, 0x46, + 0x3d, 0x8e, 0x5d, 0xdc, 0xbc, 0x62, 0xcf, 0x43, 0xbd, 0xfd, 0x0e, 0x86, 0x3d, + 0x52, 0xd5, 0xf3, 0x3c, 0x10, 0x00, 0x50, 0xbc, 0x55, 0xec, 0x6c, 0xbd, 0x9b, + 0x21, 0x46, 0x3d, 0xb3, 0xe4, 0x80, 0xbc, 0xa1, 0xf7, 0x84, 0xbd, 0x64, 0x01, + 0x4e, 0xbd, 0x01, 0xfb, 0x3e, 0xbc, 0x28, 0xfc, 0xac, 0xbc, 0x84, 0xf6, 0x17, + 0x3c, 0x69, 0x7c, 0xd9, 0xbc, 0x30, 0xb8, 0xfe, 0xbc, 0x0e, 0x3a, 0x87, 0xbd, + 0x88, 0xad, 0x93, 0xbd, 0xe1, 0x85, 0x8d, 0xbd, 0x42, 0x8c, 0x12, 0x3d, 0x41, + 0x59, 0x84, 0xbd, 0x1c, 0x0e, 0x70, 0xbb, 0xb0, 0x9e, 0xd3, 0xbc, 0x3c, 0x03, + 0xdb, 0xbb, 0xf4, 0x19, 0x01, 0x3d, 0x6f, 0x20, 0xc6, 0x3c, 0x77, 0xc0, 0xb4, + 0x3c, 0x4a, 0xa0, 0xa7, 0x3c, 0x1c, 0xaa, 0x2a, 0xbd, 0x49, 0x9b, 0x60, 0xbd, + 0x30, 0xff, 0xf9, 0xbc, 0x2f, 0x70, 0xc9, 0xbb, 0x72, 0x4b, 0x8f, 0xbd, 0x47, + 0xc6, 0x34, 0x3d, 0x18, 0x49, 0x21, 0x3c, 0x04, 0x19, 0x30, 0x3d, 0x74, 0xbe, + 0x7b, 0xbb, 0xbc, 0x92, 0x43, 0xbc, 0x6f, 0xb6, 0xdf, 0xbc, 0x20, 0xdb, 0x90, + 0x3c, 0x45, 0x29, 0x95, 0xbc, 0x4c, 0x9c, 0xa6, 0x3c, 0x2b, 0xbf, 0xe4, 0xbc, + 0xa9, 0x41, 0xff, 0xbc, 0x62, 0x15, 0xd4, 0x3c, 0x29, 0x60, 0x8e, 0xbd, 0x8d, + 0xce, 0x56, 0xbc, 0x84, 0x09, 0x41, 0x3d, 0x16, 0xb8, 0x35, 0x3d, 0x03, 0x5c, + 0x09, 0xbd, 0x82, 0xfe, 0x64, 0x3d, 0x16, 0x2e, 0x6d, 0xbd, 0xbf, 0x4b, 0x05, + 0xbd, 0x15, 0x9a, 0x28, 0xbd, 0x1d, 0x3d, 0x4f, 0xbd, 0x7c, 0x8a, 0x99, 0x3b, + 0xf9, 0x8c, 0x35, 0xbd, 0xef, 0xc2, 0x2a, 0xbd, 0xe6, 0xea, 0x85, 0xbc, 0xfd, + 0xf1, 0xde, 0x3b, 0xce, 0xb3, 0x5f, 0x3d, 0x2f, 0x4a, 0x30, 0xbc, 0xc5, 0xa1, + 0x09, 0xbd, 0x63, 0x5f, 0x5e, 0xbd, 0x44, 0xc9, 0xc2, 0xbc, 0xb6, 0x2a, 0xf8, + 0xbc, 0x58, 0x39, 0x34, 0x3d, 0x49, 0xbe, 0x5c, 0xbd, 0x45, 0xad, 0x1d, 0x3c, + 0x3f, 0x9f, 0x19, 0xbd, 0xfb, 0xef, 0x2e, 0x3c, 0xd5, 0xe8, 0x88, 0x3c, 0x13, + 0x36, 0x5c, 0xbd, 0x04, 0xeb, 0x78, 0x3c, 0x6e, 0x39, 0x64, 0x3d, 0xdc, 0x1e, + 0x70, 0x3d, 0x79, 0x43, 0x4d, 0x3d, 0xfd, 0x0f, 0x30, 0xbd, 0xd2, 0x88, 0x18, + 0x3d, 0x87, 0x62, 0xcc, 0x3c, 0x00, 0x39, 0x30, 0x3d, 0xba, 0xa0, 0xfa, 0xbc, + 0x00, 0x3d, 0x41, 0x3d, 0xed, 0xfa, 0x73, 0xbd, 0x0c, 0x09, 0x54, 0xbd, 0x77, + 0x2f, 0x5f, 0xbd, 0x01, 0x38, 0x7f, 0xbd, 0x98, 0x08, 0xee, 0xbc, 0x53, 0x34, + 0x48, 0xbc, 0x8a, 0x25, 0x72, 0xbc, 0xf3, 0x71, 0x70, 0xbd, 0x44, 0xdf, 0x1b, + 0x3d, 0xd8, 0x6e, 0x6f, 0xbd, 0xdf, 0x4d, 0x23, 0x3c, 0x9c, 0xfb, 0x21, 0x3d, + 0x72, 0xe1, 0xa4, 0xbc, 0x74, 0xc3, 0x2e, 0xbd, 0x63, 0x0c, 0x8a, 0xbc, 0x24, + 0x09, 0x6e, 0xbd, 0xbb, 0x68, 0x68, 0xbd, 0x7d, 0xd7, 0x6c, 0x3d, 0xd8, 0x63, + 0x63, 0x3c, 0x1a, 0x16, 0xdb, 0xbb, 0x86, 0x5e, 0x40, 0xbd, 0x50, 0x6d, 0x31, + 0xbb, 0xdd, 0xb6, 0x96, 0xbd, 0x19, 0x27, 0x56, 0xbd, 0xf3, 0xd5, 0x11, 0x3d, + 0x91, 0x8e, 0x68, 0x3d, 0xea, 0xed, 0x86, 0xbd, 0xd6, 0x51, 0x87, 0xbc, 0xfb, + 0x6c, 0x76, 0xbd, 0x50, 0x6f, 0x38, 0x3d, 0x9b, 0xa5, 0x71, 0xbd, 0x9b, 0x1f, + 0x16, 0xbd, 0x25, 0xee, 0x93, 0x3d, 0xa9, 0x05, 0xca, 0xbc, 0x9f, 0xee, 0x36, + 0xbd, 0x5c, 0x03, 0x28, 0x3d, 0x52, 0x3b, 0xb1, 0x3c, 0xe3, 0x45, 0x13, 0x3d, + 0x38, 0xec, 0x82, 0xbd, 0xba, 0xc6, 0x5f, 0x3d, 0x18, 0xf7, 0x59, 0x3d, 0xc4, + 0x2f, 0x89, 0x3c, 0x3c, 0x23, 0xd1, 0xbc, 0x39, 0xa7, 0x28, 0x3d, 0x07, 0x78, + 0x17, 0xbc, 0x72, 0xe3, 0xaf, 0xbc, 0x15, 0x2e, 0x2d, 0x3d, 0x2c, 0x3d, 0xa3, + 0x3c, 0x33, 0x96, 0x18, 0xbd, 0xee, 0x47, 0x30, 0xbd, 0x56, 0xc0, 0x0e, 0xbd, + 0xae, 0x3b, 0x74, 0x3c, 0x79, 0x3e, 0x94, 0x3d, 0xee, 0x19, 0x3d, 0xbd, 0x8d, + 0x14, 0x7a, 0xbd, 0x49, 0xfa, 0x2e, 0x3d, 0x9a, 0x0e, 0x8e, 0xbd, 0x41, 0x87, + 0x45, 0x3c, 0x3b, 0x28, 0x66, 0xbd, 0x3d, 0xbd, 0x20, 0x3d, 0x60, 0x4e, 0x80, + 0xbd, 0x7a, 0x3c, 0x50, 0xbd, 0xaa, 0x0f, 0x9e, 0xbd, 0xa2, 0x81, 0x57, 0xbd, + 0x69, 0xf7, 0x27, 0x3d, 0x62, 0x88, 0x17, 0xbc, 0x47, 0x5d, 0xac, 0x3c, 0xe7, + 0x41, 0x31, 0xbd, 0xde, 0xec, 0x85, 0xbd, 0x74, 0xa1, 0x48, 0xbd, 0x80, 0x0d, + 0x2a, 0xbd, 0x5e, 0x67, 0x7e, 0x3c, 0x35, 0xa5, 0xc6, 0x3c, 0xc4, 0xeb, 0x89, + 0xbc, 0xcb, 0xa7, 0x97, 0x3c, 0x0f, 0xca, 0x68, 0x3c, 0xeb, 0x57, 0xea, 0xbc, + 0x88, 0xf8, 0xb3, 0x3c, 0x44, 0x92, 0xee, 0x3c, 0x89, 0xa1, 0x92, 0x3d, 0x61, + 0xa5, 0x23, 0x3a, 0x1e, 0x6c, 0x28, 0xbd, 0x18, 0x89, 0xa4, 0x3c, 0xd1, 0x26, + 0x47, 0x3b, 0x4a, 0x06, 0x80, 0x3c, 0x3a, 0x5f, 0x58, 0xbd, 0x6e, 0x1d, 0x77, + 0xbd, 0xe1, 0x43, 0x89, 0x3a, 0x41, 0xd0, 0x71, 0xbc, 0x90, 0x43, 0x40, 0xbd, + 0xa5, 0xc3, 0x3a, 0x3c, 0xc2, 0x45, 0xb1, 0xbb, 0xf1, 0x81, 0x32, 0x3d, 0x80, + 0x8e, 0x20, 0x3d, 0x0a, 0xbd, 0x14, 0x3d, 0xbb, 0x93, 0x3e, 0xbd, 0x50, 0x1f, + 0x5b, 0x3d, 0xb7, 0xd1, 0x99, 0xbd, 0xbe, 0x77, 0x4b, 0x3d, 0x5f, 0xd4, 0x58, + 0x3d, 0xdc, 0xab, 0xa4, 0x3c, 0x41, 0x6c, 0x78, 0xbd, 0xbd, 0x11, 0x71, 0x3c, + 0xc9, 0x97, 0x50, 0xbd, 0x93, 0xca, 0xe9, 0x3b, 0xec, 0x1b, 0xb4, 0xbc, 0xcf, + 0xb1, 0x48, 0x3c, 0x26, 0xd1, 0x99, 0x3c, 0x9b, 0xca, 0x26, 0xbd, 0xe0, 0xaf, + 0x2f, 0xbc, 0xef, 0x23, 0x84, 0xbd, 0x10, 0x75, 0xe1, 0x3b, 0xe6, 0x8c, 0x3c, + 0x3d, 0xad, 0x1a, 0x48, 0x3d, 0xfe, 0x04, 0x3f, 0x3d, 0xf2, 0x2f, 0xe0, 0xbc, + 0x98, 0x58, 0xe3, 0xbb, 0xe2, 0x78, 0x84, 0x3d, 0xde, 0x9e, 0x97, 0x3b, 0xe3, + 0x90, 0x35, 0xbd, 0xb9, 0xf5, 0x57, 0x3c, 0x29, 0x97, 0x18, 0x3c, 0xa7, 0xe6, + 0x02, 0x3d, 0x6e, 0xd3, 0x0b, 0x3d, 0x09, 0x9f, 0x51, 0xbd, 0xca, 0x5b, 0xac, + 0x3a, 0x38, 0xd9, 0x55, 0xbd, 0xc0, 0x50, 0x0b, 0x3d, 0x63, 0xe8, 0x69, 0xbd, + 0x96, 0xeb, 0x86, 0xbd, 0x43, 0x18, 0x26, 0x3d, 0x76, 0xab, 0xd8, 0x3a, 0xe3, + 0x0e, 0xb9, 0xbc, 0xed, 0xb2, 0x33, 0x3c, 0x67, 0x1d, 0x7c, 0xbd, 0x13, 0x39, + 0xa8, 0x3b, 0x4b, 0xa3, 0x39, 0xbd, 0x17, 0xb9, 0x44, 0xbd, 0x88, 0x76, 0x43, + 0xbd, 0xdd, 0x31, 0x61, 0xbd, 0x2d, 0x7d, 0xae, 0xbc, 0xe9, 0xb8, 0x05, 0x3d, + 0xdd, 0x80, 0x2a, 0xbd, 0x55, 0x66, 0x08, 0xbd, 0xea, 0x09, 0x8a, 0xbd, 0x13, + 0xd8, 0x0d, 0xbd, 0x7e, 0x9d, 0x5a, 0x3d, 0x08, 0x68, 0x8d, 0x3c, 0x02, 0x87, + 0xdc, 0x3c, 0xfb, 0x55, 0xda, 0xb9, 0xc4, 0x69, 0x71, 0xbd, 0xd1, 0x02, 0xf6, + 0xbc, 0x92, 0x01, 0x0c, 0x3d, 0xbb, 0x2c, 0x40, 0xbd, 0x82, 0x69, 0x97, 0x3d, + 0x2b, 0xda, 0x57, 0xbd, 0x7b, 0x9b, 0xe0, 0x3b, 0xff, 0xfd, 0x4b, 0xbd, 0x5c, + 0xa6, 0x2e, 0x3d, 0x40, 0xec, 0x85, 0xbd, 0x3b, 0x5d, 0x17, 0xbd, 0x52, 0x04, + 0x2c, 0xbd, 0x61, 0x00, 0x20, 0x3c, 0x65, 0x33, 0x28, 0xbc, 0x77, 0x76, 0x07, + 0x3d, 0x7a, 0xff, 0x32, 0x3b, 0xb9, 0x96, 0x59, 0xbd, 0xe0, 0xe1, 0x43, 0xbd, + 0x17, 0xa7, 0x6b, 0xbd, 0xf8, 0xa6, 0x4d, 0xbd, 0x4f, 0xc3, 0x9d, 0xbb, 0xfa, + 0x3a, 0x39, 0xbd, 0xe3, 0x59, 0x9a, 0xbd, 0xbd, 0xb9, 0x43, 0xbc, 0x21, 0xc4, + 0x0c, 0x3c, 0x3e, 0x70, 0x47, 0xbd, 0x42, 0xcf, 0x93, 0x3b, 0x9b, 0xe0, 0x34, + 0x3d, 0x00, 0x5d, 0xeb, 0x39, 0x5f, 0x65, 0x80, 0xbd, 0x37, 0x8a, 0x65, 0x3d, + 0x0e, 0x1b, 0x67, 0xbc, 0xa0, 0x0a, 0x68, 0x3c, 0xc5, 0x6d, 0xf7, 0x3c, 0xe1, + 0x9d, 0x85, 0x3d, 0xa8, 0xe7, 0x69, 0xbd, 0x30, 0x9c, 0x36, 0xbd, 0xcf, 0x55, + 0xdf, 0x3c, 0x85, 0xe9, 0x4c, 0x3d, 0x3e, 0x03, 0x8a, 0xbd, 0x19, 0xe1, 0x86, + 0xbb, 0xa0, 0x51, 0xec, 0x3c, 0x11, 0xc9, 0x84, 0x3d, 0x48, 0xa9, 0x1d, 0x3d, + 0x1c, 0xd6, 0xee, 0x3b, 0x82, 0x07, 0x96, 0xbc, 0x33, 0x6b, 0xd0, 0x3c, 0x62, + 0x62, 0xb6, 0x3c, 0x4a, 0x35, 0x62, 0x3d, 0x10, 0x85, 0x66, 0xbd, 0xc9, 0xf5, + 0x53, 0xbc, 0x70, 0x4a, 0xfa, 0x3b, 0xa5, 0x21, 0x33, 0xbd, 0xe7, 0x07, 0x40, + 0x3b, 0x6d, 0xe3, 0x16, 0x3d, 0x11, 0xa2, 0xa7, 0x3a, 0x01, 0x73, 0x95, 0xbc, + 0x5c, 0xd1, 0x2e, 0xbd, 0x5c, 0x41, 0x00, 0xbd, 0x02, 0x40, 0x8a, 0x3d, 0x66, + 0xcf, 0x2b, 0x3d, 0x3d, 0x54, 0x8b, 0xbc, 0x1b, 0x25, 0x44, 0x3d, 0x56, 0xda, + 0x15, 0xbd, 0xfc, 0x0c, 0xc1, 0xbc, 0x4d, 0xcd, 0x5e, 0xbd, 0x40, 0x55, 0x2c, + 0x3d, 0xb9, 0xe6, 0xc5, 0xbc, 0x6b, 0x0d, 0xd2, 0xba, 0xd0, 0x10, 0x28, 0x3c, + 0x6b, 0xd8, 0x63, 0xbd, 0xf7, 0xed, 0xca, 0x3c, 0xa3, 0x63, 0x5a, 0x3b, 0x45, + 0x41, 0x8e, 0x3d, 0x48, 0x23, 0xd7, 0x3c, 0x71, 0xbb, 0xa8, 0x3c, 0xe2, 0x55, + 0x98, 0x3c, 0x27, 0xae, 0x5e, 0xbc, 0x06, 0x79, 0xb4, 0xbb, 0x8c, 0xdb, 0x13, + 0xbd, 0x7b, 0x59, 0x18, 0x3d, 0xbb, 0x91, 0xfc, 0xbc, 0x4b, 0x7d, 0x80, 0xbd, + 0x58, 0x76, 0x8a, 0x3c, 0x5f, 0x71, 0xa8, 0x3c, 0xb3, 0x8f, 0x89, 0xbd, 0xb4, + 0x4c, 0x64, 0xbd, 0xf9, 0x1a, 0x81, 0x3d, 0x8f, 0xa5, 0x90, 0xbd, 0x24, 0x93, + 0xbf, 0x3c, 0x1c, 0x73, 0x68, 0x3d, 0xa5, 0x53, 0x4a, 0xbd, 0xec, 0x40, 0x34, + 0xbd, 0xb2, 0x5f, 0x90, 0x3d, 0x0d, 0xe3, 0x11, 0x3d, 0x5b, 0x77, 0x91, 0x3d, + 0xe4, 0x5b, 0x8b, 0x3d, 0x99, 0x6e, 0x6a, 0xbd, 0x05, 0xcb, 0x99, 0xbd, 0xb5, + 0x26, 0x1f, 0xbd, 0xfd, 0xc3, 0x2f, 0xbd, 0xd2, 0x82, 0x96, 0x3d, 0x06, 0xf6, + 0x78, 0xbd, 0x8e, 0x08, 0x30, 0x3d, 0x16, 0x22, 0x6d, 0xbd, 0xda, 0x25, 0x4b, + 0x3d, 0xf7, 0x44, 0x43, 0xbc, 0xba, 0x20, 0xbc, 0xbc, 0x41, 0xd7, 0x04, 0xbc, + 0xe1, 0x62, 0x0d, 0xbd, 0x93, 0x78, 0x2f, 0xbd, 0x2a, 0xad, 0xd5, 0xbc, 0x13, + 0xd3, 0x6f, 0xbd, 0x88, 0xc4, 0x12, 0xbd, 0x49, 0x73, 0x84, 0xbd, 0xd6, 0x50, + 0x2c, 0x3d, 0xa9, 0xb7, 0x7d, 0xbd, 0x9a, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, + 0x00, 0x08, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80, 0x04, 0x00, 0x00, + 0xae, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc0, + 0x02, 0x74, 0xbb, 0xc6, 0x58, 0x47, 0x39, 0x07, 0x36, 0x4d, 0x3c, 0xf5, 0x20, + 0xc5, 0x3c, 0xce, 0x88, 0x6c, 0x3a, 0xd2, 0x40, 0x7d, 0xbc, 0x2f, 0x7e, 0xf5, + 0x3a, 0x3d, 0xe1, 0x3e, 0xbc, 0xda, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x1d, 0xe1, 0xa3, 0xbc, 0xe7, 0x98, 0x88, 0x3c, 0xe4, + 0xc0, 0x49, 0x3b, 0xa6, 0x49, 0x38, 0x3c, 0x0e, 0x65, 0xbc, 0xbc, 0xd8, 0x59, + 0x73, 0xbc, 0x15, 0x66, 0x0a, 0xbd, 0x7c, 0x75, 0x24, 0xba, 0x37, 0xc4, 0x65, + 0x3c, 0x94, 0x0d, 0x84, 0x3c, 0x26, 0xcc, 0x87, 0x3c, 0x59, 0xea, 0x03, 0xbd, + 0x33, 0x39, 0x48, 0xbc, 0xac, 0x3e, 0x6d, 0x3c, 0xc7, 0x46, 0xb1, 0xbb, 0xcf, + 0xee, 0x07, 0x3d, 0x26, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x00, + 0x00, 0x00, 0x7c, 0xe9, 0x43, 0x3c, 0xd3, 0x16, 0xd7, 0xbc, 0x15, 0x37, 0x4a, + 0xba, 0xa4, 0xad, 0x1c, 0x3c, 0x20, 0x66, 0x3b, 0xbb, 0x22, 0x84, 0x97, 0x3a, + 0xa5, 0x65, 0x86, 0x3c, 0x68, 0x0b, 0xf7, 0xbb, 0x52, 0xaf, 0x8c, 0x3b, 0xe1, + 0x81, 0x00, 0x3d, 0x3c, 0xf9, 0xd9, 0x3c, 0x96, 0xa8, 0x80, 0x3c, 0x94, 0xdf, + 0x21, 0x3c, 0xc7, 0x26, 0xd7, 0x3a, 0x96, 0xb2, 0x8c, 0x3c, 0x17, 0x29, 0x20, + 0x3c, 0xfa, 0xe0, 0x59, 0x3c, 0xf7, 0x08, 0x14, 0x3c, 0xad, 0x71, 0x61, 0x3c, + 0x2e, 0x73, 0x1a, 0xbc, 0x0f, 0xd0, 0x55, 0xbb, 0xa8, 0xde, 0x68, 0x3c, 0xd9, + 0x86, 0x44, 0x3c, 0x54, 0x22, 0x05, 0xbc, 0x3c, 0x7a, 0x92, 0x3c, 0x70, 0x16, + 0x01, 0x3c, 0x69, 0x1e, 0xaf, 0xbb, 0xe8, 0x4b, 0xc5, 0xbc, 0x8b, 0xfd, 0x23, + 0x3c, 0xb8, 0x1e, 0xfd, 0xbc, 0x49, 0x11, 0x50, 0xbb, 0x2a, 0x7b, 0x9c, 0x3c, + 0xb2, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x6e, + 0x5f, 0x06, 0xba, 0xca, 0x9c, 0x99, 0xbb, 0x00, 0x00, 0x00, 0x00, 0xa4, 0x8a, + 0xfe, 0xba, 0x12, 0xed, 0xa7, 0x3c, 0xc0, 0x7d, 0x37, 0xbb, 0xa3, 0x8a, 0x30, + 0xbb, 0xd0, 0x95, 0x99, 0xbc, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9c, 0x1c, 0x3d, + 0x5c, 0x2a, 0x8e, 0xbb, 0x8c, 0xc0, 0x1a, 0xbb, 0x5b, 0xa1, 0xe5, 0x3b, 0x00, + 0x00, 0x00, 0x00, 0x6a, 0x50, 0xef, 0x3c, 0xdc, 0xbc, 0x9a, 0x3a, 0x00, 0x00, + 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, + 0x00, 0x08, 0x00, 0x00, 0x00, 0x6e, 0x6b, 0xdf, 0xbb, 0x54, 0xe6, 0xe6, 0x3c, + 0xd0, 0xf4, 0xff, 0xff, 0xd4, 0xf4, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x4d, + 0x4c, 0x49, 0x52, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64, + 0x2e, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, + 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0, + 0x02, 0x00, 0x00, 0xa4, 0x02, 0x00, 0x00, 0xa8, 0x02, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, + 0x00, 0x38, 0x02, 0x00, 0x00, 0xd4, 0x01, 0x00, 0x00, 0x80, 0x01, 0x00, 0x00, + 0x3c, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x8c, + 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5a, 0xfe, + 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x10, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x68, 0xf5, 0xff, 0xff, + 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13, + 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8e, 0xfe, + 0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00, + 0x00, 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x12, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, + 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x11, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0xee, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x00, 0x10, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xd0, + 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x03, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x7e, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x6e, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e, + 0x00, 0x00, 0x00, 0x5e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x50, 0xff, 0xff, + 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x03, + 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, + 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x05, 0x34, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x17, 0x00, 0x10, 0x00, + 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, + 0x00, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x28, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x13, 0x00, 0x0c, 0x00, 0x08, + 0x00, 0x07, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, + 0x00, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, + 0x00, 0x00, 0x10, 0x08, 0x00, 0x00, 0xc4, 0x07, 0x00, 0x00, 0x7c, 0x07, 0x00, + 0x00, 0x44, 0x07, 0x00, 0x00, 0x0c, 0x07, 0x00, 0x00, 0xd4, 0x06, 0x00, 0x00, + 0x88, 0x06, 0x00, 0x00, 0x2c, 0x06, 0x00, 0x00, 0xe0, 0x05, 0x00, 0x00, 0x8c, + 0x05, 0x00, 0x00, 0x38, 0x05, 0x00, 0x00, 0xe4, 0x04, 0x00, 0x00, 0x28, 0x04, + 0x00, 0x00, 0xb4, 0x03, 0x00, 0x00, 0xf8, 0x02, 0x00, 0x00, 0x84, 0x02, 0x00, + 0x00, 0xc8, 0x01, 0x00, 0x00, 0x54, 0x01, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00, + 0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xf8, 0xff, 0xff, 0x14, + 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x15, 0x00, + 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, + 0xff, 0x02, 0x00, 0x00, 0x00, 0x3c, 0xf8, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00, + 0x53, 0x74, 0x61, 0x74, 0x65, 0x66, 0x75, 0x6c, 0x50, 0x61, 0x72, 0x74, 0x69, + 0x74, 0x69, 0x6f, 0x6e, 0x65, 0x64, 0x43, 0x61, 0x6c, 0x6c, 0x3a, 0x30, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, + 0x00, 0xac, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x90, 0xf8, + 0xff, 0xff, 0x5b, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, + 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, + 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x3b, 0x73, 0x65, + 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, + 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x52, 0x65, 0x6c, 0x75, + 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, + 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x42, + 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x40, 0xf9, 0xff, 0xff, 0x14, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, + 0x3c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80, + 0x04, 0x00, 0x00, 0x24, 0xf9, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65, + 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x66, + 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f, 0x52, 0x65, 0x73, + 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x80, 0x04, 0x00, 0x00, 0x9c, 0xf9, 0xff, 0xff, 0x14, 0x00, + 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, + 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x88, + 0xf9, 0xff, 0xff, 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, + 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, + 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x38, 0x2f, + 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, + 0x00, 0x00, 0x0c, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, + 0x00, 0x24, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0c, + 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xff, 0xff, 0x6e, 0x00, + 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, + 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, + 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, + 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, + 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, + 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, + 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43, + 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, + 0x32, 0x34, 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0xc4, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, + 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, + 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xb0, 0xfa, 0xff, 0xff, + 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, + 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, + 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x37, 0x2f, 0x4d, 0x61, 0x78, + 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x34, + 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, + 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, + 0x00, 0xff, 0xff, 0xff, 0xff, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x20, 0xfb, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, + 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, + 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x52, 0x65, + 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, + 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, + 0x32, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, + 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, + 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x43, 0x6f, 0x6e, 0x76, + 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, + 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0xec, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, + 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x1f, 0x00, + 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xd8, 0xfb, 0xff, 0xff, 0x27, 0x00, 0x00, + 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, + 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, + 0x32, 0x64, 0x5f, 0x31, 0x39, 0x36, 0x2f, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, + 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, + 0x00, 0x1f, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x5c, 0xfc, 0xff, 0xff, + 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0d, + 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, + 0xff, 0xff, 0x3e, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, + 0x00, 0x48, 0xfc, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, + 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, + 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, + 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, + 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x42, + 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, + 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, + 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, + 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x62, 0x69, + 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3e, + 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x56, 0xfd, + 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, + 0x00, 0x2c, 0x00, 0x00, 0x00, 0xe8, 0xfc, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, + 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, + 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43, + 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, + 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0xa6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0b, + 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x38, 0xfd, 0xff, 0xff, 0x1f, 0x00, + 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, + 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, + 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10, + 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, + 0x00, 0x00, 0xf6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x0a, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x88, 0xfd, 0xff, 0xff, + 0x1f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, + 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, + 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, + 0x00, 0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x46, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, + 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0xd8, 0xfd, + 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, + 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, + 0x31, 0x36, 0x34, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x8e, 0xfe, + 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, + 0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, + 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, + 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61, + 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x80, 0x04, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00, + 0x13, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00, 0x7c, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00, + 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, + 0x33, 0x2f, 0x66, 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f, + 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x2e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xc0, 0xfe, 0xff, 0xff, + 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, + 0x31, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, + 0x00, 0x00, 0x62, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x05, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff, + 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, + 0x32, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x96, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0xff, 0xff, 0xff, + 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, + 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, + 0x00, 0x00, 0xca, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x5c, 0xff, 0xff, 0xff, + 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, + 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, + 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0, + 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, + 0x31, 0x36, 0x34, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, + 0x00, 0xff, 0xff, 0xff, 0xff, 0x40, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x22, + 0x00, 0x00, 0x00, 0x73, 0x65, 0x72, 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65, + 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, + 0x32, 0x34, 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3a, 0x30, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x40, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, + 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, + 0x00, 0xdc, 0xff, 0xff, 0xff, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, + 0xe8, 0xff, 0xff, 0xff, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0xf4, + 0xff, 0xff, 0xff, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x0c, 0x00, + 0x0c, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, + 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 +}; diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c new file mode 100644 index 0000000000..2fab99dd8b --- /dev/null +++ b/third_party/aom/av1/encoder/dwt.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/dwt.h" + +// Note: block length must be even for this implementation +static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass, + tran_low_t *highpass) { + int n; + tran_low_t r, *a, *b; + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *a++ = (r = *x++) * 2; + *b++ = *x - ((r + x[1] + 1) >> 1); + x++; + } + *a = (r = *x++) * 2; + *b = *x - r; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ += (r + (*b) + 1) >> 1; + r = *b++; + } +} + +static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass, + tran_low_t *highpass) { + int n; + tran_low_t r, *a, *b; + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *a++ = (r = *x++); + *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2; + x++; + } + *a = (r = *x++); + *b = (*x - r + 1) >> 1; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ += (r + (*b) + 1) >> 1; + r = *b++; + } +} + +static void dyadic_analyze_53_uint8_input(int levels, int width, int height, + const uint8_t *x, int pitch_x, + tran_low_t *c, int pitch_c, + int dwt_scale_bits, int hbd) { + int lv, i, j, nh, nw, hh = height, hw = width; + tran_low_t buffer[2 * DWT_MAX_LENGTH]; + + if (hbd) { + const uint16_t *x16 = CONVERT_TO_SHORTPTR(x); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits; + } + } + } else { + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits; + } + } + } + + for (lv = 0; lv < levels; lv++) { + nh = hh; + hh = (hh + 1) >> 1; + nw = hw; + hw = (hw + 1) >> 1; + if ((nh < 2) || (nw < 2)) return; + for (i = 0; i < nh; i++) { + memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t)); + analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); + } + for (j = 0; j < nw; j++) { + for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j]; + analysis_53_col(nh, buffer + nh, buffer, buffer + hh); + for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i]; + } + } +} + +void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output, + int stride, int hbd) { + dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd); +} + +static int haar_ac_sad(const tran_low_t *output, int bw, int bh, int stride) { + int acsad = 0; + + for (int r = 0; r < bh; ++r) + for (int c = 0; c < bw; ++c) { + if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]); + } + return acsad; +} + +static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride, + int hbd) { + tran_low_t output[64]; + + av1_fdwt8x8_uint8_input_c(input, output, stride, hbd); + return haar_ac_sad(output, 8, 8, 8); +} + +int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride, + int hbd, int num_8x8_rows, + int num_8x8_cols) { + int64_t wavelet_energy = 0; + for (int r8 = 0; r8 < num_8x8_rows; ++r8) { + for (int c8 = 0; c8 < num_8x8_cols; ++c8) { + wavelet_energy += haar_ac_sad_8x8_uint8_input( + input + c8 * 8 + r8 * 8 * stride, stride, hbd); + } + } + return wavelet_energy; +} diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h new file mode 100644 index 0000000000..443b6bc12c --- /dev/null +++ b/third_party/aom/av1/encoder/dwt.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_DWT_H_ +#define AOM_AV1_ENCODER_DWT_H_ + +#include "av1/common/common.h" +#include "av1/common/enums.h" + +#define DWT_MAX_LENGTH 64 + +void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output, + int stride, int hbd); + +int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride, + int hbd, int num_8x8_rows, + int num_8x8_cols); + +#endif // AOM_AV1_ENCODER_DWT_H_ diff --git a/third_party/aom/av1/encoder/enc_enums.h b/third_party/aom/av1/encoder/enc_enums.h new file mode 100644 index 0000000000..20cefa16a5 --- /dev/null +++ b/third_party/aom/av1/encoder/enc_enums.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_ +#define AOM_AV1_ENCODER_ENC_ENUMS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// This enumerator type needs to be kept aligned with the mode order in +// const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code. +enum { + THR_NEARESTMV, + THR_NEARESTL2, + THR_NEARESTL3, + THR_NEARESTB, + THR_NEARESTA2, + THR_NEARESTA, + THR_NEARESTG, + + THR_NEWMV, + THR_NEWL2, + THR_NEWL3, + THR_NEWB, + THR_NEWA2, + THR_NEWA, + THR_NEWG, + + THR_NEARMV, + THR_NEARL2, + THR_NEARL3, + THR_NEARB, + THR_NEARA2, + THR_NEARA, + THR_NEARG, + + THR_GLOBALMV, + THR_GLOBALL2, + THR_GLOBALL3, + THR_GLOBALB, + THR_GLOBALA2, + THR_GLOBALA, + THR_GLOBALG, + + THR_COMP_NEAREST_NEARESTLA, + THR_COMP_NEAREST_NEARESTL2A, + THR_COMP_NEAREST_NEARESTL3A, + THR_COMP_NEAREST_NEARESTGA, + THR_COMP_NEAREST_NEARESTLB, + THR_COMP_NEAREST_NEARESTL2B, + THR_COMP_NEAREST_NEARESTL3B, + THR_COMP_NEAREST_NEARESTGB, + THR_COMP_NEAREST_NEARESTLA2, + THR_COMP_NEAREST_NEARESTL2A2, + THR_COMP_NEAREST_NEARESTL3A2, + THR_COMP_NEAREST_NEARESTGA2, + THR_COMP_NEAREST_NEARESTLL2, + THR_COMP_NEAREST_NEARESTLL3, + THR_COMP_NEAREST_NEARESTLG, + THR_COMP_NEAREST_NEARESTBA, + + THR_COMP_NEAR_NEARLB, + THR_COMP_NEW_NEWLB, + THR_COMP_NEW_NEARESTLB, + THR_COMP_NEAREST_NEWLB, + THR_COMP_NEW_NEARLB, + THR_COMP_NEAR_NEWLB, + THR_COMP_GLOBAL_GLOBALLB, + + THR_COMP_NEAR_NEARLA, + THR_COMP_NEW_NEWLA, + THR_COMP_NEW_NEARESTLA, + THR_COMP_NEAREST_NEWLA, + THR_COMP_NEW_NEARLA, + THR_COMP_NEAR_NEWLA, + THR_COMP_GLOBAL_GLOBALLA, + + THR_COMP_NEAR_NEARL2A, + THR_COMP_NEW_NEWL2A, + THR_COMP_NEW_NEARESTL2A, + THR_COMP_NEAREST_NEWL2A, + THR_COMP_NEW_NEARL2A, + THR_COMP_NEAR_NEWL2A, + THR_COMP_GLOBAL_GLOBALL2A, + + THR_COMP_NEAR_NEARL3A, + THR_COMP_NEW_NEWL3A, + THR_COMP_NEW_NEARESTL3A, + THR_COMP_NEAREST_NEWL3A, + THR_COMP_NEW_NEARL3A, + THR_COMP_NEAR_NEWL3A, + THR_COMP_GLOBAL_GLOBALL3A, + + THR_COMP_NEAR_NEARGA, + THR_COMP_NEW_NEWGA, + THR_COMP_NEW_NEARESTGA, + THR_COMP_NEAREST_NEWGA, + THR_COMP_NEW_NEARGA, + THR_COMP_NEAR_NEWGA, + THR_COMP_GLOBAL_GLOBALGA, + + THR_COMP_NEAR_NEARL2B, + THR_COMP_NEW_NEWL2B, + THR_COMP_NEW_NEARESTL2B, + THR_COMP_NEAREST_NEWL2B, + THR_COMP_NEW_NEARL2B, + THR_COMP_NEAR_NEWL2B, + THR_COMP_GLOBAL_GLOBALL2B, + + THR_COMP_NEAR_NEARL3B, + THR_COMP_NEW_NEWL3B, + THR_COMP_NEW_NEARESTL3B, + THR_COMP_NEAREST_NEWL3B, + THR_COMP_NEW_NEARL3B, + THR_COMP_NEAR_NEWL3B, + THR_COMP_GLOBAL_GLOBALL3B, + + THR_COMP_NEAR_NEARGB, + THR_COMP_NEW_NEWGB, + THR_COMP_NEW_NEARESTGB, + THR_COMP_NEAREST_NEWGB, + THR_COMP_NEW_NEARGB, + THR_COMP_NEAR_NEWGB, + THR_COMP_GLOBAL_GLOBALGB, + + THR_COMP_NEAR_NEARLA2, + THR_COMP_NEW_NEWLA2, + THR_COMP_NEW_NEARESTLA2, + THR_COMP_NEAREST_NEWLA2, + THR_COMP_NEW_NEARLA2, + THR_COMP_NEAR_NEWLA2, + THR_COMP_GLOBAL_GLOBALLA2, + + THR_COMP_NEAR_NEARL2A2, + THR_COMP_NEW_NEWL2A2, + THR_COMP_NEW_NEARESTL2A2, + THR_COMP_NEAREST_NEWL2A2, + THR_COMP_NEW_NEARL2A2, + THR_COMP_NEAR_NEWL2A2, + THR_COMP_GLOBAL_GLOBALL2A2, + + THR_COMP_NEAR_NEARL3A2, + THR_COMP_NEW_NEWL3A2, + THR_COMP_NEW_NEARESTL3A2, + THR_COMP_NEAREST_NEWL3A2, + THR_COMP_NEW_NEARL3A2, + THR_COMP_NEAR_NEWL3A2, + THR_COMP_GLOBAL_GLOBALL3A2, + + THR_COMP_NEAR_NEARGA2, + THR_COMP_NEW_NEWGA2, + THR_COMP_NEW_NEARESTGA2, + THR_COMP_NEAREST_NEWGA2, + THR_COMP_NEW_NEARGA2, + THR_COMP_NEAR_NEWGA2, + THR_COMP_GLOBAL_GLOBALGA2, + + THR_COMP_NEAR_NEARLL2, + THR_COMP_NEW_NEWLL2, + THR_COMP_NEW_NEARESTLL2, + THR_COMP_NEAREST_NEWLL2, + THR_COMP_NEW_NEARLL2, + THR_COMP_NEAR_NEWLL2, + THR_COMP_GLOBAL_GLOBALLL2, + + THR_COMP_NEAR_NEARLL3, + THR_COMP_NEW_NEWLL3, + THR_COMP_NEW_NEARESTLL3, + THR_COMP_NEAREST_NEWLL3, + THR_COMP_NEW_NEARLL3, + THR_COMP_NEAR_NEWLL3, + THR_COMP_GLOBAL_GLOBALLL3, + + THR_COMP_NEAR_NEARLG, + THR_COMP_NEW_NEWLG, + THR_COMP_NEW_NEARESTLG, + THR_COMP_NEAREST_NEWLG, + THR_COMP_NEW_NEARLG, + THR_COMP_NEAR_NEWLG, + THR_COMP_GLOBAL_GLOBALLG, + + THR_COMP_NEAR_NEARBA, + THR_COMP_NEW_NEWBA, + THR_COMP_NEW_NEARESTBA, + THR_COMP_NEAREST_NEWBA, + THR_COMP_NEW_NEARBA, + THR_COMP_NEAR_NEWBA, + THR_COMP_GLOBAL_GLOBALBA, + + THR_DC, + THR_PAETH, + THR_SMOOTH, + THR_SMOOTH_V, + THR_SMOOTH_H, + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D203_PRED, + THR_D157_PRED, + THR_D67_PRED, + THR_D113_PRED, + THR_D45_PRED, + + MAX_MODES, + SINGLE_REF_MODE_START = THR_NEARESTMV, + SINGLE_REF_MODE_END = THR_COMP_NEAREST_NEARESTLA, + NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START, + THR_MODE_START = THR_NEARESTMV, + THR_MODE_END = MAX_MODES, + THR_INTER_MODE_START = THR_MODE_START, + THR_INTER_MODE_END = THR_DC, + THR_INVALID = 255 +} UENUM1BYTE(THR_MODES); + +enum { + THR_LAST, + THR_LAST2, + THR_LAST3, + THR_BWDR, + THR_ALTR2, + THR_GOLD, + THR_ALTR, + + THR_COMP_LA, + THR_COMP_L2A, + THR_COMP_L3A, + THR_COMP_GA, + + THR_COMP_LB, + THR_COMP_L2B, + THR_COMP_L3B, + THR_COMP_GB, + + THR_COMP_LA2, + THR_COMP_L2A2, + THR_COMP_L3A2, + THR_COMP_GA2, + + THR_INTRA, + + MAX_REFS +} UENUM1BYTE(THR_MODES_SUB8X8); + +enum { + FULL_TXFM_RD, + LOW_TXFM_RD, +} UENUM1BYTE(TXFM_RD_MODEL); + +enum { + USE_FULL_RD = 0, + USE_FAST_RD, + USE_LARGESTALL, +} UENUM1BYTE(TX_SIZE_SEARCH_METHOD); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENC_ENUMS_H_ diff --git a/third_party/aom/av1/encoder/encode_strategy.c b/third_party/aom/av1/encoder/encode_strategy.c new file mode 100644 index 0000000000..35ca83c3f4 --- /dev/null +++ b/third_party/aom/av1/encoder/encode_strategy.c @@ -0,0 +1,1767 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/blockd.h" +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#if CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG + +#include "av1/common/av1_common_int.h" +#include "av1/common/reconinter.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/temporal_filter.h" +#if CONFIG_THREE_PASS +#include "av1/encoder/thirdpass.h" +#endif // CONFIG_THREE_PASS +#include "av1/encoder/tpl_model.h" + +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif + +#define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1) + +static INLINE void set_refresh_frame_flags( + RefreshFrameInfo *const refresh_frame, bool refresh_gf, bool refresh_bwdref, + bool refresh_arf) { + refresh_frame->golden_frame = refresh_gf; + refresh_frame->bwd_ref_frame = refresh_bwdref; + refresh_frame->alt_ref_frame = refresh_arf; +} + +void av1_configure_buffer_updates(AV1_COMP *const cpi, + RefreshFrameInfo *const refresh_frame, + const FRAME_UPDATE_TYPE type, + const REFBUF_STATE refbuf_state, + int force_refresh_all) { + // NOTE(weitinglin): Should we define another function to take care of + // cpi->rc.is_$Source_Type to make this function as it is in the comment? + const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = + &cpi->ext_flags.refresh_frame; + cpi->rc.is_src_frame_alt_ref = 0; + + switch (type) { + case KF_UPDATE: + set_refresh_frame_flags(refresh_frame, true, true, true); + break; + + case LF_UPDATE: + set_refresh_frame_flags(refresh_frame, false, false, false); + break; + + case GF_UPDATE: + set_refresh_frame_flags(refresh_frame, true, false, false); + break; + + case OVERLAY_UPDATE: + if (refbuf_state == REFBUF_RESET) + set_refresh_frame_flags(refresh_frame, true, true, true); + else + set_refresh_frame_flags(refresh_frame, true, false, false); + + cpi->rc.is_src_frame_alt_ref = 1; + break; + + case ARF_UPDATE: + // NOTE: BWDREF does not get updated along with ALTREF_FRAME. + if (refbuf_state == REFBUF_RESET) + set_refresh_frame_flags(refresh_frame, true, true, true); + else + set_refresh_frame_flags(refresh_frame, false, false, true); + + break; + + case INTNL_OVERLAY_UPDATE: + set_refresh_frame_flags(refresh_frame, false, false, false); + cpi->rc.is_src_frame_alt_ref = 1; + break; + + case INTNL_ARF_UPDATE: + set_refresh_frame_flags(refresh_frame, false, true, false); + break; + + default: assert(0); break; + } + + if (ext_refresh_frame_flags->update_pending && + (!is_stat_generation_stage(cpi))) { + set_refresh_frame_flags(refresh_frame, + ext_refresh_frame_flags->golden_frame, + ext_refresh_frame_flags->bwd_ref_frame, + ext_refresh_frame_flags->alt_ref_frame); + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (ext_refresh_frame_flags->golden_frame) + gf_group->update_type[cpi->gf_frame_index] = GF_UPDATE; + if (ext_refresh_frame_flags->alt_ref_frame) + gf_group->update_type[cpi->gf_frame_index] = ARF_UPDATE; + if (ext_refresh_frame_flags->bwd_ref_frame) + gf_group->update_type[cpi->gf_frame_index] = INTNL_ARF_UPDATE; + } + + if (force_refresh_all) + set_refresh_frame_flags(refresh_frame, true, true, true); +} + +static void set_additional_frame_flags(const AV1_COMMON *const cm, + unsigned int *const frame_flags) { + if (frame_is_intra_only(cm)) { + *frame_flags |= FRAMEFLAGS_INTRAONLY; + } + if (frame_is_sframe(cm)) { + *frame_flags |= FRAMEFLAGS_SWITCH; + } + if (cm->features.error_resilient_mode) { + *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT; + } +} + +static void set_ext_overrides(AV1_COMMON *const cm, + EncodeFrameParams *const frame_params, + ExternalFlags *const ext_flags) { + // Overrides the defaults with the externally supplied values with + // av1_update_reference() and av1_update_entropy() calls + // Note: The overrides are valid only for the next frame passed + // to av1_encode_lowlevel() + + if (ext_flags->use_s_frame) { + frame_params->frame_type = S_FRAME; + } + + if (ext_flags->refresh_frame_context_pending) { + cm->features.refresh_frame_context = ext_flags->refresh_frame_context; + ext_flags->refresh_frame_context_pending = 0; + } + cm->features.allow_ref_frame_mvs = ext_flags->use_ref_frame_mvs; + + frame_params->error_resilient_mode = ext_flags->use_error_resilient; + // A keyframe is already error resilient and keyframes with + // error_resilient_mode interferes with the use of show_existing_frame + // when forward reference keyframes are enabled. + frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME; + // For bitstream conformance, s-frames must be error-resilient + frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME; +} + +static int choose_primary_ref_frame( + AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) { + const AV1_COMMON *const cm = &cpi->common; + + const int intra_only = frame_params->frame_type == KEY_FRAME || + frame_params->frame_type == INTRA_ONLY_FRAME; + if (intra_only || frame_params->error_resilient_mode || + cpi->ext_flags.use_primary_ref_none) { + return PRIMARY_REF_NONE; + } + +#if !CONFIG_REALTIME_ONLY + if (cpi->use_ducky_encode) { + int wanted_fb = cpi->ppi->gf_group.primary_ref_idx[cpi->gf_frame_index]; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) + return ref_frame - LAST_FRAME; + } + + return PRIMARY_REF_NONE; + } +#endif // !CONFIG_REALTIME_ONLY + + // In large scale case, always use Last frame's frame contexts. + // Note(yunqing): In other cases, primary_ref_frame is chosen based on + // cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], which also controls + // frame bit allocation. + if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME); + + if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) + return av1_svc_primary_ref_frame(cpi); + + // Find the most recent reference frame with the same reference type as the + // current frame + const int current_ref_type = get_current_frame_ref_type(cpi); + int wanted_fb = cpi->ppi->fb_of_context_type[current_ref_type]; +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + if (gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { + int frame_level = gf_group->frame_parallel_level[cpi->gf_frame_index]; + // Book keep wanted_fb of frame_parallel_level 1 frame in an FP2 set. + if (frame_level == 1) { + cpi->wanted_fb = wanted_fb; + } + // Use the wanted_fb of level 1 frame in an FP2 for a level 2 frame in the + // set. + if (frame_level == 2 && + gf_group->update_type[cpi->gf_frame_index - 1] == INTNL_ARF_UPDATE) { + assert(gf_group->frame_parallel_level[cpi->gf_frame_index - 1] == 1); + wanted_fb = cpi->wanted_fb; + } + } + } +#endif // CONFIG_FPMT_TEST + int primary_ref_frame = PRIMARY_REF_NONE; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) { + primary_ref_frame = ref_frame - LAST_FRAME; + } + } + + return primary_ref_frame; +} + +static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) { + TimeStamps *time_stamps = &cpi->time_stamps; + int64_t this_duration; + int step = 0; + + // Clear down mmx registers + + if (cpi->ppi->use_svc && cpi->ppi->rtc_ref.set_ref_frame_config && + cpi->svc.number_spatial_layers > 1) { + // ts_start is the timestamp for the current frame and ts_end is the + // expected next timestamp given the duration passed into codec_encode(). + // See the setting in encoder_encode() in av1_cx_iface.c: + // ts_start = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol), + // ts_end = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + + // duration). So the difference ts_end - ts_start is the duration passed + // in by the user. For spatial layers SVC set the framerate based directly + // on the duration, and bypass the adjustments below. + this_duration = ts_end - ts_start; + if (this_duration > 0) { + cpi->new_framerate = 10000000.0 / this_duration; + av1_new_framerate(cpi, cpi->new_framerate); + time_stamps->prev_ts_start = ts_start; + time_stamps->prev_ts_end = ts_end; + return; + } + } + + if (ts_start == time_stamps->first_ts_start) { + this_duration = ts_end - ts_start; + step = 1; + } else { + int64_t last_duration = + time_stamps->prev_ts_end - time_stamps->prev_ts_start; + + this_duration = ts_end - time_stamps->prev_ts_end; + + // do a step update if the duration changes by 10% + if (last_duration) + step = (int)((this_duration - last_duration) * 10 / last_duration); + } + + if (this_duration) { + if (step) { + cpi->new_framerate = 10000000.0 / this_duration; + av1_new_framerate(cpi, cpi->new_framerate); + } else { + // Average this frame's rate into the last second's average + // frame rate. If we haven't seen 1 second yet, then average + // over the whole interval seen. + const double interval = + AOMMIN((double)(ts_end - time_stamps->first_ts_start), 10000000.0); + double avg_duration = 10000000.0 / cpi->framerate; + avg_duration *= (interval - avg_duration + this_duration); + avg_duration /= interval; + cpi->new_framerate = (10000000.0 / avg_duration); + // For parallel frames update cpi->framerate with new_framerate + // during av1_post_encode_updates() + double framerate = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? cpi->framerate + : cpi->new_framerate; + av1_new_framerate(cpi, framerate); + } + } + + time_stamps->prev_ts_start = ts_start; + time_stamps->prev_ts_end = ts_end; +} + +// Determine whether there is a forced keyframe pending in the lookahead buffer +int is_forced_keyframe_pending(struct lookahead_ctx *lookahead, + const int up_to_index, + const COMPRESSOR_STAGE compressor_stage) { + for (int i = 0; i <= up_to_index; i++) { + const struct lookahead_entry *e = + av1_lookahead_peek(lookahead, i, compressor_stage); + if (e == NULL) { + // We have reached the end of the lookahead buffer and not early-returned + // so there isn't a forced key-frame pending. + return -1; + } else if (e->flags == AOM_EFLAG_FORCE_KF) { + return i; + } else { + continue; + } + } + return -1; // Never reached +} + +// Check if we should encode an ARF or internal ARF. If not, try a LAST +// Do some setup associated with the chosen source +// temporal_filtered, flush, and frame_update_type are outputs. +// Return the frame source, or NULL if we couldn't find one +static struct lookahead_entry *choose_frame_source( + AV1_COMP *const cpi, int *const flush, int *pop_lookahead, + struct lookahead_entry **last_source, int *const show_frame) { + AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + struct lookahead_entry *source = NULL; + + // Source index in lookahead buffer. + int src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; + + // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q + if (src_index && + (is_forced_keyframe_pending(cpi->ppi->lookahead, src_index, + cpi->compressor_stage) != -1) && + cpi->oxcf.rc_cfg.mode != AOM_Q && !is_stat_generation_stage(cpi)) { + src_index = 0; + *flush = 1; + } + + // If the current frame is arf, then we should not pop from the lookahead + // buffer. If the current frame is not arf, then pop it. This assumes the + // first frame in the GF group is not arf. May need to change if it is not + // true. + *pop_lookahead = (src_index == 0); + // If this is a key frame and keyframe filtering is enabled with overlay, + // then do not pop. + if (*pop_lookahead && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 && + gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && + !is_stat_generation_stage(cpi) && cpi->ppi->lookahead) { + if (cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz && + (*flush || + cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz == + cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].pop_sz)) { + *pop_lookahead = 0; + } + } + + // LAP stage does not have ARFs or forward key-frames, + // hence, always pop_lookahead here. + if (is_stat_generation_stage(cpi)) { + *pop_lookahead = 1; + src_index = 0; + } + + *show_frame = *pop_lookahead; + +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) { +#else + { +#endif // CONFIG_FPMT_TEST + // Future frame in parallel encode set + if (gf_group->src_offset[cpi->gf_frame_index] != 0 && + !is_stat_generation_stage(cpi)) + src_index = gf_group->src_offset[cpi->gf_frame_index]; + } + if (*show_frame) { + // show frame, pop from buffer + // Get last frame source. + if (cm->current_frame.frame_number > 0) { + *last_source = av1_lookahead_peek(cpi->ppi->lookahead, src_index - 1, + cpi->compressor_stage); + } + // Read in the source frame. + source = av1_lookahead_peek(cpi->ppi->lookahead, src_index, + cpi->compressor_stage); + } else { + // no show frames are arf frames + source = av1_lookahead_peek(cpi->ppi->lookahead, src_index, + cpi->compressor_stage); + if (source != NULL) { + cm->showable_frame = 1; + } + } + return source; +} + +// Don't allow a show_existing_frame to coincide with an error resilient or +// S-Frame. An exception can be made in the case of a keyframe, since it does +// not depend on any previous frames. +static int allow_show_existing(const AV1_COMP *const cpi, + unsigned int frame_flags) { + if (cpi->common.current_frame.frame_number == 0) return 0; + + const struct lookahead_entry *lookahead_src = + av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage); + if (lookahead_src == NULL) return 1; + + const int is_error_resilient = + cpi->oxcf.tool_cfg.error_resilient_mode || + (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT); + const int is_s_frame = cpi->oxcf.kf_cfg.enable_sframe || + (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME); + const int is_key_frame = + (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY); + return !(is_error_resilient || is_s_frame) || is_key_frame; +} + +// Update frame_flags to tell the encoder's caller what sort of frame was +// encoded. +static void update_frame_flags(const AV1_COMMON *const cm, + const RefreshFrameInfo *const refresh_frame, + unsigned int *frame_flags) { + if (encode_show_existing_frame(cm)) { + *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN; + *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF; + *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF; + *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY; + return; + } + + if (refresh_frame->golden_frame) { + *frame_flags |= FRAMEFLAGS_GOLDEN; + } else { + *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN; + } + + if (refresh_frame->alt_ref_frame) { + *frame_flags |= FRAMEFLAGS_ALTREF; + } else { + *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF; + } + + if (refresh_frame->bwd_ref_frame) { + *frame_flags |= FRAMEFLAGS_BWDREF; + } else { + *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF; + } + + if (cm->current_frame.frame_type == KEY_FRAME) { + *frame_flags |= FRAMEFLAGS_KEY; + } else { + *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY; + } +} + +#define DUMP_REF_FRAME_IMAGES 0 + +#if DUMP_REF_FRAME_IMAGES == 1 +static int dump_one_image(AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *const ref_buf, + char *file_name) { + int h; + FILE *f_ref = NULL; + + if (ref_buf == NULL) { + printf("Frame data buffer is NULL.\n"); + return AOM_CODEC_MEM_ERROR; + } + + if ((f_ref = fopen(file_name, "wb")) == NULL) { + printf("Unable to open file %s to write.\n", file_name); + return AOM_CODEC_MEM_ERROR; + } + + // --- Y --- + for (h = 0; h < cm->height; ++h) { + fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref); + } + // --- U --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), + f_ref); + } + // --- V --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), + f_ref); + } + + fclose(f_ref); + + return AOM_CODEC_OK; +} + +static void dump_ref_frame_images(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + char file_name[256] = ""; + snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv", + cm->current_frame.frame_number, ref_frame); + dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name); + } +} +#endif // DUMP_REF_FRAME_IMAGES == 1 + +int av1_get_refresh_ref_frame_map(int refresh_frame_flags) { + int ref_map_index; + + for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index) + if ((refresh_frame_flags >> ref_map_index) & 1) break; + + if (ref_map_index == REF_FRAMES) ref_map_index = INVALID_IDX; + return ref_map_index; +} + +static int get_free_ref_map_index(RefFrameMapPair ref_map_pairs[REF_FRAMES]) { + for (int idx = 0; idx < REF_FRAMES; ++idx) + if (ref_map_pairs[idx].disp_order == -1) return idx; + return INVALID_IDX; +} + +static int get_refresh_idx(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int update_arf, GF_GROUP *gf_group, int gf_index, + int enable_refresh_skip, int cur_frame_disp) { + int arf_count = 0; + int oldest_arf_order = INT32_MAX; + int oldest_arf_idx = -1; + + int oldest_frame_order = INT32_MAX; + int oldest_idx = -1; + + for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { + RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx]; + if (ref_pair.disp_order == -1) continue; + const int frame_order = ref_pair.disp_order; + const int reference_frame_level = ref_pair.pyr_level; + // Keep future frames and three closest previous frames in output order. + if (frame_order > cur_frame_disp - 3) continue; + + if (enable_refresh_skip) { + int skip_frame = 0; + // Prevent refreshing a frame in gf_group->skip_frame_refresh. + for (int i = 0; i < REF_FRAMES; i++) { + int frame_to_skip = gf_group->skip_frame_refresh[gf_index][i]; + if (frame_to_skip == INVALID_IDX) break; + if (frame_order == frame_to_skip) { + skip_frame = 1; + break; + } + } + if (skip_frame) continue; + } + + // Keep track of the oldest level 1 frame if the current frame is also level + // 1. + if (reference_frame_level == 1) { + // If there are more than 2 level 1 frames in the reference list, + // discard the oldest. + if (frame_order < oldest_arf_order) { + oldest_arf_order = frame_order; + oldest_arf_idx = map_idx; + } + arf_count++; + continue; + } + + // Update the overall oldest reference frame. + if (frame_order < oldest_frame_order) { + oldest_frame_order = frame_order; + oldest_idx = map_idx; + } + } + if (update_arf && arf_count > 2) return oldest_arf_idx; + if (oldest_idx >= 0) return oldest_idx; + if (oldest_arf_idx >= 0) return oldest_arf_idx; + if (oldest_idx == -1) { + assert(arf_count > 2 && enable_refresh_skip); + return oldest_arf_idx; + } + assert(0 && "No valid refresh index found"); + return -1; +} + +// Computes the reference refresh index for INTNL_ARF_UPDATE frame. +int av1_calc_refresh_idx_for_intnl_arf( + AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int gf_index) { + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + + // Search for the open slot to store the current frame. + int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs); + + // Use a free slot if available. + if (free_fb_index != INVALID_IDX) { + return free_fb_index; + } else { + int enable_refresh_skip = !is_one_pass_rt_params(cpi); + int refresh_idx = + get_refresh_idx(ref_frame_map_pairs, 0, gf_group, gf_index, + enable_refresh_skip, gf_group->display_idx[gf_index]); + return refresh_idx; + } +} + +int av1_get_refresh_frame_flags( + const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params, + FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order, + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) { + const AV1_COMMON *const cm = &cpi->common; + const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = + &cpi->ext_flags.refresh_frame; + + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (gf_group->refbuf_state[gf_index] == REFBUF_RESET) + return SELECT_ALL_BUF_SLOTS; + + // TODO(jingning): Deprecate the following operations. + // Switch frames and shown key-frames overwrite all reference slots + if (frame_params->frame_type == S_FRAME) return SELECT_ALL_BUF_SLOTS; + + // show_existing_frames don't actually send refresh_frame_flags so set the + // flags to 0 to keep things consistent. + if (frame_params->show_existing_frame) return 0; + + const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + if (is_frame_droppable(rtc_ref, ext_refresh_frame_flags)) return 0; + +#if !CONFIG_REALTIME_ONLY + if (cpi->use_ducky_encode && + cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) { + int new_fb_map_idx = cpi->ppi->gf_group.update_ref_idx[gf_index]; + if (new_fb_map_idx == INVALID_IDX) return 0; + return 1 << new_fb_map_idx; + } +#endif // !CONFIG_REALTIME_ONLY + + int refresh_mask = 0; + if (ext_refresh_frame_flags->update_pending) { + if (rtc_ref->set_ref_frame_config || + use_rtc_reference_structure_one_layer(cpi)) { + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + int ref_frame_map_idx = rtc_ref->ref_idx[i]; + refresh_mask |= rtc_ref->refresh[ref_frame_map_idx] + << ref_frame_map_idx; + } + return refresh_mask; + } + // Unfortunately the encoder interface reflects the old refresh_*_frame + // flags so we have to replicate the old refresh_frame_flags logic here in + // order to preserve the behaviour of the flag overrides. + int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->last_frame << ref_frame_map_idx; + + ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->bwd_ref_frame + << ref_frame_map_idx; + + ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->alt2_ref_frame + << ref_frame_map_idx; + + if (frame_update_type == OVERLAY_UPDATE) { + ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->golden_frame + << ref_frame_map_idx; + } else { + ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->golden_frame + << ref_frame_map_idx; + + ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->alt_ref_frame + << ref_frame_map_idx; + } + return refresh_mask; + } + + // Search for the open slot to store the current frame. + int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs); + + // No refresh necessary for these frame types. + if (frame_update_type == OVERLAY_UPDATE || + frame_update_type == INTNL_OVERLAY_UPDATE) + return refresh_mask; + + // If there is an open slot, refresh that one instead of replacing a + // reference. + if (free_fb_index != INVALID_IDX) { + refresh_mask = 1 << free_fb_index; + return refresh_mask; + } + const int enable_refresh_skip = !is_one_pass_rt_params(cpi); + const int update_arf = frame_update_type == ARF_UPDATE; + const int refresh_idx = + get_refresh_idx(ref_frame_map_pairs, update_arf, &cpi->ppi->gf_group, + gf_index, enable_refresh_skip, cur_disp_order); + return 1 << refresh_idx; +} + +#if !CONFIG_REALTIME_ONLY +void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + + av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size); + + av1_setup_block_planes(xd, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, num_planes); + + set_mi_offsets(&cm->mi_params, xd, 0, 0); +} + +// Apply temporal filtering to source frames and encode the filtered frame. +// If the current frame does not require filtering, this function is identical +// to av1_encode() except that tpl is not performed. +static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, + EncodeFrameInput *const frame_input, + const EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results) { +#if CONFIG_COLLECT_COMPONENT_TIMING + if (cpi->oxcf.pass == 2) start_timing(cpi, denoise_and_encode_time); +#endif + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + const int is_second_arf = + av1_gop_is_second_arf(gf_group, cpi->gf_frame_index); + + // Decide whether to apply temporal filtering to the source frame. + int apply_filtering = + av1_is_temporal_filter_on(oxcf) && !is_stat_generation_stage(cpi); + if (update_type != KF_UPDATE && update_type != ARF_UPDATE && !is_second_arf) { + apply_filtering = 0; + } + if (apply_filtering) { + if (frame_params->frame_type == KEY_FRAME) { + // TODO(angiebird): Move the noise level check to av1_tf_info_filtering. + // Decide whether it is allowed to perform key frame filtering + int allow_kf_filtering = oxcf->kf_cfg.enable_keyframe_filtering && + !frame_params->show_existing_frame && + !is_lossless_requested(&oxcf->rc_cfg); + if (allow_kf_filtering) { + double y_noise_level = 0.0; + av1_estimate_noise_level( + frame_input->source, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y, + cm->seq_params->bit_depth, NOISE_ESTIMATION_EDGE_THRESHOLD); + apply_filtering = y_noise_level > 0; + } else { + apply_filtering = 0; + } + // If we are doing kf filtering, set up a few things. + if (apply_filtering) { + av1_setup_past_independence(cm); + } + } else if (is_second_arf) { + apply_filtering = cpi->sf.hl_sf.second_alt_ref_filtering; + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time); +#endif + // Save the pointer to the original source image. + YV12_BUFFER_CONFIG *source_buffer = frame_input->source; + // apply filtering to frame + if (apply_filtering) { + int show_existing_alt_ref = 0; + FRAME_DIFF frame_diff; + int top_index = 0; + int bottom_index = 0; + const int q_index = av1_rc_pick_q_and_bounds( + cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, + cpi->gf_frame_index, &bottom_index, &top_index); + + // TODO(bohanli): figure out why we need frame_type in cm here. + cm->current_frame.frame_type = frame_params->frame_type; + if (update_type == KF_UPDATE || update_type == ARF_UPDATE) { + YV12_BUFFER_CONFIG *tf_buf = av1_tf_info_get_filtered_buf( + &cpi->ppi->tf_info, cpi->gf_frame_index, &frame_diff); + if (tf_buf != NULL) { + frame_input->source = tf_buf; + show_existing_alt_ref = av1_check_show_filtered_frame( + tf_buf, &frame_diff, q_index, cm->seq_params->bit_depth); + if (show_existing_alt_ref) { + cpi->common.showable_frame |= 1; + } else { + cpi->common.showable_frame = 0; + } + } + if (gf_group->frame_type[cpi->gf_frame_index] != KEY_FRAME) { + cpi->ppi->show_existing_alt_ref = show_existing_alt_ref; + } + } + + if (is_second_arf) { + // Allocate the memory for tf_buf_second_arf buffer, only when it is + // required. + int ret = aom_realloc_frame_buffer( + &cpi->ppi->tf_info.tf_buf_second_arf, oxcf->frm_dim_cfg.width, + oxcf->frm_dim_cfg.height, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL, cpi->image_pyramid_levels, 0); + if (ret) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate tf_buf_second_arf"); + + YV12_BUFFER_CONFIG *tf_buf_second_arf = + &cpi->ppi->tf_info.tf_buf_second_arf; + // We didn't apply temporal filtering for second arf ahead in + // av1_tf_info_filtering(). + const int arf_src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; + // Right now, we are still using tf_buf_second_arf due to + // implementation complexity. + // TODO(angiebird): Reuse tf_info->tf_buf here. + av1_temporal_filter(cpi, arf_src_index, cpi->gf_frame_index, &frame_diff, + tf_buf_second_arf); + show_existing_alt_ref = av1_check_show_filtered_frame( + tf_buf_second_arf, &frame_diff, q_index, cm->seq_params->bit_depth); + if (show_existing_alt_ref) { + aom_extend_frame_borders(tf_buf_second_arf, av1_num_planes(cm)); + frame_input->source = tf_buf_second_arf; + } + // Currently INTNL_ARF_UPDATE only do show_existing. + cpi->common.showable_frame |= 1; + } + + // Copy source metadata to the temporal filtered frame + if (source_buffer->metadata && + aom_copy_metadata_to_frame_buffer(frame_input->source, + source_buffer->metadata)) { + aom_internal_error( + cm->error, AOM_CODEC_MEM_ERROR, + "Failed to copy source metadata to the temporal filtered frame"); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + if (cpi->oxcf.pass == 2) end_timing(cpi, apply_filtering_time); +#endif + + int set_mv_params = frame_params->frame_type == KEY_FRAME || + update_type == ARF_UPDATE || update_type == GF_UPDATE; + cm->show_frame = frame_params->show_frame; + cm->current_frame.frame_type = frame_params->frame_type; + // TODO(bohanli): Why is this? what part of it is necessary? + av1_set_frame_size(cpi, cm->width, cm->height); + if (set_mv_params) av1_set_mv_search_params(cpi); + +#if CONFIG_RD_COMMAND + if (frame_params->frame_type == KEY_FRAME) { + char filepath[] = "rd_command.txt"; + av1_read_rd_command(filepath, &cpi->rd_command); + } +#endif // CONFIG_RD_COMMAND + if (cpi->gf_frame_index == 0 && !is_stat_generation_stage(cpi)) { + // perform tpl after filtering + int allow_tpl = + oxcf->gf_cfg.lag_in_frames > 1 && oxcf->algo_cfg.enable_tpl_model; + if (gf_group->size > MAX_LENGTH_TPL_FRAME_STATS) { + allow_tpl = 0; + } + if (frame_params->frame_type != KEY_FRAME) { + // In rare case, it's possible to have non ARF/GF update_type here. + // We should set allow_tpl to zero in the situation + allow_tpl = + allow_tpl && (update_type == ARF_UPDATE || update_type == GF_UPDATE || + (cpi->use_ducky_encode && + cpi->ducky_encode_info.frame_info.gop_mode == + DUCKY_ENCODE_GOP_MODE_RCL)); + } + + if (allow_tpl) { + if (!cpi->skip_tpl_setup_stats) { + av1_tpl_preload_rc_estimate(cpi, frame_params); + av1_tpl_setup_stats(cpi, 0, frame_params); +#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS + assert(cpi->gf_frame_index == 0); + av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data, + gf_group, cm->seq_params->bit_depth); +#endif + } + } else { + av1_init_tpl_stats(&cpi->ppi->tpl_data); + } +#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + if (cpi->oxcf.pass == AOM_RC_SECOND_PASS && + cpi->second_pass_log_stream != NULL) { + TPL_INFO *tpl_info; + AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info))); + av1_pack_tpl_info(tpl_info, gf_group, &cpi->ppi->tpl_data); + av1_write_tpl_info(tpl_info, cpi->second_pass_log_stream, + cpi->common.error); + aom_free(tpl_info); + } +#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + } + + if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + // Set frame_input source to true source for psnr calculation. + if (apply_filtering && is_psnr_calc_enabled(cpi)) { + cpi->source = av1_realloc_and_scale_if_required( + cm, source_buffer, &cpi->scaled_source, cm->features.interp_filter, 0, + false, true, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + cpi->unscaled_source = source_buffer; + } +#if CONFIG_COLLECT_COMPONENT_TIMING + if (cpi->oxcf.pass == 2) end_timing(cpi, denoise_and_encode_time); +#endif + return AOM_CODEC_OK; +} +#endif // !CONFIG_REALTIME_ONLY + +/*!\cond */ +// Struct to keep track of relevant reference frame data. +typedef struct { + int map_idx; + int disp_order; + int pyr_level; + int used; +} RefBufMapData; +/*!\endcond */ + +// Comparison function to sort reference frames in ascending display order. +static int compare_map_idx_pair_asc(const void *a, const void *b) { + if (((RefBufMapData *)a)->disp_order == ((RefBufMapData *)b)->disp_order) { + return 0; + } else if (((const RefBufMapData *)a)->disp_order > + ((const RefBufMapData *)b)->disp_order) { + return 1; + } else { + return -1; + } +} + +// Checks to see if a particular reference frame is already in the reference +// frame map. +static int is_in_ref_map(RefBufMapData *map, int disp_order, int n_frames) { + for (int i = 0; i < n_frames; i++) { + if (disp_order == map[i].disp_order) return 1; + } + return 0; +} + +// Add a reference buffer index to a named reference slot. +static void add_ref_to_slot(RefBufMapData *ref, int *const remapped_ref_idx, + int frame) { + remapped_ref_idx[frame - LAST_FRAME] = ref->map_idx; + ref->used = 1; +} + +// Threshold dictating when we are allowed to start considering +// leaving lowest level frames unmapped. +#define LOW_LEVEL_FRAMES_TR 5 + +// Find which reference buffer should be left out of the named mapping. +// This is because there are 8 reference buffers and only 7 named slots. +static void set_unmapped_ref(RefBufMapData *buffer_map, int n_bufs, + int n_min_level_refs, int min_level, + int cur_frame_disp) { + int max_dist = 0; + int unmapped_idx = -1; + if (n_bufs <= ALTREF_FRAME) return; + for (int i = 0; i < n_bufs; i++) { + if (buffer_map[i].used) continue; + if (buffer_map[i].pyr_level != min_level || + n_min_level_refs >= LOW_LEVEL_FRAMES_TR) { + int dist = abs(cur_frame_disp - buffer_map[i].disp_order); + if (dist > max_dist) { + max_dist = dist; + unmapped_idx = i; + } + } + } + assert(unmapped_idx >= 0 && "Unmapped reference not found"); + buffer_map[unmapped_idx].used = 1; +} + +void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int cur_frame_disp, const AV1_COMP *cpi, int gf_index, + int is_parallel_encode, + int remapped_ref_idx[REF_FRAMES]) { + int buf_map_idx = 0; + + // Initialize reference frame mappings. + for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX; + +#if !CONFIG_REALTIME_ONLY + if (cpi->use_ducky_encode && + cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) { + for (int rf = LAST_FRAME; rf < REF_FRAMES; ++rf) { + if (cpi->ppi->gf_group.ref_frame_list[gf_index][rf] != INVALID_IDX) { + remapped_ref_idx[rf - LAST_FRAME] = + cpi->ppi->gf_group.ref_frame_list[gf_index][rf]; + } + } + + int valid_rf_idx = 0; + static const int ref_frame_type_order[REF_FRAMES - LAST_FRAME] = { + GOLDEN_FRAME, ALTREF_FRAME, LAST_FRAME, BWDREF_FRAME, + ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME + }; + for (int i = 0; i < REF_FRAMES - LAST_FRAME; i++) { + int rf = ref_frame_type_order[i]; + if (remapped_ref_idx[rf - LAST_FRAME] != INVALID_IDX) { + valid_rf_idx = remapped_ref_idx[rf - LAST_FRAME]; + break; + } + } + + for (int i = 0; i < REF_FRAMES; ++i) { + if (remapped_ref_idx[i] == INVALID_IDX) { + remapped_ref_idx[i] = valid_rf_idx; + } + } + + return; + } +#endif // !CONFIG_REALTIME_ONLY + + RefBufMapData buffer_map[REF_FRAMES]; + int n_bufs = 0; + memset(buffer_map, 0, REF_FRAMES * sizeof(buffer_map[0])); + int min_level = MAX_ARF_LAYERS; + int max_level = 0; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + int skip_ref_unmapping = 0; + int is_one_pass_rt = is_one_pass_rt_params(cpi); + + // Go through current reference buffers and store display order, pyr level, + // and map index. + for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { + // Get reference frame buffer. + RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx]; + if (ref_pair.disp_order == -1) continue; + const int frame_order = ref_pair.disp_order; + // Avoid duplicates. + if (is_in_ref_map(buffer_map, frame_order, n_bufs)) continue; + const int reference_frame_level = ref_pair.pyr_level; + + // Keep track of the lowest and highest levels that currently exist. + if (reference_frame_level < min_level) min_level = reference_frame_level; + if (reference_frame_level > max_level) max_level = reference_frame_level; + + buffer_map[n_bufs].map_idx = map_idx; + buffer_map[n_bufs].disp_order = frame_order; + buffer_map[n_bufs].pyr_level = reference_frame_level; + buffer_map[n_bufs].used = 0; + n_bufs++; + } + + // Sort frames in ascending display order. + qsort(buffer_map, n_bufs, sizeof(buffer_map[0]), compare_map_idx_pair_asc); + + int n_min_level_refs = 0; + int closest_past_ref = -1; + int golden_idx = -1; + int altref_idx = -1; + + // Find the GOLDEN_FRAME and BWDREF_FRAME. + // Also collect various stats about the reference frames for the remaining + // mappings. + for (int i = n_bufs - 1; i >= 0; i--) { + if (buffer_map[i].pyr_level == min_level) { + // Keep track of the number of lowest level frames. + n_min_level_refs++; + if (buffer_map[i].disp_order < cur_frame_disp && golden_idx == -1 && + remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] == INVALID_IDX) { + // Save index for GOLDEN. + golden_idx = i; + } else if (buffer_map[i].disp_order > cur_frame_disp && + altref_idx == -1 && + remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] == INVALID_IDX) { + // Save index for ALTREF. + altref_idx = i; + } + } else if (buffer_map[i].disp_order == cur_frame_disp) { + // Map the BWDREF_FRAME if this is the show_existing_frame. + add_ref_to_slot(&buffer_map[i], remapped_ref_idx, BWDREF_FRAME); + } + + // During parallel encodes of lower layer frames, exclude the first frame + // (frame_parallel_level 1) from being used for the reference assignment of + // the second frame (frame_parallel_level 2). + if (!is_one_pass_rt && gf_group->frame_parallel_level[gf_index] == 2 && + gf_group->frame_parallel_level[gf_index - 1] == 1 && + gf_group->update_type[gf_index - 1] == INTNL_ARF_UPDATE) { + assert(gf_group->update_type[gf_index] == INTNL_ARF_UPDATE); +#if CONFIG_FPMT_TEST + is_parallel_encode = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) + ? is_parallel_encode + : 0; +#endif // CONFIG_FPMT_TEST + // If parallel cpis are active, use ref_idx_to_skip, else, use display + // index. + assert(IMPLIES(is_parallel_encode, cpi->ref_idx_to_skip != INVALID_IDX)); + assert(IMPLIES(!is_parallel_encode, + gf_group->skip_frame_as_ref[gf_index] != INVALID_IDX)); + buffer_map[i].used = is_parallel_encode + ? (buffer_map[i].map_idx == cpi->ref_idx_to_skip) + : (buffer_map[i].disp_order == + gf_group->skip_frame_as_ref[gf_index]); + // In case a ref frame is excluded from being used during assignment, + // skip the call to set_unmapped_ref(). Applicable in steady state. + if (buffer_map[i].used) skip_ref_unmapping = 1; + } + + // Keep track of where the frames change from being past frames to future + // frames. + if (buffer_map[i].disp_order < cur_frame_disp && closest_past_ref < 0) + closest_past_ref = i; + } + + // Do not map GOLDEN and ALTREF based on their pyramid level if all reference + // frames have the same level. + if (n_min_level_refs <= n_bufs) { + // Map the GOLDEN_FRAME. + if (golden_idx > -1) + add_ref_to_slot(&buffer_map[golden_idx], remapped_ref_idx, GOLDEN_FRAME); + // Map the ALTREF_FRAME. + if (altref_idx > -1) + add_ref_to_slot(&buffer_map[altref_idx], remapped_ref_idx, ALTREF_FRAME); + } + + // Find the buffer to be excluded from the mapping. + if (!skip_ref_unmapping) + set_unmapped_ref(buffer_map, n_bufs, n_min_level_refs, min_level, + cur_frame_disp); + + // Place past frames in LAST_FRAME, LAST2_FRAME, and LAST3_FRAME. + for (int frame = LAST_FRAME; frame < GOLDEN_FRAME; frame++) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer + // in decreasing ouptut order relative to current picture. + int next_buf_max = 0; + int next_disp_order = INT_MIN; + for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used && + buffer_map[buf_map_idx].disp_order < cur_frame_disp && + buffer_map[buf_map_idx].disp_order > next_disp_order) { + next_disp_order = buffer_map[buf_map_idx].disp_order; + next_buf_max = buf_map_idx; + } + } + buf_map_idx = next_buf_max; + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Place future frames (if there are any) in BWDREF_FRAME and ALTREF2_FRAME. + for (int frame = BWDREF_FRAME; frame < REF_FRAMES; frame++) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer + // in increasing ouptut order relative to current picture. + int next_buf_max = 0; + int next_disp_order = INT_MAX; + for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used && + buffer_map[buf_map_idx].disp_order > cur_frame_disp && + buffer_map[buf_map_idx].disp_order < next_disp_order) { + next_disp_order = buffer_map[buf_map_idx].disp_order; + next_buf_max = buf_map_idx; + } + } + buf_map_idx = next_buf_max; + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Place remaining past frames. + buf_map_idx = closest_past_ref; + for (int frame = LAST_FRAME; frame < REF_FRAMES; frame++) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer. + for (; buf_map_idx >= 0; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used) break; + } + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Place remaining future frames. + buf_map_idx = n_bufs - 1; + for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; frame--) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer. + for (; buf_map_idx > closest_past_ref; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used) break; + } + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Fill any slots that are empty (should only happen for the first 7 frames). + for (int i = 0; i < REF_FRAMES; ++i) + if (remapped_ref_idx[i] == INVALID_IDX) remapped_ref_idx[i] = 0; +} + +int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, + uint8_t *const dest, unsigned int *frame_flags, + int64_t *const time_stamp, int64_t *const time_end, + const aom_rational64_t *const timestamp_ratio, + int *const pop_lookahead, int flush) { + AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + ExternalFlags *const ext_flags = &cpi->ext_flags; + GFConfig *const gf_cfg = &oxcf->gf_cfg; + + EncodeFrameInput frame_input; + EncodeFrameParams frame_params; + EncodeFrameResults frame_results; + memset(&frame_input, 0, sizeof(frame_input)); + memset(&frame_params, 0, sizeof(frame_params)); + memset(&frame_results, 0, sizeof(frame_results)); + +#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + VBR_RATECTRL_INFO *vbr_rc_info = &cpi->vbr_rc_info; + if (oxcf->pass == AOM_RC_THIRD_PASS && vbr_rc_info->ready == 0) { + THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF]; + av1_open_second_pass_log(cpi, 1); + FILE *second_pass_log_stream = cpi->second_pass_log_stream; + fseek(second_pass_log_stream, 0, SEEK_END); + size_t file_size = ftell(second_pass_log_stream); + rewind(second_pass_log_stream); + size_t read_size = 0; + while (read_size < file_size) { + THIRD_PASS_GOP_INFO gop_info; + struct aom_internal_error_info *error = cpi->common.error; + // Read in GOP information from the second pass file. + av1_read_second_pass_gop_info(second_pass_log_stream, &gop_info, error); + TPL_INFO *tpl_info; + AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info))); + av1_read_tpl_info(tpl_info, second_pass_log_stream, error); + // Read in per-frame info from second-pass encoding + av1_read_second_pass_per_frame_info(second_pass_log_stream, frame_info, + gop_info.num_frames, error); + av1_vbr_rc_append_tpl_info(vbr_rc_info, tpl_info); + read_size = ftell(second_pass_log_stream); + aom_free(tpl_info); + } + av1_close_second_pass_log(cpi); + if (cpi->oxcf.rc_cfg.mode == AOM_Q) { + vbr_rc_info->base_q_index = cpi->oxcf.rc_cfg.cq_level; + av1_vbr_rc_compute_q_indices( + vbr_rc_info->base_q_index, vbr_rc_info->total_frame_count, + vbr_rc_info->qstep_ratio_list, cm->seq_params->bit_depth, + vbr_rc_info->q_index_list); + } else { + vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q( + vbr_rc_info->total_bit_budget, cm->seq_params->bit_depth, + vbr_rc_info->scale_factors, vbr_rc_info->total_frame_count, + vbr_rc_info->update_type_list, vbr_rc_info->qstep_ratio_list, + vbr_rc_info->txfm_stats_list, vbr_rc_info->q_index_list, NULL); + } + vbr_rc_info->ready = 1; +#if CONFIG_RATECTRL_LOG + rc_log_record_chunk_info(&cpi->rc_log, vbr_rc_info->base_q_index, + vbr_rc_info->total_frame_count); +#endif // CONFIG_RATECTRL_LOG + } +#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + + // Check if we need to stuff more src frames + if (flush == 0) { + int srcbuf_size = + av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); + int pop_size = + av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage); + + // Continue buffering look ahead buffer. + if (srcbuf_size < pop_size) return -1; + } + + if (!av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage)) { +#if !CONFIG_REALTIME_ONLY + if (flush && oxcf->pass == AOM_RC_FIRST_PASS && + !cpi->ppi->twopass.first_pass_done) { + av1_end_first_pass(cpi); /* get last stats packet */ + cpi->ppi->twopass.first_pass_done = 1; + } +#endif + return -1; + } + + // TODO(sarahparker) finish bit allocation for one pass pyramid + if (has_no_stats_stage(cpi)) { + gf_cfg->gf_max_pyr_height = + AOMMIN(gf_cfg->gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS); + gf_cfg->gf_min_pyr_height = + AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height); + } + + // Allocation of mi buffers. + alloc_mb_mode_info_buffers(cpi); + + cpi->skip_tpl_setup_stats = 0; +#if !CONFIG_REALTIME_ONLY + if (oxcf->pass != AOM_RC_FIRST_PASS) { + TplParams *const tpl_data = &cpi->ppi->tpl_data; + if (tpl_data->tpl_stats_pool[0] == NULL) { + av1_setup_tpl_buffers(cpi->ppi, &cm->mi_params, oxcf->frm_dim_cfg.width, + oxcf->frm_dim_cfg.height, 0, + oxcf->gf_cfg.lag_in_frames); + } + } + cpi->twopass_frame.this_frame = NULL; + const int use_one_pass_rt_params = is_one_pass_rt_params(cpi); + if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_get_second_pass_params_time); +#endif + + // Initialise frame_level_rate_correction_factors with value previous + // to the parallel frames. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + for (int i = 0; i < RATE_FACTOR_LEVELS; i++) { + cpi->rc.frame_level_rate_correction_factors[i] = +#if CONFIG_FPMT_TEST + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) + ? cpi->ppi->p_rc.temp_rate_correction_factors[i] + : +#endif // CONFIG_FPMT_TEST + cpi->ppi->p_rc.rate_correction_factors[i]; + } + } + + // copy mv_stats from ppi to frame_level cpi. + cpi->mv_stats = cpi->ppi->mv_stats; + av1_get_second_pass_params(cpi, &frame_params, *frame_flags); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_get_second_pass_params_time); +#endif + } +#endif + + if (!is_stat_generation_stage(cpi)) { + // TODO(jingning): fwd key frame always uses show existing frame? + if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE && + gf_group->refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { + frame_params.show_existing_frame = 1; + } else { + frame_params.show_existing_frame = + (cpi->ppi->show_existing_alt_ref && + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) || + gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE; + } + frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags); + + // Special handling to reset 'show_existing_frame' in case of dropped + // frames. + if (oxcf->rc_cfg.drop_frames_water_mark && + (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE)) { + // During the encode of an OVERLAY_UPDATE/INTNL_OVERLAY_UPDATE frame, loop + // over the gf group to check if the corresponding + // ARF_UPDATE/INTNL_ARF_UPDATE frame was dropped. + int cur_disp_idx = gf_group->display_idx[cpi->gf_frame_index]; + for (int idx = 0; idx < cpi->gf_frame_index; idx++) { + if (cur_disp_idx == gf_group->display_idx[idx]) { + assert(IMPLIES( + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE, + gf_group->update_type[idx] == ARF_UPDATE)); + assert(IMPLIES(gf_group->update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE, + gf_group->update_type[idx] == INTNL_ARF_UPDATE)); + // Reset show_existing_frame and set cpi->is_dropped_frame to true if + // the frame was dropped during its first encode. + if (gf_group->is_frame_dropped[idx]) { + frame_params.show_existing_frame = 0; + assert(!cpi->is_dropped_frame); + cpi->is_dropped_frame = true; + } + break; + } + } + } + + // Reset show_existing_alt_ref decision to 0 after it is used. + if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) { + cpi->ppi->show_existing_alt_ref = 0; + } + } else { + frame_params.show_existing_frame = 0; + } + + struct lookahead_entry *source = NULL; + struct lookahead_entry *last_source = NULL; + if (frame_params.show_existing_frame) { + source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage); + *pop_lookahead = 1; + frame_params.show_frame = 1; + } else { + source = choose_frame_source(cpi, &flush, pop_lookahead, &last_source, + &frame_params.show_frame); + } + + if (source == NULL) { // If no source was found, we can't encode a frame. +#if !CONFIG_REALTIME_ONLY + if (flush && oxcf->pass == AOM_RC_FIRST_PASS && + !cpi->ppi->twopass.first_pass_done) { + av1_end_first_pass(cpi); /* get last stats packet */ + cpi->ppi->twopass.first_pass_done = 1; + } +#endif + return -1; + } + + // reset src_offset to allow actual encode call for this frame to get its + // source. + gf_group->src_offset[cpi->gf_frame_index] = 0; + + // Source may be changed if temporal filtered later. + frame_input.source = &source->img; + if ((cpi->ppi->use_svc || cpi->rc.prev_frame_is_dropped) && + last_source != NULL) + av1_svc_set_last_source(cpi, &frame_input, &last_source->img); + else + frame_input.last_source = last_source != NULL ? &last_source->img : NULL; + frame_input.ts_duration = source->ts_end - source->ts_start; + // Save unfiltered source. It is used in av1_get_second_pass_params(). + cpi->unfiltered_source = frame_input.source; + + *time_stamp = source->ts_start; + *time_end = source->ts_end; + if (source->ts_start < cpi->time_stamps.first_ts_start) { + cpi->time_stamps.first_ts_start = source->ts_start; + cpi->time_stamps.prev_ts_end = source->ts_start; + } + + av1_apply_encoding_flags(cpi, source->flags); + *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; + +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + cpi->framerate = cpi->temp_framerate; + } + } +#endif // CONFIG_FPMT_TEST + + // Shown frames and arf-overlay frames need frame-rate considering + if (frame_params.show_frame) + adjust_frame_rate(cpi, source->ts_start, source->ts_end); + + if (!frame_params.show_existing_frame) { + if (cpi->film_grain_table) { + cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup( + cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */, + &cm->film_grain_params); + } else { + cm->cur_frame->film_grain_params_present = + cm->seq_params->film_grain_params_present; + } + // only one operating point supported now + const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp); + if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR; + + cm->frame_presentation_time = (uint32_t)pts64; + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_get_one_pass_rt_params_time); +#endif +#if CONFIG_REALTIME_ONLY + av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input, + *frame_flags); + if (use_rtc_reference_structure_one_layer(cpi)) + av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0); +#else + if (use_one_pass_rt_params) { + av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input, + *frame_flags); + if (use_rtc_reference_structure_one_layer(cpi)) + av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0); + } +#endif +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_get_one_pass_rt_params_time); +#endif + + FRAME_UPDATE_TYPE frame_update_type = + get_frame_update_type(gf_group, cpi->gf_frame_index); + + if (frame_params.show_existing_frame && + frame_params.frame_type != KEY_FRAME) { + // Force show-existing frames to be INTER, except forward keyframes + frame_params.frame_type = INTER_FRAME; + } + + // Per-frame encode speed. In theory this can vary, but things may have + // been written assuming speed-level will not change within a sequence, so + // this parameter should be used with caution. + frame_params.speed = oxcf->speed; + +#if !CONFIG_REALTIME_ONLY + // Set forced key frames when necessary. For two-pass encoding / lap mode, + // this is already handled by av1_get_second_pass_params. However when no + // stats are available, we still need to check if the new frame is a keyframe. + // For one pass rt, this is already checked in av1_get_one_pass_rt_params. + if (!use_one_pass_rt_params && + (is_stat_generation_stage(cpi) || has_no_stats_stage(cpi))) { + // Current frame is coded as a key-frame for any of the following cases: + // 1) First frame of a video + // 2) For all-intra frame encoding + // 3) When a key-frame is forced + const int kf_requested = + (cm->current_frame.frame_number == 0 || + oxcf->kf_cfg.key_freq_max == 0 || (*frame_flags & FRAMEFLAGS_KEY)); + if (kf_requested && frame_update_type != OVERLAY_UPDATE && + frame_update_type != INTNL_OVERLAY_UPDATE) { + frame_params.frame_type = KEY_FRAME; + } else if (is_stat_generation_stage(cpi)) { + // For stats generation, set the frame type to inter here. + frame_params.frame_type = INTER_FRAME; + } + } +#endif + + // Work out some encoding parameters specific to the pass: + if (has_no_stats_stage(cpi) && oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) { + av1_cyclic_refresh_update_parameters(cpi); + } else if (is_stat_generation_stage(cpi)) { + cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&oxcf->rc_cfg); + } else if (is_stat_consumption_stage(cpi)) { +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_w(); +#endif +#if TXCOEFF_COST_TIMER + cm->txcoeff_cost_timer = 0; + cm->txcoeff_cost_count = 0; +#endif + } + + if (!is_stat_generation_stage(cpi)) + set_ext_overrides(cm, &frame_params, ext_flags); + + // Shown keyframes and S frames refresh all reference buffers + const int force_refresh_all = + ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) || + frame_params.frame_type == S_FRAME) && + !frame_params.show_existing_frame; + + av1_configure_buffer_updates( + cpi, &frame_params.refresh_frame, frame_update_type, + gf_group->refbuf_state[cpi->gf_frame_index], force_refresh_all); + + if (!is_stat_generation_stage(cpi)) { + const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME]; + + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; + init_ref_map_pair(cpi, ref_frame_map_pairs); + const int order_offset = gf_group->arf_src_offset[cpi->gf_frame_index]; + const int cur_frame_disp = + cpi->common.current_frame.frame_number + order_offset; + + int get_ref_frames = 0; +#if CONFIG_FPMT_TEST + get_ref_frames = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0; +#endif // CONFIG_FPMT_TEST + if (get_ref_frames || + gf_group->frame_parallel_level[cpi->gf_frame_index] == 0) { + if (!ext_flags->refresh_frame.update_pending) { + av1_get_ref_frames(ref_frame_map_pairs, cur_frame_disp, cpi, + cpi->gf_frame_index, 1, cm->remapped_ref_idx); + } else if (cpi->ppi->rtc_ref.set_ref_frame_config || + use_rtc_reference_structure_one_layer(cpi)) { + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) + cm->remapped_ref_idx[i] = cpi->ppi->rtc_ref.ref_idx[i]; + } + } + + // Get the reference frames + bool has_ref_frames = false; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + const RefCntBuffer *ref_frame = + get_ref_frame_buf(cm, ref_frame_priority_order[i]); + ref_frame_buf[i] = ref_frame != NULL ? &ref_frame->buf : NULL; + if (ref_frame != NULL) has_ref_frames = true; + } + if (!has_ref_frames && (frame_params.frame_type == INTER_FRAME || + frame_params.frame_type == S_FRAME)) { + return AOM_CODEC_ERROR; + } + + // Work out which reference frame slots may be used. + frame_params.ref_frame_flags = + get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), ref_frame_buf, + ext_flags->ref_frame_flags); + + // Set primary_ref_frame of non-reference frames as PRIMARY_REF_NONE. + if (cpi->ppi->gf_group.is_frame_non_ref[cpi->gf_frame_index]) { + frame_params.primary_ref_frame = PRIMARY_REF_NONE; + } else { + frame_params.primary_ref_frame = + choose_primary_ref_frame(cpi, &frame_params); + } + + frame_params.order_offset = gf_group->arf_src_offset[cpi->gf_frame_index]; + + // Call av1_get_refresh_frame_flags() if refresh index not available. + if (!cpi->refresh_idx_available) { + frame_params.refresh_frame_flags = av1_get_refresh_frame_flags( + cpi, &frame_params, frame_update_type, cpi->gf_frame_index, + cur_frame_disp, ref_frame_map_pairs); + } else { + assert(cpi->ref_refresh_index != INVALID_IDX); + frame_params.refresh_frame_flags = (1 << cpi->ref_refresh_index); + } + + // Make the frames marked as is_frame_non_ref to non-reference frames. + if (gf_group->is_frame_non_ref[cpi->gf_frame_index]) + frame_params.refresh_frame_flags = 0; + + frame_params.existing_fb_idx_to_show = INVALID_IDX; + // Find the frame buffer to show based on display order. + if (frame_params.show_existing_frame) { + for (int frame = 0; frame < REF_FRAMES; frame++) { + const RefCntBuffer *const buf = cm->ref_frame_map[frame]; + if (buf == NULL) continue; + const int frame_order = (int)buf->display_order_hint; + if (frame_order == cur_frame_disp) + frame_params.existing_fb_idx_to_show = frame; + } + } + } + + // The way frame_params->remapped_ref_idx is setup is a placeholder. + // Currently, reference buffer assignment is done by update_ref_frame_map() + // which is called by high-level strategy AFTER encoding a frame. It + // modifies cm->remapped_ref_idx. If you want to use an alternative method + // to determine reference buffer assignment, just put your assignments into + // frame_params->remapped_ref_idx here and they will be used when encoding + // this frame. If frame_params->remapped_ref_idx is setup independently of + // cm->remapped_ref_idx then update_ref_frame_map() will have no effect. + memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx, + REF_FRAMES * sizeof(*cm->remapped_ref_idx)); + + cpi->td.mb.rdmult_delta_qindex = cpi->td.mb.delta_qindex = 0; + + if (!frame_params.show_existing_frame) { + cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm; + } + + const int is_intra_frame = frame_params.frame_type == KEY_FRAME || + frame_params.frame_type == INTRA_ONLY_FRAME; + FeatureFlags *const features = &cm->features; + if (!is_stat_generation_stage(cpi) && + (oxcf->pass == AOM_RC_ONE_PASS || oxcf->pass >= AOM_RC_SECOND_PASS) && + is_intra_frame) { + av1_set_screen_content_options(cpi, features); + } + +#if CONFIG_REALTIME_ONLY + if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } +#else + if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME && + gf_cfg->lag_in_frames == 0) { + if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + } else if (denoise_and_encode(cpi, dest, &frame_input, &frame_params, + &frame_results) != AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } +#endif // CONFIG_REALTIME_ONLY + + // This is used in rtc temporal filter case. Use true source in the PSNR + // calculation. + if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf && + cpi->common.current_frame.frame_type != KEY_FRAME) { + assert(cpi->orig_source.buffer_alloc_sz > 0); + cpi->source = &cpi->orig_source; + } + + if (!is_stat_generation_stage(cpi)) { + // First pass doesn't modify reference buffer assignment or produce frame + // flags + update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags); + set_additional_frame_flags(cm, frame_flags); + } + +#if !CONFIG_REALTIME_ONLY +#if TXCOEFF_COST_TIMER + if (!is_stat_generation_stage(cpi)) { + cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer; + fprintf(stderr, + "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld " + "in us\n", + cm->txcoeff_cost_count, cm->txcoeff_cost_timer, + cm->cum_txcoeff_cost_timer); + } +#endif +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_TUNE_VMAF + if (!is_stat_generation_stage(cpi) && + (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) { + av1_update_vmaf_curve(cpi); + } +#endif + + // Unpack frame_results: + *size = frame_results.size; + + // Leave a signal for a higher level caller about if this frame is droppable + if (*size > 0) { + cpi->droppable = + is_frame_droppable(&cpi->ppi->rtc_ref, &ext_flags->refresh_frame); + } + + // For SVC, or when frame-dropper is enabled: + // keep track of the (unscaled) source corresponding to the refresh of LAST + // reference (base temporal layer - TL0). Copy only for the + // top spatial enhancement layer so all spatial layers of the next + // superframe have last_source to be aligned with previous TL0 superframe. + // Avoid cases where resolution changes for unscaled source (top spatial + // layer). Only needs to be done for frame that are encoded (size > 0). + if (*size > 0 && + (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->svc.temporal_layer_id == 0 && + cpi->unscaled_source->y_width == cpi->svc.source_last_TL0.y_width && + cpi->unscaled_source->y_height == cpi->svc.source_last_TL0.y_height) { + aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0); + aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0); + aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0); + } + + return AOM_CODEC_OK; +} diff --git a/third_party/aom/av1/encoder/encode_strategy.h b/third_party/aom/av1/encoder/encode_strategy.h new file mode 100644 index 0000000000..c1d14d134c --- /dev/null +++ b/third_party/aom/av1/encoder/encode_strategy.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Declares frame encoding functions. + */ +#ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ +#define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "aom/aom_encoder.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" + +/*!\brief Implement high-level encode strategy + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + * This function will implement high-level encode strategy, choosing frame type, + * frame placement, etc. It populates an EncodeFrameParams struct with the + * results of these decisions and then encodes the frame. The caller should use + * the output parameters *time_stamp and *time_end only when this function + * returns AOM_CODEC_OK. + * + * \param[in] cpi Top-level encoder structure + * \param[in] size Bitstream size + * \param[in] dest Bitstream output + * \param[in] frame_flags Flags to decide how to encoding the frame + * \param[out] time_stamp Time stamp of the frame + * \param[out] time_end Time end + * \param[in] timestamp_ratio Time base + * \param[in] pop_lookahead Decide to pop the source frame from queue + * \param[in] flush Decide to encode one frame or the rest of frames + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval -1 + * \retval #AOM_CODEC_ERROR + */ +int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, + uint8_t *const dest, unsigned int *frame_flags, + int64_t *const time_stamp, int64_t *const time_end, + const aom_rational64_t *const timestamp_ratio, + int *const pop_lookahead, int flush); + +/*!\cond */ +// Set individual buffer update flags based on frame reference type. +// force_refresh_all is used when we have a KEY_FRAME or S_FRAME. It forces all +// refresh_*_frame flags to be set, because we refresh all buffers in this case. +void av1_configure_buffer_updates(AV1_COMP *const cpi, + RefreshFrameInfo *const refresh_frame, + const FRAME_UPDATE_TYPE type, + const REFBUF_STATE refbuf_state, + int force_refresh_all); + +int av1_get_refresh_frame_flags( + const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params, + FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order, + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]); + +int av1_get_refresh_ref_frame_map(int refresh_frame_flags); + +/*!\brief Obtain indices of reference frames in ref_frame_map + * + * \callgraph + * \callergraph + * + * \param[out] remapped_ref_idx An array for storing indices of reference + * frames. The index is used to retrieve a + * reference frame buffer from ref_frame_map + * in AV1Common. + */ +void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int cur_frame_disp, const AV1_COMP *cpi, int gf_index, + int is_parallel_encode, + int remapped_ref_idx[REF_FRAMES]); + +int is_forced_keyframe_pending(struct lookahead_ctx *lookahead, + const int up_to_index, + const COMPRESSOR_STAGE compressor_stage); + +static AOM_INLINE int is_frame_droppable( + const RTC_REF *const rtc_ref, + const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) { + // Droppable frame is only used by external refresh flags. VoD setting won't + // trigger its use case. + if (rtc_ref->set_ref_frame_config) + return rtc_ref->non_reference_frame; + else if (ext_refresh_frame_flags->update_pending) + return !(ext_refresh_frame_flags->alt_ref_frame || + ext_refresh_frame_flags->alt2_ref_frame || + ext_refresh_frame_flags->bwd_ref_frame || + ext_refresh_frame_flags->golden_frame || + ext_refresh_frame_flags->last_frame); + else + return 0; +} + +static AOM_INLINE int get_current_frame_ref_type(const AV1_COMP *const cpi) { + // We choose the reference "type" of this frame from the flags which indicate + // which reference frames will be refreshed by it. More than one of these + // flags may be set, so the order here implies an order of precedence. This is + // just used to choose the primary_ref_frame (as the most recent reference + // buffer of the same reference-type as the current frame). + + switch (cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]) { + case 0: return 0; + case 1: return 1; + case MAX_ARF_LAYERS: + case MAX_ARF_LAYERS + 1: return 4; + default: return 7; + } +} + +int av1_calc_refresh_idx_for_intnl_arf( + AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int gf_index); +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c new file mode 100644 index 0000000000..e2213a8355 --- /dev/null +++ b/third_party/aom/av1/encoder/encodeframe.c @@ -0,0 +1,2408 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_timer.h" + +#if CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG + +#include "av1/common/cfl.h" +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/idct.h" +#include "av1/common/mv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconintra.h" +#include "av1/common/reconinter.h" +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" +#include "av1/common/warped_motion.h" + +#include "av1/encoder/allintra_vis.h" +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/global_motion_facade.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/ml.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/partition_strategy.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/partition_model_weights.h" +#endif +#include "av1/encoder/partition_search.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/var_based_part.h" + +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif + +/*!\cond */ +// This is used as a reference when computing the source variance for the +// purposes of activity masking. +// Eventually this should be replaced by custom no-reference routines, +// which will be faster. +static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 +}; + +#if CONFIG_AV1_HIGHBITDEPTH +static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 +}; + +static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = { + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4 +}; + +static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = { + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16 +}; +#endif // CONFIG_AV1_HIGHBITDEPTH +/*!\endcond */ + +// For the given bit depth, returns a constant array used to assist the +// calculation of source block variance, which will then be used to decide +// adaptive quantizers. +static const uint8_t *get_var_offs(int use_hbd, int bd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + assert(bd == 8 || bd == 10 || bd == 12); + const int off_index = (bd - 8) >> 1; + static const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8, + AV1_HIGH_VAR_OFFS_10, + AV1_HIGH_VAR_OFFS_12 }; + return CONVERT_TO_BYTEPTR(high_var_offs[off_index]); + } +#else + (void)use_hbd; + (void)bd; + assert(!use_hbd); +#endif + assert(bd == 8); + return AV1_VAR_OFFS; +} + +void av1_init_rtc_counters(MACROBLOCK *const x) { + av1_init_cyclic_refresh_counters(x); + x->cnt_zeromv = 0; +} + +void av1_accumulate_rtc_counters(AV1_COMP *cpi, const MACROBLOCK *const x) { + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) + av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh, x); + cpi->rc.cnt_zeromv += x->cnt_zeromv; +} + +unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi, + const MACROBLOCKD *xd, + const struct buf_2d *ref, + BLOCK_SIZE bsize, int plane, + int use_hbd) { + const int subsampling_x = xd->plane[plane].subsampling_x; + const int subsampling_y = xd->plane[plane].subsampling_y; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + unsigned int sse; + const unsigned int var = cpi->ppi->fn_ptr[plane_bsize].vf( + ref->buf, ref->stride, get_var_offs(use_hbd, xd->bd), 0, &sse); + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[plane_bsize]); +} + +unsigned int av1_get_perpixel_variance_facade(const AV1_COMP *cpi, + const MACROBLOCKD *xd, + const struct buf_2d *ref, + BLOCK_SIZE bsize, int plane) { + const int use_hbd = is_cur_buf_hbd(xd); + return av1_get_perpixel_variance(cpi, xd, ref, bsize, plane, use_hbd); +} + +void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col, const int num_planes, + BLOCK_SIZE bsize) { + // Set current frame pointer. + x->e_mbd.cur_buf = src; + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) { + const int is_uv = i > 0; + setup_pred_plane( + &x->plane[i].src, bsize, src->buffers[i], src->crop_widths[is_uv], + src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, NULL, + x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y); + } +} + +#if !CONFIG_REALTIME_ONLY +/*!\brief Assigns different quantization parameters to each super + * block based on its TPL weight. + * + * \ingroup tpl_modelling + * + * \param[in] cpi Top level encoder instance structure + * \param[in,out] td Thread data structure + * \param[in,out] x Macro block level data for this block. + * \param[in] tile_info Tile infromation / identification + * \param[in] mi_row Block row (in "MI_SIZE" units) index + * \param[in] mi_col Block column (in "MI_SIZE" units) index + * \param[out] num_planes Number of image planes (e.g. Y,U,V) + * + * \remark No return value but updates macroblock and thread data + * related to the q / q delta to be used. + */ +static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td, + MACROBLOCK *const x, + const TileInfo *const tile_info, + int mi_row, int mi_col, int num_planes) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + assert(delta_q_info->delta_q_present_flag); + + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + // Delta-q modulation based on variance + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size); + + const int delta_q_res = delta_q_info->delta_q_res; + int current_qindex = cm->quant_params.base_qindex; + if (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.qp_mode == + DUCKY_ENCODE_FRAME_MODE_QINDEX) { + const int sb_row = mi_row >> cm->seq_params->mib_size_log2; + const int sb_col = mi_col >> cm->seq_params->mib_size_log2; + const int sb_cols = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); + const int sb_index = sb_row * sb_cols + sb_col; + current_qindex = + cpi->ducky_encode_info.frame_info.superblock_encode_qindex[sb_index]; + } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) { + if (DELTA_Q_PERCEPTUAL_MODULATION == 1) { + const int block_wavelet_energy_level = + av1_block_wavelet_energy_level(cpi, x, sb_size); + x->sb_energy_level = block_wavelet_energy_level; + current_qindex = av1_compute_q_from_energy_level_deltaq_mode( + cpi, block_wavelet_energy_level); + } else { + const int block_var_level = av1_log_block_var(cpi, x, sb_size); + x->sb_energy_level = block_var_level; + current_qindex = + av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level); + } + } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_OBJECTIVE && + cpi->oxcf.algo_cfg.enable_tpl_model) { + // Setup deltaq based on tpl stats + current_qindex = + av1_get_q_for_deltaq_objective(cpi, td, NULL, sb_size, mi_row, mi_col); + } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI) { + current_qindex = av1_get_sbq_perceptual_ai(cpi, sb_size, mi_row, mi_col); + } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) { + current_qindex = av1_get_sbq_user_rating_based(cpi, mi_row, mi_col); + } else if (cpi->oxcf.q_cfg.enable_hdr_deltaq) { + current_qindex = av1_get_q_for_hdr(cpi, x, sb_size, mi_row, mi_col); + } + + x->rdmult_cur_qindex = current_qindex; + MACROBLOCKD *const xd = &x->e_mbd; + const int adjusted_qindex = av1_adjust_q_from_delta_q_res( + delta_q_res, xd->current_base_qindex, current_qindex); + if (cpi->use_ducky_encode) { + assert(adjusted_qindex == current_qindex); + } + current_qindex = adjusted_qindex; + + x->delta_qindex = current_qindex - cm->quant_params.base_qindex; + x->rdmult_delta_qindex = x->delta_qindex; + + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + xd->mi[0]->current_qindex = current_qindex; + av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0); + + // keep track of any non-zero delta-q used + td->deltaq_used |= (x->delta_qindex != 0); + + if (cpi->oxcf.tool_cfg.enable_deltalf_mode) { + const int delta_lf_res = delta_q_info->delta_lf_res; + const int lfmask = ~(delta_lf_res - 1); + const int delta_lf_from_base = + ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask); + const int8_t delta_lf = + (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + const int mib_size = cm->seq_params->mib_size; + + // pre-set the delta lf for loop filter. Note that this value is set + // before mi is assigned for each block in current superblock + for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) { + for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) { + const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k); + mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf; + } + } + } + } +} + +static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row, + int mi_col) { + const AV1_COMMON *cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCK *x = &td->mb; + const int frame_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + + av1_zero(x->tpl_keep_ref_frame); + + if (!av1_tpl_stats_ready(tpl_data, frame_idx)) return; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return; + if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return; + + const int is_overlay = + cpi->ppi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE; + if (is_overlay) { + memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame)); + return; + } + + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 }; + const int step = 1 << block_mis_log2; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + + const int mi_row_end = + AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + AOMMIN(coded_to_superres_mi(mi_col + mi_size_wide[sb_size], + cm->superres_scale_denominator), + mi_cols_sr); + const int row_step = step; + const int col_step_sr = + coded_to_superres_mi(step, cm->superres_scale_denominator); + for (int row = mi_row; row < mi_row_end; row += row_step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { + const TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; + int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 }; + // Find the winner ref frame idx for the current block + int64_t best_inter_cost = this_stats->pred_error[0]; + int best_rf_idx = 0; + for (int idx = 1; idx < INTER_REFS_PER_FRAME; ++idx) { + if ((this_stats->pred_error[idx] < best_inter_cost) && + (this_stats->pred_error[idx] != 0)) { + best_inter_cost = this_stats->pred_error[idx]; + best_rf_idx = idx; + } + } + // tpl_pred_error is the pred_error reduction of best_ref w.r.t. + // LAST_FRAME. + tpl_pred_error[best_rf_idx] = this_stats->pred_error[best_rf_idx] - + this_stats->pred_error[LAST_FRAME - 1]; + + for (int rf_idx = 1; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) + inter_cost[rf_idx] += tpl_pred_error[rf_idx]; + } + } + + int rank_index[INTER_REFS_PER_FRAME - 1]; + for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) { + rank_index[idx] = idx + 1; + for (int i = idx; i > 0; --i) { + if (inter_cost[rank_index[i - 1]] > inter_cost[rank_index[i]]) { + const int tmp = rank_index[i - 1]; + rank_index[i - 1] = rank_index[i]; + rank_index[i] = tmp; + } + } + } + + x->tpl_keep_ref_frame[INTRA_FRAME] = 1; + x->tpl_keep_ref_frame[LAST_FRAME] = 1; + + int cutoff_ref = 0; + for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) { + x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 1; + if (idx > 2) { + if (!cutoff_ref) { + // If the predictive coding gains are smaller than the previous more + // relevant frame over certain amount, discard this frame and all the + // frames afterwards. + if (llabs(inter_cost[rank_index[idx]]) < + llabs(inter_cost[rank_index[idx - 1]]) / 8 || + inter_cost[rank_index[idx]] == 0) + cutoff_ref = 1; + } + + if (cutoff_ref) x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 0; + } + } +} + +static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x, + int mi_row, int mi_col) { + const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; + const int orig_rdmult = cpi->rd.RDMULT; + + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int gf_group_index = cpi->gf_frame_index; + if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ && + cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 && + cpi->ppi->gf_group.update_type[gf_group_index] == ARF_UPDATE) { + const int dr = + av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult); + x->rdmult = dr; + } +} +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_RT_ML_PARTITIONING +// Get a prediction(stored in x->est_pred) for the whole superblock. +static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *x, int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + MACROBLOCKD *xd = &x->e_mbd; + + // TODO(kyslov) Extend to 128x128 + assert(cm->seq_params->sb_size == BLOCK_64X64); + + av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + + if (!is_key_frame) { + MB_MODE_INFO *mi = xd->mi[0]; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + + assert(yv12 != NULL); + + av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + get_ref_scale_factors(cm, LAST_FRAME), 1); + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NONE; + mi->bsize = BLOCK_64X64; + mi->mv[0].as_int = 0; + mi->interp_filters = av1_broadcast_interp_filter(BILINEAR); + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + + xd->plane[0].dst.buf = x->est_pred; + xd->plane[0].dst.stride = 64; + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + } else { +#if CONFIG_AV1_HIGHBITDEPTH + switch (xd->bd) { + case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break; + case 10: + memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0])); + break; + case 12: + memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0])); + break; + } +#else + memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); +#endif // CONFIG_VP9_HIGHBITDEPTH + } +} +#endif // CONFIG_RT_ML_PARTITIONING + +#define AVG_CDF_WEIGHT_LEFT 3 +#define AVG_CDF_WEIGHT_TOP_RIGHT 1 + +/*!\brief Encode a superblock (minimal RD search involved) + * + * \ingroup partition_search + * Encodes the superblock by a pre-determined partition pattern, only minor + * rd-based searches are allowed to adjust the initial pattern. It is only used + * by realtime encoding. + */ +static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + const int mi_row, const int mi_col, + const int seg_skip) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + const SPEED_FEATURES *const sf = &cpi->sf; + const TileInfo *const tile_info = &tile_data->tile_info; + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + PC_TREE *const pc_root = td->pc_root; + +#if CONFIG_RT_ML_PARTITIONING + if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) { + RD_STATS dummy_rdc; + get_estimated_pred(cpi, tile_info, x, mi_row, mi_col); + av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, pc_root); + return; + } +#endif + // Set the partition + if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || + (sf->rt_sf.use_fast_fixed_part && + x->content_state_sb.source_sad_nonrd < kMedSad)) { + // set a fixed-size partition + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size; + if (sf->rt_sf.use_fast_fixed_part && + x->content_state_sb.source_sad_nonrd < kLowSad) { + bsize_select = BLOCK_64X64; + } + const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select; + av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) { + // set a variance-based partition + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col); + } + assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || + sf->part_sf.partition_search_type == VAR_BASED_PARTITION); + set_cb_offsets(td->mb.cb_offset, 0, 0); + + // Initialize the flag to skip cdef to 1. + if (sf->rt_sf.skip_cdef_sb) { + const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1; + // If 128x128 block is used, we need to set the flag for all 4 64x64 sub + // "blocks". + for (int r = 0; r < block64_in_sb; ++r) { + for (int c = 0; c < block64_in_sb; ++c) { + const int idx_in_sb = + r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64; + if (mi[idx_in_sb]) mi[idx_in_sb]->cdef_strength = 1; + } + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, nonrd_use_partition_time); +#endif + av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + pc_root); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, nonrd_use_partition_time); +#endif +} + +// This function initializes the stats for encode_rd_sb. +static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td, + const TileDataEnc *tile_data, + SIMPLE_MOTION_DATA_TREE *sms_root, + RD_STATS *rd_cost, int mi_row, int mi_col, + int gather_tpl_data) { + const AV1_COMMON *cm = &cpi->common; + const TileInfo *tile_info = &tile_data->tile_info; + MACROBLOCK *x = &td->mb; + + const SPEED_FEATURES *sf = &cpi->sf; + const int use_simple_motion_search = + (sf->part_sf.simple_motion_search_split || + sf->part_sf.simple_motion_search_prune_rect || + sf->part_sf.simple_motion_search_early_term_none || + sf->part_sf.ml_early_term_after_part_split_level) && + !frame_is_intra_only(cm); + if (use_simple_motion_search) { + av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_root, + mi_row, mi_col); + } + +#if !CONFIG_REALTIME_ONLY + if (!(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME && + cpi->oxcf.gf_cfg.lag_in_frames == 0)) { + init_ref_frame_space(cpi, td, mi_row, mi_col); + x->sb_energy_level = 0; + x->part_search_info.cnn_output_valid = 0; + if (gather_tpl_data) { + if (cm->delta_q_info.delta_q_present_flag) { + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes); + av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col); + } + + // TODO(jingning): revisit this function. + if (cpi->oxcf.algo_cfg.enable_tpl_model && (0)) { + adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col); + } + } + } +#else + (void)tile_info; + (void)mi_row; + (void)mi_col; + (void)gather_tpl_data; +#endif + + x->reuse_inter_pred = false; + x->txfm_search_params.mode_eval_type = DEFAULT_EVAL; + reset_mb_rd_record(x->txfm_search_info.mb_rd_record); + av1_zero(x->picked_ref_frames_mask); + av1_invalid_rd_stats(rd_cost); +} + +#if !CONFIG_REALTIME_ONLY +static void sb_qp_sweep_init_quantizers(AV1_COMP *cpi, ThreadData *td, + const TileDataEnc *tile_data, + SIMPLE_MOTION_DATA_TREE *sms_tree, + RD_STATS *rd_cost, int mi_row, + int mi_col, int delta_qp_ofs) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const TileInfo *tile_info = &tile_data->tile_info; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + assert(delta_q_info->delta_q_present_flag); + const int delta_q_res = delta_q_info->delta_q_res; + + const SPEED_FEATURES *sf = &cpi->sf; + const int use_simple_motion_search = + (sf->part_sf.simple_motion_search_split || + sf->part_sf.simple_motion_search_prune_rect || + sf->part_sf.simple_motion_search_early_term_none || + sf->part_sf.ml_early_term_after_part_split_level) && + !frame_is_intra_only(cm); + if (use_simple_motion_search) { + av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_tree, + mi_row, mi_col); + } + + int current_qindex = x->rdmult_cur_qindex + delta_qp_ofs; + + MACROBLOCKD *const xd = &x->e_mbd; + current_qindex = av1_adjust_q_from_delta_q_res( + delta_q_res, xd->current_base_qindex, current_qindex); + + x->delta_qindex = current_qindex - cm->quant_params.base_qindex; + + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + xd->mi[0]->current_qindex = current_qindex; + av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0); + + // keep track of any non-zero delta-q used + td->deltaq_used |= (x->delta_qindex != 0); + + if (cpi->oxcf.tool_cfg.enable_deltalf_mode) { + const int delta_lf_res = delta_q_info->delta_lf_res; + const int lfmask = ~(delta_lf_res - 1); + const int delta_lf_from_base = + ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask); + const int8_t delta_lf = + (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + const int mib_size = cm->seq_params->mib_size; + + // pre-set the delta lf for loop filter. Note that this value is set + // before mi is assigned for each block in current superblock + for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) { + for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) { + const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k); + mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf; + } + } + } + } + + x->reuse_inter_pred = false; + x->txfm_search_params.mode_eval_type = DEFAULT_EVAL; + reset_mb_rd_record(x->txfm_search_info.mb_rd_record); + av1_zero(x->picked_ref_frames_mask); + av1_invalid_rd_stats(rd_cost); +} + +static int sb_qp_sweep(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, + SIMPLE_MOTION_DATA_TREE *sms_tree, + SB_FIRST_PASS_STATS *sb_org_stats) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + RD_STATS rdc_winner, cur_rdc; + av1_invalid_rd_stats(&rdc_winner); + + int best_qindex = td->mb.rdmult_delta_qindex; + const int start = cm->current_frame.frame_type == KEY_FRAME ? -20 : -12; + const int end = cm->current_frame.frame_type == KEY_FRAME ? 20 : 12; + const int step = cm->delta_q_info.delta_q_res; + + for (int sweep_qp_delta = start; sweep_qp_delta <= end; + sweep_qp_delta += step) { + sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_tree, &cur_rdc, mi_row, + mi_col, sweep_qp_delta); + + const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); + const int backup_current_qindex = + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex; + + av1_reset_mbmi(&cm->mi_params, bsize, mi_row, mi_col); + av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col); + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = backup_current_qindex; + + td->pc_root = av1_alloc_pc_tree_node(bsize); + if (!td->pc_root) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, + &cur_rdc, cur_rdc, td->pc_root, sms_tree, NULL, + SB_DRY_PASS, NULL); + + if ((rdc_winner.rdcost > cur_rdc.rdcost) || + (abs(sweep_qp_delta) < abs(best_qindex - x->rdmult_delta_qindex) && + rdc_winner.rdcost == cur_rdc.rdcost)) { + rdc_winner = cur_rdc; + best_qindex = x->rdmult_delta_qindex + sweep_qp_delta; + } + } + + return best_qindex; +} +#endif //! CONFIG_REALTIME_ONLY + +/*!\brief Encode a superblock (RD-search-based) + * + * \ingroup partition_search + * Conducts partition search for a superblock, based on rate-distortion costs, + * from scratch or adjusting from a pre-calculated partition pattern. + */ +static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + const int mi_row, const int mi_col, + const int seg_skip) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const SPEED_FEATURES *const sf = &cpi->sf; + const TileInfo *const tile_info = &tile_data->tile_info; + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int num_planes = av1_num_planes(cm); + int dummy_rate; + int64_t dummy_dist; + RD_STATS dummy_rdc; + SIMPLE_MOTION_DATA_TREE *const sms_root = td->sms_root; + +#if CONFIG_REALTIME_ONLY + (void)seg_skip; +#endif // CONFIG_REALTIME_ONLY + + init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col, + 1); + + // Encode the superblock + if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) { + // partition search starting from a variance-based partition + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_use_partition_time); +#endif + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + &dummy_rate, &dummy_dist, 1, td->pc_root); + av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, + sf->part_sf.partition_search_type); + td->pc_root = NULL; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_use_partition_time); +#endif + } +#if !CONFIG_REALTIME_ONLY + else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) { + // partition search by adjusting a fixed-size partition + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + const BLOCK_SIZE bsize = + seg_skip ? sb_size : sf->part_sf.fixed_partition_size; + av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + &dummy_rate, &dummy_dist, 1, td->pc_root); + av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, + sf->part_sf.partition_search_type); + td->pc_root = NULL; + } else { + // The most exhaustive recursive partition search + SuperBlockEnc *sb_enc = &x->sb_enc; + // No stats for overlay frames. Exclude key frame. + av1_get_tpl_stats_sb(cpi, sb_size, mi_row, mi_col, sb_enc); + + // Reset the tree for simple motion search data + av1_reset_simple_motion_tree_partition(sms_root, sb_size); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_partition_time); +#endif + + // Estimate the maximum square partition block size, which will be used + // as the starting block size for partitioning the sb + set_max_min_partition_size(sb_enc, cpi, x, sf, sb_size, mi_row, mi_col); + + // The superblock can be searched only once, or twice consecutively for + // better quality. Note that the meaning of passes here is different from + // the general concept of 1-pass/2-pass encoders. + const int num_passes = + cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1; + + if (cpi->oxcf.sb_qp_sweep && + !(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME && + cpi->oxcf.gf_cfg.lag_in_frames == 0) && + cm->delta_q_info.delta_q_present_flag) { + AOM_CHECK_MEM_ERROR( + x->e_mbd.error_info, td->mb.sb_stats_cache, + (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_stats_cache))); + av1_backup_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row, + mi_col); + assert(x->rdmult_delta_qindex == x->delta_qindex); + + const int best_qp_diff = + sb_qp_sweep(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, sms_root, + td->mb.sb_stats_cache) - + x->rdmult_delta_qindex; + + sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_root, &dummy_rdc, + mi_row, mi_col, best_qp_diff); + + const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); + const int backup_current_qindex = + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex; + + av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col); + av1_restore_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row, + mi_col); + + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = + backup_current_qindex; + aom_free(td->mb.sb_stats_cache); + td->mb.sb_stats_cache = NULL; + } + if (num_passes == 1) { +#if CONFIG_PARTITION_SEARCH_ORDER + if (cpi->ext_part_controller.ready && !frame_is_intra_only(cm)) { + av1_reset_part_sf(&cpi->sf.part_sf); + av1_reset_sf_for_ext_part(cpi); + RD_STATS this_rdc; + av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row, + mi_col, sb_size, &this_rdc); + } else { + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + &dummy_rdc, dummy_rdc, td->pc_root, sms_root, + NULL, SB_SINGLE_PASS, NULL); + } +#else + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL, + SB_SINGLE_PASS, NULL); +#endif // CONFIG_PARTITION_SEARCH_ORDER + } else { + // First pass + AOM_CHECK_MEM_ERROR( + x->e_mbd.error_info, td->mb.sb_fp_stats, + (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_fp_stats))); + av1_backup_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row, + mi_col); + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL, + SB_DRY_PASS, NULL); + + // Second pass + init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, + mi_col, 0); + av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col); + av1_reset_simple_motion_tree_partition(sms_root, sb_size); + + av1_restore_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row, + mi_col); + + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL, + SB_WET_PASS, NULL); + aom_free(td->mb.sb_fp_stats); + td->mb.sb_fp_stats = NULL; + } + + // Reset to 0 so that it wouldn't be used elsewhere mistakenly. + sb_enc->tpl_data_count = 0; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_partition_time); +#endif + } +#endif // !CONFIG_REALTIME_ONLY + + // Update the inter rd model + // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile. + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && + cm->tiles.cols == 1 && cm->tiles.rows == 1) { + av1_inter_mode_data_fit(tile_data, x->rdmult); + } +} + +// Check if the cost update of symbols mode, coeff and dv are tile or off. +static AOM_INLINE int is_mode_coeff_dv_upd_freq_tile_or_off( + const AV1_COMP *const cpi) { + const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf; + + return (inter_sf->coeff_cost_upd_level <= INTERNAL_COST_UPD_TILE && + inter_sf->mode_cost_upd_level <= INTERNAL_COST_UPD_TILE && + cpi->sf.intra_sf.dv_cost_upd_level <= INTERNAL_COST_UPD_TILE); +} + +// When row-mt is enabled and cost update frequencies are set to off/tile, +// processing of current SB can start even before processing of top-right SB +// is finished. This function checks if it is sufficient to wait for top SB +// to finish processing before current SB starts processing. +static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) { + const MODE mode = cpi->oxcf.mode; + if (mode == GOOD) return 0; + + if (mode == ALLINTRA) + return is_mode_coeff_dv_upd_freq_tile_or_off(cpi); + else if (mode == REALTIME) + return (is_mode_coeff_dv_upd_freq_tile_or_off(cpi) && + cpi->sf.inter_sf.mv_cost_upd_level <= INTERNAL_COST_UPD_TILE); + else + return 0; +} + +/*!\brief Calculate source SAD at superblock level using 64x64 block source SAD + * + * \ingroup partition_search + * \callgraph + * \callergraph + */ +static AOM_INLINE uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row, + int mi_col) { + if (cpi->src_sad_blk_64x64 == NULL) return UINT64_MAX; + + const AV1_COMMON *const cm = &cpi->common; + const int blk_64x64_in_mis = (cm->seq_params->sb_size == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; + const int num_blk_64x64_cols = + (cm->mi_params.mi_cols + blk_64x64_in_mis - 1) / blk_64x64_in_mis; + const int num_blk_64x64_rows = + (cm->mi_params.mi_rows + blk_64x64_in_mis - 1) / blk_64x64_in_mis; + const int blk_64x64_col_index = mi_col / blk_64x64_in_mis; + const int blk_64x64_row_index = mi_row / blk_64x64_in_mis; + uint64_t curr_sb_sad = UINT64_MAX; + const uint64_t *const src_sad_blk_64x64_data = + &cpi->src_sad_blk_64x64[blk_64x64_col_index + + blk_64x64_row_index * num_blk_64x64_cols]; + if (cm->seq_params->sb_size == BLOCK_128X128 && + blk_64x64_col_index + 1 < num_blk_64x64_cols && + blk_64x64_row_index + 1 < num_blk_64x64_rows) { + // Calculate SB source SAD by accumulating source SAD of 64x64 blocks in the + // superblock + curr_sb_sad = src_sad_blk_64x64_data[0] + src_sad_blk_64x64_data[1] + + src_sad_blk_64x64_data[num_blk_64x64_cols] + + src_sad_blk_64x64_data[num_blk_64x64_cols + 1]; + } else if (cm->seq_params->sb_size == BLOCK_64X64) { + curr_sb_sad = src_sad_blk_64x64_data[0]; + } + return curr_sb_sad; +} + +/*!\brief Determine whether grading content can be skipped based on sad stat + * + * \ingroup partition_search + * \callgraph + * \callergraph + */ +static AOM_INLINE bool is_calc_src_content_needed(AV1_COMP *cpi, + MACROBLOCK *const x, + int mi_row, int mi_col) { + if (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) + return true; + const uint64_t curr_sb_sad = get_sb_source_sad(cpi, mi_row, mi_col); + if (curr_sb_sad == UINT64_MAX) return true; + if (curr_sb_sad == 0) { + x->content_state_sb.source_sad_nonrd = kZeroSad; + return false; + } + AV1_COMMON *const cm = &cpi->common; + bool do_calc_src_content = true; + + if (cpi->oxcf.speed < 9) return do_calc_src_content; + + // TODO(yunqing): Tune/validate the thresholds for 128x128 SB size. + if (AOMMIN(cm->width, cm->height) < 360) { + // Derive Average 64x64 block source SAD from SB source SAD + const uint64_t avg_64x64_blk_sad = + (cm->seq_params->sb_size == BLOCK_128X128) ? ((curr_sb_sad + 2) >> 2) + : curr_sb_sad; + + // The threshold is determined based on kLowSad and kHighSad threshold and + // test results. + const uint64_t thresh_low = 15000; + const uint64_t thresh_high = 40000; + + if (avg_64x64_blk_sad > thresh_low && avg_64x64_blk_sad < thresh_high) { + do_calc_src_content = false; + // Note: set x->content_state_sb.source_sad_rd as well if this is extended + // to RTC rd path. + x->content_state_sb.source_sad_nonrd = kMedSad; + } + } + + return do_calc_src_content; +} + +/*!\brief Determine whether grading content is needed based on sf and frame stat + * + * \ingroup partition_search + * \callgraph + * \callergraph + */ +// TODO(any): consolidate sfs to make interface cleaner +static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi, + MACROBLOCK *const x, + TileDataEnc *tile_data, + int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + if (cm->current_frame.frame_type == KEY_FRAME || + (cpi->ppi->use_svc && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) { + assert(x->content_state_sb.source_sad_nonrd == kMedSad); + assert(x->content_state_sb.source_sad_rd == kMedSad); + return; + } + bool calc_src_content = false; + + if (cpi->sf.rt_sf.source_metrics_sb_nonrd) { + if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0) { + calc_src_content = is_calc_src_content_needed(cpi, x, mi_row, mi_col); + } else { + x->content_state_sb.source_sad_nonrd = kZeroSad; + } + } else if ((cpi->sf.rt_sf.var_part_based_on_qidx >= 1) && + (cm->width * cm->height <= 352 * 288)) { + if (cpi->rc.frame_source_sad > 0) + calc_src_content = true; + else + x->content_state_sb.source_sad_rd = kZeroSad; + } + if (calc_src_content) + av1_source_content_sb(cpi, x, tile_data, mi_row, mi_col); +} + +/*!\brief Encode a superblock row by breaking it into superblocks + * + * \ingroup partition_search + * \callgraph + * \callergraph + * Do partition and mode search for an sb row: one row of superblocks filling up + * the width of the current tile. + */ +static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, int mi_row, + TokenExtra **tp) { + AV1_COMMON *const cm = &cpi->common; + const TileInfo *const tile_info = &tile_data->tile_info; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; + bool row_mt_enabled = mt_info->row_mt_enabled; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int mib_size = cm->seq_params->mib_size; + const int mib_size_log2 = cm->seq_params->mib_size_log2; + const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2; + const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_row_time); +#endif + + // Initialize the left context for the new SB row + av1_zero_left_context(xd); + + // Reset delta for quantizer and loof filters at the beginning of every tile + if (mi_row == tile_info->mi_row_start || row_mt_enabled) { + if (cm->delta_q_info.delta_q_present_flag) + xd->current_base_qindex = cm->quant_params.base_qindex; + if (cm->delta_q_info.delta_lf_present_flag) { + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + } + } + + reset_thresh_freq_fact(x); + + // Code each SB in the row + for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0; + mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) { + // In realtime/allintra mode and when frequency of cost updates is off/tile, + // wait for the top superblock to finish encoding. Otherwise, wait for the + // top-right superblock to finish encoding. + enc_row_mt->sync_read_ptr( + row_mt_sync, sb_row, sb_col_in_tile - delay_wait_for_top_right_sb(cpi)); + +#if CONFIG_MULTITHREAD + if (row_mt_enabled) { + pthread_mutex_lock(enc_row_mt->mutex_); + const bool row_mt_exit = enc_row_mt->row_mt_exit; + pthread_mutex_unlock(enc_row_mt->mutex_); + // Exit in case any worker has encountered an error. + if (row_mt_exit) return; + } +#endif + + const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled; + if (update_cdf && (tile_info->mi_row_start != mi_row)) { + if ((tile_info->mi_col_start == mi_col)) { + // restore frame context at the 1st column sb + memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx)); + } else { + // update context + int wt_left = AVG_CDF_WEIGHT_LEFT; + int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT; + if (tile_info->mi_col_end > (mi_col + mib_size)) + av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile, + wt_left, wt_tr); + else + av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1, + wt_left, wt_tr); + } + } + + // Update the rate cost tables for some symbols + av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col); + + // Reset color coding related parameters + av1_zero(x->color_sensitivity_sb); + av1_zero(x->color_sensitivity_sb_g); + av1_zero(x->color_sensitivity_sb_alt); + av1_zero(x->color_sensitivity); + x->content_state_sb.source_sad_nonrd = kMedSad; + x->content_state_sb.source_sad_rd = kMedSad; + x->content_state_sb.lighting_change = 0; + x->content_state_sb.low_sumdiff = 0; + x->force_zeromv_skip_for_sb = 0; + x->sb_me_block = 0; + x->sb_me_partition = 0; + x->sb_me_mv.as_int = 0; + + if (cpi->oxcf.mode == ALLINTRA) { + x->intra_sb_rdmult_modifier = 128; + } + + xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv; + x->source_variance = UINT_MAX; + td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col); + + // Get segment id and skip flag + const struct segmentation *const seg = &cm->seg; + int seg_skip = 0; + if (seg->enabled) { + const uint8_t *const map = + seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; + const uint8_t segment_id = + map ? get_segment_id(&cm->mi_params, map, sb_size, mi_row, mi_col) + : 0; + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); + } + + produce_gradients_for_sb(cpi, x, sb_size, mi_row, mi_col); + + init_src_var_info_of_4x4_sub_blocks(cpi, x->src_var_info_of_4x4_sub_blocks, + sb_size); + + // Grade the temporal variation of the sb, the grade will be used to decide + // fast mode search strategy for coding blocks + grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col); + + // encode the superblock + if (use_nonrd_mode) { + encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip); + } else { + encode_rd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip); + } + + // Update the top-right context in row_mt coding + if (update_cdf && (tile_info->mi_row_end > (mi_row + mib_size))) { + if (sb_cols_in_tile == 1) + memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx)); + else if (sb_col_in_tile >= 1) + memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx, + sizeof(*xd->tile_ctx)); + } + enc_row_mt->sync_write_ptr(row_mt_sync, sb_row, sb_col_in_tile, + sb_cols_in_tile); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_row_time); +#endif +} + +static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + + // Copy data over into macro block data structures. + av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, + cm->seq_params->sb_size); + + av1_setup_block_planes(xd, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, num_planes); +} + +void av1_alloc_tile_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + av1_row_mt_mem_dealloc(cpi); + + aom_free(cpi->tile_data); + cpi->allocated_tiles = 0; + enc_row_mt->allocated_tile_cols = 0; + enc_row_mt->allocated_tile_rows = 0; + + CHECK_MEM_ERROR( + cm, cpi->tile_data, + aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); + + cpi->allocated_tiles = tile_cols * tile_rows; + enc_row_mt->allocated_tile_cols = tile_cols; + enc_row_mt->allocated_tile_rows = tile_rows; + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + av1_zero(this_tile->row_mt_sync); + this_tile->row_ctx = NULL; + } + } +} + +void av1_init_tile_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int tile_col, tile_row; + TokenInfo *const token_info = &cpi->token_info; + TokenExtra *pre_tok = token_info->tile_tok[0][0]; + TokenList *tplist = token_info->tplist[0][0]; + unsigned int tile_tok = 0; + int tplist_count = 0; + + if (!is_stat_generation_stage(cpi) && + cm->features.allow_screen_content_tools) { + // Number of tokens for which token info needs to be allocated. + unsigned int tokens_required = + get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols, + MAX_SB_SIZE_LOG2, num_planes); + // Allocate/reallocate memory for token related info if the number of tokens + // required is more than the number of tokens already allocated. This could + // occur in case of the following: + // 1) If the memory is not yet allocated + // 2) If the frame dimensions have changed + const bool realloc_tokens = tokens_required > token_info->tokens_allocated; + if (realloc_tokens) { + free_token_info(token_info); + alloc_token_info(cm, token_info, tokens_required); + pre_tok = token_info->tile_tok[0][0]; + tplist = token_info->tplist[0][0]; + } + } + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileInfo *const tile_info = &tile_data->tile_info; + av1_tile_init(tile_info, cm, tile_row, tile_col); + tile_data->firstpass_top_mv = kZeroMv; + tile_data->abs_sum_level = 0; + + if (is_token_info_allocated(token_info)) { + token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; + pre_tok = token_info->tile_tok[tile_row][tile_col]; + tile_tok = allocated_tokens( + tile_info, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, + num_planes); + token_info->tplist[tile_row][tile_col] = tplist + tplist_count; + tplist = token_info->tplist[tile_row][tile_col]; + tplist_count = av1_get_sb_rows_in_tile(cm, tile_info); + } + tile_data->allow_update_cdf = !cm->tiles.large_scale; + tile_data->allow_update_cdf = tile_data->allow_update_cdf && + !cm->features.disable_cdf_update && + !delay_wait_for_top_right_sb(cpi); + tile_data->tctx = *cm->fc; + } + } +} + +// Populate the start palette token info prior to encoding an SB row. +static AOM_INLINE void get_token_start(AV1_COMP *cpi, const TileInfo *tile_info, + int tile_row, int tile_col, int mi_row, + TokenExtra **tp) { + const TokenInfo *token_info = &cpi->token_info; + if (!is_token_info_allocated(token_info)) return; + + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col]; + const int sb_row_in_tile = + (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2; + + get_start_tok(cpi, tile_row, tile_col, mi_row, tp, + cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes); + assert(tplist != NULL); + tplist[sb_row_in_tile].start = *tp; +} + +// Populate the token count after encoding an SB row. +static AOM_INLINE void populate_token_count(AV1_COMP *cpi, + const TileInfo *tile_info, + int tile_row, int tile_col, + int mi_row, TokenExtra *tok) { + const TokenInfo *token_info = &cpi->token_info; + if (!is_token_info_allocated(token_info)) return; + + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + TokenList *const tplist = token_info->tplist[tile_row][tile_col]; + const int sb_row_in_tile = + (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2; + const int tile_mb_cols = + (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; + const int num_mb_rows_in_sb = + ((1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4; + tplist[sb_row_in_tile].count = + (unsigned int)(tok - tplist[sb_row_in_tile].start); + + assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <= + get_token_alloc(num_mb_rows_in_sb, tile_mb_cols, + cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, + num_planes)); + + (void)num_planes; + (void)tile_mb_cols; + (void)num_mb_rows_in_sb; +} + +/*!\brief Encode a superblock row + * + * \ingroup partition_search + */ +void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row, + int tile_col, int mi_row) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + TokenExtra *tok = NULL; + + get_token_start(cpi, tile_info, tile_row, tile_col, mi_row, &tok); + + encode_sb_row(cpi, td, this_tile, mi_row, &tok); + + populate_token_count(cpi, tile_info, tile_row, tile_col, mi_row, tok); +} + +/*!\brief Encode a tile + * + * \ingroup partition_search + */ +void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, + int tile_col) { + AV1_COMMON *const cm = &cpi->common; + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + if (!cpi->sf.rt_sf.use_nonrd_pick_mode) av1_inter_mode_data_init(this_tile); + + av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start, + tile_info->mi_col_end, tile_row); + av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, + &td->mb.e_mbd); + + if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra) + cfl_init(&td->mb.e_mbd.cfl, cm->seq_params); + + if (td->mb.txfm_search_info.mb_rd_record != NULL) { + av1_crc32c_calculator_init( + &td->mb.txfm_search_info.mb_rd_record->crc_calculator); + } + + for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; + mi_row += cm->seq_params->mib_size) { + av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); + } + this_tile->abs_sum_level = td->abs_sum_level; +} + +/*!\brief Break one frame into tiles and encode the tiles + * + * \ingroup partition_search + * + * \param[in] cpi Top-level encoder structure + */ +static AOM_INLINE void encode_tiles(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int tile_col, tile_row; + + MACROBLOCK *const mb = &cpi->td.mb; + assert(IMPLIES(cpi->tile_data == NULL, + cpi->allocated_tiles < tile_cols * tile_rows)); + if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi); + + av1_init_tile_data(cpi); + av1_alloc_mb_data(cpi, mb); + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; + cpi->td.intrabc_used = 0; + cpi->td.deltaq_used = 0; + cpi->td.abs_sum_level = 0; + cpi->td.rd_counts.seg_tmp_pred_cost[0] = 0; + cpi->td.rd_counts.seg_tmp_pred_cost[1] = 0; + cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; + cpi->td.mb.tile_pb_ctx = &this_tile->tctx; + av1_init_rtc_counters(&cpi->td.mb); + cpi->td.mb.palette_pixels = 0; + av1_encode_tile(cpi, &cpi->td, tile_row, tile_col); + if (!frame_is_intra_only(&cpi->common)) + av1_accumulate_rtc_counters(cpi, &cpi->td.mb); + cpi->palette_pixel_num += cpi->td.mb.palette_pixels; + cpi->intrabc_used |= cpi->td.intrabc_used; + cpi->deltaq_used |= cpi->td.deltaq_used; + } + } + + av1_dealloc_mb_data(mb, av1_num_planes(cm)); +} + +// Set the relative distance of a reference frame w.r.t. current frame +static AOM_INLINE void set_rel_frame_dist( + const AV1_COMMON *const cm, RefFrameDistanceInfo *const ref_frame_dist_info, + const int ref_frame_flags) { + MV_REFERENCE_FRAME ref_frame; + int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX; + ref_frame_dist_info->nearest_past_ref = NONE_FRAME; + ref_frame_dist_info->nearest_future_ref = NONE_FRAME; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = 0; + if (ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + int dist = av1_encoder_get_relative_dist( + cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME], + cm->current_frame.display_order_hint); + ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = dist; + // Get the nearest ref_frame in the past + if (abs(dist) < min_past_dist && dist < 0) { + ref_frame_dist_info->nearest_past_ref = ref_frame; + min_past_dist = abs(dist); + } + // Get the nearest ref_frame in the future + if (dist < min_future_dist && dist > 0) { + ref_frame_dist_info->nearest_future_ref = ref_frame; + min_future_dist = dist; + } + } + } +} + +static INLINE int refs_are_one_sided(const AV1_COMMON *cm) { + assert(!frame_is_intra_only(cm)); + + int one_sided_refs = 1; + const int cur_display_order_hint = cm->current_frame.display_order_hint; + for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); + if (buf == NULL) continue; + if (av1_encoder_get_relative_dist(buf->display_order_hint, + cur_display_order_hint) > 0) { + one_sided_refs = 0; // bwd reference + break; + } + } + return one_sided_refs; +} + +static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm, + int ref_order_hint[2]) { + const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; + ref_order_hint[0] = ref_order_hint[1] = 0; + if (!skip_mode_info->skip_mode_allowed) return; + + const RefCntBuffer *const buf_0 = + get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0); + const RefCntBuffer *const buf_1 = + get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1); + assert(buf_0 != NULL && buf_1 != NULL); + + ref_order_hint[0] = buf_0->order_hint; + ref_order_hint[1] = buf_1->order_hint; +} + +static int check_skip_mode_enabled(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + + av1_setup_skip_mode_allowed(cm); + if (!cm->current_frame.skip_mode_info.skip_mode_allowed) return 0; + + // Turn off skip mode if the temporal distances of the reference pair to the + // current frame are different by more than 1 frame. + const int cur_offset = (int)cm->current_frame.order_hint; + int ref_offset[2]; + get_skip_mode_ref_offsets(cm, ref_offset); + const int cur_to_ref0 = get_relative_dist(&cm->seq_params->order_hint_info, + cur_offset, ref_offset[0]); + const int cur_to_ref1 = abs(get_relative_dist( + &cm->seq_params->order_hint_info, cur_offset, ref_offset[1])); + if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0; + + // High Latency: Turn off skip mode if all refs are fwd. + if (cpi->all_one_sided_refs && cpi->oxcf.gf_cfg.lag_in_frames > 0) return 0; + + const int ref_frame[2] = { + cm->current_frame.skip_mode_info.ref_frame_idx_0 + LAST_FRAME, + cm->current_frame.skip_mode_info.ref_frame_idx_1 + LAST_FRAME + }; + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[0]]) || + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]])) + return 0; + + return 1; +} + +static AOM_INLINE void set_default_interp_skip_flags( + const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) { + const int num_planes = av1_num_planes(cm); + interp_search_flags->default_interp_skip_flags = + (num_planes == 1) ? INTERP_SKIP_LUMA_EVAL_CHROMA + : INTERP_SKIP_LUMA_SKIP_CHROMA; +} + +static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) { + if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp || + cpi->sf.inter_sf.disable_onesided_comp) && + cpi->all_one_sided_refs) { + // Disable all compound references + cpi->prune_ref_frame_mask = (1 << MODE_CTX_REF_FRAMES) - (1 << REF_FRAMES); + } else if (!cpi->sf.rt_sf.use_nonrd_pick_mode && + cpi->sf.inter_sf.selective_ref_frame >= 2) { + AV1_COMMON *const cm = &cpi->common; + const int cur_frame_display_order_hint = + cm->current_frame.display_order_hint; + unsigned int *ref_display_order_hint = + cm->cur_frame->ref_display_order_hint; + const int arf2_dist = av1_encoder_get_relative_dist( + ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME], + cur_frame_display_order_hint); + const int bwd_dist = av1_encoder_get_relative_dist( + ref_display_order_hint[BWDREF_FRAME - LAST_FRAME], + cur_frame_display_order_hint); + + for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) { + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_idx); + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) || + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]])) { + continue; + } + + if (!cpi->all_one_sided_refs) { + int ref_dist[2]; + for (int i = 0; i < 2; ++i) { + ref_dist[i] = av1_encoder_get_relative_dist( + ref_display_order_hint[rf[i] - LAST_FRAME], + cur_frame_display_order_hint); + } + + // One-sided compound is used only when all reference frames are + // one-sided. + if ((ref_dist[0] > 0) == (ref_dist[1] > 0)) { + cpi->prune_ref_frame_mask |= 1 << ref_idx; + } + } + + if (cpi->sf.inter_sf.selective_ref_frame >= 4 && + (rf[0] == ALTREF2_FRAME || rf[1] == ALTREF2_FRAME) && + (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) { + // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references. + if (arf2_dist > 0 && bwd_dist > 0 && bwd_dist <= arf2_dist) { + // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer + // reference to the current frame than ALTREF2_FRAME + cpi->prune_ref_frame_mask |= 1 << ref_idx; + } + } + } + } +} + +static int allow_deltaq_mode(AV1_COMP *cpi) { +#if !CONFIG_REALTIME_ONLY + AV1_COMMON *const cm = &cpi->common; + BLOCK_SIZE sb_size = cm->seq_params->sb_size; + int sbs_wide = mi_size_wide[sb_size]; + int sbs_high = mi_size_high[sb_size]; + + int64_t delta_rdcost = 0; + for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sbs_high) { + for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sbs_wide) { + int64_t this_delta_rdcost = 0; + av1_get_q_for_deltaq_objective(cpi, &cpi->td, &this_delta_rdcost, sb_size, + mi_row, mi_col); + delta_rdcost += this_delta_rdcost; + } + } + return delta_rdcost < 0; +#else + (void)cpi; + return 1; +#endif // !CONFIG_REALTIME_ONLY +} + +#define FORCE_ZMV_SKIP_128X128_BLK_DIFF 10000 +#define FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF 4 + +// Populates block level thresholds for force zeromv-skip decision +static void populate_thresh_to_force_zeromv_skip(AV1_COMP *cpi) { + if (cpi->sf.rt_sf.part_early_exit_zeromv == 0) return; + + // Threshold for forcing zeromv-skip decision is as below: + // For 128x128 blocks, threshold is 10000 and per pixel threshold is 0.6103. + // For 64x64 blocks, threshold is 5000 and per pixel threshold is 1.221 + // allowing slightly higher error for smaller blocks. + // Per Pixel Threshold of 64x64 block Area of 64x64 block 1 1 + // ------------------------------------=sqrt(---------------------)=sqrt(-)=- + // Per Pixel Threshold of 128x128 block Area of 128x128 block 4 2 + // Thus, per pixel thresholds for blocks of size 32x32, 16x16,... can be + // chosen as 2.442, 4.884,.... As the per pixel error tends to be higher for + // small blocks, the same is clipped to 4. + const unsigned int thresh_exit_128x128_part = FORCE_ZMV_SKIP_128X128_BLK_DIFF; + const int num_128x128_pix = + block_size_wide[BLOCK_128X128] * block_size_high[BLOCK_128X128]; + + for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) { + const int num_block_pix = block_size_wide[bsize] * block_size_high[bsize]; + + // Calculate the threshold for zeromv-skip decision based on area of the + // partition + unsigned int thresh_exit_part_blk = + (unsigned int)(thresh_exit_128x128_part * + sqrt((double)num_block_pix / num_128x128_pix) + + 0.5); + thresh_exit_part_blk = AOMMIN( + thresh_exit_part_blk, + (unsigned int)(FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF * num_block_pix)); + cpi->zeromv_skip_thresh_exit_part[bsize] = thresh_exit_part_blk; + } +} + +static void free_block_hash_buffers(uint32_t *block_hash_values[2][2], + int8_t *is_block_same[2][3]) { + for (int k = 0; k < 2; ++k) { + for (int j = 0; j < 2; ++j) { + aom_free(block_hash_values[k][j]); + } + + for (int j = 0; j < 3; ++j) { + aom_free(is_block_same[k][j]); + } + } +} + +/*!\brief Encoder setup(only for the current frame), encoding, and recontruction + * for a single frame + * + * \ingroup high_level_algo + */ +static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { + ThreadData *const td = &cpi->td; + MACROBLOCK *const x = &td->mb; + AV1_COMMON *const cm = &cpi->common; + CommonModeInfoParams *const mi_params = &cm->mi_params; + FeatureFlags *const features = &cm->features; + MACROBLOCKD *const xd = &x->e_mbd; + RD_COUNTS *const rdc = &cpi->td.rd_counts; +#if CONFIG_FPMT_TEST + FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs; + FrameProbInfo *const temp_frame_probs_simulation = + &cpi->ppi->temp_frame_probs_simulation; +#endif + FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs; + IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const DELTAQ_MODE deltaq_mode = oxcf->q_cfg.deltaq_mode; + int i; + + if (!cpi->sf.rt_sf.use_nonrd_pick_mode) { + mi_params->setup_mi(mi_params); + } + + set_mi_offsets(mi_params, xd, 0, 0); + + av1_zero(*td->counts); + av1_zero(rdc->tx_type_used); + av1_zero(rdc->obmc_used); + av1_zero(rdc->warped_used); + av1_zero(rdc->seg_tmp_pred_cost); + + // Reset the flag. + cpi->intrabc_used = 0; + // Need to disable intrabc when superres is selected + if (av1_superres_scaled(cm)) { + features->allow_intrabc = 0; + } + + features->allow_intrabc &= (oxcf->kf_cfg.enable_intrabc); + + if (features->allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int warped_probability = +#if CONFIG_FPMT_TEST + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE + ? temp_frame_probs->warped_probs[update_type] + : +#endif // CONFIG_FPMT_TEST + frame_probs->warped_probs[update_type]; + if (warped_probability < cpi->sf.inter_sf.prune_warped_prob_thresh) + features->allow_warped_motion = 0; + } + + int hash_table_created = 0; + if (!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) && + !cpi->sf.rt_sf.use_nonrd_pick_mode) { + // TODO(any): move this outside of the recoding loop to avoid recalculating + // the hash table. + // add to hash table + const int pic_width = cpi->source->y_crop_width; + const int pic_height = cpi->source->y_crop_height; + uint32_t *block_hash_values[2][2] = { { NULL } }; + int8_t *is_block_same[2][3] = { { NULL } }; + int k, j; + bool error = false; + + for (k = 0; k < 2 && !error; ++k) { + for (j = 0; j < 2; ++j) { + block_hash_values[k][j] = (uint32_t *)aom_malloc( + sizeof(*block_hash_values[0][0]) * pic_width * pic_height); + if (!block_hash_values[k][j]) { + error = true; + break; + } + } + + for (j = 0; j < 3 && !error; ++j) { + is_block_same[k][j] = (int8_t *)aom_malloc( + sizeof(*is_block_same[0][0]) * pic_width * pic_height); + if (!is_block_same[k][j]) error = true; + } + } + + av1_hash_table_init(intrabc_hash_info); + if (error || + !av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) { + free_block_hash_buffers(block_hash_values, is_block_same); + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating intrabc_hash_table and buffers"); + } + hash_table_created = 1; + av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source, + block_hash_values[0], is_block_same[0]); + // Hash data generated for screen contents is used for intraBC ME + const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize]; + const int max_sb_size = + (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)); + int src_idx = 0; + for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) { + const int dst_idx = !src_idx; + av1_generate_block_hash_value( + intrabc_hash_info, cpi->source, size, block_hash_values[src_idx], + block_hash_values[dst_idx], is_block_same[src_idx], + is_block_same[dst_idx]); + if (size >= min_alloc_size) { + if (!av1_add_to_hash_map_by_row_with_precal_data( + &intrabc_hash_info->intrabc_hash_table, + block_hash_values[dst_idx], is_block_same[dst_idx][2], + pic_width, pic_height, size)) { + error = true; + break; + } + } + } + + free_block_hash_buffers(block_hash_values, is_block_same); + + if (error) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error adding data to intrabc_hash_table"); + } + } + + const CommonQuantParams *quant_params = &cm->quant_params; + for (i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = + cm->seg.enabled ? av1_get_qindex(&cm->seg, i, quant_params->base_qindex) + : quant_params->base_qindex; + xd->lossless[i] = + qindex == 0 && quant_params->y_dc_delta_q == 0 && + quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 && + quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0; + if (xd->lossless[i]) cpi->enc_seg.has_lossless_segment = 1; + xd->qindex[i] = qindex; + if (xd->lossless[i]) { + cpi->optimize_seg_arr[i] = NO_TRELLIS_OPT; + } else { + cpi->optimize_seg_arr[i] = cpi->sf.rd_sf.optimize_coefficients; + } + } + features->coded_lossless = is_coded_lossless(cm, xd); + features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm); + + // Fix delta q resolution for the moment + + cm->delta_q_info.delta_q_res = 0; + if (cpi->use_ducky_encode) { + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_DUCKY_ENCODE; + } else if (cpi->oxcf.q_cfg.aq_mode != CYCLIC_REFRESH_AQ) { + if (deltaq_mode == DELTA_Q_OBJECTIVE) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE; + else if (deltaq_mode == DELTA_Q_PERCEPTUAL) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; + else if (deltaq_mode == DELTA_Q_PERCEPTUAL_AI) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; + else if (deltaq_mode == DELTA_Q_USER_RATING_BASED) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; + else if (deltaq_mode == DELTA_Q_HDR) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; + // Set delta_q_present_flag before it is used for the first time + cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES; + cm->delta_q_info.delta_q_present_flag = deltaq_mode != NO_DELTA_Q; + + // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q + // is used for ineligible frames. That effectively will turn off row_mt + // usage. Note objective delta_q and tpl eligible frames are only altref + // frames currently. + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (cm->delta_q_info.delta_q_present_flag) { + if (deltaq_mode == DELTA_Q_OBJECTIVE && + gf_group->update_type[cpi->gf_frame_index] == LF_UPDATE) + cm->delta_q_info.delta_q_present_flag = 0; + + if (deltaq_mode == DELTA_Q_OBJECTIVE && + cm->delta_q_info.delta_q_present_flag) { + cm->delta_q_info.delta_q_present_flag &= allow_deltaq_mode(cpi); + } + } + + // Reset delta_q_used flag + cpi->deltaq_used = 0; + + cm->delta_q_info.delta_lf_present_flag = + cm->delta_q_info.delta_q_present_flag && + oxcf->tool_cfg.enable_deltalf_mode; + cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI; + + // update delta_q_present_flag and delta_lf_present_flag based on + // base_qindex + cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0; + cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0; + } else if (cpi->cyclic_refresh->apply_cyclic_refresh || + cpi->svc.number_temporal_layers == 1) { + cpi->cyclic_refresh->actual_num_seg1_blocks = 0; + cpi->cyclic_refresh->actual_num_seg2_blocks = 0; + } + cpi->rc.cnt_zeromv = 0; + + av1_frame_init_quantizer(cpi); + init_encode_frame_mb_context(cpi); + set_default_interp_skip_flags(cm, &cpi->interp_search_flags); + + if (cm->prev_frame && cm->prev_frame->seg.enabled) + cm->last_frame_seg_map = cm->prev_frame->seg_map; + else + cm->last_frame_seg_map = NULL; + if (features->allow_intrabc || features->coded_lossless) { + av1_set_default_ref_deltas(cm->lf.ref_deltas); + av1_set_default_mode_deltas(cm->lf.mode_deltas); + } else if (cm->prev_frame) { + memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES); + memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS); + } + memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES); + memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS); + + cpi->all_one_sided_refs = + frame_is_intra_only(cm) ? 0 : refs_are_one_sided(cm); + + cpi->prune_ref_frame_mask = 0; + // Figure out which ref frames can be skipped at frame level. + setup_prune_ref_frame_mask(cpi); + + x->txfm_search_info.txb_split_count = 0; +#if CONFIG_SPEED_STATS + x->txfm_search_info.tx_search_count = 0; +#endif // CONFIG_SPEED_STATS + +#if !CONFIG_REALTIME_ONLY +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_compute_global_motion_time); +#endif + av1_compute_global_motion_facade(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_compute_global_motion_time); +#endif +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_setup_motion_field_time); +#endif + av1_calculate_ref_frame_side(cm); + if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_setup_motion_field_time); +#endif + + cm->current_frame.skip_mode_info.skip_mode_flag = + check_skip_mode_enabled(cpi); + + // Initialization of skip mode cost depends on the value of + // 'skip_mode_flag'. This initialization happens in the function + // av1_fill_mode_rates(), which is in turn called in + // av1_initialize_rd_consts(). Thus, av1_initialize_rd_consts() + // has to be called after 'skip_mode_flag' is initialized. + av1_initialize_rd_consts(cpi); + av1_set_sad_per_bit(cpi, &x->sadperbit, quant_params->base_qindex); + populate_thresh_to_force_zeromv_skip(cpi); + + enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy; + enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy; + mt_info->row_mt_enabled = 0; + mt_info->pack_bs_mt_enabled = AOMMIN(mt_info->num_mod_workers[MOD_PACK_BS], + cm->tiles.cols * cm->tiles.rows) > 1; + + if (oxcf->row_mt && (mt_info->num_workers > 1)) { + mt_info->row_mt_enabled = 1; + enc_row_mt->sync_read_ptr = av1_row_mt_sync_read; + enc_row_mt->sync_write_ptr = av1_row_mt_sync_write; + av1_encode_tiles_row_mt(cpi); + } else { + if (AOMMIN(mt_info->num_workers, cm->tiles.cols * cm->tiles.rows) > 1) { + av1_encode_tiles_mt(cpi); + } else { + // Preallocate the pc_tree for realtime coding to reduce the cost of + // memory allocation. + const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; + if (use_nonrd_mode) { + td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + } else { + td->pc_root = NULL; + } + + encode_tiles(cpi); + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + } + } + + // If intrabc is allowed but never selected, reset the allow_intrabc flag. + if (features->allow_intrabc && !cpi->intrabc_used) { + features->allow_intrabc = 0; + } + if (features->allow_intrabc) { + cm->delta_q_info.delta_lf_present_flag = 0; + } + + if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) { + cm->delta_q_info.delta_q_present_flag = 0; + } + + // Set the transform size appropriately before bitstream creation + const MODE_EVAL_TYPE eval_type = + cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch + ? WINNER_MODE_EVAL + : DEFAULT_EVAL; + const TX_SIZE_SEARCH_METHOD tx_search_type = + cpi->winner_mode_params.tx_size_search_methods[eval_type]; + assert(oxcf->txfm_cfg.enable_tx64 || tx_search_type != USE_LARGESTALL); + features->tx_mode = select_tx_mode(cm, tx_search_type); + + // Retain the frame level probability update conditions for parallel frames. + // These conditions will be consumed during postencode stage to update the + // probability. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + cpi->do_update_frame_probs_txtype[cpi->num_frame_recode] = + cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats; + cpi->do_update_frame_probs_obmc[cpi->num_frame_recode] = + (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX); + cpi->do_update_frame_probs_warp[cpi->num_frame_recode] = + (features->allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0); + cpi->do_update_frame_probs_interpfilter[cpi->num_frame_recode] = + (cm->current_frame.frame_type != KEY_FRAME && + cpi->sf.interp_sf.adaptive_interp_filter_search == 2 && + features->interp_filter == SWITCHABLE); + } + + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats || + ((cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != + INT_MAX) && + (cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != 0))) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + for (i = 0; i < TX_SIZES_ALL; i++) { + int sum = 0; + int j; + int left = MAX_TX_TYPE_PROB; + + for (j = 0; j < TX_TYPES; j++) + sum += cpi->td.rd_counts.tx_type_used[i][j]; + + for (j = TX_TYPES - 1; j >= 0; j--) { + int update_txtype_frameprobs = 1; + const int new_prob = + sum ? MAX_TX_TYPE_PROB * cpi->td.rd_counts.tx_type_used[i][j] / sum + : (j ? 0 : MAX_TX_TYPE_PROB); +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == + 0) { + int prob = + (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + temp_frame_probs_simulation->tx_type_probs[update_type][i][j] = + prob; + // Copy temp_frame_probs_simulation to temp_frame_probs + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->tx_type_probs[update_type_idx][i][j] = + temp_frame_probs_simulation + ->tx_type_probs[update_type_idx][i][j]; + } + } + update_txtype_frameprobs = 0; + } +#endif // CONFIG_FPMT_TEST + // Track the frame probabilities of parallel encode frames to update + // during postencode stage. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + update_txtype_frameprobs = 0; + cpi->frame_new_probs[cpi->num_frame_recode] + .tx_type_probs[update_type][i][j] = new_prob; + } + if (update_txtype_frameprobs) { + int prob = + (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1; + left -= prob; + if (j == 0) prob += left; + frame_probs->tx_type_probs[update_type][i][j] = prob; + } + } + } + } + + if (cm->seg.enabled) { + cm->seg.temporal_update = 1; + if (rdc->seg_tmp_pred_cost[0] < rdc->seg_tmp_pred_cost[1]) + cm->seg.temporal_update = 0; + } + + if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + + for (i = 0; i < BLOCK_SIZES_ALL; i++) { + int sum = 0; + int update_obmc_frameprobs = 1; + for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j]; + + const int new_prob = + sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0; +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + temp_frame_probs_simulation->obmc_probs[update_type][i] = + (temp_frame_probs_simulation->obmc_probs[update_type][i] + + new_prob) >> + 1; + // Copy temp_frame_probs_simulation to temp_frame_probs + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->obmc_probs[update_type_idx][i] = + temp_frame_probs_simulation->obmc_probs[update_type_idx][i]; + } + } + update_obmc_frameprobs = 0; + } +#endif // CONFIG_FPMT_TEST + // Track the frame probabilities of parallel encode frames to update + // during postencode stage. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + update_obmc_frameprobs = 0; + cpi->frame_new_probs[cpi->num_frame_recode].obmc_probs[update_type][i] = + new_prob; + } + if (update_obmc_frameprobs) { + frame_probs->obmc_probs[update_type][i] = + (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1; + } + } + } + + if (features->allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int update_warp_frameprobs = 1; + int sum = 0; + for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i]; + const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0; +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + temp_frame_probs_simulation->warped_probs[update_type] = + (temp_frame_probs_simulation->warped_probs[update_type] + + new_prob) >> + 1; + // Copy temp_frame_probs_simulation to temp_frame_probs + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->warped_probs[update_type_idx] = + temp_frame_probs_simulation->warped_probs[update_type_idx]; + } + } + update_warp_frameprobs = 0; + } +#endif // CONFIG_FPMT_TEST + // Track the frame probabilities of parallel encode frames to update + // during postencode stage. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + update_warp_frameprobs = 0; + cpi->frame_new_probs[cpi->num_frame_recode].warped_probs[update_type] = + new_prob; + } + if (update_warp_frameprobs) { + frame_probs->warped_probs[update_type] = + (frame_probs->warped_probs[update_type] + new_prob) >> 1; + } + } + + if (cm->current_frame.frame_type != KEY_FRAME && + cpi->sf.interp_sf.adaptive_interp_filter_search == 2 && + features->interp_filter == SWITCHABLE) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + int sum = 0; + int j; + int left = 1536; + + for (j = 0; j < SWITCHABLE_FILTERS; j++) { + sum += cpi->td.counts->switchable_interp[i][j]; + } + + for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) { + int update_interpfilter_frameprobs = 1; + const int new_prob = + sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum + : (j ? 0 : 1536); +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == + 0) { + int prob = (temp_frame_probs_simulation + ->switchable_interp_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + temp_frame_probs_simulation + ->switchable_interp_probs[update_type][i][j] = prob; + // Copy temp_frame_probs_simulation to temp_frame_probs + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] = + temp_frame_probs_simulation + ->switchable_interp_probs[update_type_idx][i][j]; + } + } + update_interpfilter_frameprobs = 0; + } +#endif // CONFIG_FPMT_TEST + // Track the frame probabilities of parallel encode frames to update + // during postencode stage. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + update_interpfilter_frameprobs = 0; + cpi->frame_new_probs[cpi->num_frame_recode] + .switchable_interp_probs[update_type][i][j] = new_prob; + } + if (update_interpfilter_frameprobs) { + int prob = (frame_probs->switchable_interp_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + frame_probs->switchable_interp_probs[update_type][i][j] = prob; + } + } + } + } + if (hash_table_created) { + av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table); + } +} + +/*!\brief Setup reference frame buffers and encode a frame + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + * + * \param[in] cpi Top-level encoder structure + */ +void av1_encode_frame(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + FeatureFlags *const features = &cm->features; + RD_COUNTS *const rdc = &cpi->td.rd_counts; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + // Indicates whether or not to use a default reduced set for ext-tx + // rather than the potential full set of 16 transforms + features->reduced_tx_set_used = oxcf->txfm_cfg.reduced_tx_type_set; + + // Make sure segment_id is no larger than last_active_segid. + if (cm->seg.enabled && cm->seg.update_map) { + const int mi_rows = cm->mi_params.mi_rows; + const int mi_cols = cm->mi_params.mi_cols; + const int last_active_segid = cm->seg.last_active_segid; + uint8_t *map = cpi->enc_seg.map; + for (int mi_row = 0; mi_row < mi_rows; ++mi_row) { + for (int mi_col = 0; mi_col < mi_cols; ++mi_col) { + map[mi_col] = AOMMIN(map[mi_col], last_active_segid); + } + map += mi_cols; + } + } + + av1_setup_frame_buf_refs(cm); + enforce_max_ref_frames(cpi, &cpi->ref_frame_flags, + cm->cur_frame->ref_display_order_hint, + cm->current_frame.display_order_hint); + set_rel_frame_dist(&cpi->common, &cpi->ref_frame_dist_info, + cpi->ref_frame_flags); + av1_setup_frame_sign_bias(cm); + + // If global motion is enabled, then every buffer which is used as either + // a source or a ref frame should have an image pyramid allocated. + // Check here so that issues can be caught early in debug mode +#if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY + if (cpi->image_pyramid_levels > 0) { + assert(cpi->source->y_pyramid); + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + assert(buf->buf.y_pyramid); + } + } + } +#endif // !defined(NDEBUG) && !CONFIG_REALTIME_ONLY + +#if CONFIG_MISMATCH_DEBUG + mismatch_reset_frame(av1_num_planes(cm)); +#endif + + rdc->newmv_or_intra_blocks = 0; + cpi->palette_pixel_num = 0; + + if (cpi->sf.hl_sf.frame_parameter_update || + cpi->sf.rt_sf.use_comp_ref_nonrd) { + if (frame_is_intra_only(cm)) + current_frame->reference_mode = SINGLE_REFERENCE; + else + current_frame->reference_mode = REFERENCE_MODE_SELECT; + + features->interp_filter = SWITCHABLE; + if (cm->tiles.large_scale) features->interp_filter = EIGHTTAP_REGULAR; + + features->switchable_motion_mode = is_switchable_motion_mode_allowed( + features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc); + + rdc->compound_ref_used_flag = 0; + rdc->skip_mode_used_flag = 0; + + encode_frame_internal(cpi); + + if (current_frame->reference_mode == REFERENCE_MODE_SELECT) { + // Use a flag that includes 4x4 blocks + if (rdc->compound_ref_used_flag == 0) { + current_frame->reference_mode = SINGLE_REFERENCE; +#if CONFIG_ENTROPY_STATS + av1_zero(cpi->td.counts->comp_inter); +#endif // CONFIG_ENTROPY_STATS + } + } + // Re-check on the skip mode status as reference mode may have been + // changed. + SkipModeInfo *const skip_mode_info = ¤t_frame->skip_mode_info; + if (frame_is_intra_only(cm) || + current_frame->reference_mode == SINGLE_REFERENCE) { + skip_mode_info->skip_mode_allowed = 0; + skip_mode_info->skip_mode_flag = 0; + } + if (skip_mode_info->skip_mode_flag && rdc->skip_mode_used_flag == 0) + skip_mode_info->skip_mode_flag = 0; + + if (!cm->tiles.large_scale) { + if (features->tx_mode == TX_MODE_SELECT && + cpi->td.mb.txfm_search_info.txb_split_count == 0) + features->tx_mode = TX_MODE_LARGEST; + } + } else { + // This is needed if real-time speed setting is changed on the fly + // from one using compound prediction to one using single reference. + if (current_frame->reference_mode == REFERENCE_MODE_SELECT) + current_frame->reference_mode = SINGLE_REFERENCE; + encode_frame_internal(cpi); + } +} diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h new file mode 100644 index 0000000000..ce32fb47e6 --- /dev/null +++ b/third_party/aom/av1/encoder/encodeframe.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_ +#define AOM_AV1_ENCODER_ENCODEFRAME_H_ + +#include "aom/aom_integer.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +#include "av1/encoder/global_motion.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define DELTA_Q_PERCEPTUAL_MODULATION \ + 1 // 0: variance based + // 1: wavelet AC energy based + +struct macroblock; +struct yv12_buffer_config; +struct AV1_COMP; +struct ThreadData; + +void av1_init_rtc_counters(struct macroblock *const x); + +void av1_accumulate_rtc_counters(struct AV1_COMP *cpi, + const struct macroblock *const x); + +void av1_setup_src_planes(struct macroblock *x, + const struct yv12_buffer_config *src, int mi_row, + int mi_col, const int num_planes, BLOCK_SIZE bsize); + +void av1_encode_frame(struct AV1_COMP *cpi); + +void av1_alloc_tile_data(struct AV1_COMP *cpi); +void av1_init_tile_data(struct AV1_COMP *cpi); +void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row, + int tile_col); +void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td, + int tile_row, int tile_col, int mi_row); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEFRAME_H_ diff --git a/third_party/aom/av1/encoder/encodeframe_utils.c b/third_party/aom/av1/encoder/encodeframe_utils.c new file mode 100644 index 0000000000..949837184a --- /dev/null +++ b/third_party/aom/av1/encoder/encodeframe_utils.c @@ -0,0 +1,1775 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/common_data.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/rdopt.h" + +void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult) { + const AV1_COMMON *const cm = &cpi->common; + + const BLOCK_SIZE bsize_base = BLOCK_16X16; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + int row, col; + double num_of_mi = 0.0; + double geom_mean_of_scale = 1.0; + + // To avoid overflow of 'geom_mean_of_scale', bsize_base must be at least + // BLOCK_8X8. + // + // For bsize=BLOCK_128X128 and bsize_base=BLOCK_8X8, the loop below would + // iterate 256 times. Considering the maximum value of + // cpi->ssim_rdmult_scaling_factors (see av1_set_mb_ssim_rdmult_scaling()), + // geom_mean_of_scale can go up to 4.8323^256, which is within DBL_MAX + // (maximum value a double data type can hold). If bsize_base is modified to + // BLOCK_4X4 (minimum possible block size), geom_mean_of_scale can go up + // to 4.8323^1024 and exceed DBL_MAX, resulting in data overflow. + assert(bsize_base >= BLOCK_8X8); + assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM); + + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + assert(cpi->ssim_rdmult_scaling_factors[index] != 0.0); + geom_mean_of_scale *= cpi->ssim_rdmult_scaling_factors[index]; + num_of_mi += 1.0; + } + } + geom_mean_of_scale = pow(geom_mean_of_scale, (1.0 / num_of_mi)); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); + *rdmult = AOMMAX(*rdmult, 0); + av1_set_error_per_bit(errorperbit, *rdmult); +} + +#if CONFIG_SALIENCY_MAP +void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi, + int *errorperbit, const BLOCK_SIZE bsize, + const int mi_row, const int mi_col, + int *const rdmult) { + const AV1_COMMON *const cm = &cpi->common; + const int num_mi_w = mi_size_wide[bsize]; + const int num_mi_h = mi_size_high[bsize]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + + *rdmult = + (int)(*rdmult * cpi->sm_scaling_factor[(mi_row / num_mi_h) * num_cols + + (mi_col / num_mi_w)]); + + *rdmult = AOMMAX(*rdmult, 0); + av1_set_error_per_bit(errorperbit, *rdmult); +} +#endif + +// TODO(angiebird): Move these function to tpl_model.c +#if !CONFIG_REALTIME_ONLY +// Return the end column for the current superblock, in unit of TPL blocks. +static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col, + int num_mi_w) { + // Find the start column of this superblock. + const int sb_mi_col_start = (mi_col >> cm->seq_params->mib_size_log2) + << cm->seq_params->mib_size_log2; + // Same but in superres upscaled dimension. + const int sb_mi_col_start_sr = + coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator); + // Width of this superblock in mi units. + const int sb_mi_width = mi_size_wide[cm->seq_params->sb_size]; + // Same but in superres upscaled dimension. + const int sb_mi_width_sr = + coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator); + // Superblock end in mi units. + const int sb_mi_end = sb_mi_col_start_sr + sb_mi_width_sr; + // Superblock end in TPL units. + return (sb_mi_end + num_mi_w - 1) / num_mi_w; +} + +int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + int deltaq_rdmult = set_rdmult(cpi, x, -1); + if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult; + if (cm->superres_scale_denominator != SCALE_NUMERATOR) return deltaq_rdmult; + if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult; + if (x->rb == 0) return deltaq_rdmult; + + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + int tpl_stride = tpl_frame->stride; + double intra_cost_base = 0; + double mc_dep_cost_base = 0; + double cbcmp_base = 0; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + + for (int row = mi_row; row < mi_row + mi_high; row += step) { + for (int col = mi_col; col < mi_col + mi_wide; col += step) { + if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) + continue; + + TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + + double cbcmp = (double)this_stats->srcrf_dist; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); + intra_cost_base += log(dist_scaled) * cbcmp; + mc_dep_cost_base += log(3 * dist_scaled + mc_dep_delta) * cbcmp; + cbcmp_base += cbcmp; + } + } + + if (cbcmp_base == 0) return deltaq_rdmult; + + double rk = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base); + deltaq_rdmult = (int)(deltaq_rdmult * (rk / x->rb)); + + return AOMMAX(deltaq_rdmult, 1); +} + +int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int orig_rdmult) { + const AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + const int deltaq_rdmult = set_rdmult(cpi, x, -1); + if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) + return deltaq_rdmult; + if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult; + + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int block_mi_width_sr = + coded_to_superres_mi(mi_size_wide[bsize], cm->superres_scale_denominator); + + const BLOCK_SIZE bsize_base = BLOCK_16X16; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (block_mi_width_sr + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + // This is required because the end col of superblock may be off by 1 in case + // of superres. + const int sb_bcol_end = get_superblock_tpl_column_end(cm, mi_col, num_mi_w); + int row, col; + double base_block_count = 0.0; + double geom_mean_of_scale = 0.0; + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col_sr / num_mi_h; + col < num_cols && col < mi_col_sr / num_mi_h + num_bcols && + col < sb_bcol_end; + ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += log(cpi->ppi->tpl_sb_rdmult_scaling_factors[index]); + base_block_count += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count); + int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5); + rdmult = AOMMAX(rdmult, 0); + av1_set_error_per_bit(&x->errorperbit, rdmult); +#if !CONFIG_RD_COMMAND + if (bsize == cm->seq_params->sb_size) { + const int rdmult_sb = set_rdmult(cpi, x, -1); + assert(rdmult_sb == rdmult); + (void)rdmult_sb; + } +#endif // !CONFIG_RD_COMMAND + return rdmult; +} +#endif // !CONFIG_REALTIME_ONLY + +static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts, + const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + int dir; + for (dir = 0; dir < 2; ++dir) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); + + // Only allow the 3 valid SWITCHABLE_FILTERS. + assert(filter < SWITCHABLE_FILTERS); + ++counts->switchable_interp[ctx][filter]; + } +} + +// This function will copy the best reference mode information from +// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT. +static INLINE void copy_mbmi_ext_frame_to_mbmi_ext( + MB_MODE_INFO_EXT *mbmi_ext, + const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) { + memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack, + sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); + memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight, + sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); + mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context; + mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count; + memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs, + sizeof(mbmi_ext->global_mvs)); +} + +void av1_update_state(const AV1_COMP *const cpi, ThreadData *td, + const PICK_MODE_CONTEXT *const ctx, int mi_row, + int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) { + int i, x_idx, y; + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const MB_MODE_INFO *const mi = &ctx->mic; + MB_MODE_INFO *const mi_addr = xd->mi[0]; + const struct segmentation *const seg = &cm->seg; + assert(bsize < BLOCK_SIZES_ALL); + const int bw = mi_size_wide[mi->bsize]; + const int bh = mi_size_high[mi->bsize]; + const int mis = mi_params->mi_stride; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + + assert(mi->bsize == bsize); + + *mi_addr = *mi; + copy_mbmi_ext_frame_to_mbmi_ext(&x->mbmi_ext, &ctx->mbmi_ext_best, + av1_ref_frame_type(ctx->mic.ref_frame)); + + memcpy(txfm_info->blk_skip, ctx->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + + txfm_info->skip_txfm = ctx->rd_stats.skip_txfm; + + xd->tx_type_map = ctx->tx_type_map; + xd->tx_type_map_stride = mi_size_wide[bsize]; + // If not dry_run, copy the transform type data into the frame level buffer. + // Encoder will fetch tx types when writing bitstream. + if (!dry_run) { + const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); + uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx; + const int mi_stride = mi_params->mi_stride; + for (int blk_row = 0; blk_row < bh; ++blk_row) { + av1_copy_array(tx_type_map + blk_row * mi_stride, + xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw); + } + xd->tx_type_map = tx_type_map; + xd->tx_type_map_stride = mi_stride; + } + + // If segmentation in use + if (seg->enabled) { + // For in frame complexity AQ copy the segment id from the segment map. + if (cpi->oxcf.q_cfg.aq_mode == COMPLEXITY_AQ) { + const uint8_t *const map = + seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; + mi_addr->segment_id = + map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0; + } + // Else for cyclic refresh mode update the segment map, set the segment id + // and then update the quantizer. + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + !cpi->rc.rtc_external_ratectrl) { + av1_cyclic_refresh_update_segment(cpi, x, mi_row, mi_col, bsize, + ctx->rd_stats.rate, ctx->rd_stats.dist, + txfm_info->skip_txfm, dry_run); + } + if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd)) + mi_addr->uv_mode = UV_DC_PRED; + + if (!dry_run && !mi_addr->skip_txfm) { + int cdf_num; + const uint8_t spatial_pred = av1_get_spatial_seg_pred( + cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4); + const uint8_t coded_id = av1_neg_interleave( + mi_addr->segment_id, spatial_pred, seg->last_active_segid + 1); + int64_t spatial_cost = x->mode_costs.spatial_pred_cost[cdf_num][coded_id]; + td->rd_counts.seg_tmp_pred_cost[0] += spatial_cost; + + const int pred_segment_id = + cm->last_frame_seg_map + ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row, + mi_col) + : 0; + const int use_tmp_pred = pred_segment_id == mi_addr->segment_id; + const uint8_t tmp_pred_ctx = av1_get_pred_context_seg_id(xd); + td->rd_counts.seg_tmp_pred_cost[1] += + x->mode_costs.tmp_pred_cost[tmp_pred_ctx][use_tmp_pred]; + if (!use_tmp_pred) { + td->rd_counts.seg_tmp_pred_cost[1] += spatial_cost; + } + } + } + + // Count zero motion vector. + if (!dry_run && !frame_is_intra_only(cm)) { + const MV mv = mi->mv[0].as_mv; + if (is_inter_block(mi) && mi->ref_frame[0] == LAST_FRAME && + abs(mv.row) < 8 && abs(mv.col) < 8) { + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); + // Accumulate low_content_frame. + for (int mi_y = 0; mi_y < ymis; mi_y += 2) x->cnt_zeromv += bw << 1; + } + } + + for (i = 0; i < num_planes; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + p[i].dqcoeff = ctx->dqcoeff[i]; + p[i].eobs = ctx->eobs[i]; + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + } + for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; + // Restore the coding context of the MB to that that was in place + // when the mode was picked for it + + const int cols = + AOMMIN((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width, mi_width); + const int rows = AOMMIN( + (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height, mi_height); + for (y = 0; y < rows; y++) { + for (x_idx = 0; x_idx < cols; x_idx++) xd->mi[x_idx + y * mis] = mi_addr; + } + + if (cpi->oxcf.q_cfg.aq_mode) + av1_init_plane_quantizers(cpi, x, mi_addr->segment_id, 0); + + if (dry_run) return; + +#if CONFIG_INTERNAL_STATS + { + unsigned int *const mode_chosen_counts = + (unsigned int *)cpi->mode_chosen_counts; // Cast const away. + if (frame_is_intra_only(cm)) { + static const int kf_mode_index[] = { + THR_DC /*DC_PRED*/, + THR_V_PRED /*V_PRED*/, + THR_H_PRED /*H_PRED*/, + THR_D45_PRED /*D45_PRED*/, + THR_D135_PRED /*D135_PRED*/, + THR_D113_PRED /*D113_PRED*/, + THR_D157_PRED /*D157_PRED*/, + THR_D203_PRED /*D203_PRED*/, + THR_D67_PRED /*D67_PRED*/, + THR_SMOOTH, /*SMOOTH_PRED*/ + THR_SMOOTH_V, /*SMOOTH_V_PRED*/ + THR_SMOOTH_H, /*SMOOTH_H_PRED*/ + THR_PAETH /*PAETH_PRED*/, + }; + ++mode_chosen_counts[kf_mode_index[mi_addr->mode]]; + } else { + // Note how often each mode chosen as best + ++mode_chosen_counts[ctx->best_mode_index]; + } + } +#endif + if (!frame_is_intra_only(cm)) { + if (is_inter_block(mi) && cm->features.interp_filter == SWITCHABLE) { + // When the frame interp filter is SWITCHABLE, several cases that always + // use the default type (EIGHTTAP_REGULAR) are described in + // av1_is_interp_needed(). Here, we should keep the counts for all + // applicable blocks, so the frame filter resetting decision in + // fix_interp_filter() is made correctly. + update_filter_type_count(td->counts, xd, mi_addr); + } + } + + const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col); + const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row); + if (cm->seq_params->order_hint_info.enable_ref_frame_mvs) + av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis); +} + +void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts, + PREDICTION_MODE mode, int16_t mode_context) { + (void)counts; + + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + if (mode == NEWMV) { +#if CONFIG_ENTROPY_STATS + ++counts->newmv_mode[mode_ctx][0]; +#endif + update_cdf(fc->newmv_cdf[mode_ctx], 0, 2); + return; + } + +#if CONFIG_ENTROPY_STATS + ++counts->newmv_mode[mode_ctx][1]; +#endif + update_cdf(fc->newmv_cdf[mode_ctx], 1, 2); + + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + if (mode == GLOBALMV) { +#if CONFIG_ENTROPY_STATS + ++counts->zeromv_mode[mode_ctx][0]; +#endif + update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2); + return; + } + +#if CONFIG_ENTROPY_STATS + ++counts->zeromv_mode[mode_ctx][1]; +#endif + update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2); + + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; +#if CONFIG_ENTROPY_STATS + ++counts->refmv_mode[mode_ctx][mode != NEARESTMV]; +#endif + update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2); +} + +static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + FRAME_COUNTS *counts) { + FRAME_CONTEXT *fc = xd->tile_ctx; + const BLOCK_SIZE bsize = mbmi->bsize; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize); + + (void)counts; + + if (mbmi->mode == DC_PRED) { + const int n = pmi->palette_size[0]; + const int palette_mode_ctx = av1_get_palette_mode_ctx(xd); + +#if CONFIG_ENTROPY_STATS + ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0]; +#endif + update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx], + n > 0, 2); + if (n > 0) { +#if CONFIG_ENTROPY_STATS + ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE]; +#endif + update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx], + n - PALETTE_MIN_SIZE, PALETTE_SIZES); + } + } + + if (mbmi->uv_mode == UV_DC_PRED) { + const int n = pmi->palette_size[1]; + const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); + +#if CONFIG_ENTROPY_STATS + ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0]; +#endif + update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2); + + if (n > 0) { +#if CONFIG_ENTROPY_STATS + ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE]; +#endif + update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx], + n - PALETTE_MIN_SIZE, PALETTE_SIZES); + } + } +} + +void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts, + MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, const int intraonly) { + FRAME_CONTEXT *fc = xd->tile_ctx; + const PREDICTION_MODE y_mode = mbmi->mode; + (void)counts; + const BLOCK_SIZE bsize = mbmi->bsize; + + if (intraonly) { +#if CONFIG_ENTROPY_STATS + const PREDICTION_MODE above = av1_above_block_mode(above_mi); + const PREDICTION_MODE left = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[above]; + const int left_ctx = intra_mode_context[left]; + ++counts->kf_y_mode[above_ctx][left_ctx][y_mode]; +#endif // CONFIG_ENTROPY_STATS + update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES); + } else { +#if CONFIG_ENTROPY_STATS + ++counts->y_mode[size_group_lookup[bsize]][y_mode]; +#endif // CONFIG_ENTROPY_STATS + update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES); + } + + if (av1_filter_intra_allowed(cm, mbmi)) { + const int use_filter_intra_mode = + mbmi->filter_intra_mode_info.use_filter_intra; +#if CONFIG_ENTROPY_STATS + ++counts->filter_intra[mbmi->bsize][use_filter_intra_mode]; + if (use_filter_intra_mode) { + ++counts + ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode]; + } +#endif // CONFIG_ENTROPY_STATS + update_cdf(fc->filter_intra_cdfs[mbmi->bsize], use_filter_intra_mode, 2); + if (use_filter_intra_mode) { + update_cdf(fc->filter_intra_mode_cdf, + mbmi->filter_intra_mode_info.filter_intra_mode, + FILTER_INTRA_MODES); + } + } + if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->angle_delta[mbmi->mode - V_PRED] + [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA]; +#endif + update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED], + mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA, + 2 * MAX_ANGLE_DELTA + 1); + } + + if (!xd->is_chroma_ref) return; + + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd); +#if CONFIG_ENTROPY_STATS + ++counts->uv_mode[cfl_allowed][y_mode][uv_mode]; +#endif // CONFIG_ENTROPY_STATS + update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode, + UV_INTRA_MODES - !cfl_allowed); + if (uv_mode == UV_CFL_PRED) { + const int8_t joint_sign = mbmi->cfl_alpha_signs; + const uint8_t idx = mbmi->cfl_alpha_idx; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_sign[joint_sign]; +#endif + update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS); + if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)]; +#endif + update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE); + } + if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)]; +#endif + update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE); + } + } + const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); + if (av1_is_directional_mode(intra_mode) && av1_use_angle_delta(bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->angle_delta[intra_mode - V_PRED] + [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA]; +#endif + update_cdf(fc->angle_delta_cdf[intra_mode - V_PRED], + mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA, + 2 * MAX_ANGLE_DELTA + 1); + } + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { + update_palette_cdf(xd, mbmi, counts); + } +} + +void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes) { + MACROBLOCKD *xd = &x->e_mbd; + int p; + const int num_4x4_blocks_wide = mi_size_wide[bsize]; + const int num_4x4_blocks_high = mi_size_high[bsize]; + int mi_width = mi_size_wide[bsize]; + int mi_height = mi_size_high[bsize]; + for (p = 0; p < num_planes; p++) { + int tx_col = mi_col; + int tx_row = mi_row & MAX_MIB_MASK; + memcpy( + xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x), + ctx->a + num_4x4_blocks_wide * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); + memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y), + ctx->l + num_4x4_blocks_high * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); + } + memcpy(xd->above_partition_context + mi_col, ctx->sa, + sizeof(*xd->above_partition_context) * mi_width); + memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl, + sizeof(xd->left_partition_context[0]) * mi_height); + xd->above_txfm_context = ctx->p_ta; + xd->left_txfm_context = ctx->p_tl; + memcpy(xd->above_txfm_context, ctx->ta, + sizeof(*xd->above_txfm_context) * mi_width); + memcpy(xd->left_txfm_context, ctx->tl, + sizeof(*xd->left_txfm_context) * mi_height); +} + +void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes) { + const MACROBLOCKD *xd = &x->e_mbd; + int p; + int mi_width = mi_size_wide[bsize]; + int mi_height = mi_size_high[bsize]; + + // buffer the above/left context information of the block in search. + for (p = 0; p < num_planes; ++p) { + int tx_col = mi_col; + int tx_row = mi_row & MAX_MIB_MASK; + memcpy( + ctx->a + mi_width * p, + xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x), + (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x); + memcpy(ctx->l + mi_height * p, + xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y), + (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y); + } + memcpy(ctx->sa, xd->above_partition_context + mi_col, + sizeof(*xd->above_partition_context) * mi_width); + memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK), + sizeof(xd->left_partition_context[0]) * mi_height); + memcpy(ctx->ta, xd->above_txfm_context, + sizeof(*xd->above_txfm_context) * mi_width); + memcpy(ctx->tl, xd->left_txfm_context, + sizeof(*xd->left_txfm_context) * mi_height); + ctx->p_ta = xd->above_txfm_context; + ctx->p_tl = xd->left_txfm_context; +} + +static void set_partial_sb_partition(const AV1_COMMON *const cm, + MB_MODE_INFO *mi, int bh_in, int bw_in, + int mi_rows_remaining, + int mi_cols_remaining, BLOCK_SIZE bsize, + MB_MODE_INFO **mib) { + int bh = bh_in; + int r, c; + for (r = 0; r < cm->seq_params->mib_size; r += bh) { + int bw = bw_in; + for (c = 0; c < cm->seq_params->mib_size; c += bw) { + const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c); + const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c); + mib[grid_index] = mi + mi_index; + mib[grid_index]->bsize = find_partition_size( + bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw); + } + } +} + +// This function attempts to set all mode info entries in a given superblock +// to the same block partition size. +// However, at the bottom and right borders of the image the requested size +// may not be allowed in which case this code attempts to choose the largest +// allowable partition. +void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + MB_MODE_INFO **mib, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mi_rows_remaining = tile->mi_row_end - mi_row; + const int mi_cols_remaining = tile->mi_col_end - mi_col; + MB_MODE_INFO *const mi_upper_left = + mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col); + int bh = mi_size_high[bsize]; + int bw = mi_size_wide[bsize]; + + assert(bsize >= mi_params->mi_alloc_bsize && + "Attempted to use bsize < mi_params->mi_alloc_bsize"); + assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0)); + + // Apply the requested partition size to the SB if it is all "in image" + if ((mi_cols_remaining >= cm->seq_params->mib_size) && + (mi_rows_remaining >= cm->seq_params->mib_size)) { + for (int block_row = 0; block_row < cm->seq_params->mib_size; + block_row += bh) { + for (int block_col = 0; block_col < cm->seq_params->mib_size; + block_col += bw) { + const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col); + const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col); + mib[grid_index] = mi_upper_left + mi_index; + mib[grid_index]->bsize = bsize; + } + } + } else { + // Else this is a partial SB. + set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining, + mi_cols_remaining, bsize, mib); + } +} + +int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + assert(bsize >= BLOCK_8X8); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + + for (int i = 0; i < 4; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= cm->mi_params.mi_rows) || + (mi_col + x_idx >= cm->mi_params.mi_cols)) + return 0; + if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) != + PARTITION_NONE && + subsize != BLOCK_8X8) + return 0; + } + return 1; +} + +#if !CONFIG_REALTIME_ONLY +int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int orig_rdmult) { + AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + int64_t intra_cost = 0; + int64_t mc_dep_cost = 0; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + + if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) { + return orig_rdmult; + } + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { + return orig_rdmult; + } + +#ifndef NDEBUG + int mi_count = 0; +#endif + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int step = 1 << block_mis_log2; + const int row_step = step; + const int col_step_sr = + coded_to_superres_mi(step, cm->superres_scale_denominator); + for (int row = mi_row; row < mi_row + mi_high; row += row_step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { + if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue; + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + intra_cost += this_stats->recrf_dist << RDDIV_BITS; + mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; +#ifndef NDEBUG + mi_count++; +#endif + } + } + assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); + + double beta = 1.0; + if (mc_dep_cost > 0 && intra_cost > 0) { + const double r0 = cpi->rd.r0; + const double rk = (double)intra_cost / mc_dep_cost; + beta = (r0 / rk); + } + + int rdmult = av1_get_adaptive_rdmult(cpi, beta); + + rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2); + rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2); + + rdmult = AOMMAX(1, rdmult); + + return rdmult; +} + +// Checks to see if a super block is on a horizontal image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) { + int top_edge = 0; + int bottom_edge = cpi->common.mi_params.mi_rows; + int is_active_h_edge = 0; + + // For two pass account for any formatting bars detected. + if (is_stat_consumption_stage_twopass(cpi)) { + const AV1_COMMON *const cm = &cpi->common; + const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats( + &cpi->ppi->twopass, cm->current_frame.display_order_hint); + if (this_frame_stats == NULL) return AOM_CODEC_ERROR; + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + top_edge += (int)(this_frame_stats->inactive_zone_rows * 4); + + bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4); + bottom_edge = AOMMAX(top_edge, bottom_edge); + } + + if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) || + ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) { + is_active_h_edge = 1; + } + return is_active_h_edge; +} + +// Checks to see if a super block is on a vertical image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) { + int left_edge = 0; + int right_edge = cpi->common.mi_params.mi_cols; + int is_active_v_edge = 0; + + // For two pass account for any formatting bars detected. + if (is_stat_consumption_stage_twopass(cpi)) { + const AV1_COMMON *const cm = &cpi->common; + const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats( + &cpi->ppi->twopass, cm->current_frame.display_order_hint); + if (this_frame_stats == NULL) return AOM_CODEC_ERROR; + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + left_edge += (int)(this_frame_stats->inactive_zone_cols * 4); + + right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4); + right_edge = AOMMAX(left_edge, right_edge); + } + + if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) || + ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) { + is_active_v_edge = 1; + } + return is_active_v_edge; +} + +void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, SuperBlockEnc *sb_enc) { + sb_enc->tpl_data_count = 0; + + if (!cpi->oxcf.algo_cfg.enable_tpl_model) return; + if (cpi->common.current_frame.frame_type == KEY_FRAME) return; + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE) + return; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + + AV1_COMMON *const cm = &cpi->common; + const int gf_group_index = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + if (!av1_tpl_stats_ready(tpl_data, gf_group_index)) return; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + + int mi_count = 0; + int count = 0; + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + // mi_cols_sr is mi_cols at superres case. + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + // TPL store unit size is not the same as the motion estimation unit size. + // Here always use motion estimation size to avoid getting repetitive inter/ + // intra cost. + const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d); + assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]); + const int row_step = mi_size_high[tpl_bsize]; + const int col_step_sr = coded_to_superres_mi(mi_size_wide[tpl_bsize], + cm->superres_scale_denominator); + + // Stride is only based on SB size, and we fill in values for every 16x16 + // block in a SB. + sb_enc->tpl_stride = (mi_col_end_sr - mi_col_sr) / col_step_sr; + + for (int row = mi_row; row < mi_row + mi_high; row += row_step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { + assert(count < MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); + // Handle partial SB, so that no invalid values are used later. + if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) { + sb_enc->tpl_inter_cost[count] = INT64_MAX; + sb_enc->tpl_intra_cost[count] = INT64_MAX; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + sb_enc->tpl_mv[count][i].as_int = INVALID_MV; + } + count++; + continue; + } + + TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + sb_enc->tpl_inter_cost[count] = this_stats->inter_cost + << TPL_DEP_COST_SCALE_LOG2; + sb_enc->tpl_intra_cost[count] = this_stats->intra_cost + << TPL_DEP_COST_SCALE_LOG2; + memcpy(sb_enc->tpl_mv[count], this_stats->mv, sizeof(this_stats->mv)); + mi_count++; + count++; + } + } + + assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); + sb_enc->tpl_data_count = mi_count; +} + +// analysis_type 0: Use mc_dep_cost and intra_cost +// analysis_type 1: Use count of best inter predictor chosen +// analysis_type 2: Use cost reduction from intra to inter for best inter +// predictor chosen +int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td, + int64_t *delta_dist, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + double intra_cost = 0; + double mc_dep_reg = 0; + double mc_dep_cost = 0; + double cbcmp_base = 1; + double srcrf_dist = 0; + double srcrf_sse = 0; + double srcrf_rate = 0; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + const int base_qindex = cm->quant_params.base_qindex; + + if (tpl_idx >= MAX_TPL_FRAME_IDX) return base_qindex; + + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + if (!tpl_frame->is_valid) return base_qindex; + +#ifndef NDEBUG + int mi_count = 0; +#endif + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int step = 1 << block_mis_log2; + const int row_step = step; + const int col_step_sr = + coded_to_superres_mi(step, cm->superres_scale_denominator); + for (int row = mi_row; row < mi_row + mi_high; row += row_step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { + if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue; + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; + double cbcmp = (double)this_stats->srcrf_dist; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); + intra_cost += log(dist_scaled) * cbcmp; + mc_dep_cost += log(dist_scaled + mc_dep_delta) * cbcmp; + mc_dep_reg += log(3 * dist_scaled + mc_dep_delta) * cbcmp; + srcrf_dist += (double)(this_stats->srcrf_dist << RDDIV_BITS); + srcrf_sse += (double)(this_stats->srcrf_sse << RDDIV_BITS); + srcrf_rate += (double)(this_stats->srcrf_rate << TPL_DEP_COST_SCALE_LOG2); +#ifndef NDEBUG + mi_count++; +#endif + cbcmp_base += cbcmp; + } + } + assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); + + int offset = 0; + double beta = 1.0; + double rk; + if (mc_dep_cost > 0 && intra_cost > 0) { + const double r0 = cpi->rd.r0; + rk = exp((intra_cost - mc_dep_cost) / cbcmp_base); + td->mb.rb = exp((intra_cost - mc_dep_reg) / cbcmp_base); + beta = (r0 / rk); + assert(beta > 0.0); + } else { + return base_qindex; + } + offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta); + + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1); + offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1); + int qindex = cm->quant_params.base_qindex + offset; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + + int frm_qstep = av1_dc_quant_QTX(base_qindex, 0, cm->seq_params->bit_depth); + int sbs_qstep = + av1_dc_quant_QTX(base_qindex, offset, cm->seq_params->bit_depth); + + if (delta_dist) { + double sbs_dist = srcrf_dist * pow((double)sbs_qstep / frm_qstep, 2.0); + double sbs_rate = srcrf_rate * ((double)frm_qstep / sbs_qstep); + sbs_dist = AOMMIN(sbs_dist, srcrf_sse); + *delta_dist = (int64_t)((sbs_dist - srcrf_dist) / rk); + *delta_dist += RDCOST(tpl_frame->base_rdmult, 4 * 256, 0); + *delta_dist += RDCOST(tpl_frame->base_rdmult, sbs_rate - srcrf_rate, 0); + } + return qindex; +} + +#if !DISABLE_HDR_LUMA_DELTAQ +// offset table defined in Table3 of T-REC-H.Sup15 document. +static const int hdr_thres[HDR_QP_LEVELS + 1] = { 0, 301, 367, 434, 501, 567, + 634, 701, 767, 834, 1024 }; + +static const int hdr10_qp_offset[HDR_QP_LEVELS] = { 3, 2, 1, 0, -1, + -2, -3, -4, -5, -6 }; +#endif + +int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + assert(cm->seq_params->bit_depth == AOM_BITS_10); + +#if DISABLE_HDR_LUMA_DELTAQ + (void)x; + (void)bsize; + (void)mi_row; + (void)mi_col; + return cm->quant_params.base_qindex; +#else + // calculate pixel average + const int block_luma_avg = av1_log_block_avg(cpi, x, bsize, mi_row, mi_col); + // adjust offset based on average of the pixel block + int offset = 0; + for (int i = 0; i < HDR_QP_LEVELS; i++) { + if (block_luma_avg >= hdr_thres[i] && block_luma_avg < hdr_thres[i + 1]) { + offset = (int)(hdr10_qp_offset[i] * QP_SCALE_FACTOR); + break; + } + } + + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1); + offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1); + int qindex = cm->quant_params.base_qindex + offset; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + + return qindex; +#endif +} +#endif // !CONFIG_REALTIME_ONLY + +void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree, + BLOCK_SIZE bsize) { + if (sms_tree == NULL) return; + sms_tree->partitioning = PARTITION_NONE; + + if (bsize >= BLOCK_8X8) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int idx = 0; idx < 4; ++idx) + av1_reset_simple_motion_tree_partition(sms_tree->split[idx], subsize); + } +} + +// Record the ref frames that have been selected by square partition blocks. +void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type, + BLOCK_SIZE bsize, int mib_size, + int mi_row, int mi_col) { + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + const int sb_size_mask = mib_size - 1; + const int mi_row_in_sb = mi_row & sb_size_mask; + const int mi_col_in_sb = mi_col & sb_size_mask; + const int mi_size = mi_size_wide[bsize]; + for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) { + for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) { + x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type; + } + } +} + +static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr, + int num_cdfs, int cdf_stride, int nsymbs, + int wt_left, int wt_tr) { + for (int i = 0; i < num_cdfs; i++) { + for (int j = 0; j <= nsymbs; j++) { + cdf_ptr_left[i * cdf_stride + j] = + (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left + + (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr + + ((wt_left + wt_tr) / 2)) / + (wt_left + wt_tr)); + assert(cdf_ptr_left[i * cdf_stride + j] >= 0 && + cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP); + } + } +} + +#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \ + AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs)) + +#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride) \ + do { \ + aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left; \ + aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr; \ + int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob); \ + int num_cdfs = array_size / cdf_stride; \ + avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \ + wt_left, wt_tr); \ + } while (0) + +static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left, + int wt_tr) { + AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4); + for (int i = 0; i < 2; i++) { + AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf, + MV_CLASSES); + AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf, + nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE); + AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE); + AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf, + nmv_tr->comps[i].class0_hp_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf, + CLASS0_SIZE); + AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2); + } +} + +// In case of row-based multi-threading of encoder, since we always +// keep a top - right sync, we can average the top - right SB's CDFs and +// the left SB's CDFs and use the same for current SB's encoding to +// improve the performance. This function facilitates the averaging +// of CDF and used only when row-mt is enabled in encoder. +void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr, + int wt_left, int wt_tr) { + AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2); + AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2); + AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2); + AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5); + AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6); + AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7); + AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8); + AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9); + AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10); + AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11); + AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3); + AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4); + AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE); + AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2); + AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2); + AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2); + AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2); + AVERAGE_CDF(ctx_left->inter_compound_mode_cdf, + ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES); + AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf, + MASKED_COMPOUND_TYPES); + AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16); + AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2); + AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2); + AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf, + INTERINTRA_MODES); + AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES); + AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2); + AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf, + PALETTE_SIZES); + AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf, + PALETTE_SIZES); + for (int j = 0; j < PALETTE_SIZES; j++) { + int nsymbs = j + PALETTE_MIN_SIZE; + AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j], + ctx_tr->palette_y_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j], + ctx_tr->palette_uv_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + } + AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2); + AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2); + AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2); + AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2); + AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2); + AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2); + AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2); + AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2); + AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2); + AVERAGE_CDF(ctx_left->skip_txfm_cdfs, ctx_tr->skip_txfm_cdfs, 2); + AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2); + avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr); + avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr); + AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2); + AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2); + AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf, + ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS); + AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2); + AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf, + FILTER_INTRA_MODES); + AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf, + RESTORE_SWITCHABLE_TYPES); + AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2); + AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2); + AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES); + AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0], + UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES)); + AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES); + for (int i = 0; i < PARTITION_CONTEXTS; i++) { + if (i < 4) { + AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4, + CDF_SIZE(10)); + } else if (i < 16) { + AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10); + } else { + AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8, + CDF_SIZE(10)); + } + } + AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf, + SWITCHABLE_FILTERS); + AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES); + AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf, + 2 * MAX_ANGLE_DELTA + 1); + AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH, + CDF_SIZE(MAX_TX_DEPTH + 1)); + AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1); + AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1); + for (int i = 0; i < FRAME_LF_COUNT; i++) { + AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i], + DELTA_LF_PROBS + 1); + } + AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2, + CDF_SIZE(TX_TYPES)); + AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS); + AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf, + CFL_ALPHABET_SIZE); +} + +// Check neighbor blocks' motion information. +static int check_neighbor_blocks(MB_MODE_INFO **mi, int mi_stride, + const TileInfo *const tile_info, int mi_row, + int mi_col) { + int is_above_low_motion = 1; + int is_left_low_motion = 1; + const int thr = 24; + + // Check above block. + if (mi_row > tile_info->mi_row_start) { + const MB_MODE_INFO *above_mbmi = mi[-mi_stride]; + const int_mv above_mv = above_mbmi->mv[0]; + if (above_mbmi->mode >= INTRA_MODE_END && + (abs(above_mv.as_mv.row) > thr || abs(above_mv.as_mv.col) > thr)) + is_above_low_motion = 0; + } + + // Check left block. + if (mi_col > tile_info->mi_col_start) { + const MB_MODE_INFO *left_mbmi = mi[-1]; + const int_mv left_mv = left_mbmi->mv[0]; + if (left_mbmi->mode >= INTRA_MODE_END && + (abs(left_mv.as_mv.row) > thr || abs(left_mv.as_mv.col) > thr)) + is_left_low_motion = 0; + } + + return (is_above_low_motion && is_left_low_motion); +} + +// Check this block's motion in a fast way. +static int fast_detect_non_zero_motion(AV1_COMP *cpi, const uint8_t *src_y, + int src_ystride, + const uint8_t *last_src_y, + int last_src_ystride, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const BLOCK_SIZE bsize = cm->seq_params->sb_size; + unsigned int blk_sad = INT_MAX; + if (cpi->src_sad_blk_64x64 != NULL) { + const int sb_size_by_mb = (bsize == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; + const int sb_cols = + (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; + const int sbi_col = mi_col / sb_size_by_mb; + const int sbi_row = mi_row / sb_size_by_mb; + blk_sad = (unsigned int)cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols]; + } else { + blk_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, + last_src_ystride); + } + + // Search 4 1-away points. + const uint8_t *const search_pos[4] = { + last_src_y - last_src_ystride, + last_src_y - 1, + last_src_y + 1, + last_src_y + last_src_ystride, + }; + unsigned int sad_arr[4]; + cpi->ppi->fn_ptr[bsize].sdx4df(src_y, src_ystride, search_pos, + last_src_ystride, sad_arr); + + blk_sad = (blk_sad * 5) >> 3; + return (blk_sad < sad_arr[0] && blk_sad < sad_arr[1] && + blk_sad < sad_arr[2] && blk_sad < sad_arr[3]); +} + +// Grade the temporal variation of the source by comparing the current sb and +// its collocated block in the last frame. +void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, + int mi_row, int mi_col) { + if (cpi->last_source->y_width != cpi->source->y_width || + cpi->last_source->y_height != cpi->source->y_height) + return; +#if CONFIG_AV1_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return; +#endif + + unsigned int tmp_sse; + unsigned int tmp_variance; + const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size; + uint8_t *src_y = cpi->source->y_buffer; + const int src_ystride = cpi->source->y_stride; + const int src_offset = src_ystride * (mi_row << 2) + (mi_col << 2); + uint8_t *last_src_y = cpi->last_source->y_buffer; + const int last_src_ystride = cpi->last_source->y_stride; + const int last_src_offset = last_src_ystride * (mi_row << 2) + (mi_col << 2); + uint64_t avg_source_sse_threshold_verylow = 10000; // ~1.5*1.5*(64*64) + uint64_t avg_source_sse_threshold_low[2] = { 100000, // ~5*5*(64*64) + 36000 }; // ~3*3*(64*64) + + uint64_t avg_source_sse_threshold_high = 1000000; // ~15*15*(64*64) + if (cpi->sf.rt_sf.increase_source_sad_thresh) { + avg_source_sse_threshold_high = avg_source_sse_threshold_high << 1; + avg_source_sse_threshold_low[0] = avg_source_sse_threshold_low[0] << 1; + avg_source_sse_threshold_verylow = avg_source_sse_threshold_verylow << 1; + } + uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5 + src_y += src_offset; + last_src_y += last_src_offset; + tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y, + last_src_ystride, &tmp_sse); + // rd thresholds + if (tmp_sse < avg_source_sse_threshold_low[1]) + x->content_state_sb.source_sad_rd = kLowSad; + + // nonrd thresholds + if (tmp_sse == 0) { + x->content_state_sb.source_sad_nonrd = kZeroSad; + return; + } + if (tmp_sse < avg_source_sse_threshold_verylow) + x->content_state_sb.source_sad_nonrd = kVeryLowSad; + else if (tmp_sse < avg_source_sse_threshold_low[0]) + x->content_state_sb.source_sad_nonrd = kLowSad; + else if (tmp_sse > avg_source_sse_threshold_high) + x->content_state_sb.source_sad_nonrd = kHighSad; + + // Detect large lighting change. + // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12) + if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh) + x->content_state_sb.lighting_change = 1; + if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1)) + x->content_state_sb.low_sumdiff = 1; + + if (!cpi->sf.rt_sf.use_rtc_tf || cpi->rc.high_source_sad || + cpi->rc.frame_source_sad > 20000 || cpi->svc.number_spatial_layers > 1) + return; + + // In-place temporal filter. If psnr calculation is enabled, we store the + // source for that. + AV1_COMMON *const cm = &cpi->common; + // Calculate n*mean^2 + const unsigned int nmean2 = tmp_sse - tmp_variance; + const int ac_q_step = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, + cm->seq_params->bit_depth); + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const int avg_q_step = av1_ac_quant_QTX(p_rc->avg_frame_qindex[INTER_FRAME], + 0, cm->seq_params->bit_depth); + + const unsigned int threshold = + (cpi->sf.rt_sf.use_rtc_tf == 1) + ? (clamp(avg_q_step, 250, 1000)) * ac_q_step + : 250 * ac_q_step; + + // TODO(yunqing): use a weighted sum instead of averaging in filtering. + if (tmp_variance <= threshold && nmean2 <= 15) { + // Check neighbor blocks. If neighbor blocks aren't low-motion blocks, + // skip temporal filtering for this block. + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); + const TileInfo *const tile_info = &tile_data->tile_info; + const int is_neighbor_blocks_low_motion = check_neighbor_blocks( + mi, cm->mi_params.mi_stride, tile_info, mi_row, mi_col); + if (!is_neighbor_blocks_low_motion) return; + + // Only consider 64x64 SB for now. Need to extend to 128x128 for large SB + // size. + // Test several nearby points. If non-zero mv exists, don't do temporal + // filtering. + const int is_this_blk_low_motion = fast_detect_non_zero_motion( + cpi, src_y, src_ystride, last_src_y, last_src_ystride, mi_row, mi_col); + + if (!is_this_blk_low_motion) return; + + const int shift_x[2] = { 0, cpi->source->subsampling_x }; + const int shift_y[2] = { 0, cpi->source->subsampling_y }; + const uint8_t h = block_size_high[bsize]; + const uint8_t w = block_size_wide[bsize]; + + for (int plane = 0; plane < av1_num_planes(cm); ++plane) { + uint8_t *src = cpi->source->buffers[plane]; + const int src_stride = cpi->source->strides[plane != 0]; + uint8_t *last_src = cpi->last_source->buffers[plane]; + const int last_src_stride = cpi->last_source->strides[plane != 0]; + src += src_stride * (mi_row << (2 - shift_y[plane != 0])) + + (mi_col << (2 - shift_x[plane != 0])); + last_src += last_src_stride * (mi_row << (2 - shift_y[plane != 0])) + + (mi_col << (2 - shift_x[plane != 0])); + + for (int i = 0; i < (h >> shift_y[plane != 0]); ++i) { + for (int j = 0; j < (w >> shift_x[plane != 0]); ++j) { + src[j] = (last_src[j] + src[j]) >> 1; + } + src += src_stride; + last_src += last_src_stride; + } + } + } +} + +// Memset the mbmis at the current superblock to 0 +void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size, + int mi_row, int mi_col) { + // size of sb in unit of mi (BLOCK_4X4) + const int sb_size_mi = mi_size_wide[sb_size]; + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + // size of sb in unit of allocated mi size + const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d; + assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 && + "mi is not allocated as a multiple of sb!"); + assert(mi_params->mi_stride % sb_size_mi == 0 && + "mi_grid_base is not allocated as a multiple of sb!"); + + const int mi_rows = mi_size_high[sb_size]; + for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) { + assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) < + mi_params->mi_stride); + const int mi_grid_idx = + get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col); + const int alloc_mi_idx = + get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col); + memset(&mi_params->mi_grid_base[mi_grid_idx], 0, + sb_size_mi * sizeof(*mi_params->mi_grid_base)); + memset(&mi_params->tx_type_map[mi_grid_idx], 0, + sb_size_mi * sizeof(*mi_params->tx_type_map)); + if (cur_mi_row % mi_alloc_size_1d == 0) { + memset(&mi_params->mi_alloc[alloc_mi_idx], 0, + sb_size_alloc_mi * sizeof(*mi_params->mi_alloc)); + } + } +} + +void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi, + ThreadData *td, const TileDataEnc *tile_data, + int mi_row, int mi_col) { + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const TileInfo *tile_info = &tile_data->tile_info; + + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes); + + sb_fp_stats->rd_count = td->rd_counts; + sb_fp_stats->split_count = x->txfm_search_info.txb_split_count; + + sb_fp_stats->fc = *td->counts; + + // Don't copy in row_mt case, otherwise run into data race. No behavior change + // in row_mt case. + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models, + sizeof(sb_fp_stats->inter_mode_rd_models)); + } + + memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact, + sizeof(sb_fp_stats->thresh_freq_fact)); + + const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); + sb_fp_stats->current_qindex = + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex; + +#if CONFIG_INTERNAL_STATS + memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts, + sizeof(sb_fp_stats->mode_chosen_counts)); +#endif // CONFIG_INTERNAL_STATS +} + +void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi, + ThreadData *td, TileDataEnc *tile_data, int mi_row, + int mi_col) { + MACROBLOCK *x = &td->mb; + + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + + av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, + num_planes); + + td->rd_counts = sb_fp_stats->rd_count; + x->txfm_search_info.txb_split_count = sb_fp_stats->split_count; + + *td->counts = sb_fp_stats->fc; + + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models, + sizeof(sb_fp_stats->inter_mode_rd_models)); + } + + memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact, + sizeof(sb_fp_stats->thresh_freq_fact)); + + const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = + sb_fp_stats->current_qindex; + +#if CONFIG_INTERNAL_STATS + memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts, + sizeof(sb_fp_stats->mode_chosen_counts)); +#endif // CONFIG_INTERNAL_STATS +} + +/*! Checks whether to skip updating the entropy cost based on tile info. + * + * This function contains the common code used to skip the cost update of coeff, + * mode, mv and dv symbols. + */ +static int skip_cost_update(const SequenceHeader *seq_params, + const TileInfo *const tile_info, const int mi_row, + const int mi_col, + INTERNAL_COST_UPDATE_TYPE upd_level) { + if (upd_level == INTERNAL_COST_UPD_SB) return 0; + if (upd_level == INTERNAL_COST_UPD_OFF) return 1; + + // upd_level is at most as frequent as each sb_row in a tile. + if (mi_col != tile_info->mi_col_start) return 1; + + if (upd_level == INTERNAL_COST_UPD_SBROW_SET) { + const int mib_size_log2 = seq_params->mib_size_log2; + const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2; + const int sb_size = seq_params->mib_size * MI_SIZE; + const int tile_height = + (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE; + // When upd_level = INTERNAL_COST_UPD_SBROW_SET, the cost update happens + // once for 2, 4 sb rows for sb size 128, sb size 64 respectively. However, + // as the update will not be equally spaced in smaller resolutions making + // it equally spaced by calculating (mv_num_rows_cost_update) the number of + // rows after which the cost update should happen. + const int sb_size_update_freq_map[2] = { 2, 4 }; + const int update_freq_sb_rows = + sb_size_update_freq_map[sb_size != MAX_SB_SIZE]; + const int update_freq_num_rows = sb_size * update_freq_sb_rows; + // Round-up the division result to next integer. + const int num_updates_per_tile = + (tile_height + update_freq_num_rows - 1) / update_freq_num_rows; + const int num_rows_update_per_tile = num_updates_per_tile * sb_size; + // Round-up the division result to next integer. + const int num_sb_rows_per_update = + (tile_height + num_rows_update_per_tile - 1) / num_rows_update_per_tile; + if ((sb_row % num_sb_rows_per_update) != 0) return 1; + } + return 0; +} + +// Checks for skip status of mv cost update. +static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info, + const int mi_row, const int mi_col) { + const AV1_COMMON *cm = &cpi->common; + // For intra frames, mv cdfs are not updated during the encode. Hence, the mv + // cost calculation is skipped in this case. + if (frame_is_intra_only(cm)) return 1; + + return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, + cpi->sf.inter_sf.mv_cost_upd_level); +} + +// Checks for skip status of dv cost update. +static int skip_dv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info, + const int mi_row, const int mi_col) { + const AV1_COMMON *cm = &cpi->common; + // Intrabc is only applicable to intra frames. So skip if intrabc is not + // allowed. + if (!av1_allow_intrabc(cm) || is_stat_generation_stage(cpi)) { + return 1; + } + + return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, + cpi->sf.intra_sf.dv_cost_upd_level); +} + +// Update the rate costs of some symbols according to the frequency directed +// by speed features +void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, + const TileInfo *const tile_info, const int mi_row, + const int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + + if (cm->features.disable_cdf_update) { + return; + } + + switch (cpi->sf.inter_sf.coeff_cost_upd_level) { + case INTERNAL_COST_UPD_OFF: + case INTERNAL_COST_UPD_TILE: // Tile level + break; + case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile + case INTERNAL_COST_UPD_SBROW: // SB row level in tile + case INTERNAL_COST_UPD_SB: // SB level + if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, + cpi->sf.inter_sf.coeff_cost_upd_level)) + break; + av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes); + break; + default: assert(0); + } + + switch (cpi->sf.inter_sf.mode_cost_upd_level) { + case INTERNAL_COST_UPD_OFF: + case INTERNAL_COST_UPD_TILE: // Tile level + break; + case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile + case INTERNAL_COST_UPD_SBROW: // SB row level in tile + case INTERNAL_COST_UPD_SB: // SB level + if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, + cpi->sf.inter_sf.mode_cost_upd_level)) + break; + av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx); + break; + default: assert(0); + } + + switch (cpi->sf.inter_sf.mv_cost_upd_level) { + case INTERNAL_COST_UPD_OFF: + case INTERNAL_COST_UPD_TILE: // Tile level + break; + case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile + case INTERNAL_COST_UPD_SBROW: // SB row level in tile + case INTERNAL_COST_UPD_SB: // SB level + // Checks for skip status of mv cost update. + if (skip_mv_cost_update(cpi, tile_info, mi_row, mi_col)) break; + av1_fill_mv_costs(&xd->tile_ctx->nmvc, + cm->features.cur_frame_force_integer_mv, + cm->features.allow_high_precision_mv, x->mv_costs); + break; + default: assert(0); + } + + switch (cpi->sf.intra_sf.dv_cost_upd_level) { + case INTERNAL_COST_UPD_OFF: + case INTERNAL_COST_UPD_TILE: // Tile level + break; + case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile + case INTERNAL_COST_UPD_SBROW: // SB row level in tile + case INTERNAL_COST_UPD_SB: // SB level + // Checks for skip status of dv cost update. + if (skip_dv_cost_update(cpi, tile_info, mi_row, mi_col)) break; + av1_fill_dv_costs(&xd->tile_ctx->ndvc, x->dv_costs); + break; + default: assert(0); + } +} + +void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes) { + for (int plane = 0; plane < num_planes; ++plane) { + aom_free(mb->plane[plane].src_diff); + mb->plane[plane].src_diff = NULL; + } +} + +void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb) { + const int num_planes = av1_num_planes(cm); +#ifndef NDEBUG + for (int plane = 0; plane < num_planes; ++plane) { + assert(!mb->plane[plane].src_diff); + } +#endif + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling_xy = + plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y + : 0; + const int sb_size = MAX_SB_SQUARE >> subsampling_xy; + CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff, + (int16_t *)aom_memalign( + 32, sizeof(*mb->plane[plane].src_diff) * sb_size)); + } +} diff --git a/third_party/aom/av1/encoder/encodeframe_utils.h b/third_party/aom/av1/encoder/encodeframe_utils.h new file mode 100644 index 0000000000..14c71b8802 --- /dev/null +++ b/third_party/aom/av1/encoder/encodeframe_utils.h @@ -0,0 +1,595 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_ +#define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_ + +#include "aom_ports/aom_timer.h" + +#include "av1/common/reconinter.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/rdopt.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define WRITE_FEATURE_TO_FILE 0 + +#define FEATURE_SIZE_SMS_SPLIT_FAST 6 +#define FEATURE_SIZE_SMS_SPLIT 17 +#define FEATURE_SIZE_SMS_PRUNE_PART 25 +#define FEATURE_SIZE_SMS_TERM_NONE 28 +#define FEATURE_SIZE_FP_SMS_TERM_NONE 20 +#define FEATURE_SIZE_MAX_MIN_PART_PRED 13 +#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4 + +#define FEATURE_SMS_NONE_FLAG 1 +#define FEATURE_SMS_SPLIT_FLAG (1 << 1) +#define FEATURE_SMS_RECT_FLAG (1 << 2) + +#define FEATURE_SMS_PRUNE_PART_FLAG \ + (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG) +#define FEATURE_SMS_SPLIT_MODEL_FLAG \ + (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG) + +// Number of sub-partitions in rectangular partition types. +#define SUB_PARTITIONS_RECT 2 + +// Number of sub-partitions in split partition type. +#define SUB_PARTITIONS_SPLIT 4 + +// Number of sub-partitions in AB partition types. +#define SUB_PARTITIONS_AB 3 + +// Number of sub-partitions in 4-way partition types. +#define SUB_PARTITIONS_PART4 4 + +// 4part partition types. +enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES); + +// AB partition types. +enum { + HORZ_A = 0, + HORZ_B, + VERT_A, + VERT_B, + NUM_AB_PARTS +} UENUM1BYTE(AB_PART_TYPE); + +// Rectangular partition types. +enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE); + +// Structure to keep win flags for HORZ and VERT partition evaluations. +typedef struct { + int rect_part_win[NUM_RECT_PARTS]; +} RD_RECT_PART_WIN_INFO; + +enum { PICK_MODE_RD = 0, PICK_MODE_NONRD }; + +enum { + SB_SINGLE_PASS, // Single pass encoding: all ctxs get updated normally + SB_DRY_PASS, // First pass of multi-pass: does not update the ctxs + SB_WET_PASS // Second pass of multi-pass: finalize and update the ctx +} UENUM1BYTE(SB_MULTI_PASS_MODE); + +typedef struct { + ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE]; + ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE]; + PARTITION_CONTEXT sa[MAX_MIB_SIZE]; + PARTITION_CONTEXT sl[MAX_MIB_SIZE]; + TXFM_CONTEXT *p_ta; + TXFM_CONTEXT *p_tl; + TXFM_CONTEXT ta[MAX_MIB_SIZE]; + TXFM_CONTEXT tl[MAX_MIB_SIZE]; +} RD_SEARCH_MACROBLOCK_CONTEXT; + +// This struct is used to store the statistics used by sb-level multi-pass +// encoding. Currently, this is only used to make a copy of the state before we +// perform the first pass +typedef struct SB_FIRST_PASS_STATS { + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_COUNTS rd_count; + + int split_count; + FRAME_COUNTS fc; + InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; + int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES]; + int current_qindex; + +#if CONFIG_INTERNAL_STATS + unsigned int mode_chosen_counts[MAX_MODES]; +#endif // CONFIG_INTERNAL_STATS +} SB_FIRST_PASS_STATS; + +// This structure contains block size related +// variables for use in rd_pick_partition(). +typedef struct { + // Half of block width to determine block edge. + int mi_step; + + // Block row and column indices. + int mi_row; + int mi_col; + + // Block edge row and column indices. + int mi_row_edge; + int mi_col_edge; + + // Block width of current partition block. + int width; + + // Block width of minimum partition size allowed. + int min_partition_size_1d; + + // Flag to indicate if partition is 8x8 or higher size. + int bsize_at_least_8x8; + + // Indicates edge blocks in frame. + int has_rows; + int has_cols; + + // Block size of current partition. + BLOCK_SIZE bsize; + + // Size of current sub-partition. + BLOCK_SIZE subsize; + + // Size of split partition. + BLOCK_SIZE split_bsize2; +} PartitionBlkParams; + +#if CONFIG_COLLECT_PARTITION_STATS +typedef struct PartitionTimingStats { + // Tracks the number of partition decision used in the current call to \ref + // av1_rd_pick_partition + int partition_decisions[EXT_PARTITION_TYPES]; + // Tracks the number of partition_block searched in the current call to \ref + // av1_rd_pick_partition + int partition_attempts[EXT_PARTITION_TYPES]; + // Tracks the time spent on each partition search in the current call to \ref + // av1_rd_pick_partition + int64_t partition_times[EXT_PARTITION_TYPES]; + // Tracks the rdcost spent on each partition search in the current call to + // \ref av1_rd_pick_partition + int64_t partition_rdcost[EXT_PARTITION_TYPES]; + // Timer used to time the partitions. + struct aom_usec_timer timer; + // Whether the timer is on + int timer_is_on; +} PartitionTimingStats; +#endif // CONFIG_COLLECT_PARTITION_STATS + +// Structure holding state variables for partition search. +typedef struct { + // Intra partitioning related info. + PartitionSearchInfo *intra_part_info; + + // Parameters related to partition block size. + PartitionBlkParams part_blk_params; + + // Win flags for HORZ and VERT partition evaluations. + RD_RECT_PART_WIN_INFO split_part_rect_win[SUB_PARTITIONS_SPLIT]; + + // RD cost for the current block of given partition type. + RD_STATS this_rdc; + + // RD cost summed across all blocks of partition type. + RD_STATS sum_rdc; + + // Array holding partition type cost. + int tmp_partition_cost[PARTITION_TYPES]; + + // Pointer to partition cost buffer + int *partition_cost; + + // RD costs for different partition types. + int64_t none_rd; + int64_t split_rd[SUB_PARTITIONS_SPLIT]; + // RD costs for rectangular partitions. + // rect_part_rd[0][i] is the RD cost of ith partition index of PARTITION_HORZ. + // rect_part_rd[1][i] is the RD cost of ith partition index of PARTITION_VERT. + int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT]; + + // Flags indicating if the corresponding partition was winner or not. + // Used to bypass similar blocks during AB partition evaluation. + int is_split_ctx_is_ready[2]; + int is_rect_ctx_is_ready[NUM_RECT_PARTS]; + + // If true, skips the rest of partition evaluation at the current bsize level. + int terminate_partition_search; + + // If false, skips rdopt on PARTITION_NONE. + int partition_none_allowed; + + // If partition_rect_allowed[HORZ] is false, skips searching PARTITION_HORZ, + // PARTITION_HORZ_A, PARTITIO_HORZ_B, PARTITION_HORZ_4. Same holds for VERT. + int partition_rect_allowed[NUM_RECT_PARTS]; + + // If false, skips searching rectangular partition unless some logic related + // to edge detection holds. + int do_rectangular_split; + + // If false, skips searching PARTITION_SPLIT. + int do_square_split; + + // If true, prunes the corresponding PARTITION_HORZ/PARTITION_VERT. Note that + // this does not directly affect the extended partitions, so this can be used + // to prune out PARTITION_HORZ/PARTITION_VERT while still allowing rdopt of + // PARTITION_HORZ_AB4, etc. + int prune_rect_part[NUM_RECT_PARTS]; + + // Chroma subsampling in x and y directions. + int ss_x; + int ss_y; + + // Partition plane context index. + int pl_ctx_idx; + + // This flag will be set if best partition is found from the search. + bool found_best_partition; + +#if CONFIG_COLLECT_PARTITION_STATS + PartitionTimingStats part_timing_stats; +#endif // CONFIG_COLLECT_PARTITION_STATS +} PartitionSearchState; + +static AOM_INLINE void av1_disable_square_split_partition( + PartitionSearchState *part_state) { + part_state->do_square_split = 0; +} + +// Disables all possible rectangular splits. This includes PARTITION_AB4 as they +// depend on the corresponding partition_rect_allowed. +static AOM_INLINE void av1_disable_rect_partitions( + PartitionSearchState *part_state) { + part_state->do_rectangular_split = 0; + part_state->partition_rect_allowed[HORZ] = 0; + part_state->partition_rect_allowed[VERT] = 0; +} + +// Disables all possible splits so that only PARTITION_NONE *might* be allowed. +static AOM_INLINE void av1_disable_all_splits( + PartitionSearchState *part_state) { + av1_disable_square_split_partition(part_state); + av1_disable_rect_partitions(part_state); +} + +static AOM_INLINE void av1_set_square_split_only( + PartitionSearchState *part_state) { + part_state->partition_none_allowed = 0; + part_state->do_square_split = 1; + av1_disable_rect_partitions(part_state); +} + +static AOM_INLINE bool av1_blk_has_rows_and_cols( + const PartitionBlkParams *blk_params) { + return blk_params->has_rows && blk_params->has_cols; +} + +static AOM_INLINE bool av1_is_whole_blk_in_frame( + const PartitionBlkParams *blk_params, + const CommonModeInfoParams *mi_params) { + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + return mi_row + mi_size_high[bsize] <= mi_params->mi_rows && + mi_col + mi_size_wide[bsize] <= mi_params->mi_cols; +} + +static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, + int dual_filter) { + for (int dir = 0; dir < 2; ++dir) { + if (dir && !dual_filter) break; + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); + update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter, + SWITCHABLE_FILTERS); + } +} + +static AOM_INLINE int set_rdmult(const AV1_COMP *const cpi, + const MACROBLOCK *const x, int segment_id) { + const AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const CommonQuantParams *quant_params = &cm->quant_params; + const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; + const FRAME_UPDATE_TYPE update_type = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + + int qindex; + if (segment_id >= 0) { + qindex = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex); + } else { + qindex = quant_params->base_qindex + x->rdmult_delta_qindex + + quant_params->y_dc_delta_q; + } + + return av1_compute_rd_mult( + qindex, bit_depth, update_type, layer_depth, boost_index, frame_type, + cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi)); +} + +static AOM_INLINE int do_split_check(BLOCK_SIZE bsize) { + return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32); +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p, + int frm) { + assert(frm >= 0); + if (frm < 0 || + p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) { + return NULL; + } + + return &p->stats_buf_ctx->stats_in_start[frm]; +} + +int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int orig_rdmult); + +int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step); + +int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step); + +void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, SuperBlockEnc *sb_enc); + +int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td, + int64_t *delta_dist, BLOCK_SIZE bsize, + int mi_row, int mi_col); + +int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mi_row, int mi_col); + +int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col); + +int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int orig_rdmult); +#endif // !CONFIG_REALTIME_ONLY + +void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult); + +#if CONFIG_SALIENCY_MAP +void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi, + int *errorperbit, const BLOCK_SIZE bsize, + const int mi_row, const int mi_col, + int *const rdmult); +#endif + +void av1_update_state(const AV1_COMP *const cpi, ThreadData *td, + const PICK_MODE_CONTEXT *const ctx, int mi_row, + int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run); + +void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts, + PREDICTION_MODE mode, int16_t mode_context); + +void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts, + MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, const int intraonly); + +void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes); + +void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes); + +void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + MB_MODE_INFO **mib, int mi_row, int mi_col, + BLOCK_SIZE bsize); + +int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize); + +void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree, + BLOCK_SIZE bsize); + +void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type, + BLOCK_SIZE bsize, int mib_size, + int mi_row, int mi_col); + +void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr, + int wt_left, int wt_tr); + +void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, + int mi_row, int mi_col); + +void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size, + int mi_row, int mi_col); + +void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi, + ThreadData *td, const TileDataEnc *tile_data, + int mi_row, int mi_col); + +void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi, + ThreadData *td, TileDataEnc *tile_data, int mi_row, + int mi_col); + +void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, + const TileInfo *const tile_info, const int mi_row, + const int mi_col); + +void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes); + +static AOM_INLINE void av1_dealloc_mb_data(struct macroblock *mb, + int num_planes) { + aom_free(mb->txfm_search_info.mb_rd_record); + mb->txfm_search_info.mb_rd_record = NULL; + + aom_free(mb->inter_modes_info); + mb->inter_modes_info = NULL; + + av1_dealloc_src_diff_buf(mb, num_planes); + + aom_free(mb->e_mbd.seg_mask); + mb->e_mbd.seg_mask = NULL; + + aom_free(mb->winner_mode_stats); + mb->winner_mode_stats = NULL; + + aom_free(mb->dqcoeff_buf); + mb->dqcoeff_buf = NULL; +} + +static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi, + struct macroblock *mb) { + const SPEED_FEATURES *sf = &cpi->sf; + // The winner_mode_stats buffer is not required in these cases. + if (is_stat_generation_stage(cpi) || + (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode) || + (sf->winner_mode_sf.multi_winner_mode_type == MULTI_WINNER_MODE_OFF)) + return; + + const AV1_COMMON *cm = &cpi->common; + const int winner_mode_count = + winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type]; + CHECK_MEM_ERROR(cm, mb->winner_mode_stats, + (WinnerModeStats *)aom_malloc( + winner_mode_count * sizeof(mb->winner_mode_stats[0]))); +} + +void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb); + +static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi, + struct macroblock *mb) { + const AV1_COMMON *cm = &cpi->common; + const SPEED_FEATURES *sf = &cpi->sf; + if (!sf->rt_sf.use_nonrd_pick_mode) { + // Memory for mb_rd_record is allocated only when use_mb_rd_hash sf is + // enabled. + if (sf->rd_sf.use_mb_rd_hash) + CHECK_MEM_ERROR(cm, mb->txfm_search_info.mb_rd_record, + (MB_RD_RECORD *)aom_malloc(sizeof(MB_RD_RECORD))); + if (!frame_is_intra_only(cm)) + CHECK_MEM_ERROR( + cm, mb->inter_modes_info, + (InterModesInfo *)aom_malloc(sizeof(*mb->inter_modes_info))); + } + + av1_alloc_src_diff_buf(cm, mb); + + CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0]))); + + allocate_winner_mode_stats(cpi, mb); + + const int max_sb_square_y = 1 + << num_pels_log2_lookup[cm->seq_params->sb_size]; + CHECK_MEM_ERROR( + cm, mb->dqcoeff_buf, + (tran_low_t *)aom_memalign(32, max_sb_square_y * sizeof(tran_low_t))); +} + +// This function will compute the number of reference frames to be disabled +// based on selective_ref_frame speed feature. +static AOM_INLINE unsigned int get_num_refs_to_disable( + const AV1_COMP *cpi, const int *ref_frame_flags, + const unsigned int *ref_display_order_hint, + unsigned int cur_frame_display_index) { + unsigned int num_refs_to_disable = 0; + if (cpi->sf.inter_sf.selective_ref_frame >= 3) { + num_refs_to_disable++; + if (cpi->sf.inter_sf.selective_ref_frame >= 6) { + // Disable LAST2_FRAME and ALTREF2_FRAME + num_refs_to_disable += 2; + } else if (cpi->sf.inter_sf.selective_ref_frame == 5 && + *ref_frame_flags & av1_ref_frame_flag_list[LAST2_FRAME]) { + const int last2_frame_dist = av1_encoder_get_relative_dist( + ref_display_order_hint[LAST2_FRAME - LAST_FRAME], + cur_frame_display_index); + // Disable LAST2_FRAME if it is a temporally distant frame + if (abs(last2_frame_dist) > 2) { + num_refs_to_disable++; + } +#if !CONFIG_REALTIME_ONLY + else if (is_stat_consumption_stage_twopass(cpi)) { + const FIRSTPASS_STATS *const this_frame_stats = + read_one_frame_stats(&cpi->ppi->twopass, cur_frame_display_index); + const double coded_error_per_mb = this_frame_stats->coded_error; + // Disable LAST2_FRAME if the coded error of the current frame based on + // first pass stats is very low. + if (coded_error_per_mb < 100.0) num_refs_to_disable++; + } +#endif // CONFIG_REALTIME_ONLY + } + } + return num_refs_to_disable; +} + +static INLINE int get_max_allowed_ref_frames( + const AV1_COMP *cpi, const int *ref_frame_flags, + const unsigned int *ref_display_order_hint, + unsigned int cur_frame_display_index) { + const unsigned int max_reference_frames = + cpi->oxcf.ref_frm_cfg.max_reference_frames; + const unsigned int num_refs_to_disable = get_num_refs_to_disable( + cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index); + const unsigned int max_allowed_refs_for_given_speed = + INTER_REFS_PER_FRAME - num_refs_to_disable; + return AOMMIN(max_allowed_refs_for_given_speed, max_reference_frames); +} + +// Enforce the number of references for each arbitrary frame based on user +// options and speed. +static AOM_INLINE void enforce_max_ref_frames( + AV1_COMP *cpi, int *ref_frame_flags, + const unsigned int *ref_display_order_hint, + unsigned int cur_frame_display_index) { + MV_REFERENCE_FRAME ref_frame; + int total_valid_refs = 0; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + total_valid_refs++; + } + } + + const int max_allowed_refs = get_max_allowed_ref_frames( + cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index); + + for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) { + const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i]; + + if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) { + continue; + } + + switch (ref_frame_to_disable) { + case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break; + case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break; + case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break; + case BWDREF_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break; + default: assert(0); + } + --total_valid_refs; + } + assert(total_valid_refs <= max_allowed_refs); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_ diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c new file mode 100644 index 0000000000..c78761dd98 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemb.c @@ -0,0 +1,866 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/bitwriter.h" +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + +#include "av1/common/cfl.h" +#include "av1/common/idct.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/scan.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/txb_rdopt.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" + +void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride) { + assert(rows >= 4 && cols >= 4); +#if CONFIG_AV1_HIGHBITDEPTH + if (bd_info.use_highbitdepth_buf) { + aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, + pred8, pred_stride); + return; + } +#endif + (void)bd_info; + aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8, + pred_stride); +} + +void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, + int blk_col, int blk_row, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; + const int diff_stride = block_size_wide[plane_bsize]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + const int tx1d_width = tx_size_wide[tx_size]; + const int tx1d_height = tx_size_high[tx_size]; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; + int16_t *src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2]; + av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride, + src, src_stride, dst, dst_stride); +} + +void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) { + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; + assert(plane_bsize < BLOCK_SIZES_ALL); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + const MACROBLOCKD *xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + + av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf, + p->src.stride, pd->dst.buf, pd->dst.stride); +} + +int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost) { + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + const int eob = p->eobs[block]; + const int segment_id = xd->mi[0]->segment_id; + + if (eob == 0 || !cpi->optimize_seg_arr[segment_id] || + xd->lossless[segment_id]) { + *rate_cost = av1_cost_skip_txb(&x->coeff_costs, txb_ctx, plane, tx_size); + return eob; + } + + return av1_optimize_txb(cpi, x, plane, block, tx_size, tx_type, txb_ctx, + rate_cost, cpi->oxcf.algo_cfg.sharpness); +} + +// Hyper-parameters for dropout optimization, based on following logics. +// TODO(yjshen): These settings are tuned by experiments. They may still be +// optimized for better performance. +// (1) Coefficients which are large enough will ALWAYS be kept. +const tran_low_t DROPOUT_COEFF_MAX = 2; // Max dropout-able coefficient. +// (2) Continuous coefficients will ALWAYS be kept. Here rigorous continuity is +// NOT required. For example, `5 0 0 0 7` is treated as two continuous +// coefficients if three zeros do not fulfill the dropout condition. +const int DROPOUT_CONTINUITY_MAX = 2; // Max dropout-able continuous coeff. +// (3) Dropout operation is NOT applicable to blocks with large or small +// quantization index. +const int DROPOUT_Q_MAX = 128; +const int DROPOUT_Q_MIN = 16; +// (4) Recall that dropout optimization will forcibly set some quantized +// coefficients to zero. The key logic on determining whether a coefficient +// should be dropped is to check the number of continuous zeros before AND +// after this coefficient. The exact number of zeros for judgement depends +// on block size and quantization index. More concretely, block size +// determines the base number of zeros, while quantization index determines +// the multiplier. Intuitively, larger block requires more zeros and larger +// quantization index also requires more zeros (more information is lost +// when using larger quantization index). +const int DROPOUT_BEFORE_BASE_MAX = 32; // Max base number for leading zeros. +const int DROPOUT_BEFORE_BASE_MIN = 16; // Min base number for leading zeros. +const int DROPOUT_AFTER_BASE_MAX = 32; // Max base number for trailing zeros. +const int DROPOUT_AFTER_BASE_MIN = 16; // Min base number for trailing zeros. +const int DROPOUT_MULTIPLIER_MAX = 8; // Max multiplier on number of zeros. +const int DROPOUT_MULTIPLIER_MIN = 2; // Min multiplier on number of zeros. +const int DROPOUT_MULTIPLIER_Q_BASE = 32; // Base Q to compute multiplier. + +void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, + TX_TYPE tx_type, int qindex) { + const int tx_width = tx_size_wide[tx_size]; + const int tx_height = tx_size_high[tx_size]; + + // Early return if `qindex` is out of range. + if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) { + return; + } + + // Compute number of zeros used for dropout judgement. + const int base_size = AOMMAX(tx_width, tx_height); + const int multiplier = CLIP(qindex / DROPOUT_MULTIPLIER_Q_BASE, + DROPOUT_MULTIPLIER_MIN, DROPOUT_MULTIPLIER_MAX); + const int dropout_num_before = + multiplier * + CLIP(base_size, DROPOUT_BEFORE_BASE_MIN, DROPOUT_BEFORE_BASE_MAX); + const int dropout_num_after = + multiplier * + CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX); + + av1_dropout_qcoeff_num(mb, plane, block, tx_size, tx_type, dropout_num_before, + dropout_num_after); +} + +void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, TX_TYPE tx_type, + int dropout_num_before, int dropout_num_after) { + const struct macroblock_plane *const p = &mb->plane[plane]; + tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block); + tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + const int max_eob = av1_get_max_eob(tx_size); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + + // Early return if there are not enough non-zero coefficients. + if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before || + max_eob <= dropout_num_before + dropout_num_after) { + return; + } + + int count_zeros_before = 0; + int count_zeros_after = 0; + int count_nonzeros = 0; + // Index of the first non-zero coefficient after sufficient number of + // continuous zeros. If equals to `-1`, it means number of leading zeros + // hasn't reach `dropout_num_before`. + int idx = -1; + int eob = 0; // New end of block. + + for (int i = 0; i < p->eobs[block]; ++i) { + const int scan_idx = scan_order->scan[i]; + if (abs(qcoeff[scan_idx]) > DROPOUT_COEFF_MAX) { + // Keep large coefficients. + count_zeros_before = 0; + count_zeros_after = 0; + idx = -1; + eob = i + 1; + } else if (qcoeff[scan_idx] == 0) { // Count zeros. + if (idx == -1) { + ++count_zeros_before; + } else { + ++count_zeros_after; + } + } else { // Count non-zeros. + if (count_zeros_before >= dropout_num_before) { + idx = (idx == -1) ? i : idx; + ++count_nonzeros; + } else { + count_zeros_before = 0; + eob = i + 1; + } + } + + // Handle continuity. + if (count_nonzeros > DROPOUT_CONTINUITY_MAX) { + count_zeros_before = 0; + count_zeros_after = 0; + count_nonzeros = 0; + idx = -1; + eob = i + 1; + } + + // Handle the trailing zeros after original end of block. + if (idx != -1 && i == p->eobs[block] - 1) { + count_zeros_after += (max_eob - p->eobs[block]); + } + + // Set redundant coefficients to zeros if needed. + if (count_zeros_after >= dropout_num_after) { + for (int j = idx; j <= i; ++j) { + qcoeff[scan_order->scan[j]] = 0; + dqcoeff[scan_order->scan[j]] = 0; + } + count_zeros_before += (i - idx + 1); + count_zeros_after = 0; + count_nonzeros = 0; + } else if (i == p->eobs[block] - 1) { + eob = i + 1; + } + } + + if (eob != p->eobs[block]) { + p->eobs[block] = eob; + p->txb_entropy_ctx[block] = + av1_get_txb_entropy_context(qcoeff, scan_order, eob); + } +} + +// Settings for optimization type. NOTE: To set optimization type for all intra +// frames, both `KEY_BLOCK_OPT_TYPE` and `INTRA_BLOCK_OPT_TYPE` should be set. +// TODO(yjshen): These settings are hard-coded and look okay for now. They +// should be made configurable later. +// Blocks of key frames ONLY. +const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; +// Blocks of intra frames (key frames EXCLUSIVE). +const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; +// Blocks of inter frames. (NOTE: Dropout optimization is DISABLED by default +// if trellis optimization is on for inter frames.) +const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; + +enum { + QUANT_FUNC_LOWBD = 0, + QUANT_FUNC_HIGHBD = 1, + QUANT_FUNC_TYPES = 2 +} UENUM1BYTE(QUANT_FUNC); + +#if CONFIG_AV1_HIGHBITDEPTH +static AV1_QUANT_FACADE + quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = { + { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade }, + { av1_quantize_b_facade, av1_highbd_quantize_b_facade }, + { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade }, + { NULL, NULL } + }; +#else +static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] = { + av1_quantize_fp_facade, av1_quantize_b_facade, av1_quantize_dc_facade, NULL +}; +#endif + +// Computes the transform for DC only blocks +void av1_xform_dc_only(MACROBLOCK *x, int plane, int block, + TxfmParam *txfm_param, int64_t per_px_mean) { + assert(per_px_mean != INT64_MAX); + const struct macroblock_plane *const p = &x->plane[plane]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff = p->coeff + block_offset; + const int n_coeffs = av1_get_max_eob(txfm_param->tx_size); + memset(coeff, 0, sizeof(*coeff) * n_coeffs); + coeff[0] = + (tran_low_t)((per_px_mean * dc_coeff_scale[txfm_param->tx_size]) >> 12); +} + +void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param, + const QUANT_PARAM *qparam) { + av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, txfm_param); + av1_quant(x, plane, block, txfm_param, qparam); +} + +void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TxfmParam *txfm_param) { + const struct macroblock_plane *const p = &x->plane[plane]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff = p->coeff + block_offset; + const int diff_stride = block_size_wide[plane_bsize]; + + const int src_offset = (blk_row * diff_stride + blk_col); + const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2]; + + av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); +} + +void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param, + const QUANT_PARAM *qparam) { + const struct macroblock_plane *const p = &x->plane[plane]; + const SCAN_ORDER *const scan_order = + get_scan(txfm_param->tx_size, txfm_param->tx_type); + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff = p->coeff + block_offset; + tran_low_t *const qcoeff = p->qcoeff + block_offset; + tran_low_t *const dqcoeff = p->dqcoeff + block_offset; + uint16_t *const eob = &p->eobs[block]; + + if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) { + const int n_coeffs = av1_get_max_eob(txfm_param->tx_size); + if (LIKELY(!x->seg_skip_block)) { +#if CONFIG_AV1_HIGHBITDEPTH + quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd]( + coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam); +#else + quant_func_list[qparam->xform_quant_idx]( + coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam); +#endif + } else { + av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob); + } + } + // use_optimize_b is true means av1_optimze_b will be called, + // thus cannot update entropy ctx now (performed in optimize_b) + if (qparam->use_optimize_b) { + p->txb_entropy_ctx[block] = 0; + } else { + p->txb_entropy_ctx[block] = + av1_get_txb_entropy_context(qcoeff, scan_order, *eob); + } +} + +void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size, + TX_TYPE tx_type, TxfmParam *txfm_param) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + txfm_param->tx_type = tx_type; + txfm_param->tx_size = tx_size; + txfm_param->lossless = xd->lossless[mbmi->segment_id]; + txfm_param->tx_set_type = av1_get_ext_tx_set_type( + tx_size, is_inter_block(mbmi), cm->features.reduced_tx_set_used); + + txfm_param->bd = xd->bd; + txfm_param->is_hbd = is_cur_buf_hbd(xd); +} +void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx, + int use_quant_b_adapt, QUANT_PARAM *qparam) { + qparam->log_scale = av1_get_tx_scale(tx_size); + qparam->tx_size = tx_size; + + qparam->use_quant_b_adapt = use_quant_b_adapt; + + // TODO(bohanli): optimize_b and quantization idx has relationship, + // but is kind of buried and complicated in different encoding stages. + // Should have a unified function to derive quant_idx, rather than + // determine and pass in the quant_idx + qparam->use_optimize_b = use_optimize_b; + qparam->xform_quant_idx = xform_quant_idx; + + qparam->qmatrix = NULL; + qparam->iqmatrix = NULL; +} +void av1_setup_qmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, + TX_TYPE tx_type, QUANT_PARAM *qparam) { + qparam->qmatrix = av1_get_qmatrix(quant_params, xd, plane, tx_size, tx_type); + qparam->iqmatrix = + av1_get_iqmatrix(quant_params, xd, plane, tx_size, tx_type); +} + +static void encode_block(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg, + RUN_TYPE dry_run) { + (void)dry_run; + struct encode_b_args *const args = arg; + const AV1_COMP *const cpi = args->cpi; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + uint8_t *dst; + ENTROPY_CONTEXT *a, *l; + int dummy_rate_cost = 0; + + const int bw = mi_size_wide[plane_bsize]; + dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; + + a = &args->ta[blk_col]; + l = &args->tl[blk_row]; + + TX_TYPE tx_type = DCT_DCT; + const int blk_skip_idx = blk_row * bw + blk_col; + if (!is_blk_skip(x->txfm_search_info.blk_skip, plane, blk_skip_idx) && + !mbmi->skip_mode) { + tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + TxfmParam txfm_param; + QUANT_PARAM quant_param; + const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run); + int quant_idx; + if (use_trellis) + quant_idx = AV1_XFORM_QUANT_FP; + else + quant_idx = + USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP; + av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param); + av1_setup_quant(tx_size, use_trellis, quant_idx, + cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + // Whether trellis or dropout optimization is required for inter frames. + const bool do_trellis = INTER_BLOCK_OPT_TYPE == TRELLIS_OPT || + INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT; + const bool do_dropout = INTER_BLOCK_OPT_TYPE == DROPOUT_OPT || + INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT; + + if (quant_param.use_optimize_b && do_trellis) { + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, + &dummy_rate_cost); + } + if (!quant_param.use_optimize_b && do_dropout) { + av1_dropout_qcoeff(x, plane, block, tx_size, tx_type, + cm->quant_params.base_qindex); + } + } else { + p->eobs[block] = 0; + p->txb_entropy_ctx[block] = 0; + } + + av1_set_txb_context(x, plane, block, tx_size, a, l); + + if (p->eobs[block]) { + // As long as any YUV plane has non-zero quantized transform coefficients, + // mbmi->skip_txfm flag is set to 0. + mbmi->skip_txfm = 0; + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + pd->dst.stride, p->eobs[block], + cm->features.reduced_tx_set_used); + } else { + // Only when YUV planes all have zero quantized transform coefficients, + // mbmi->skip_txfm flag is set to 1. + mbmi->skip_txfm &= 1; + } + + // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0 + // case. It is possible that certain collision in hash index would cause + // the assertion failure. To further optimize the rate-distortion + // performance, we need to re-visit this part and enable this assert + // again. + if (p->eobs[block] == 0 && plane == 0) { +#if 0 + if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ && + args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) { + // TODO(jingning,angiebird,huisu@google.com): enable txk_check when + // enable_optimize_b is true to detect potential RD bug. + const uint8_t disable_txk_check = args->enable_optimize_b; + if (!disable_txk_check) { + assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] == + DCT_DCT); + } + } +#endif + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + +#if CONFIG_MISMATCH_DEBUG + if (dry_run == OUTPUT_ENABLED) { + int pixel_c, pixel_r; + BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + int blk_w = block_size_wide[bsize]; + int blk_h = block_size_high[bsize]; + mi_to_pixel_loc(&pixel_c, &pixel_r, xd->mi_col, xd->mi_row, blk_col, + blk_row, pd->subsampling_x, pd->subsampling_y); + mismatch_record_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint, + plane, pixel_c, pixel_r, blk_w, blk_h, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#endif +} + +static void encode_block_inter(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg, RUN_TYPE dry_run) { + struct encode_b_args *const args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + if (!plane) { + assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] && + tx_size_high[tx_size] >= tx_size_high[plane_tx_size]); + } + + if (tx_size == plane_tx_size || plane) { + encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg, + dry_run); + } else { + assert(tx_size < TX_SIZES_ALL); + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size)); + assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size)); + // This is the square transform block partition entry point. + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsh * bsw; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + assert(bsw > 0 && bsh > 0); + + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { + const int offsetc = blk_col + col; + + encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs, + arg, dry_run); + block += step; + } + } + } +} + +void av1_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane, + foreach_transformed_block_visitor visit, void *arg) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + // Call visit() directly with zero offsets if the current block size is the + // same as the transform block size. + if (plane_bsize == tx_bsize) { + visit(plane, 0, 0, 0, plane_bsize, tx_size, arg); + return; + } + const uint8_t txw_unit = tx_size_wide_unit[tx_size]; + const uint8_t txh_unit = tx_size_high_unit[tx_size]; + const int step = txw_unit * txh_unit; + + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); + const int mu_blocks_wide = + AOMMIN(mi_size_wide[max_unit_bsize], max_blocks_wide); + const int mu_blocks_high = + AOMMIN(mi_size_high[max_unit_bsize], max_blocks_high); + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + int i = 0; + for (int r = 0; r < max_blocks_high; r += mu_blocks_high) { + const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high); + // Skip visiting the sub blocks that are wholly within the UMV. + for (int c = 0; c < max_blocks_wide; c += mu_blocks_wide) { + const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide); + for (int blk_row = r; blk_row < unit_height; blk_row += txh_unit) { + for (int blk_col = c; blk_col < unit_width; blk_col += txw_unit) { + visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg); + i += step; + } + } + } + } + // Check if visit() is invoked at least once. + assert(i >= 1); +} + +typedef struct encode_block_pass1_args { + AV1_COMP *cpi; + MACROBLOCK *x; +} encode_block_pass1_args; + +static void encode_block_pass1(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + encode_block_pass1_args *args = (encode_block_pass1_args *)arg; + AV1_COMP *cpi = args->cpi; + AV1_COMMON *cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + + uint8_t *dst; + dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt, + &quant_param); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT, + &quant_param); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + if (p->eobs[block] > 0) { + txfm_param.eob = p->eobs[block]; + if (txfm_param.is_hbd) { + av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); + return; + } + av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); + } +} + +void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) { + encode_block_pass1_args args = { cpi, x }; + av1_subtract_plane(x, bsize, 0); + av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, + encode_block_pass1, &args); +} + +void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RUN_TYPE dry_run) { + assert(bsize < BLOCK_SIZES_ALL); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + // In the current encoder implementation, for inter blocks, + // only when YUV planes all have zero quantized transform coefficients, + // mbmi->skip_txfm flag is set to 1. + // For intra blocks, this flag is set to 0 since skipped blocks are so rare + // that transmitting skip_txfm = 1 is very expensive. + // mbmi->skip_txfm is init to 1, and will be modified in encode_block() based + // on transform, quantization, and (if exists) trellis optimization. + mbmi->skip_txfm = 1; + if (x->txfm_search_info.skip_txfm) return; + + struct optimize_ctx ctx; + struct encode_b_args arg = { + cpi, x, &ctx, NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id] + }; + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int subsampling_x = pd->subsampling_x; + const int subsampling_y = pd->subsampling_y; + if (plane && !xd->is_chroma_ref) break; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; + const int bw = mi_size_wide[txb_size]; + const int bh = mi_size_high[txb_size]; + int block = 0; + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]); + av1_subtract_plane(x, plane_bsize, plane); + arg.ta = ctx.ta[plane]; + arg.tl = ctx.tl[plane]; + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, subsampling_x, subsampling_y); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide); + mu_blocks_high = AOMMIN(mi_height, mu_blocks_high); + + for (int idy = 0; idy < mi_height; idy += mu_blocks_high) { + for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) { + int blk_row, blk_col; + const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height); + const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width); + for (blk_row = idy; blk_row < unit_height; blk_row += bh) { + for (blk_col = idx; blk_col < unit_width; blk_col += bw) { + encode_block_inter(plane, block, blk_row, blk_col, plane_bsize, + max_tx_size, &arg, dry_run); + block += step; + } + } + } + } + } +} + +static void encode_block_intra_and_set_context(int plane, int block, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, + arg); + + struct encode_b_args *const args = arg; + MACROBLOCK *x = args->x; + ENTROPY_CONTEXT *a = &args->ta[blk_col]; + ENTROPY_CONTEXT *l = &args->tl[blk_row]; + av1_set_txb_context(x, plane, block, tx_size, a, l); +} + +void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct encode_b_args *const args = arg; + const AV1_COMP *const cpi = args->cpi; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + PLANE_TYPE plane_type = get_plane_type(plane); + uint16_t *eob = &p->eobs[block]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + int dummy_rate_cost = 0; + + av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); + + TX_TYPE tx_type = DCT_DCT; + const int bw = mi_size_wide[plane_bsize]; + if (plane == 0 && is_blk_skip(x->txfm_search_info.blk_skip, plane, + blk_row * bw + blk_col)) { + *eob = 0; + p->txb_entropy_ctx[block] = 0; + } else { + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); + + const ENTROPY_CONTEXT *a = &args->ta[blk_col]; + const ENTROPY_CONTEXT *l = &args->tl[blk_row]; + tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + TxfmParam txfm_param; + QUANT_PARAM quant_param; + const int use_trellis = + is_trellis_used(args->enable_optimize_b, args->dry_run); + int quant_idx; + if (use_trellis) + quant_idx = AV1_XFORM_QUANT_FP; + else + quant_idx = + USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP; + + av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param); + av1_setup_quant(tx_size, use_trellis, quant_idx, + cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + // Whether trellis or dropout optimization is required for key frames and + // intra frames. + const bool do_trellis = (frame_is_intra_only(cm) && + (KEY_BLOCK_OPT_TYPE == TRELLIS_OPT || + KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) || + (!frame_is_intra_only(cm) && + (INTRA_BLOCK_OPT_TYPE == TRELLIS_OPT || + INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)); + const bool do_dropout = (frame_is_intra_only(cm) && + (KEY_BLOCK_OPT_TYPE == DROPOUT_OPT || + KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) || + (!frame_is_intra_only(cm) && + (INTRA_BLOCK_OPT_TYPE == DROPOUT_OPT || + INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)); + + if (quant_param.use_optimize_b && do_trellis) { + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, + &dummy_rate_cost); + } + if (do_dropout) { + av1_dropout_qcoeff(x, plane, block, tx_size, tx_type, + cm->quant_params.base_qindex); + } + } + + if (*eob) { + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + dst_stride, *eob, + cm->features.reduced_tx_set_used); + } + + // TODO(jingning): Temporarily disable txk_type check for eob=0 case. + // It is possible that certain collision in hash index would cause + // the assertion failure. To further optimize the rate-distortion + // performance, we need to re-visit this part and enable this assert + // again. + if (*eob == 0 && plane == 0) { +#if 0 + if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ + && args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) { + assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] == + DCT_DCT); + } +#endif + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + + // For intra mode, skipped blocks are so rare that transmitting + // skip_txfm = 1 is very expensive. + mbmi->skip_txfm = 0; + + if (plane == AOM_PLANE_Y && xd->cfl.store_y) { + cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); + } +} + +void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run, + TRELLIS_OPT_TYPE enable_optimize_b) { + assert(bsize < BLOCK_SIZES_ALL); + const MACROBLOCKD *const xd = &x->e_mbd; + if (plane && !xd->is_chroma_ref) return; + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 }; + ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 }; + struct encode_b_args arg = { + cpi, x, NULL, ta, tl, dry_run, enable_optimize_b + }; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + if (enable_optimize_b) { + av1_get_entropy_contexts(plane_bsize, pd, ta, tl); + } + av1_foreach_transformed_block_in_plane( + xd, plane_bsize, plane, encode_block_intra_and_set_context, &arg); +} diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h new file mode 100644 index 0000000000..f97bf8f517 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemb.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEMB_H_ +#define AOM_AV1_ENCODER_ENCODEMB_H_ + +#include "config/aom_config.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/block.h" +#include "av1/encoder/tokenize.h" +#ifdef __cplusplus +extern "C" { +#endif + +enum { + AV1_XFORM_QUANT_FP = 0, + AV1_XFORM_QUANT_B = 1, + AV1_XFORM_QUANT_DC = 2, + AV1_XFORM_QUANT_SKIP_QUANT, + AV1_XFORM_QUANT_TYPES, +} UENUM1BYTE(AV1_XFORM_QUANT); + +// TODO(any): Merge OPT_TYPe and TRELLLIS_OPT_TYPE +// Available optimization types to optimize the quantized coefficients. +enum { + NONE_OPT = 0, // No optimization. + TRELLIS_OPT = 1, // Trellis optimization. See `av1_optimize_b()`. + DROPOUT_OPT = 2, // Dropout optimization. See `av1_dropout_qcoeff()`. + TRELLIS_DROPOUT_OPT = 3 // Perform dropout after trellis optimization. +} UENUM1BYTE(OPT_TYPE); + +enum { + NO_TRELLIS_OPT, // No trellis optimization + FULL_TRELLIS_OPT, // Trellis optimization in all stages + FINAL_PASS_TRELLIS_OPT, // Trellis optimization in only the final encode pass + NO_ESTIMATE_YRD_TRELLIS_OPT // Disable trellis in estimate_yrd_for_sb +} UENUM1BYTE(TRELLIS_OPT_TYPE); + +struct optimize_ctx { + ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE]; + ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE]; +}; + +struct encode_b_args { + const struct AV1_COMP *cpi; + MACROBLOCK *x; + struct optimize_ctx *ctx; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + RUN_TYPE dry_run; + TRELLIS_OPT_TYPE enable_optimize_b; +}; + +void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RUN_TYPE dry_run); + +void av1_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane, + foreach_transformed_block_visitor visit, void *arg); + +void av1_encode_sby_pass1(struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize); + +void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size, + TX_TYPE tx_type, TxfmParam *txfm_param); +void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx, + int use_quant_b_adapt, QUANT_PARAM *qparam); +void av1_setup_qmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, + TX_TYPE tx_type, QUANT_PARAM *qparam); + +void av1_xform_dc_only(MACROBLOCK *x, int plane, int block, + TxfmParam *txfm_param, int64_t per_px_mean); + +void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param, + const QUANT_PARAM *qparam); + +void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TxfmParam *txfm_param); + +void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param, + const QUANT_PARAM *qparam); + +int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost); + +// This function can be used as (i) a further optimization to reduce the +// redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis +// optimization, or (ii) an alternative to trellis optimization in high-speed +// compression mode (e.g., real-time mode under speed-6) due to its LOW time +// complexity. The rational behind is to drop out the may-be redundant quantized +// coefficient which is among a bunch of zeros. NOTE: This algorithm is not as +// accurate as trellis optimization since the hyper-parameters are hard-coded +// instead of dynamic search. More adaptive logic may improve the performance. +// This function should be applied to all or partical block cells. +// Inputs: +// mb: Pointer to the MACROBLOCK to perform dropout on. +// plane: Index of the plane to which the target block belongs. +// block: Index of the target block. +// tx_size: Transform size of the target block. +// tx_type: Transform type of the target block. This field is particularly +// used to find out the scan order of the block. +// qindex: Quantization index used for target block. In general, all blocks +// in a same plane share the same quantization index. This field is +// particularly used to determine how many zeros should be used to +// drop out a coefficient. +// Returns: +// Nothing will be returned, but `qcoeff`, `dqcoeff`, `eob`, as well as +// `txb_entropy_ctx`, which `mb` points to, may be modified by this function. +void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, + TX_TYPE tx_type, int qindex); +// Same as above, with the number of zeroes needed before/after a coeff to drop +// it explicitly passed in, instead of being derived from qindex. +void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, TX_TYPE tx_type, + int dropout_num_before, int dropout_num_after); + +void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride); + +void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, + int blk_col, int blk_row, TX_SIZE tx_size); + +void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane); + +static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l) { + const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block]; + memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a)); + memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l)); +} + +void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); + +void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run, + TRELLIS_OPT_TYPE enable_optimize_b); + +static INLINE int is_trellis_used(TRELLIS_OPT_TYPE optimize_b, + RUN_TYPE dry_run) { + if (optimize_b == NO_TRELLIS_OPT) return false; + if (optimize_b == FINAL_PASS_TRELLIS_OPT && dry_run != OUTPUT_ENABLED) + return false; + return true; +} + +// Scaling terms (precision of 12 bits) to perform tx-size specific +// normalization that is used in DCT_DCT forward transform. +// For transform blocks of 1:2 and 2:1 - sqrt(2) normalization is used +// For transform blocks of 1:4 and 4:1 - factor of 2 is used +// For transform blocks TX_8x8 and below - an additional factor of 2 is used +// For transform blocks max(width,height)=64 - currently not supported + +static const uint16_t dc_coeff_scale[TX_SIZES_ALL] = { + 1024, 2048, 4096, 4096, 0, 1448, 1448, 2896, 2896, 2896, + 2896, 0, 0, 2048, 2048, 4096, 4096, 0, 0 +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEMB_H_ diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c new file mode 100644 index 0000000000..7cae72c159 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemv.c @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/common.h" +#include "av1/common/entropymode.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemv.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/bitops.h" + +static void update_mv_component_stats(int comp, nmv_component *mvcomp, + MvSubpelPrecision precision) { + assert(comp != 0); + int offset; + const int sign = comp < 0; + const int mag = sign ? -comp : comp; + const int mv_class = av1_get_mv_class(mag - 1, &offset); + const int d = offset >> 3; // int mv data + const int fr = (offset >> 1) & 3; // fractional mv data + const int hp = offset & 1; // high precision mv data + + // Sign + update_cdf(mvcomp->sign_cdf, sign, 2); + + // Class + update_cdf(mvcomp->classes_cdf, mv_class, MV_CLASSES); + + // Integer bits + if (mv_class == MV_CLASS_0) { + update_cdf(mvcomp->class0_cdf, d, CLASS0_SIZE); + } else { + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (int i = 0; i < n; ++i) + update_cdf(mvcomp->bits_cdf[i], (d >> i) & 1, 2); + } + // Fractional bits + if (precision > MV_SUBPEL_NONE) { + aom_cdf_prob *fp_cdf = + mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf; + update_cdf(fp_cdf, fr, MV_FP_SIZE); + } + + // High precision bit + if (precision > MV_SUBPEL_LOW_PRECISION) { + aom_cdf_prob *hp_cdf = + mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf; + update_cdf(hp_cdf, hp, 2); + } +} + +void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx, + MvSubpelPrecision precision) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); + + update_cdf(mvctx->joints_cdf, j, MV_JOINTS); + + if (mv_joint_vertical(j)) + update_mv_component_stats(diff.row, &mvctx->comps[0], precision); + + if (mv_joint_horizontal(j)) + update_mv_component_stats(diff.col, &mvctx->comps[1], precision); +} + +static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp, + MvSubpelPrecision precision) { + assert(comp != 0); + int offset; + const int sign = comp < 0; + const int mag = sign ? -comp : comp; + const int mv_class = av1_get_mv_class(mag - 1, &offset); + const int d = offset >> 3; // int mv data + const int fr = (offset >> 1) & 3; // fractional mv data + const int hp = offset & 1; // high precision mv data + + // Sign + aom_write_symbol(w, sign, mvcomp->sign_cdf, 2); + + // Class + aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES); + + // Integer bits + if (mv_class == MV_CLASS_0) { + aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE); + } else { + int i; + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (i = 0; i < n; ++i) + aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2); + } + // Fractional bits + if (precision > MV_SUBPEL_NONE) { + aom_write_symbol( + w, fr, + mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf, + MV_FP_SIZE); + } + + // High precision bit + if (precision > MV_SUBPEL_LOW_PRECISION) + aom_write_symbol( + w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, + 2); +} + +/* TODO(siekyleb@amazon.com): This function writes MV_VALS ints or 128 KiB. This + * is more than most L1D caches and is a significant chunk of L2. Write + * SIMD that uses streaming writes to avoid loading all of that into L1, or + * just don't update the larger component costs every time this called + * (or both). + */ +void av1_build_nmv_component_cost_table(int *mvcost, + const nmv_component *const mvcomp, + MvSubpelPrecision precision) { + int i, j, v, o, mantissa; + int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE]; + int bits_cost[MV_OFFSET_BITS][2]; + int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE] = { 0 }, + fp_cost[MV_FP_SIZE] = { 0 }; + int class0_hp_cost[2] = { 0 }, hp_cost[2] = { 0 }; + + av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL); + av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL); + av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL); + for (i = 0; i < MV_OFFSET_BITS; ++i) { + av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL); + } + + if (precision > MV_SUBPEL_NONE) { + for (i = 0; i < CLASS0_SIZE; ++i) + av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], + NULL); + av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL); + } + + if (precision > MV_SUBPEL_LOW_PRECISION) { + av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL); + av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL); + } + + // Instead of accumulating the cost of each vector component's bits + // individually, compute the costs based on smaller vectors. Costs for + // [2^exp, 2 * 2^exp - 1] are calculated based on [0, 2^exp - 1] + // respectively. Offsets are maintained to swap both 1) class costs when + // treated as a complete vector component with the highest set bit when + // treated as a mantissa (significand) and 2) leading zeros to account for + // the current exponent. + + // Cost offsets + int cost_swap[MV_OFFSET_BITS] = { 0 }; + // Delta to convert positive vector to negative vector costs + int negate_sign = sign_cost[1] - sign_cost[0]; + + // Initialize with offsets to swap the class costs with the costs of the + // highest set bit. + for (i = 1; i < MV_OFFSET_BITS; ++i) { + cost_swap[i] = bits_cost[i - 1][1]; + if (i > CLASS0_BITS) cost_swap[i] -= class_cost[i - CLASS0_BITS]; + } + + // Seed the fractional costs onto the output (overwritten latter). + for (o = 0; o < MV_FP_SIZE; ++o) { + int hp; + for (hp = 0; hp < 2; ++hp) { + v = 2 * o + hp + 1; + mvcost[v] = fp_cost[o] + hp_cost[hp] + sign_cost[0]; + } + } + + mvcost[0] = 0; + // Fill the costs for each exponent's vectors, using the costs set in the + // previous exponents. + for (i = 0; i < MV_OFFSET_BITS; ++i) { + const int exponent = (2 * MV_FP_SIZE) << i; + + int class = 0; + if (i >= CLASS0_BITS) { + class = class_cost[i - CLASS0_BITS + 1]; + } + + // Iterate through mantissas, keeping track of the location + // of the highest set bit for the mantissa. + // To be clear: in the outer loop, the position of the highest set bit + // (exponent) is tracked and, in this loop, the highest set bit of the + // mantissa is tracked. + mantissa = 0; + for (j = 0; j <= i; ++j) { + for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) { + int cost = mvcost[mantissa + 1] + class + cost_swap[j]; + v = exponent + mantissa + 1; + mvcost[v] = cost; + mvcost[-v] = cost + negate_sign; + } + cost_swap[j] += bits_cost[i][0]; + } + } + + // Special case to avoid buffer overrun + { + int exponent = (2 * MV_FP_SIZE) << MV_OFFSET_BITS; + int class = class_cost[MV_CLASSES - 1]; + mantissa = 0; + for (j = 0; j < MV_OFFSET_BITS; ++j) { + for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) { + int cost = mvcost[mantissa + 1] + class + cost_swap[j]; + v = exponent + mantissa + 1; + mvcost[v] = cost; + mvcost[-v] = cost + negate_sign; + } + } + // At this point: mantissa = exponent >> 1 + + // Manually calculate the final cost offset + int cost_swap_hi = + bits_cost[MV_OFFSET_BITS - 1][1] - class_cost[MV_CLASSES - 2]; + for (; mantissa < exponent - 1; ++mantissa) { + int cost = mvcost[mantissa + 1] + class + cost_swap_hi; + v = exponent + mantissa + 1; + mvcost[v] = cost; + mvcost[-v] = cost + negate_sign; + } + } + + // Fill costs for class0 vectors, overwriting previous placeholder values + // used for calculating the costs of the larger vectors. + for (i = 0; i < CLASS0_SIZE; ++i) { + const int top = i * 2 * MV_FP_SIZE; + for (o = 0; o < MV_FP_SIZE; ++o) { + int hp; + int cost = class0_fp_cost[i][o] + class_cost[0] + class0_cost[i]; + for (hp = 0; hp < 2; ++hp) { + v = top + 2 * o + hp + 1; + mvcost[v] = cost + class0_hp_cost[hp] + sign_cost[0]; + mvcost[-v] = cost + class0_hp_cost[hp] + sign_cost[1]; + } + } + } +} + +void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv, + const MV *ref, nmv_context *mvctx, int usehp) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); + // If the mv_diff is zero, then we should have used near or nearest instead. + assert(j != MV_JOINT_ZERO); + if (cpi->common.features.cur_frame_force_integer_mv) { + usehp = MV_SUBPEL_NONE; + } + aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS); + if (mv_joint_vertical(j)) + encode_mv_component(w, diff.row, &mvctx->comps[0], usehp); + + if (mv_joint_horizontal(j)) + encode_mv_component(w, diff.col, &mvctx->comps[1], usehp); + + // If auto_mv_step_size is enabled then keep track of the largest + // motion vector component used. + if (cpi->sf.mv_sf.auto_mv_step_size) { + int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3; + td->max_mv_magnitude = AOMMAX(maxv, td->max_mv_magnitude); + } +} + +void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx) { + // DV and ref DV should not have sub-pel. + assert((mv->col & 7) == 0); + assert((mv->row & 7) == 0); + assert((ref->col & 7) == 0); + assert((ref->row & 7) == 0); + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); + + aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS); + if (mv_joint_vertical(j)) + encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE); + + if (mv_joint_horizontal(j)) + encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE); +} + +void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context *ctx, + MvSubpelPrecision precision) { + av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL); + av1_build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision); + av1_build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision); +} + +int_mv av1_get_ref_mv_from_stack(int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + int ref_mv_idx, + const MB_MODE_INFO_EXT *mbmi_ext) { + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const CANDIDATE_MV *curr_ref_mv_stack = + mbmi_ext->ref_mv_stack[ref_frame_type]; + + if (ref_frame[1] > INTRA_FRAME) { + assert(ref_idx == 0 || ref_idx == 1); + return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv + : curr_ref_mv_stack[ref_mv_idx].this_mv; + } + + assert(ref_idx == 0); + return ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type] + ? curr_ref_mv_stack[ref_mv_idx].this_mv + : mbmi_ext->global_mvs[ref_frame_type]; +} + +int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int ref_mv_idx = mbmi->ref_mv_idx; + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { + assert(has_second_ref(mbmi)); + ref_mv_idx += 1; + } + return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx, + &x->mbmi_ext); +} + +void av1_find_best_ref_mvs_from_stack(int allow_hp, + const MB_MODE_INFO_EXT *mbmi_ext, + MV_REFERENCE_FRAME ref_frame, + int_mv *nearest_mv, int_mv *near_mv, + int is_integer) { + const int ref_idx = 0; + MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME }; + *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext); + lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer); + *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext); + lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer); +} diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h new file mode 100644 index 0000000000..c39001a5a2 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemv.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEMV_H_ +#define AOM_AV1_ENCODER_ENCODEMV_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv, + const MV *ref, nmv_context *mvctx, int usehp); + +void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx, + MvSubpelPrecision precision); + +void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context *mvctx, + MvSubpelPrecision precision); +void av1_build_nmv_component_cost_table(int *mvcost, + const nmv_component *const mvcomp, + MvSubpelPrecision precision); + +void av1_update_mv_count(ThreadData *td); + +void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx); +int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx); +int_mv av1_get_ref_mv_from_stack(int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + int ref_mv_idx, + const MB_MODE_INFO_EXT *mbmi_ext); +void av1_find_best_ref_mvs_from_stack(int allow_hp, + const MB_MODE_INFO_EXT *mbmi_ext, + MV_REFERENCE_FRAME ref_frame, + int_mv *nearest_mv, int_mv *near_mv, + int is_integer); + +static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) { + // row: Z col: Z | MV_JOINT_ZERO (0) + // row: Z col: NZ | MV_JOINT_HNZVZ (1) + // row: NZ col: Z | MV_JOINT_HZVNZ (2) + // row: NZ col: NZ | MV_JOINT_HNZVNZ (3) + return (!!mv->col) | ((!!mv->row) << 1); +} + +static INLINE int av1_mv_class_base(MV_CLASS_TYPE c) { + return c ? CLASS0_SIZE << (c + 2) : 0; +} + +// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0. +static INLINE uint8_t av1_log_in_base_2(unsigned int n) { + // get_msb() is only valid when n != 0. + return n == 0 ? 0 : get_msb(n); +} + +static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) { + assert(z >= 0); + const MV_CLASS_TYPE c = (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3); + assert(c <= MV_CLASS_10); + if (offset) *offset = z - av1_mv_class_base(c); + return c; +} + +static INLINE int av1_check_newmv_joint_nonzero(const AV1_COMMON *cm, + MACROBLOCK *const x) { + (void)cm; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + if (this_mode == NEW_NEWMV) { + const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); + const int_mv ref_mv_1 = av1_get_ref_mv(x, 1); + if (mbmi->mv[0].as_int == ref_mv_0.as_int || + mbmi->mv[1].as_int == ref_mv_1.as_int) { + return 0; + } + } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { + const int_mv ref_mv_1 = av1_get_ref_mv(x, 1); + if (mbmi->mv[1].as_int == ref_mv_1.as_int) { + return 0; + } + } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) { + const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); + if (mbmi->mv[0].as_int == ref_mv_0.as_int) { + return 0; + } + } else if (this_mode == NEWMV) { + const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); + if (mbmi->mv[0].as_int == ref_mv_0.as_int) { + return 0; + } + } + return 1; +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEMV_H_ diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c new file mode 100644 index 0000000000..4732ad435b --- /dev/null +++ b/third_party/aom/av1/encoder/encoder.c @@ -0,0 +1,5409 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include + +#include "av1/common/scale.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aomcx.h" + +#if CONFIG_DENOISE +#include "aom_dsp/grain_table.h" +#include "aom_dsp/noise_util.h" +#include "aom_dsp/noise_model.h" +#endif +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_dsp/psnr.h" +#if CONFIG_INTERNAL_STATS +#include "aom_dsp/ssim.h" +#endif +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_scale/aom_scale.h" +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "av1/common/alloccommon.h" +#include "av1/common/filter.h" +#include "av1/common/idct.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/resize.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/allintra_vis.h" +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/dwt.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/hash_motion.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/mv_prec.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/pickcdef.h" +#include "av1/encoder/picklpf.h" +#include "av1/encoder/pickrst.h" +#include "av1/encoder/random.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rc_utils.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#if CONFIG_SALIENCY_MAP +#include "av1/encoder/saliency_map.h" +#endif +#include "av1/encoder/segmentation.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/superres_scale.h" +#include "av1/encoder/thirdpass.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/var_based_part.h" + +#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7 + +// #define OUTPUT_YUV_REC +#ifdef OUTPUT_YUV_REC +FILE *yuv_rec_file; +#define FILE_NAME_LEN 100 +#endif + +#ifdef OUTPUT_YUV_DENOISED +FILE *yuv_denoised_file = NULL; +#endif + +static INLINE void Scale2Ratio(AOM_SCALING_MODE mode, int *hr, int *hs) { + switch (mode) { + case AOME_NORMAL: + *hr = 1; + *hs = 1; + break; + case AOME_FOURFIVE: + *hr = 4; + *hs = 5; + break; + case AOME_THREEFIVE: + *hr = 3; + *hs = 5; + break; + case AOME_THREEFOUR: + *hr = 3; + *hs = 4; + break; + case AOME_ONEFOUR: + *hr = 1; + *hs = 4; + break; + case AOME_ONEEIGHT: + *hr = 1; + *hs = 8; + break; + case AOME_ONETWO: + *hr = 1; + *hs = 2; + break; + case AOME_TWOTHREE: + *hr = 2; + *hs = 3; + break; + case AOME_ONETHREE: + *hr = 1; + *hs = 3; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } +} + +int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + if (rows == mi_params->mb_rows && cols == mi_params->mb_cols) { + unsigned char *const active_map_4x4 = cpi->active_map.map; + const int mi_rows = mi_params->mi_rows; + const int mi_cols = mi_params->mi_cols; + const int row_scale = mi_size_high_log2[BLOCK_16X16]; + const int col_scale = mi_size_wide_log2[BLOCK_16X16]; + cpi->active_map.update = 0; + assert(mi_rows % 2 == 0); + assert(mi_cols % 2 == 0); + if (new_map_16x16) { + for (int r = 0; r < (mi_rows >> row_scale); ++r) { + for (int c = 0; c < (mi_cols >> col_scale); ++c) { + const uint8_t val = new_map_16x16[r * cols + c] + ? AM_SEGMENT_ID_ACTIVE + : AM_SEGMENT_ID_INACTIVE; + active_map_4x4[(2 * r + 0) * mi_cols + (c + 0)] = val; + active_map_4x4[(2 * r + 0) * mi_cols + (c + 1)] = val; + active_map_4x4[(2 * r + 1) * mi_cols + (c + 0)] = val; + active_map_4x4[(2 * r + 1) * mi_cols + (c + 1)] = val; + } + } + cpi->active_map.enabled = 1; + } + return 0; + } + + return -1; +} + +int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + if (rows == mi_params->mb_rows && cols == mi_params->mb_cols && + new_map_16x16) { + unsigned char *const seg_map_8x8 = cpi->enc_seg.map; + const int mi_rows = mi_params->mi_rows; + const int mi_cols = mi_params->mi_cols; + const int row_scale = mi_size_high_log2[BLOCK_16X16]; + const int col_scale = mi_size_wide_log2[BLOCK_16X16]; + assert(mi_rows % 2 == 0); + assert(mi_cols % 2 == 0); + + memset(new_map_16x16, !cpi->active_map.enabled, rows * cols); + if (cpi->active_map.enabled) { + for (int r = 0; r < (mi_rows >> row_scale); ++r) { + for (int c = 0; c < (mi_cols >> col_scale); ++c) { + // Cyclic refresh segments are considered active despite not having + // AM_SEGMENT_ID_ACTIVE + uint8_t temp = 0; + temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 0)] != + AM_SEGMENT_ID_INACTIVE; + temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 1)] != + AM_SEGMENT_ID_INACTIVE; + temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 0)] != + AM_SEGMENT_ID_INACTIVE; + temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 1)] != + AM_SEGMENT_ID_INACTIVE; + new_map_16x16[r * cols + c] |= temp; + } + } + } + return 0; + } + + return -1; +} + +void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage) { + bool is_allintra = usage == ALLINTRA; + + av1_rtcd(); + aom_dsp_rtcd(); + aom_scale_rtcd(); + av1_init_intra_predictors(); + av1_init_me_luts(); + if (!is_allintra) av1_init_wedge_masks(); + if (!is_allintra || end_usage != AOM_Q) av1_rc_init_minq_luts(); +} + +void av1_new_framerate(AV1_COMP *cpi, double framerate) { + cpi->framerate = framerate < 0.1 ? 30 : framerate; + av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height); +} + +double av1_get_compression_ratio(const AV1_COMMON *const cm, + size_t encoded_frame_size) { + const int upscaled_width = cm->superres_upscaled_width; + const int height = cm->height; + const int64_t luma_pic_size = (int64_t)upscaled_width * height; + const SequenceHeader *const seq_params = cm->seq_params; + const BITSTREAM_PROFILE profile = seq_params->profile; + const int pic_size_profile_factor = + profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36); + encoded_frame_size = + (encoded_frame_size > 129 ? encoded_frame_size - 128 : 1); + const int64_t uncompressed_frame_size = + (luma_pic_size * pic_size_profile_factor) >> 3; + return (double)uncompressed_frame_size / encoded_frame_size; +} + +static void auto_tile_size_balancing(AV1_COMMON *const cm, int num_sbs, + int num_tiles_lg, int tile_col_row) { + CommonTileParams *const tiles = &cm->tiles; + int i, start_sb; + int size_sb = num_sbs >> num_tiles_lg; + int res_sbs = num_sbs - (size_sb << num_tiles_lg); + int num_tiles = 1 << num_tiles_lg; + int inc_index = num_tiles - res_sbs; + + tiles->uniform_spacing = 0; + + for (i = 0, start_sb = 0; start_sb < num_sbs && i < MAX_TILE_COLS; ++i) { + if (i == inc_index) ++size_sb; + if (tile_col_row) + tiles->col_start_sb[i] = start_sb; + else + tiles->row_start_sb[i] = start_sb; + + start_sb += AOMMIN(size_sb, tiles->max_width_sb); + } + + if (tile_col_row) { + tiles->cols = i; + tiles->col_start_sb[i] = num_sbs; + } else { + tiles->rows = i; + tiles->row_start_sb[i] = num_sbs; + } +} + +static void set_tile_info(AV1_COMMON *const cm, + const TileConfig *const tile_cfg) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = cm->seq_params; + CommonTileParams *const tiles = &cm->tiles; + int i, start_sb; + + av1_get_tile_limits(cm); + + int sb_cols = + CEIL_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2); + // configure tile columns + if (tile_cfg->tile_width_count == 0 || tile_cfg->tile_height_count == 0) { + tiles->uniform_spacing = 1; + tiles->log2_cols = AOMMAX(tile_cfg->tile_columns, tiles->min_log2_cols); + // Add a special case to handle super resolution + sb_cols = coded_to_superres_mi(sb_cols, cm->superres_scale_denominator); + int min_log2_cols = 0; + for (; (tiles->max_width_sb << min_log2_cols) <= sb_cols; ++min_log2_cols) { + } + tiles->log2_cols = AOMMAX(tiles->log2_cols, min_log2_cols); + + tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols); + } else if (tile_cfg->tile_widths[0] < 0) { + auto_tile_size_balancing(cm, sb_cols, tile_cfg->tile_columns, 1); + } else { + int size_sb, j = 0; + tiles->uniform_spacing = 0; + for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) { + tiles->col_start_sb[i] = start_sb; + size_sb = tile_cfg->tile_widths[j++]; + if (j >= tile_cfg->tile_width_count) j = 0; + start_sb += AOMMIN(size_sb, tiles->max_width_sb); + } + tiles->cols = i; + tiles->col_start_sb[i] = sb_cols; + } + av1_calculate_tile_cols(seq_params, mi_params->mi_rows, mi_params->mi_cols, + tiles); + + // configure tile rows + int sb_rows = + CEIL_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2); + if (tiles->uniform_spacing) { + tiles->log2_rows = AOMMAX(tile_cfg->tile_rows, tiles->min_log2_rows); + tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows); + } else if (tile_cfg->tile_heights[0] < 0) { + auto_tile_size_balancing(cm, sb_rows, tile_cfg->tile_rows, 0); + } else { + int size_sb, j = 0; + for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) { + tiles->row_start_sb[i] = start_sb; + size_sb = tile_cfg->tile_heights[j++]; + if (j >= tile_cfg->tile_height_count) j = 0; + start_sb += AOMMIN(size_sb, tiles->max_height_sb); + } + tiles->rows = i; + tiles->row_start_sb[i] = sb_rows; + } + av1_calculate_tile_rows(seq_params, mi_params->mi_rows, tiles); +} + +void av1_update_frame_size(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + + // Setup mi_params here in case we need more mi's. + CommonModeInfoParams *const mi_params = &cm->mi_params; + mi_params->set_mb_mi(mi_params, cm->width, cm->height, + cpi->sf.part_sf.default_min_partition_size); + + av1_init_macroblockd(cm, xd); + + if (!cpi->ppi->seq_params_locked) + set_sb_size(cm->seq_params, + av1_select_sb_size(&cpi->oxcf, cm->width, cm->height, + cpi->ppi->number_spatial_layers)); + + set_tile_info(cm, &cpi->oxcf.tile_cfg); +} + +static INLINE int does_level_match(int width, int height, double fps, + int lvl_width, int lvl_height, + double lvl_fps, int lvl_dim_mult) { + const int64_t lvl_luma_pels = (int64_t)lvl_width * lvl_height; + const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps; + const int64_t luma_pels = (int64_t)width * height; + const double display_sample_rate = luma_pels * fps; + return luma_pels <= lvl_luma_pels && + display_sample_rate <= lvl_display_sample_rate && + width <= lvl_width * lvl_dim_mult && + height <= lvl_height * lvl_dim_mult; +} + +static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width, + int height, double init_framerate) { + SequenceHeader *const seq_params = &ppi->seq_params; + const AV1LevelParams *const level_params = &ppi->level_params; + // TODO(any): This is a placeholder function that only addresses dimensions + // and max display sample rates. + // Need to add checks for max bit rate, max decoded luma sample rate, header + // rate, etc. that are not covered by this function. + AV1_LEVEL level = SEQ_LEVEL_MAX; + if (does_level_match(width, height, init_framerate, 512, 288, 30.0, 4)) { + level = SEQ_LEVEL_2_0; + } else if (does_level_match(width, height, init_framerate, 704, 396, 30.0, + 4)) { + level = SEQ_LEVEL_2_1; + } else if (does_level_match(width, height, init_framerate, 1088, 612, 30.0, + 4)) { + level = SEQ_LEVEL_3_0; + } else if (does_level_match(width, height, init_framerate, 1376, 774, 30.0, + 4)) { + level = SEQ_LEVEL_3_1; + } else if (does_level_match(width, height, init_framerate, 2048, 1152, 30.0, + 3)) { + level = SEQ_LEVEL_4_0; + } else if (does_level_match(width, height, init_framerate, 2048, 1152, 60.0, + 3)) { + level = SEQ_LEVEL_4_1; + } else if (does_level_match(width, height, init_framerate, 4096, 2176, 30.0, + 2)) { + level = SEQ_LEVEL_5_0; + } else if (does_level_match(width, height, init_framerate, 4096, 2176, 60.0, + 2)) { + level = SEQ_LEVEL_5_1; + } else if (does_level_match(width, height, init_framerate, 4096, 2176, 120.0, + 2)) { + level = SEQ_LEVEL_5_2; + } else if (does_level_match(width, height, init_framerate, 8192, 4352, 30.0, + 2)) { + level = SEQ_LEVEL_6_0; + } else if (does_level_match(width, height, init_framerate, 8192, 4352, 60.0, + 2)) { + level = SEQ_LEVEL_6_1; + } else if (does_level_match(width, height, init_framerate, 8192, 4352, 120.0, + 2)) { + level = SEQ_LEVEL_6_2; + } +#if CONFIG_CWG_C013 + // TODO(bohanli): currently target level is only working for the 0th operating + // point, so scalable coding is not supported. + else if (level_params->target_seq_level_idx[0] >= SEQ_LEVEL_7_0 && + level_params->target_seq_level_idx[0] <= SEQ_LEVEL_8_3) { + // Only use level 7.x to 8.x when explicitly asked to. + if (does_level_match(width, height, init_framerate, 16384, 8704, 30.0, 2)) { + level = SEQ_LEVEL_7_0; + } else if (does_level_match(width, height, init_framerate, 16384, 8704, + 60.0, 2)) { + level = SEQ_LEVEL_7_1; + } else if (does_level_match(width, height, init_framerate, 16384, 8704, + 120.0, 2)) { + level = SEQ_LEVEL_7_2; + } else if (does_level_match(width, height, init_framerate, 32768, 17408, + 30.0, 2)) { + level = SEQ_LEVEL_8_0; + } else if (does_level_match(width, height, init_framerate, 32768, 17408, + 60.0, 2)) { + level = SEQ_LEVEL_8_1; + } else if (does_level_match(width, height, init_framerate, 32768, 17408, + 120.0, 2)) { + level = SEQ_LEVEL_8_2; + } + } +#endif + + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + assert(is_valid_seq_level_idx(level_params->target_seq_level_idx[i]) || + level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS); + // If a higher target level is specified, it is then used rather than the + // inferred one from resolution and framerate. + seq_params->seq_level_idx[i] = + level_params->target_seq_level_idx[i] < SEQ_LEVELS && + level_params->target_seq_level_idx[i] > level + ? level_params->target_seq_level_idx[i] + : level; + // Set the maximum parameters for bitrate and buffer size for this profile, + // level, and tier + seq_params->op_params[i].bitrate = av1_max_level_bitrate( + seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]); + // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the + // check + if (seq_params->op_params[i].bitrate == 0) + aom_internal_error( + &ppi->error, AOM_CODEC_UNSUP_BITSTREAM, + "AV1 does not support this combination of profile, level, and tier."); + // Buffer size in bits/s is bitrate in bits/s * 1 s + seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate; + } +} + +void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi, + const AV1EncoderConfig *oxcf, + int disable_frame_id_numbers) { + SequenceHeader *const seq = &ppi->seq_params; + const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; + const ToolCfg *const tool_cfg = &oxcf->tool_cfg; + + seq->still_picture = + !tool_cfg->force_video_mode && (oxcf->input_cfg.limit == 1); + seq->reduced_still_picture_hdr = + seq->still_picture && !tool_cfg->full_still_picture_hdr; + seq->force_screen_content_tools = 2; + seq->force_integer_mv = 2; + seq->order_hint_info.enable_order_hint = tool_cfg->enable_order_hint; + seq->frame_id_numbers_present_flag = + !seq->reduced_still_picture_hdr && + !oxcf->tile_cfg.enable_large_scale_tile && + tool_cfg->error_resilient_mode && !disable_frame_id_numbers; + if (seq->reduced_still_picture_hdr) { + seq->order_hint_info.enable_order_hint = 0; + seq->force_screen_content_tools = 2; + seq->force_integer_mv = 2; + } + seq->order_hint_info.order_hint_bits_minus_1 = + seq->order_hint_info.enable_order_hint + ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1 + : -1; + + seq->max_frame_width = frm_dim_cfg->forced_max_frame_width + ? frm_dim_cfg->forced_max_frame_width + : frm_dim_cfg->width; + seq->max_frame_height = frm_dim_cfg->forced_max_frame_height + ? frm_dim_cfg->forced_max_frame_height + : frm_dim_cfg->height; + seq->num_bits_width = + (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1; + seq->num_bits_height = + (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1; + assert(seq->num_bits_width <= 16); + assert(seq->num_bits_height <= 16); + + seq->frame_id_length = FRAME_ID_LENGTH; + seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH; + + seq->enable_dual_filter = tool_cfg->enable_dual_filter; + seq->order_hint_info.enable_dist_wtd_comp = + oxcf->comp_type_cfg.enable_dist_wtd_comp; + seq->order_hint_info.enable_dist_wtd_comp &= + seq->order_hint_info.enable_order_hint; + seq->order_hint_info.enable_ref_frame_mvs = tool_cfg->ref_frame_mvs_present; + seq->order_hint_info.enable_ref_frame_mvs &= + seq->order_hint_info.enable_order_hint; + seq->enable_superres = oxcf->superres_cfg.enable_superres; + seq->enable_cdef = tool_cfg->cdef_control != CDEF_NONE ? 1 : 0; + seq->enable_restoration = tool_cfg->enable_restoration; + seq->enable_warped_motion = oxcf->motion_mode_cfg.enable_warped_motion; + seq->enable_interintra_compound = tool_cfg->enable_interintra_comp; + seq->enable_masked_compound = oxcf->comp_type_cfg.enable_masked_comp; + seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter; + seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra; + + set_bitstream_level_tier(ppi, frm_dim_cfg->width, frm_dim_cfg->height, + oxcf->input_cfg.init_framerate); + + if (seq->operating_points_cnt_minus_1 == 0) { + seq->operating_point_idc[0] = 0; + } else { + // Set operating_point_idc[] such that the i=0 point corresponds to the + // highest quality operating point (all layers), and subsequent + // operarting points (i > 0) are lower quality corresponding to + // skip decoding enhancement layers (temporal first). + int i = 0; + assert(seq->operating_points_cnt_minus_1 == + (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1)); + for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) { + for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) { + seq->operating_point_idc[i] = + (~(~0u << (ppi->number_spatial_layers - sl)) << 8) | + ~(~0u << (ppi->number_temporal_layers - tl)); + i++; + } + } + } +} + +static void init_config_sequence(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf) { + SequenceHeader *const seq_params = &ppi->seq_params; + const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; + const ColorCfg *const color_cfg = &oxcf->color_cfg; + + ppi->use_svc = 0; + ppi->number_spatial_layers = 1; + ppi->number_temporal_layers = 1; + + seq_params->profile = oxcf->profile; + seq_params->bit_depth = oxcf->tool_cfg.bit_depth; + seq_params->use_highbitdepth = oxcf->use_highbitdepth; + seq_params->color_primaries = color_cfg->color_primaries; + seq_params->transfer_characteristics = color_cfg->transfer_characteristics; + seq_params->matrix_coefficients = color_cfg->matrix_coefficients; + seq_params->monochrome = oxcf->tool_cfg.enable_monochrome; + seq_params->chroma_sample_position = color_cfg->chroma_sample_position; + seq_params->color_range = color_cfg->color_range; + seq_params->timing_info_present = dec_model_cfg->timing_info_present; + seq_params->timing_info.num_units_in_display_tick = + dec_model_cfg->timing_info.num_units_in_display_tick; + seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale; + seq_params->timing_info.equal_picture_interval = + dec_model_cfg->timing_info.equal_picture_interval; + seq_params->timing_info.num_ticks_per_picture = + dec_model_cfg->timing_info.num_ticks_per_picture; + + seq_params->display_model_info_present_flag = + dec_model_cfg->display_model_info_present_flag; + seq_params->decoder_model_info_present_flag = + dec_model_cfg->decoder_model_info_present_flag; + if (dec_model_cfg->decoder_model_info_present_flag) { + // set the decoder model parameters in schedule mode + seq_params->decoder_model_info.num_units_in_decoding_tick = + dec_model_cfg->num_units_in_decoding_tick; + ppi->buffer_removal_time_present = 1; + av1_set_aom_dec_model_info(&seq_params->decoder_model_info); + av1_set_dec_model_op_parameters(&seq_params->op_params[0]); + } else if (seq_params->timing_info_present && + seq_params->timing_info.equal_picture_interval && + !seq_params->decoder_model_info_present_flag) { + // set the decoder model parameters in resource availability mode + av1_set_resource_availability_parameters(&seq_params->op_params[0]); + } else { + seq_params->op_params[0].initial_display_delay = + 10; // Default value (not signaled) + } + + if (seq_params->monochrome) { + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 1; + } else if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && + seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && + seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + seq_params->subsampling_x = 0; + seq_params->subsampling_y = 0; + } else { + if (seq_params->profile == 0) { + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 1; + } else if (seq_params->profile == 1) { + seq_params->subsampling_x = 0; + seq_params->subsampling_y = 0; + } else { + if (seq_params->bit_depth == AOM_BITS_12) { + seq_params->subsampling_x = oxcf->input_cfg.chroma_subsampling_x; + seq_params->subsampling_y = oxcf->input_cfg.chroma_subsampling_y; + } else { + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 0; + } + } + } + av1_change_config_seq(ppi, oxcf, NULL); +} + +static void init_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { + AV1_COMMON *const cm = &cpi->common; + ResizePendingParams *resize_pending_params = &cpi->resize_pending_params; + + cpi->oxcf = *oxcf; + cpi->framerate = oxcf->input_cfg.init_framerate; + + cm->width = oxcf->frm_dim_cfg.width; + cm->height = oxcf->frm_dim_cfg.height; + cpi->is_dropped_frame = false; + + alloc_compressor_data(cpi); + + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; + cpi->frame_size_related_setup_done = false; + + // Single thread case: use counts in common. + cpi->td.counts = &cpi->counts; + + // Init SVC parameters. + cpi->svc.number_spatial_layers = 1; + cpi->svc.number_temporal_layers = 1; + cm->spatial_layer_id = 0; + cm->temporal_layer_id = 0; + // Init rtc_ref parameters. + cpi->ppi->rtc_ref.set_ref_frame_config = 0; + cpi->ppi->rtc_ref.non_reference_frame = 0; + cpi->ppi->rtc_ref.ref_frame_comp[0] = 0; + cpi->ppi->rtc_ref.ref_frame_comp[1] = 0; + cpi->ppi->rtc_ref.ref_frame_comp[2] = 0; + + // change includes all joint functionality + av1_change_config(cpi, oxcf, false); + + cpi->ref_frame_flags = 0; + + // Reset resize pending flags + resize_pending_params->width = 0; + resize_pending_params->height = 0; + + // Setup identity scale factor + av1_setup_scale_factors_for_frame(&cm->sf_identity, 1, 1, 1, 1); + + init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx); + + av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); +} + +void av1_change_config_seq(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf, + bool *is_sb_size_changed) { + SequenceHeader *const seq_params = &ppi->seq_params; + const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; + const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; + const ColorCfg *const color_cfg = &oxcf->color_cfg; + + if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile; + seq_params->bit_depth = oxcf->tool_cfg.bit_depth; + seq_params->color_primaries = color_cfg->color_primaries; + seq_params->transfer_characteristics = color_cfg->transfer_characteristics; + seq_params->matrix_coefficients = color_cfg->matrix_coefficients; + seq_params->monochrome = oxcf->tool_cfg.enable_monochrome; + seq_params->chroma_sample_position = color_cfg->chroma_sample_position; + seq_params->color_range = color_cfg->color_range; + + assert(IMPLIES(seq_params->profile <= PROFILE_1, + seq_params->bit_depth <= AOM_BITS_10)); + + seq_params->timing_info_present = dec_model_cfg->timing_info_present; + seq_params->timing_info.num_units_in_display_tick = + dec_model_cfg->timing_info.num_units_in_display_tick; + seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale; + seq_params->timing_info.equal_picture_interval = + dec_model_cfg->timing_info.equal_picture_interval; + seq_params->timing_info.num_ticks_per_picture = + dec_model_cfg->timing_info.num_ticks_per_picture; + + seq_params->display_model_info_present_flag = + dec_model_cfg->display_model_info_present_flag; + seq_params->decoder_model_info_present_flag = + dec_model_cfg->decoder_model_info_present_flag; + if (dec_model_cfg->decoder_model_info_present_flag) { + // set the decoder model parameters in schedule mode + seq_params->decoder_model_info.num_units_in_decoding_tick = + dec_model_cfg->num_units_in_decoding_tick; + ppi->buffer_removal_time_present = 1; + av1_set_aom_dec_model_info(&seq_params->decoder_model_info); + av1_set_dec_model_op_parameters(&seq_params->op_params[0]); + } else if (seq_params->timing_info_present && + seq_params->timing_info.equal_picture_interval && + !seq_params->decoder_model_info_present_flag) { + // set the decoder model parameters in resource availability mode + av1_set_resource_availability_parameters(&seq_params->op_params[0]); + } else { + seq_params->op_params[0].initial_display_delay = + 10; // Default value (not signaled) + } + + av1_update_film_grain_parameters_seq(ppi, oxcf); + + int sb_size = seq_params->sb_size; + // Superblock size should not be updated after the first key frame. + if (!ppi->seq_params_locked) { + set_sb_size(seq_params, av1_select_sb_size(oxcf, frm_dim_cfg->width, + frm_dim_cfg->height, + ppi->number_spatial_layers)); + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) + seq_params->tier[i] = (oxcf->tier_mask >> i) & 1; + } + if (is_sb_size_changed != NULL && sb_size != seq_params->sb_size) + *is_sb_size_changed = true; + + // Init sequence level coding tools + // This should not be called after the first key frame. + if (!ppi->seq_params_locked) { + seq_params->operating_points_cnt_minus_1 = + (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) + ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1 + : 0; + av1_init_seq_coding_tools( + ppi, oxcf, ppi->use_svc || ppi->rtc_ref.set_ref_frame_config); + } + seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr; + +#if CONFIG_AV1_HIGHBITDEPTH + highbd_set_var_fns(ppi); +#endif + + set_primary_rc_buffer_sizes(oxcf, ppi); +} + +void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf, + bool is_sb_size_changed) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + MACROBLOCK *const x = &cpi->td.mb; + AV1LevelParams *const level_params = &cpi->ppi->level_params; + RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + FeatureFlags *const features = &cm->features; + + // in case of LAP, lag in frames is set according to number of lap buffers + // calculated at init time. This stores and restores LAP's lag in frames to + // prevent override by new cfg. + int lap_lag_in_frames = -1; + if (cpi->ppi->lap_enabled && cpi->compressor_stage == LAP_STAGE) { + lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames; + } + + cpi->oxcf = *oxcf; + + av1_update_film_grain_parameters(cpi, oxcf); + + // When user provides superres_mode = AOM_SUPERRES_AUTO, we still initialize + // superres mode for current encoding = AOM_SUPERRES_NONE. This is to ensure + // that any analysis (e.g. TPL) happening outside the main encoding loop still + // happens at full resolution. + // This value will later be set appropriately just before main encoding loop. + cpi->superres_mode = oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO + ? AOM_SUPERRES_NONE + : oxcf->superres_cfg.superres_mode; // default + x->e_mbd.bd = (int)seq_params->bit_depth; + x->e_mbd.global_motion = cm->global_motion; + + memcpy(level_params->target_seq_level_idx, cpi->oxcf.target_seq_level_idx, + sizeof(level_params->target_seq_level_idx)); + level_params->keep_level_stats = 0; + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + if (level_params->target_seq_level_idx[i] < SEQ_LEVELS || + level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS) { + level_params->keep_level_stats |= 1u << i; + if (!level_params->level_info[i]) { + CHECK_MEM_ERROR(cm, level_params->level_info[i], + aom_calloc(1, sizeof(*level_params->level_info[i]))); + } + } + } + + // TODO(huisu@): level targeting currently only works for the 0th operating + // point, so scalable coding is not supported yet. + if (level_params->target_seq_level_idx[0] < SEQ_LEVELS) { + // Adjust encoder config in order to meet target level. + config_target_level(cpi, level_params->target_seq_level_idx[0], + seq_params->tier[0]); + } + + if (has_no_stats_stage(cpi) && (rc_cfg->mode == AOM_Q)) { + p_rc->baseline_gf_interval = FIXED_GF_INTERVAL; + } else if (!is_one_pass_rt_params(cpi) || + cm->current_frame.frame_number == 0) { + // For rtc mode: logic for setting the baseline_gf_interval is done + // in av1_get_one_pass_rt_params(), and it should not be reset here in + // change_config(), unless after init_config (first frame). + p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2; + } + + refresh_frame->golden_frame = false; + refresh_frame->bwd_ref_frame = false; + + features->refresh_frame_context = + (oxcf->tool_cfg.frame_parallel_decoding_mode) + ? REFRESH_FRAME_CONTEXT_DISABLED + : REFRESH_FRAME_CONTEXT_BACKWARD; + if (oxcf->tile_cfg.enable_large_scale_tile) + features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; + + if (x->palette_buffer == NULL) { + CHECK_MEM_ERROR(cm, x->palette_buffer, + aom_memalign(16, sizeof(*x->palette_buffer))); + } + + if (x->tmp_conv_dst == NULL) { + CHECK_MEM_ERROR( + cm, x->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst))); + x->e_mbd.tmp_conv_dst = x->tmp_conv_dst; + } + // The buffers 'tmp_pred_bufs[]' and 'comp_rd_buffer' are used in inter frames + // to store intermediate inter mode prediction results and are not required + // for allintra encoding mode. Hence, the memory allocations for these buffers + // are avoided for allintra encoding mode. + if (cpi->oxcf.kf_cfg.key_freq_max != 0) { + if (x->comp_rd_buffer.pred0 == NULL) + alloc_compound_type_rd_buffers(cm->error, &x->comp_rd_buffer); + + for (int i = 0; i < 2; ++i) { + if (x->tmp_pred_bufs[i] == NULL) { + CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i], + aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*x->tmp_pred_bufs[i]))); + x->e_mbd.tmp_obmc_bufs[i] = x->tmp_pred_bufs[i]; + } + } + } + + av1_reset_segment_features(cm); + + av1_set_high_precision_mv(cpi, 1, 0); + + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + p_rc->bits_off_target = + AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size); + p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size); + + // Set up frame rate and related parameters rate control values. + av1_new_framerate(cpi, cpi->framerate); + + // Set absolute upper and lower quality limits + rc->worst_quality = rc_cfg->worst_allowed_q; + rc->best_quality = rc_cfg->best_allowed_q; + + // If lossless has been requested make sure average Q accumulators are reset. + if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { + int i; + for (i = 0; i < FRAME_TYPES; ++i) { + p_rc->avg_frame_qindex[i] = 0; + } + } + + features->interp_filter = + oxcf->tile_cfg.enable_large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE; + features->switchable_motion_mode = is_switchable_motion_mode_allowed( + features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc); + + if (frm_dim_cfg->render_width > 0 && frm_dim_cfg->render_height > 0) { + cm->render_width = frm_dim_cfg->render_width; + cm->render_height = frm_dim_cfg->render_height; + } else { + cm->render_width = frm_dim_cfg->width; + cm->render_height = frm_dim_cfg->height; + } + cm->width = frm_dim_cfg->width; + cm->height = frm_dim_cfg->height; + + if (cm->width > cpi->data_alloc_width || + cm->height > cpi->data_alloc_height || is_sb_size_changed) { + av1_free_context_buffers(cm); + av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); + av1_free_sms_tree(&cpi->td); + av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm)); + cpi->td.firstpass_ctx = NULL; + alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; + cpi->frame_size_related_setup_done = false; + } + av1_update_frame_size(cpi); + + rc->is_src_frame_alt_ref = 0; + + if (!cpi->ppi->rtc_ref.set_ref_frame_config) + cpi->ext_flags.refresh_frame.update_pending = 0; + cpi->ext_flags.refresh_frame_context_pending = 0; + + if (cpi->ppi->use_svc) + av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth); + + check_reset_rc_flag(cpi); + + // restore the value of lag_in_frame for LAP stage. + if (lap_lag_in_frames != -1) { + cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames; + } + +#if CONFIG_REALTIME_ONLY + assert(!oxcf->tool_cfg.enable_global_motion); + cpi->image_pyramid_levels = 0; +#else + if (oxcf->tool_cfg.enable_global_motion) { + cpi->image_pyramid_levels = + global_motion_pyr_levels[default_global_motion_method]; + } else { + cpi->image_pyramid_levels = 0; + } +#endif // CONFIG_REALTIME_ONLY +} + +static INLINE void init_frame_info(FRAME_INFO *frame_info, + const AV1_COMMON *const cm) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = cm->seq_params; + frame_info->frame_width = cm->width; + frame_info->frame_height = cm->height; + frame_info->mi_cols = mi_params->mi_cols; + frame_info->mi_rows = mi_params->mi_rows; + frame_info->mb_cols = mi_params->mb_cols; + frame_info->mb_rows = mi_params->mb_rows; + frame_info->num_mbs = mi_params->MBs; + frame_info->bit_depth = seq_params->bit_depth; + frame_info->subsampling_x = seq_params->subsampling_x; + frame_info->subsampling_y = seq_params->subsampling_y; +} + +static INLINE void init_frame_index_set(FRAME_INDEX_SET *frame_index_set) { + frame_index_set->show_frame_count = 0; +} + +static INLINE void update_counters_for_show_frame(AV1_COMP *const cpi) { + assert(cpi->common.show_frame); + cpi->frame_index_set.show_frame_count++; + cpi->common.current_frame.frame_number++; +} + +AV1_PRIMARY *av1_create_primary_compressor( + struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers, + const AV1EncoderConfig *oxcf) { + AV1_PRIMARY *volatile const ppi = aom_memalign(32, sizeof(AV1_PRIMARY)); + if (!ppi) return NULL; + av1_zero(*ppi); + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(ppi->error.jmp)) { + ppi->error.setjmp = 0; + av1_remove_primary_compressor(ppi); + return 0; + } + ppi->error.setjmp = 1; + + ppi->seq_params_locked = 0; + ppi->lap_enabled = num_lap_buffers > 0; + ppi->output_pkt_list = pkt_list_head; + ppi->b_calculate_psnr = CONFIG_INTERNAL_STATS; + ppi->frames_left = oxcf->input_cfg.limit; + ppi->num_fp_contexts = 1; + + init_config_sequence(ppi, oxcf); + +#if CONFIG_ENTROPY_STATS + av1_zero(ppi->aggregate_fc); +#endif // CONFIG_ENTROPY_STATS + + av1_primary_rc_init(oxcf, &ppi->p_rc); + + // For two pass and lag_in_frames > 33 in LAP. + ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2; + if (ppi->lap_enabled) { + if ((num_lap_buffers < + (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) && + num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) { + /* + * For lag in frames >= 19 and <33, enable scenecut + * with limited future frame prediction. + */ + ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1; + } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) { + // Disable scenecut when lag_in_frames < 19. + ppi->p_rc.enable_scenecut_detection = DISABLE_SCENECUT; + } + } + +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \ + ppi->fn_ptr[BT].sdf = SDF; \ + ppi->fn_ptr[BT].sdaf = SDAF; \ + ppi->fn_ptr[BT].vf = VF; \ + ppi->fn_ptr[BT].svf = SVF; \ + ppi->fn_ptr[BT].svaf = SVAF; \ + ppi->fn_ptr[BT].sdx4df = SDX4DF; \ + ppi->fn_ptr[BT].jsdaf = JSDAF; \ + ppi->fn_ptr[BT].jsvaf = JSVAF; \ + ppi->fn_ptr[BT].sdx3df = SDX3DF; + +// Realtime mode doesn't use 4x rectangular blocks. +#if !CONFIG_REALTIME_ONLY + BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16, + aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16, + aom_sad4x16x4d, aom_sad4x16x3d, aom_dist_wtd_sad4x16_avg, + aom_dist_wtd_sub_pixel_avg_variance4x16) + + BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4, + aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4, + aom_sad16x4x4d, aom_sad16x4x3d, aom_dist_wtd_sad16x4_avg, + aom_dist_wtd_sub_pixel_avg_variance16x4) + + BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32, + aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32, + aom_sad8x32x4d, aom_sad8x32x3d, aom_dist_wtd_sad8x32_avg, + aom_dist_wtd_sub_pixel_avg_variance8x32) + + BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8, + aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, + aom_sad32x8x4d, aom_sad32x8x3d, aom_dist_wtd_sad32x8_avg, + aom_dist_wtd_sub_pixel_avg_variance32x8) + + BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64, + aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64, + aom_sad16x64x4d, aom_sad16x64x3d, aom_dist_wtd_sad16x64_avg, + aom_dist_wtd_sub_pixel_avg_variance16x64) + + BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16, + aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16, + aom_sad64x16x4d, aom_sad64x16x3d, aom_dist_wtd_sad64x16_avg, + aom_dist_wtd_sub_pixel_avg_variance64x16) +#endif // !CONFIG_REALTIME_ONLY + + BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128, + aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128, + aom_sad128x128x4d, aom_sad128x128x3d, aom_dist_wtd_sad128x128_avg, + aom_dist_wtd_sub_pixel_avg_variance128x128) + + BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64, + aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, + aom_sad128x64x4d, aom_sad128x64x3d, aom_dist_wtd_sad128x64_avg, + aom_dist_wtd_sub_pixel_avg_variance128x64) + + BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128, + aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, + aom_sad64x128x4d, aom_sad64x128x3d, aom_dist_wtd_sad64x128_avg, + aom_dist_wtd_sub_pixel_avg_variance64x128) + + BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16, + aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, + aom_sad32x16x4d, aom_sad32x16x3d, aom_dist_wtd_sad32x16_avg, + aom_dist_wtd_sub_pixel_avg_variance32x16) + + BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32, + aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, + aom_sad16x32x4d, aom_sad16x32x3d, aom_dist_wtd_sad16x32_avg, + aom_dist_wtd_sub_pixel_avg_variance16x32) + + BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32, + aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, + aom_sad64x32x4d, aom_sad64x32x3d, aom_dist_wtd_sad64x32_avg, + aom_dist_wtd_sub_pixel_avg_variance64x32) + + BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64, + aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, + aom_sad32x64x4d, aom_sad32x64x3d, aom_dist_wtd_sad32x64_avg, + aom_dist_wtd_sub_pixel_avg_variance32x64) + + BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32, + aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32, + aom_sad32x32x4d, aom_sad32x32x3d, aom_dist_wtd_sad32x32_avg, + aom_dist_wtd_sub_pixel_avg_variance32x32) + + BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64, + aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64, + aom_sad64x64x4d, aom_sad64x64x3d, aom_dist_wtd_sad64x64_avg, + aom_dist_wtd_sub_pixel_avg_variance64x64) + + BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16, + aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16, + aom_sad16x16x4d, aom_sad16x16x3d, aom_dist_wtd_sad16x16_avg, + aom_dist_wtd_sub_pixel_avg_variance16x16) + + BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8, + aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, + aom_sad16x8x4d, aom_sad16x8x3d, aom_dist_wtd_sad16x8_avg, + aom_dist_wtd_sub_pixel_avg_variance16x8) + + BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16, + aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, + aom_sad8x16x4d, aom_sad8x16x3d, aom_dist_wtd_sad8x16_avg, + aom_dist_wtd_sub_pixel_avg_variance8x16) + + BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8, + aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d, + aom_sad8x8x3d, aom_dist_wtd_sad8x8_avg, + aom_dist_wtd_sub_pixel_avg_variance8x8) + + BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4, + aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d, + aom_sad8x4x3d, aom_dist_wtd_sad8x4_avg, + aom_dist_wtd_sub_pixel_avg_variance8x4) + + BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8, + aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d, + aom_sad4x8x3d, aom_dist_wtd_sad4x8_avg, + aom_dist_wtd_sub_pixel_avg_variance4x8) + + BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4, + aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d, + aom_sad4x4x3d, aom_dist_wtd_sad4x4_avg, + aom_dist_wtd_sub_pixel_avg_variance4x4) + +#if !CONFIG_REALTIME_ONLY +#define OBFP(BT, OSDF, OVF, OSVF) \ + ppi->fn_ptr[BT].osdf = OSDF; \ + ppi->fn_ptr[BT].ovf = OVF; \ + ppi->fn_ptr[BT].osvf = OSVF; + + OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128, + aom_obmc_sub_pixel_variance128x128) + OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64, + aom_obmc_sub_pixel_variance128x64) + OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128, + aom_obmc_sub_pixel_variance64x128) + OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64, + aom_obmc_sub_pixel_variance64x64) + OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32, + aom_obmc_sub_pixel_variance64x32) + OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64, + aom_obmc_sub_pixel_variance32x64) + OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32, + aom_obmc_sub_pixel_variance32x32) + OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16, + aom_obmc_sub_pixel_variance32x16) + OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32, + aom_obmc_sub_pixel_variance16x32) + OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16, + aom_obmc_sub_pixel_variance16x16) + OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8, + aom_obmc_sub_pixel_variance16x8) + OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16, + aom_obmc_sub_pixel_variance8x16) + OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8, + aom_obmc_sub_pixel_variance8x8) + OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8, + aom_obmc_sub_pixel_variance4x8) + OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4, + aom_obmc_sub_pixel_variance8x4) + OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4, + aom_obmc_sub_pixel_variance4x4) + OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16, + aom_obmc_sub_pixel_variance4x16) + OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4, + aom_obmc_sub_pixel_variance16x4) + OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32, + aom_obmc_sub_pixel_variance8x32) + OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8, + aom_obmc_sub_pixel_variance32x8) + OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64, + aom_obmc_sub_pixel_variance16x64) + OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16, + aom_obmc_sub_pixel_variance64x16) +#endif // !CONFIG_REALTIME_ONLY + +#define MBFP(BT, MCSDF, MCSVF) \ + ppi->fn_ptr[BT].msdf = MCSDF; \ + ppi->fn_ptr[BT].msvf = MCSVF; + + MBFP(BLOCK_128X128, aom_masked_sad128x128, + aom_masked_sub_pixel_variance128x128) + MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64) + MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128) + MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64) + MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32) + MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64) + MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32) + MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16) + MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32) + MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16) + MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8) + MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16) + MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8) + MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8) + MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4) + MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4) + +#if !CONFIG_REALTIME_ONLY + MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16) + MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4) + MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32) + MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8) + MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64) + MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16) +#endif + +#define SDSFP(BT, SDSF, SDSX4DF) \ + ppi->fn_ptr[BT].sdsf = SDSF; \ + ppi->fn_ptr[BT].sdsx4df = SDSX4DF; + + SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d) + SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d) + SDSFP(BLOCK_64X128, aom_sad_skip_64x128, aom_sad_skip_64x128x4d) + SDSFP(BLOCK_64X64, aom_sad_skip_64x64, aom_sad_skip_64x64x4d) + SDSFP(BLOCK_64X32, aom_sad_skip_64x32, aom_sad_skip_64x32x4d) + + SDSFP(BLOCK_32X64, aom_sad_skip_32x64, aom_sad_skip_32x64x4d) + SDSFP(BLOCK_32X32, aom_sad_skip_32x32, aom_sad_skip_32x32x4d) + SDSFP(BLOCK_32X16, aom_sad_skip_32x16, aom_sad_skip_32x16x4d) + + SDSFP(BLOCK_16X32, aom_sad_skip_16x32, aom_sad_skip_16x32x4d) + SDSFP(BLOCK_16X16, aom_sad_skip_16x16, aom_sad_skip_16x16x4d) + SDSFP(BLOCK_16X8, aom_sad_skip_16x8, aom_sad_skip_16x8x4d) + SDSFP(BLOCK_8X16, aom_sad_skip_8x16, aom_sad_skip_8x16x4d) + SDSFP(BLOCK_8X8, aom_sad_skip_8x8, aom_sad_skip_8x8x4d) + + SDSFP(BLOCK_4X8, aom_sad_skip_4x8, aom_sad_skip_4x8x4d) + +#if !CONFIG_REALTIME_ONLY + SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d) + SDSFP(BLOCK_16X64, aom_sad_skip_16x64, aom_sad_skip_16x64x4d) + SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d) + SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d) + SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d) +#endif +#undef SDSFP + +#if CONFIG_AV1_HIGHBITDEPTH + highbd_set_var_fns(ppi); +#endif + + { + // As cm->mi_params is a part of the frame level context (cpi), it is + // unavailable at this point. mi_params is created as a local temporary + // variable, to be passed into the functions used for allocating tpl + // buffers. The values in this variable are populated according to initial + // width and height of the frame. + CommonModeInfoParams mi_params; + enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, + BLOCK_4X4); + + const BLOCK_SIZE bsize = BLOCK_16X16; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (mi_params.mi_cols + w - 1) / w; + const int num_rows = (mi_params.mi_rows + h - 1) / h; + AOM_CHECK_MEM_ERROR( + &ppi->error, ppi->tpl_sb_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*ppi->tpl_sb_rdmult_scaling_factors))); + +#if CONFIG_INTERNAL_STATS + ppi->b_calculate_blockiness = 1; + ppi->b_calculate_consistency = 1; + + for (int i = 0; i <= STAT_ALL; i++) { + ppi->psnr[0].stat[i] = 0; + ppi->psnr[1].stat[i] = 0; + + ppi->fastssim.stat[i] = 0; + ppi->psnrhvs.stat[i] = 0; + } + + ppi->psnr[0].worst = 100.0; + ppi->psnr[1].worst = 100.0; + ppi->worst_ssim = 100.0; + ppi->worst_ssim_hbd = 100.0; + + ppi->count[0] = 0; + ppi->count[1] = 0; + ppi->total_bytes = 0; + + if (ppi->b_calculate_psnr) { + ppi->total_sq_error[0] = 0; + ppi->total_samples[0] = 0; + ppi->total_sq_error[1] = 0; + ppi->total_samples[1] = 0; + ppi->total_recode_hits = 0; + ppi->summed_quality = 0; + ppi->summed_weights = 0; + ppi->summed_quality_hbd = 0; + ppi->summed_weights_hbd = 0; + } + + ppi->fastssim.worst = 100.0; + ppi->psnrhvs.worst = 100.0; + + if (ppi->b_calculate_blockiness) { + ppi->total_blockiness = 0; + ppi->worst_blockiness = 0.0; + } + + ppi->total_inconsistency = 0; + ppi->worst_consistency = 100.0; + if (ppi->b_calculate_consistency) { + AOM_CHECK_MEM_ERROR(&ppi->error, ppi->ssim_vars, + aom_malloc(sizeof(*ppi->ssim_vars) * 4 * + mi_params.mi_rows * mi_params.mi_cols)); + } +#endif + } + + ppi->error.setjmp = 0; + return ppi; +} + +AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf, + BufferPool *const pool, COMPRESSOR_STAGE stage, + int lap_lag_in_frames) { + AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP)); + + if (!cpi) return NULL; + + av1_zero(*cpi); + + cpi->ppi = ppi; + + AV1_COMMON *volatile const cm = &cpi->common; + cm->seq_params = &ppi->seq_params; + cm->error = + (struct aom_internal_error_info *)aom_calloc(1, sizeof(*cm->error)); + if (!cm->error) { + aom_free(cpi); + return NULL; + } + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(cm->error->jmp)) { + cm->error->setjmp = 0; + av1_remove_compressor(cpi); + return NULL; + } + + cm->error->setjmp = 1; + cpi->compressor_stage = stage; + + cpi->do_frame_data_update = true; + + CommonModeInfoParams *const mi_params = &cm->mi_params; + mi_params->free_mi = enc_free_mi; + mi_params->setup_mi = enc_setup_mi; + mi_params->set_mb_mi = + (oxcf->pass == AOM_RC_FIRST_PASS || cpi->compressor_stage == LAP_STAGE) + ? stat_stage_set_mb_mi + : enc_set_mb_mi; + + mi_params->mi_alloc_bsize = BLOCK_4X4; + + CHECK_MEM_ERROR(cm, cm->fc, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); + CHECK_MEM_ERROR( + cm, cm->default_frame_context, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context))); + memset(cm->fc, 0, sizeof(*cm->fc)); + memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context)); + + cpi->common.buffer_pool = pool; + + init_config(cpi, oxcf); + if (cpi->compressor_stage == LAP_STAGE) { + cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames; + } + + av1_rc_init(&cpi->oxcf, &cpi->rc); + + init_frame_info(&cpi->frame_info, cm); + init_frame_index_set(&cpi->frame_index_set); + + cm->current_frame.frame_number = 0; + cpi->rc.frame_number_encoded = 0; + cpi->rc.prev_frame_is_dropped = 0; + cpi->rc.max_consec_drop = INT_MAX; + cpi->rc.drop_count_consec = 0; + cm->current_frame_id = -1; + cpi->tile_data = NULL; + cpi->last_show_frame_buf = NULL; + realloc_segmentation_maps(cpi); + + cpi->refresh_frame.alt_ref_frame = false; + +#if CONFIG_SPEED_STATS + cpi->tx_search_count = 0; +#endif // CONFIG_SPEED_STATS + + cpi->time_stamps.first_ts_start = INT64_MAX; + +#ifdef OUTPUT_YUV_REC + yuv_rec_file = fopen("rec.yuv", "wb"); +#endif +#ifdef OUTPUT_YUV_DENOISED + yuv_denoised_file = fopen("denoised.yuv", "wb"); +#endif + +#if !CONFIG_REALTIME_ONLY + if (is_stat_consumption_stage(cpi)) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz); + + if (!cpi->ppi->lap_enabled) { + /*Re-initialize to stats buffer, populated by application in the case of + * two pass*/ + cpi->ppi->twopass.stats_buf_ctx->stats_in_start = + oxcf->twopass_stats_in.buf; + cpi->twopass_frame.stats_in = + cpi->ppi->twopass.stats_buf_ctx->stats_in_start; + cpi->ppi->twopass.stats_buf_ctx->stats_in_end = + &cpi->ppi->twopass.stats_buf_ctx->stats_in_start[packets - 1]; + + // The buffer size is packets - 1 because the last packet is total_stats. + av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, + oxcf->twopass_stats_in.buf, packets - 1); + av1_init_second_pass(cpi); + } else { + av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, NULL, 0); + av1_init_single_pass_lap(cpi); + } + } +#endif + + // The buffer "obmc_buffer" is used in inter frames for fast obmc search. + // Hence, the memory allocation for the same is avoided for allintra encoding + // mode. + if (cpi->oxcf.kf_cfg.key_freq_max != 0) + alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm->error); + + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + CHECK_MEM_ERROR( + cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y], + (uint32_t *)aom_malloc( + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0]))); + + cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0; + + av1_set_speed_features_framesize_independent(cpi, oxcf->speed); + av1_set_speed_features_framesize_dependent(cpi, oxcf->speed); + + int max_mi_cols = mi_params->mi_cols; + int max_mi_rows = mi_params->mi_rows; + if (oxcf->frm_dim_cfg.forced_max_frame_width) { + max_mi_cols = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_width); + } + if (oxcf->frm_dim_cfg.forced_max_frame_height) { + max_mi_rows = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_height); + } + + const int consec_zero_mv_alloc_size = (max_mi_rows * max_mi_cols) >> 2; + CHECK_MEM_ERROR( + cm, cpi->consec_zero_mv, + aom_calloc(consec_zero_mv_alloc_size, sizeof(*cpi->consec_zero_mv))); + cpi->consec_zero_mv_alloc_size = consec_zero_mv_alloc_size; + + cpi->mb_weber_stats = NULL; + cpi->mb_delta_q = NULL; + cpi->palette_pixel_num = 0; + cpi->scaled_last_source_available = 0; + + { + const BLOCK_SIZE bsize = BLOCK_16X16; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (max_mi_cols + w - 1) / w; + const int num_rows = (max_mi_rows + h - 1) / h; + CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->ssim_rdmult_scaling_factors))); + CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->tpl_rdmult_scaling_factors))); + } + +#if CONFIG_TUNE_VMAF + { + const BLOCK_SIZE bsize = BLOCK_64X64; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (mi_params->mi_cols + w - 1) / w; + const int num_rows = (mi_params->mi_rows + h - 1) / h; + CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->vmaf_info.rdmult_scaling_factors))); + for (int i = 0; i < MAX_ARF_LAYERS; i++) { + cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0; + cpi->vmaf_info.last_frame_ysse[i] = -1.0; + cpi->vmaf_info.last_frame_vmaf[i] = -1.0; + } + cpi->vmaf_info.original_qindex = -1; + cpi->vmaf_info.vmaf_model = NULL; + } +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + { + const int w = mi_size_wide[butteraugli_rdo_bsize]; + const int h = mi_size_high[butteraugli_rdo_bsize]; + const int num_cols = (mi_params->mi_cols + w - 1) / w; + const int num_rows = (mi_params->mi_rows + h - 1) / h; + CHECK_MEM_ERROR( + cm, cpi->butteraugli_info.rdmult_scaling_factors, + aom_malloc(num_rows * num_cols * + sizeof(*cpi->butteraugli_info.rdmult_scaling_factors))); + memset(&cpi->butteraugli_info.source, 0, + sizeof(cpi->butteraugli_info.source)); + memset(&cpi->butteraugli_info.resized_source, 0, + sizeof(cpi->butteraugli_info.resized_source)); + cpi->butteraugli_info.recon_set = false; + } +#endif + +#if CONFIG_SALIENCY_MAP + { + CHECK_MEM_ERROR(cm, cpi->saliency_map, + (uint8_t *)aom_calloc(cm->height * cm->width, + sizeof(*cpi->saliency_map))); + // Buffer initialization based on MIN_MIB_SIZE_LOG2 to ensure that + // cpi->sm_scaling_factor buffer is allocated big enough, since we have no + // idea of the actual superblock size we are going to use yet. + const int min_mi_w_sb = (1 << MIN_MIB_SIZE_LOG2); + const int min_mi_h_sb = (1 << MIN_MIB_SIZE_LOG2); + const int max_sb_cols = + (cm->mi_params.mi_cols + min_mi_w_sb - 1) / min_mi_w_sb; + const int max_sb_rows = + (cm->mi_params.mi_rows + min_mi_h_sb - 1) / min_mi_h_sb; + CHECK_MEM_ERROR(cm, cpi->sm_scaling_factor, + (double *)aom_calloc(max_sb_rows * max_sb_cols, + sizeof(*cpi->sm_scaling_factor))); + } +#endif + +#if CONFIG_COLLECT_PARTITION_STATS + av1_zero(cpi->partition_stats); +#endif // CONFIG_COLLECT_PARTITION_STATS + + // Initialize the members of DeltaQuantParams with INT_MAX to ensure that + // the quantizer tables are correctly initialized using the default deltaq + // parameters when av1_init_quantizer is called for the first time. + DeltaQuantParams *const prev_deltaq_params = + &cpi->enc_quant_dequant_params.prev_deltaq_params; + prev_deltaq_params->y_dc_delta_q = INT_MAX; + prev_deltaq_params->u_dc_delta_q = INT_MAX; + prev_deltaq_params->v_dc_delta_q = INT_MAX; + prev_deltaq_params->u_ac_delta_q = INT_MAX; + prev_deltaq_params->v_ac_delta_q = INT_MAX; + + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + av1_qm_init(&cm->quant_params, av1_num_planes(cm)); + + av1_loop_filter_init(cm); + cm->superres_scale_denominator = SCALE_NUMERATOR; + cm->superres_upscaled_width = oxcf->frm_dim_cfg.width; + cm->superres_upscaled_height = oxcf->frm_dim_cfg.height; +#if !CONFIG_REALTIME_ONLY + av1_loop_restoration_precal(); +#endif + + cpi->third_pass_ctx = NULL; + if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) { + av1_init_thirdpass_ctx(cm, &cpi->third_pass_ctx, NULL); + } + + cpi->second_pass_log_stream = NULL; + cpi->use_ducky_encode = 0; + + cm->error->setjmp = 0; + return cpi; +} + +#if CONFIG_INTERNAL_STATS +#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T)) + +#define SNPRINT2(H, T, V) \ + snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V)) +#endif // CONFIG_INTERNAL_STATS + +void av1_remove_primary_compressor(AV1_PRIMARY *ppi) { + if (!ppi) return; +#if !CONFIG_REALTIME_ONLY + av1_tf_info_free(&ppi->tf_info); +#endif // !CONFIG_REALTIME_ONLY + + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + aom_free(ppi->level_params.level_info[i]); + } + av1_lookahead_destroy(ppi->lookahead); + + aom_free(ppi->tpl_sb_rdmult_scaling_factors); + ppi->tpl_sb_rdmult_scaling_factors = NULL; + + TplParams *const tpl_data = &ppi->tpl_data; + aom_free(tpl_data->txfm_stats_list); + + for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) { + aom_free(tpl_data->tpl_stats_pool[frame]); + aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]); + tpl_data->tpl_stats_pool[frame] = NULL; + } + +#if !CONFIG_REALTIME_ONLY + av1_tpl_dealloc(&tpl_data->tpl_mt_sync); +#endif + + av1_terminate_workers(ppi); + free_thread_data(ppi); + + aom_free(ppi->p_mt_info.tile_thr_data); + ppi->p_mt_info.tile_thr_data = NULL; + aom_free(ppi->p_mt_info.workers); + ppi->p_mt_info.workers = NULL; + ppi->p_mt_info.num_workers = 0; + + aom_free(ppi); +} + +void av1_remove_compressor(AV1_COMP *cpi) { + if (!cpi) return; +#if CONFIG_RATECTRL_LOG + if (cpi->oxcf.pass == 3) { + rc_log_show(&cpi->rc_log); + } +#endif // CONFIG_RATECTRL_LOG + + AV1_COMMON *cm = &cpi->common; + if (cm->current_frame.frame_number > 0) { +#if CONFIG_SPEED_STATS + if (!is_stat_generation_stage(cpi)) { + fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count); + } +#endif // CONFIG_SPEED_STATS + +#if CONFIG_COLLECT_PARTITION_STATS == 2 + if (!is_stat_generation_stage(cpi)) { + av1_print_fr_partition_timing_stats(&cpi->partition_stats, + "fr_part_timing_data.csv"); + } +#endif + } + +#if CONFIG_AV1_TEMPORAL_DENOISING + av1_denoiser_free(&(cpi->denoiser)); +#endif + + if (cm->error) { + // Help detect use after free of the error detail string. + memset(cm->error->detail, 'A', sizeof(cm->error->detail) - 1); + cm->error->detail[sizeof(cm->error->detail) - 1] = '\0'; + aom_free(cm->error); + } + aom_free(cpi->td.tctx); + MultiThreadInfo *const mt_info = &cpi->mt_info; +#if CONFIG_MULTITHREAD + pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_; + pthread_cond_t *const enc_row_mt_cond_ = mt_info->enc_row_mt.cond_; + pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_; + pthread_mutex_t *const tpl_error_mutex_ = mt_info->tpl_row_mt.mutex_; + pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_; + if (enc_row_mt_mutex_ != NULL) { + pthread_mutex_destroy(enc_row_mt_mutex_); + aom_free(enc_row_mt_mutex_); + } + if (enc_row_mt_cond_ != NULL) { + pthread_cond_destroy(enc_row_mt_cond_); + aom_free(enc_row_mt_cond_); + } + if (gm_mt_mutex_ != NULL) { + pthread_mutex_destroy(gm_mt_mutex_); + aom_free(gm_mt_mutex_); + } + if (tpl_error_mutex_ != NULL) { + pthread_mutex_destroy(tpl_error_mutex_); + aom_free(tpl_error_mutex_); + } + if (pack_bs_mt_mutex_ != NULL) { + pthread_mutex_destroy(pack_bs_mt_mutex_); + aom_free(pack_bs_mt_mutex_); + } +#endif + av1_row_mt_mem_dealloc(cpi); + + if (mt_info->num_workers > 1) { + av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync); + av1_loop_filter_dealloc(&mt_info->lf_row_sync); + av1_cdef_mt_dealloc(&mt_info->cdef_sync); +#if !CONFIG_REALTIME_ONLY + av1_loop_restoration_dealloc(&mt_info->lr_row_sync); + av1_tf_mt_dealloc(&mt_info->tf_sync); +#endif + } + + av1_free_thirdpass_ctx(cpi->third_pass_ctx); + + av1_close_second_pass_log(cpi); + + dealloc_compressor_data(cpi); + + av1_ext_part_delete(&cpi->ext_part_controller); + + av1_remove_common(cm); + + aom_free(cpi); + +#ifdef OUTPUT_YUV_REC + fclose(yuv_rec_file); +#endif + +#ifdef OUTPUT_YUV_DENOISED + fclose(yuv_denoised_file); +#endif +} + +static void generate_psnr_packet(AV1_COMP *cpi) { + struct aom_codec_cx_pkt pkt; + int i; + PSNR_STATS psnr; +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr, + bit_depth, in_bit_depth); +#else + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); +#endif + + for (i = 0; i < 4; ++i) { + pkt.data.psnr.samples[i] = psnr.samples[i]; + pkt.data.psnr.sse[i] = psnr.sse[i]; + pkt.data.psnr.psnr[i] = psnr.psnr[i]; + } + +#if CONFIG_AV1_HIGHBITDEPTH + if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) && + (in_bit_depth < bit_depth)) { + for (i = 0; i < 4; ++i) { + pkt.data.psnr.samples_hbd[i] = psnr.samples_hbd[i]; + pkt.data.psnr.sse_hbd[i] = psnr.sse_hbd[i]; + pkt.data.psnr.psnr_hbd[i] = psnr.psnr_hbd[i]; + } + } +#endif + + pkt.kind = AOM_CODEC_PSNR_PKT; + aom_codec_pkt_list_add(cpi->ppi->output_pkt_list, &pkt); +} + +int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) { + if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1; + + *ext_ref_frame_flags = ref_frame_flags; + return 0; +} + +int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx); + if (cfg) { + aom_yv12_copy_frame(cfg, sd, num_planes); + return 0; + } else { + return -1; + } +} + +int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx); + if (cfg) { + aom_yv12_copy_frame(sd, cfg, num_planes); + return 0; + } else { + return -1; + } +} + +#ifdef OUTPUT_YUV_REC +void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) { + uint8_t *src = s->y_buffer; + int h = cm->height; + if (yuv_rec_file == NULL) return; + if (s->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer); + + do { + fwrite(src16, s->y_width, 2, yuv_rec_file); + src16 += s->y_stride; + } while (--h); + + src16 = CONVERT_TO_SHORTPTR(s->u_buffer); + h = s->uv_height; + + do { + fwrite(src16, s->uv_width, 2, yuv_rec_file); + src16 += s->uv_stride; + } while (--h); + + src16 = CONVERT_TO_SHORTPTR(s->v_buffer); + h = s->uv_height; + + do { + fwrite(src16, s->uv_width, 2, yuv_rec_file); + src16 += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); + return; + } + + do { + fwrite(src, s->y_width, 1, yuv_rec_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); +} +#endif // OUTPUT_YUV_REC + +void av1_set_mv_search_params(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params; + const int max_mv_def = AOMMAX(cm->width, cm->height); + + // Default based on max resolution. + mv_search_params->mv_step_param = av1_init_search_range(max_mv_def); + + if (cpi->sf.mv_sf.auto_mv_step_size) { + if (frame_is_intra_only(cm)) { + // Initialize max_mv_magnitude for use in the first INTER frame + // after a key/intra-only frame. + mv_search_params->max_mv_magnitude = max_mv_def; + } else { + // Use adaptive mv steps based on previous frame stats for show frames and + // internal arfs. + FRAME_UPDATE_TYPE cur_update_type = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; + int use_auto_mv_step = + (cm->show_frame || cur_update_type == INTNL_ARF_UPDATE) && + mv_search_params->max_mv_magnitude != -1 && + cpi->sf.mv_sf.auto_mv_step_size >= 2; + if (use_auto_mv_step) { + // Allow mv_steps to correspond to twice the max mv magnitude found + // in the previous frame, capped by the default max_mv_magnitude based + // on resolution. + mv_search_params->mv_step_param = av1_init_search_range( + AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude)); + } + // Reset max_mv_magnitude based on update flag. + if (cpi->do_frame_data_update) mv_search_params->max_mv_magnitude = -1; + } + } +} + +void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) { + const AV1_COMMON *const cm = &cpi->common; + const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + + if (cm->seq_params->force_screen_content_tools != 2) { + features->allow_screen_content_tools = features->allow_intrabc = + cm->seq_params->force_screen_content_tools; + return; + } + + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + features->allow_screen_content_tools = 1; + features->allow_intrabc = cpi->oxcf.mode == REALTIME ? 0 : 1; + cpi->is_screen_content_type = 1; + cpi->use_screen_content_tools = 1; + return; + } + + if (cpi->oxcf.mode == REALTIME) { + features->allow_screen_content_tools = features->allow_intrabc = 0; + return; + } + + // Screen content tools are not evaluated in non-RD encoding mode unless + // content type is not set explicitly, i.e., when + // cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN, use_nonrd_pick_mode = 1 + // and hybrid_intra_pickmode = 0. Hence, screen content detection is + // disabled. + if (cpi->sf.rt_sf.use_nonrd_pick_mode && + !cpi->sf.rt_sf.hybrid_intra_pickmode) { + features->allow_screen_content_tools = features->allow_intrabc = 0; + return; + } + + // Estimate if the source frame is screen content, based on the portion of + // blocks that have few luma colors. + const uint8_t *src = cpi->unfiltered_source->y_buffer; + assert(src != NULL); + const int use_hbd = cpi->unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH; + const int stride = cpi->unfiltered_source->y_stride; + const int width = cpi->unfiltered_source->y_width; + const int height = cpi->unfiltered_source->y_height; + const int64_t area = (int64_t)width * height; + const int bd = cm->seq_params->bit_depth; + const int blk_w = 16; + const int blk_h = 16; + // These threshold values are selected experimentally. + const int color_thresh = 4; + const unsigned int var_thresh = 0; + // Counts of blocks with no more than color_thresh colors. + int64_t counts_1 = 0; + // Counts of blocks with no more than color_thresh colors and variance larger + // than var_thresh. + int64_t counts_2 = 0; + + for (int r = 0; r + blk_h <= height; r += blk_h) { + for (int c = 0; c + blk_w <= width; c += blk_w) { + int count_buf[1 << 8]; // Maximum (1 << 8) bins for hbd path. + const uint8_t *const this_src = src + r * stride + c; + int n_colors; + if (use_hbd) + av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, NULL, + count_buf, &n_colors, NULL); + else + av1_count_colors(this_src, stride, blk_w, blk_h, count_buf, &n_colors); + if (n_colors > 1 && n_colors <= color_thresh) { + ++counts_1; + struct buf_2d buf; + buf.stride = stride; + buf.buf = (uint8_t *)this_src; + const unsigned int var = av1_get_perpixel_variance( + cpi, xd, &buf, BLOCK_16X16, AOM_PLANE_Y, use_hbd); + if (var > var_thresh) ++counts_2; + } + } + } + + // The threshold values are selected experimentally. + features->allow_screen_content_tools = counts_1 * blk_h * blk_w * 10 > area; + // IntraBC would force loop filters off, so we use more strict rules that also + // requires that the block has high variance. + features->allow_intrabc = features->allow_screen_content_tools && + counts_2 * blk_h * blk_w * 12 > area; + cpi->use_screen_content_tools = features->allow_screen_content_tools; + cpi->is_screen_content_type = + features->allow_intrabc || (counts_1 * blk_h * blk_w * 10 > area * 4 && + counts_2 * blk_h * blk_w * 30 > area); +} + +static void init_motion_estimation(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params; + const int aligned_width = (cm->width + 7) & ~7; + const int y_stride = + aom_calc_y_stride(aligned_width, cpi->oxcf.border_in_pixels); + const int y_stride_src = ((cpi->oxcf.frm_dim_cfg.width != cm->width || + cpi->oxcf.frm_dim_cfg.height != cm->height) || + av1_superres_scaled(cm)) + ? y_stride + : cpi->ppi->lookahead->buf->img.y_stride; + int fpf_y_stride = + cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride : y_stride; + + // Update if search_site_cfg is uninitialized or the current frame has a new + // stride + const int should_update = + !mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride || + !mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][DIAMOND].stride || + (y_stride != + mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride); + + if (!should_update) { + return; + } + + // Initialization of search_site_cfg for NUM_DISTINCT_SEARCH_METHODS. + for (SEARCH_METHODS i = DIAMOND; i < NUM_DISTINCT_SEARCH_METHODS; i++) { + const int level = ((i == NSTEP_8PT) || (i == CLAMPED_DIAMOND)) ? 1 : 0; + av1_init_motion_compensation[i]( + &mv_search_params->search_site_cfg[SS_CFG_SRC][i], y_stride, level); + av1_init_motion_compensation[i]( + &mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][i], y_stride_src, + level); + } + + // First pass search site config initialization. + av1_init_motion_fpf(&mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND], + fpf_y_stride); + for (SEARCH_METHODS i = NSTEP; i < NUM_DISTINCT_SEARCH_METHODS; i++) { + memcpy(&mv_search_params->search_site_cfg[SS_CFG_FPF][i], + &mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND], + sizeof(search_site_config)); + } +} + +static void init_ref_frame_bufs(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + int i; + if (cm->cur_frame) { + cm->cur_frame->ref_count--; + cm->cur_frame = NULL; + } + for (i = 0; i < REF_FRAMES; ++i) { + if (cm->ref_frame_map[i]) { + cm->ref_frame_map[i]->ref_count--; + cm->ref_frame_map[i] = NULL; + } + } +#ifndef NDEBUG + BufferPool *const pool = cm->buffer_pool; + for (i = 0; i < pool->num_frame_bufs; ++i) { + assert(pool->frame_bufs[i].ref_count == 0); + } +#endif +} + +// TODO(chengchen): consider renaming this function as it is necessary +// for the encoder to setup critical parameters, and it does not +// deal with initial width any longer. +aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + + if (!cpi->frame_size_related_setup_done || + seq_params->use_highbitdepth != use_highbitdepth || + seq_params->subsampling_x != subsampling_x || + seq_params->subsampling_y != subsampling_y) { + seq_params->subsampling_x = subsampling_x; + seq_params->subsampling_y = subsampling_y; + seq_params->use_highbitdepth = use_highbitdepth; + + av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); + av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed); + + if (!is_stat_generation_stage(cpi)) { +#if !CONFIG_REALTIME_ONLY + if (!av1_tf_info_alloc(&cpi->ppi->tf_info, cpi)) + return AOM_CODEC_MEM_ERROR; +#endif // !CONFIG_REALTIME_ONLY + } + init_ref_frame_bufs(cpi); + + init_motion_estimation(cpi); // TODO(agrange) This can be removed. + + cpi->initial_mbs = cm->mi_params.MBs; + cpi->frame_size_related_setup_done = true; + } + return AOM_CODEC_OK; +} + +#if CONFIG_AV1_TEMPORAL_DENOISING +static void setup_denoiser_buffer(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + if (cpi->oxcf.noise_sensitivity > 0 && + !cpi->denoiser.frame_buffer_initialized) { + if (av1_denoiser_alloc( + cm, &cpi->svc, &cpi->denoiser, cpi->ppi->use_svc, + cpi->oxcf.noise_sensitivity, cm->width, cm->height, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate denoiser"); + } +} +#endif + +// Returns 1 if the assigned width or height was <= 0. +static int set_size_literal(AV1_COMP *cpi, int width, int height) { + AV1_COMMON *cm = &cpi->common; + aom_codec_err_t err = av1_check_initial_width( + cpi, cm->seq_params->use_highbitdepth, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y); + if (err != AOM_CODEC_OK) { + aom_internal_error(cm->error, err, "av1_check_initial_width() failed"); + } + + if (width <= 0 || height <= 0) return 1; + + cm->width = width; + cm->height = height; + +#if CONFIG_AV1_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif + + if (cm->width > cpi->data_alloc_width || + cm->height > cpi->data_alloc_height) { + av1_free_context_buffers(cm); + av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); + av1_free_sms_tree(&cpi->td); + av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm)); + cpi->td.firstpass_ctx = NULL; + alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; + cpi->frame_size_related_setup_done = false; + } + alloc_mb_mode_info_buffers(cpi); + av1_update_frame_size(cpi); + + return 0; +} + +void av1_set_frame_size(AV1_COMP *cpi, int width, int height) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + int ref_frame; + + if (width != cm->width || height != cm->height) { + // There has been a change in the encoded frame size + set_size_literal(cpi, width, height); + // Recalculate 'all_lossless' in case super-resolution was (un)selected. + cm->features.all_lossless = + cm->features.coded_lossless && !av1_superres_scaled(cm); + + av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); +#if CONFIG_AV1_TEMPORAL_DENOISING + // Reset the denoiser on the resized frame. + if (cpi->oxcf.noise_sensitivity > 0) { + av1_denoiser_free(&(cpi->denoiser)); + setup_denoiser_buffer(cpi); + } +#endif + } + if (is_stat_consumption_stage(cpi)) { + av1_set_target_rate(cpi, cm->width, cm->height); + } + + alloc_frame_mvs(cm, cm->cur_frame); + + // Allocate above context buffers + CommonContexts *const above_contexts = &cm->above_contexts; + if (above_contexts->num_planes < av1_num_planes(cm) || + above_contexts->num_mi_cols < cm->mi_params.mi_cols || + above_contexts->num_tile_rows < cm->tiles.rows) { + av1_free_above_context_buffers(above_contexts); + if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows, + cm->mi_params.mi_cols, + av1_num_planes(cm))) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + + AV1EncoderConfig *oxcf = &cpi->oxcf; + oxcf->border_in_pixels = av1_get_enc_border_size( + av1_is_resize_needed(oxcf), oxcf->kf_cfg.key_freq_max == 0, + cm->seq_params->sb_size); + + // Reset the frame pointers to the current frame size. + if (aom_realloc_frame_buffer( + &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL, cpi->image_pyramid_levels, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + if (!is_stat_generation_stage(cpi)) av1_init_cdef_worker(cpi); + +#if !CONFIG_REALTIME_ONLY + if (is_restoration_used(cm)) { + for (int i = 0; i < num_planes; ++i) + cm->rst_info[i].frame_restoration_type = RESTORE_NONE; + + const bool is_sgr_enabled = !cpi->sf.lpf_sf.disable_sgr_filter; + av1_alloc_restoration_buffers(cm, is_sgr_enabled); + // Store the allocated restoration buffers in MT object. + if (cpi->ppi->p_mt_info.num_workers > 1) { + av1_init_lr_mt_buffers(cpi); + } + } +#endif + + init_motion_estimation(cpi); + + int has_valid_ref_frame = 0; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame); + av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width, + buf->buf.y_crop_height, cm->width, + cm->height); + has_valid_ref_frame |= av1_is_valid_scale(sf); + if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes); + } + } + if (!frame_is_intra_only(cm) && !has_valid_ref_frame) { + aom_internal_error( + cm->error, AOM_CODEC_CORRUPT_FRAME, + "Can't find at least one reference frame with valid size"); + } + + av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height, + cm->width, cm->height); + + set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); +} + +static INLINE int extend_borders_mt(const AV1_COMP *cpi, + MULTI_THREADED_MODULES stage, int plane) { + const AV1_COMMON *const cm = &cpi->common; + if (cpi->mt_info.num_mod_workers[stage] < 2) return 0; + switch (stage) { + // TODO(deepa.kg@ittiam.com): When cdef and loop-restoration are disabled, + // multi-thread frame border extension along with loop filter frame. + // As loop-filtering of a superblock row modifies the pixels of the + // above superblock row, border extension requires that loop filtering + // of the current and above superblock row is complete. + case MOD_LPF: return 0; + case MOD_CDEF: + return is_cdef_used(cm) && !cpi->ppi->rtc_ref.non_reference_frame && + !is_restoration_used(cm) && !av1_superres_scaled(cm); + case MOD_LR: + return is_restoration_used(cm) && + (cm->rst_info[plane].frame_restoration_type != RESTORE_NONE); + default: assert(0); + } + return 0; +} + +/*!\brief Select and apply cdef filters and switchable restoration filters + * + * \ingroup high_level_algo + */ +static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm, + MACROBLOCKD *xd, int use_restoration, + int use_cdef, + unsigned int skip_apply_postproc_filters) { +#if !CONFIG_REALTIME_ONLY + if (use_restoration) + av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0); +#else + (void)use_restoration; +#endif + + if (use_cdef) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, cdef_time); +#endif + const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF]; + // Find CDEF parameters + av1_cdef_search(cpi); + + // Apply the filter + if ((skip_apply_postproc_filters & SKIP_APPLY_CDEF) == 0) { + assert(!cpi->ppi->rtc_ref.non_reference_frame); + if (num_workers > 1) { + // Extension of frame borders is multi-threaded along with cdef. + const int do_extend_border = + extend_borders_mt(cpi, MOD_CDEF, /* plane */ 0); + av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker, + cpi->mt_info.workers, &cpi->mt_info.cdef_sync, + num_workers, av1_cdef_init_fb_row_mt, + do_extend_border); + } else { + av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, cdef_time); +#endif + } + + const int use_superres = av1_superres_scaled(cm); + if (use_superres) { + if ((skip_apply_postproc_filters & SKIP_APPLY_SUPERRES) == 0) { + av1_superres_post_encode(cpi); + } + } + +#if !CONFIG_REALTIME_ONLY +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loop_restoration_time); +#endif + if (use_restoration) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + const int num_workers = mt_info->num_mod_workers[MOD_LR]; + av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1); + av1_pick_filter_restoration(cpi->source, cpi); + if ((skip_apply_postproc_filters & SKIP_APPLY_RESTORATION) == 0 && + (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + cm->rst_info[1].frame_restoration_type != RESTORE_NONE || + cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) { + if (num_workers > 1) { + // Extension of frame borders is multi-threaded along with loop + // restoration filter. + const int do_extend_border = 1; + av1_loop_restoration_filter_frame_mt( + &cm->cur_frame->buf, cm, 0, mt_info->workers, num_workers, + &mt_info->lr_row_sync, &cpi->lr_ctxt, do_extend_border); + } else { + av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0, + &cpi->lr_ctxt); + } + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loop_restoration_time); +#endif +#endif // !CONFIG_REALTIME_ONLY +} + +static void extend_frame_borders(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + // TODO(debargha): Fix mv search range on encoder side + for (int plane = 0; plane < av1_num_planes(cm); ++plane) { + const bool extend_border_done = extend_borders_mt(cpi, MOD_CDEF, plane) || + extend_borders_mt(cpi, MOD_LR, plane); + if (!extend_border_done) { + const YV12_BUFFER_CONFIG *const ybf = &cm->cur_frame->buf; + aom_extend_frame_borders_plane_row(ybf, plane, 0, + ybf->crop_heights[plane > 0]); + } + } +} + +/*!\brief Select and apply deblocking filters, cdef filters, and restoration + * filters. + * + * \ingroup high_level_algo + */ +static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + const int num_workers = mt_info->num_mod_workers[MOD_LPF]; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + cpi->td.mb.rdmult = cpi->rd.RDMULT; + + assert(IMPLIES(is_lossless_requested(&cpi->oxcf.rc_cfg), + cm->features.coded_lossless && cm->features.all_lossless)); + + const int use_loopfilter = + is_loopfilter_used(cm) && !cpi->mt_info.pipeline_lpf_mt_with_enc; + const int use_cdef = is_cdef_used(cm); + const int use_superres = av1_superres_scaled(cm); + const int use_restoration = is_restoration_used(cm); + + const unsigned int skip_apply_postproc_filters = + derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef, + use_superres, use_restoration); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loop_filter_time); +#endif + if (use_loopfilter) { + av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick); + struct loopfilter *lf = &cm->lf; + if ((lf->filter_level[0] || lf->filter_level[1]) && + (skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0) { + assert(!cpi->ppi->rtc_ref.non_reference_frame); + // lpf_opt_level = 1 : Enables dual/quad loop-filtering. + // lpf_opt_level is set to 1 if transform size search depth in inter + // blocks is limited to one as quad loop filtering assumes that all the + // transform blocks within a 16x8/8x16/16x16 prediction block are of the + // same size. lpf_opt_level = 2 : Filters both chroma planes together, in + // addition to enabling dual/quad loop-filtering. This is enabled when lpf + // pick method is LPF_PICK_FROM_Q as u and v plane filter levels are + // equal. + int lpf_opt_level = get_lpf_opt_level(&cpi->sf); + av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0, + mt_info->workers, num_workers, + &mt_info->lf_row_sync, lpf_opt_level); + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loop_filter_time); +#endif + + cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef, + skip_apply_postproc_filters); +} + +static void update_motion_stat(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; + const int avg_cnt_zeromv = + 100 * cpi->rc.cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols); + if (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { + rc->avg_frame_low_motion = + (rc->avg_frame_low_motion == 0) + ? avg_cnt_zeromv + : (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4; + // For SVC: set avg_frame_low_motion (only computed on top spatial layer) + // to all lower spatial layers. + if (cpi->ppi->use_svc && + svc->spatial_layer_id == svc->number_spatial_layers - 1) { + for (int i = 0; i < svc->number_spatial_layers - 1; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_frame_low_motion = rc->avg_frame_low_motion; + } + } + } +} + +/*!\brief Encode a frame without the recode loop, usually used in one-pass + * encoding and realtime coding. + * + * \ingroup high_level_algo + * + * \param[in] cpi Top-level encoder structure + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval #AOM_CODEC_ERROR + */ +static int encode_without_recode(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg; + SVC *const svc = &cpi->svc; + const int resize_pending = is_frame_resize_pending(cpi); + int top_index = 0, bottom_index = 0, q = 0; + YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source; + InterpFilter filter_scaler = + cpi->ppi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id] + : EIGHTTAP_SMOOTH; + int phase_scaler = cpi->ppi->use_svc + ? svc->downsample_filter_phase[svc->spatial_layer_id] + : 0; + + set_size_independent_vars(cpi); + av1_setup_frame_size(cpi); + cm->prev_frame = get_primary_ref_frame_buf(cm); + av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + av1_set_mv_search_params(cpi); + + if (cm->current_frame.frame_number == 0 && + (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) && + cpi->svc.temporal_layer_id == 0) { + const SequenceHeader *seq_params = cm->seq_params; + if (aom_alloc_frame_buffer( + &cpi->svc.source_last_TL0, cpi->oxcf.frm_dim_cfg.width, + cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate buffer for source_last_TL0"); + } + } + + if (!cpi->ppi->use_svc) { + phase_scaler = 8; + // 2:1 scaling. + if ((cm->width << 1) == unscaled->y_crop_width && + (cm->height << 1) == unscaled->y_crop_height) { + filter_scaler = BILINEAR; + // For lower resolutions use eighttap_smooth. + if (cm->width * cm->height <= 320 * 180) filter_scaler = EIGHTTAP_SMOOTH; + } else if ((cm->width << 2) == unscaled->y_crop_width && + (cm->height << 2) == unscaled->y_crop_height) { + // 4:1 scaling. + filter_scaler = EIGHTTAP_SMOOTH; + } else if ((cm->width << 2) == 3 * unscaled->y_crop_width && + (cm->height << 2) == 3 * unscaled->y_crop_height) { + // 4:3 scaling. + filter_scaler = EIGHTTAP_REGULAR; + } + } + + allocate_gradient_info_for_hog(cpi); + + allocate_src_var_of_4x4_sub_block_buf(cpi); + + const SPEED_FEATURES *sf = &cpi->sf; + if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) + variance_partition_alloc(cpi); + + if (cm->current_frame.frame_type == KEY_FRAME || + ((sf->inter_sf.extra_prune_warped && cpi->refresh_frame.golden_frame))) + copy_frame_prob_info(cpi); + +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame: \n"); +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { + av1_setup_butteraugli_rdmult(cpi); + } +#endif + + cpi->source = av1_realloc_and_scale_if_required( + cm, unscaled, &cpi->scaled_source, filter_scaler, phase_scaler, true, + false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + if (frame_is_intra_only(cm) || resize_pending != 0) { + const int current_size = + (cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2; + if (cpi->consec_zero_mv && + (cpi->consec_zero_mv_alloc_size < current_size)) { + aom_free(cpi->consec_zero_mv); + cpi->consec_zero_mv_alloc_size = 0; + CHECK_MEM_ERROR(cm, cpi->consec_zero_mv, + aom_malloc(current_size * sizeof(*cpi->consec_zero_mv))); + cpi->consec_zero_mv_alloc_size = current_size; + } + assert(cpi->consec_zero_mv != NULL); + memset(cpi->consec_zero_mv, 0, current_size * sizeof(*cpi->consec_zero_mv)); + } + + if (cpi->scaled_last_source_available) { + cpi->last_source = &cpi->scaled_last_source; + cpi->scaled_last_source_available = 0; + } else if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler, + phase_scaler, true, false, cpi->oxcf.border_in_pixels, + cpi->image_pyramid_levels); + } + + if (cpi->sf.rt_sf.use_temporal_noise_estimate) { + av1_update_noise_estimate(cpi); + } + +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && cpi->ppi->use_svc) + av1_denoiser_reset_on_first_frame(cpi); +#endif + + // For 1 spatial layer encoding: if the (non-LAST) reference has different + // resolution from the source then disable that reference. This is to avoid + // significant increase in encode time from scaling the references in + // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger) + // resized frame and ALTREF will be refreshed ~4 frames later, so both + // references become available again after few frames. + // For superres: don't disable golden reference. + if (svc->number_spatial_layers == 1) { + if (!cpi->oxcf.superres_cfg.enable_superres) { + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) { + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) + cpi->ref_frame_flags ^= AOM_GOLD_FLAG; + } + } + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) { + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, ALTREF_FRAME); + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) + cpi->ref_frame_flags ^= AOM_ALT_FLAG; + } + } + + int scale_references = 0; +#if CONFIG_FPMT_TEST + scale_references = + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0; +#endif // CONFIG_FPMT_TEST + if (scale_references || + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + if (!frame_is_intra_only(cm)) { + av1_scale_references(cpi, filter_scaler, phase_scaler, 1); + } + } + + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q, + q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq); + av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed); + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + av1_set_variance_partition_thresholds(cpi, q, 0); + av1_setup_frame(cpi); + + // Check if this high_source_sad (scene/slide change) frame should be + // encoded at high/max QP, and if so, set the q and adjust some rate + // control parameters. + if (cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ && + cpi->rc.high_source_sad) { + if (av1_encodedframe_overshoot_cbr(cpi, &q)) { + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q, + q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq); + av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed); + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + av1_set_variance_partition_thresholds(cpi, q, 0); + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + cm->features.primary_ref_frame == PRIMARY_REF_NONE) + av1_setup_frame(cpi); + } + } + + if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) { + suppress_active_map(cpi); + av1_cyclic_refresh_setup(cpi); + } + av1_apply_active_map(cpi); + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + + // This is for rtc temporal filtering case. + if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf && + cm->current_frame.frame_type != KEY_FRAME) { + const SequenceHeader *seq_params = cm->seq_params; + + if (cpi->orig_source.buffer_alloc_sz == 0 || + cpi->last_source->y_width != cpi->source->y_width || + cpi->last_source->y_height != cpi->source->y_height) { + // Allocate a source buffer to store the true source for psnr calculation. + if (aom_alloc_frame_buffer( + &cpi->orig_source, cpi->oxcf.frm_dim_cfg.width, + cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate scaled buffer"); + } + + aom_yv12_copy_y(cpi->source, &cpi->orig_source); + aom_yv12_copy_u(cpi->source, &cpi->orig_source); + aom_yv12_copy_v(cpi->source, &cpi->orig_source); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_encode_frame_time); +#endif + + // Set the motion vector precision based on mv stats from the last coded + // frame. + if (!frame_is_intra_only(cm)) av1_pick_and_set_high_precision_mv(cpi, q); + + // transform / motion compensation build reconstruction frame + av1_encode_frame(cpi); + + if (!cpi->rc.rtc_external_ratectrl && !frame_is_intra_only(cm)) + update_motion_stat(cpi); + + // Adjust the refresh of the golden (longer-term) reference based on QP + // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode. + if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR && + cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 && + svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl && + sf->rt_sf.gf_refresh_based_on_qp) + av1_adjust_gf_refresh_qp_one_pass_rt(cpi); + + // For non-svc: if scaling is required, copy scaled_source + // into scaled_last_source. + if (cm->current_frame.frame_number > 1 && !cpi->ppi->use_svc && + cpi->scaled_source.y_buffer != NULL && + cpi->scaled_last_source.y_buffer != NULL && + cpi->scaled_source.y_crop_width == cpi->scaled_last_source.y_crop_width && + cpi->scaled_source.y_crop_height == + cpi->scaled_last_source.y_crop_height && + (cm->width != cpi->unscaled_source->y_crop_width || + cm->height != cpi->unscaled_source->y_crop_height)) { + cpi->scaled_last_source_available = 1; + aom_yv12_copy_y(&cpi->scaled_source, &cpi->scaled_last_source); + aom_yv12_copy_u(&cpi->scaled_source, &cpi->scaled_last_source); + aom_yv12_copy_v(&cpi->scaled_source, &cpi->scaled_last_source); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_encode_frame_time); +#endif +#if CONFIG_INTERNAL_STATS + ++cpi->frame_recode_hits; +#endif + + return AOM_CODEC_OK; +} + +#if !CONFIG_REALTIME_ONLY + +/*!\brief Recode loop for encoding one frame. the purpose of encoding one frame + * for multiple times can be approaching a target bitrate or adjusting the usage + * of global motions. + * + * \ingroup high_level_algo + * + * \param[in] cpi Top-level encoder structure + * \param[in] size Bitstream size + * \param[in] dest Bitstream output + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval -1 + * \retval #AOM_CODEC_ERROR + */ +static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + GlobalMotionInfo *const gm_info = &cpi->gm_info; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const QuantizationCfg *const q_cfg = &oxcf->q_cfg; + const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE); + // Must allow recode if minimum compression ratio is set. + assert(IMPLIES(oxcf->rc_cfg.min_cr > 0, allow_recode)); + + set_size_independent_vars(cpi); + if (is_stat_consumption_stage_twopass(cpi) && + cpi->sf.interp_sf.adaptive_interp_filter_search) + cpi->interp_search_flags.interp_filter_search_mask = + av1_setup_interp_filter_search_mask(cpi); + + av1_setup_frame_size(cpi); + + if (av1_superres_in_recode_allowed(cpi) && + cpi->superres_mode != AOM_SUPERRES_NONE && + cm->superres_scale_denominator == SCALE_NUMERATOR) { + // Superres mode is currently enabled, but the denominator selected will + // disable superres. So no need to continue, as we will go through another + // recode loop for full-resolution after this anyway. + return -1; + } + + int top_index = 0, bottom_index = 0; + int q = 0, q_low = 0, q_high = 0; + av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + q_low = bottom_index; + q_high = top_index; + + av1_set_mv_search_params(cpi); + + allocate_gradient_info_for_hog(cpi); + + allocate_src_var_of_4x4_sub_block_buf(cpi); + + if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) + variance_partition_alloc(cpi); + + if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi); + +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame: \n"); +#endif + +#if !CONFIG_RD_COMMAND + // Determine whether to use screen content tools using two fast encoding. + if (!cpi->sf.hl_sf.disable_extra_sc_testing && !cpi->use_ducky_encode) + av1_determine_sc_tools_with_encoding(cpi, q); +#endif // !CONFIG_RD_COMMAND + +#if CONFIG_TUNE_VMAF + if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { + av1_vmaf_neg_preprocessing(cpi, cpi->unscaled_source); + } +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + cpi->butteraugli_info.recon_set = false; + int original_q = 0; +#endif + + cpi->num_frame_recode = 0; + + // Loop variables + int loop = 0; + int loop_count = 0; + int overshoot_seen = 0; + int undershoot_seen = 0; + int low_cr_seen = 0; + int last_loop_allow_hp = 0; + + do { + loop = 0; + int do_mv_stats_collection = 1; + + // if frame was scaled calculate global_motion_search again if already + // done + if (loop_count > 0 && cpi->source && gm_info->search_done) { + if (cpi->source->y_crop_width != cm->width || + cpi->source->y_crop_height != cm->height) { + gm_info->search_done = 0; + } + } + cpi->source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_source, &cpi->scaled_source, EIGHTTAP_REGULAR, 0, + false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + +#if CONFIG_TUNE_BUTTERAUGLI + if (oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { + if (loop_count == 0) { + original_q = q; + // TODO(sdeng): different q here does not make big difference. Use a + // faster pass instead. + q = 96; + av1_setup_butteraugli_source(cpi); + } else { + q = original_q; + } + } +#endif + + if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels, + cpi->image_pyramid_levels); + } + + int scale_references = 0; +#if CONFIG_FPMT_TEST + scale_references = + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0; +#endif // CONFIG_FPMT_TEST + if (scale_references || + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + if (!frame_is_intra_only(cm)) { + if (loop_count > 0) { + release_scaled_references(cpi); + } + av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0); + } + } + +#if CONFIG_TUNE_VMAF + if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { + cpi->vmaf_info.original_qindex = q; + q = av1_get_vmaf_base_qindex(cpi, q); + } +#endif + +#if CONFIG_RD_COMMAND + RD_COMMAND *rd_command = &cpi->rd_command; + RD_OPTION option = rd_command->option_ls[rd_command->frame_index]; + if (option == RD_OPTION_SET_Q || option == RD_OPTION_SET_Q_RDMULT) { + q = rd_command->q_index_ls[rd_command->frame_index]; + } +#endif // CONFIG_RD_COMMAND + +#if CONFIG_BITRATE_ACCURACY +#if CONFIG_THREE_PASS + if (oxcf->pass == AOM_RC_THIRD_PASS && cpi->vbr_rc_info.ready == 1) { + int frame_coding_idx = + av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); + if (frame_coding_idx < cpi->vbr_rc_info.total_frame_count) { + q = cpi->vbr_rc_info.q_index_list[frame_coding_idx]; + } else { + // TODO(angiebird): Investigate why sometimes there is an extra frame + // after the last GOP. + q = cpi->vbr_rc_info.base_q_index; + } + } +#else + if (cpi->vbr_rc_info.q_index_list_ready) { + q = cpi->vbr_rc_info.q_index_list[cpi->gf_frame_index]; + } +#endif // CONFIG_THREE_PASS +#endif // CONFIG_BITRATE_ACCURACY + +#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + // TODO(angiebird): Move this into a function. + if (oxcf->pass == AOM_RC_THIRD_PASS) { + int frame_coding_idx = + av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); + double qstep_ratio = cpi->vbr_rc_info.qstep_ratio_list[frame_coding_idx]; + FRAME_UPDATE_TYPE update_type = + cpi->vbr_rc_info.update_type_list[frame_coding_idx]; + rc_log_frame_encode_param(&cpi->rc_log, frame_coding_idx, qstep_ratio, q, + update_type); + } +#endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + + if (cpi->use_ducky_encode) { + const DuckyEncodeFrameInfo *frame_info = + &cpi->ducky_encode_info.frame_info; + if (frame_info->qp_mode == DUCKY_ENCODE_FRAME_MODE_QINDEX) { + q = frame_info->q_index; + cm->delta_q_info.delta_q_present_flag = frame_info->delta_q_enabled; + } + } + + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q, + q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq); + av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + + av1_set_variance_partition_thresholds(cpi, q, 0); + + // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n", + // cm->current_frame.frame_number, cm->show_frame, q, + // cm->current_frame.frame_type, cm->superres_scale_denominator); + + if (loop_count == 0) { + av1_setup_frame(cpi); + } else if (get_primary_ref_frame_buf(cm) == NULL) { + // Base q-index may have changed, so we need to assign proper default coef + // probs before every iteration. + av1_default_coef_probs(cm); + av1_setup_frame_contexts(cm); + } + + if (q_cfg->aq_mode == VARIANCE_AQ) { + av1_vaq_frame_setup(cpi); + } else if (q_cfg->aq_mode == COMPLEXITY_AQ) { + av1_setup_in_frame_q_adj(cpi); + } + + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_encode_frame_time); +#endif + // Set the motion vector precision based on mv stats from the last coded + // frame. + if (!frame_is_intra_only(cm)) { + av1_pick_and_set_high_precision_mv(cpi, q); + + // If the precision has changed during different iteration of the loop, + // then we need to reset the global motion vectors + if (loop_count > 0 && + cm->features.allow_high_precision_mv != last_loop_allow_hp) { + gm_info->search_done = 0; + } + last_loop_allow_hp = cm->features.allow_high_precision_mv; + } + + // transform / motion compensation build reconstruction frame + av1_encode_frame(cpi); + + // Disable mv_stats collection for parallel frames based on update flag. + if (!cpi->do_frame_data_update) do_mv_stats_collection = 0; + + // Reset the mv_stats in case we are interrupted by an intraframe or an + // overlay frame. + if (cpi->mv_stats.valid && do_mv_stats_collection) av1_zero(cpi->mv_stats); + + // Gather the mv_stats for the next frame + if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA && + av1_frame_allows_smart_mv(cpi) && do_mv_stats_collection) { + av1_collect_mv_stats(cpi, q); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_encode_frame_time); +#endif + +#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND + const int do_dummy_pack = 1; +#else // CONFIG_BITRATE_ACCURACY + // Dummy pack of the bitstream using up to date stats to get an + // accurate estimate of output frame size to determine if we need + // to recode. + const int do_dummy_pack = + (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF && + oxcf->rc_cfg.mode != AOM_Q) || + oxcf->rc_cfg.min_cr > 0; +#endif // CONFIG_BITRATE_ACCURACY + if (do_dummy_pack) { + av1_finalize_encoded_frame(cpi); + int largest_tile_id = 0; // Output from bitstream: unused here + rc->coefficient_size = 0; + if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + // bits used for this frame + rc->projected_frame_size = (int)(*size) << 3; +#if CONFIG_RD_COMMAND + PSNR_STATS psnr; + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); + printf("q %d rdmult %d rate %d dist %" PRIu64 "\n", q, cpi->rd.RDMULT, + rc->projected_frame_size, psnr.sse[0]); + ++rd_command->frame_index; + if (rd_command->frame_index == rd_command->frame_count) { + return AOM_CODEC_ERROR; + } +#endif // CONFIG_RD_COMMAND + +#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + if (oxcf->pass == AOM_RC_THIRD_PASS) { + int frame_coding_idx = + av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); + rc_log_frame_entropy(&cpi->rc_log, frame_coding_idx, + rc->projected_frame_size, rc->coefficient_size); + } +#endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + } + +#if CONFIG_TUNE_VMAF + if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { + q = cpi->vmaf_info.original_qindex; + } +#endif + if (allow_recode) { + // Update q and decide whether to do a recode loop + recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index, + bottom_index, &undershoot_seen, &overshoot_seen, + &low_cr_seen, loop_count); + } + +#if CONFIG_TUNE_BUTTERAUGLI + if (loop_count == 0 && oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { + loop = 1; + av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.4); + } +#endif + + if (cpi->use_ducky_encode) { + // Ducky encode currently does not support recode loop. + loop = 0; + } +#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND + loop = 0; // turn off recode loop when CONFIG_BITRATE_ACCURACY is on +#endif // CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND + + if (loop) { + ++loop_count; + cpi->num_frame_recode = + (cpi->num_frame_recode < (NUM_RECODES_PER_FRAME - 1)) + ? (cpi->num_frame_recode + 1) + : (NUM_RECODES_PER_FRAME - 1); +#if CONFIG_INTERNAL_STATS + ++cpi->frame_recode_hits; +#endif + } +#if CONFIG_COLLECT_COMPONENT_TIMING + if (loop) printf("\n Recoding:"); +#endif + } while (loop); + + return AOM_CODEC_OK; +} +#endif // !CONFIG_REALTIME_ONLY + +// TODO(jingning, paulwilkins): Set up high grain level to test +// hardware decoders. Need to adapt the actual noise variance +// according to the difference between reconstructed frame and the +// source signal. +static void set_grain_syn_params(AV1_COMMON *cm) { + aom_film_grain_t *film_grain_params = &cm->film_grain_params; + film_grain_params->apply_grain = 1; + film_grain_params->update_parameters = 1; + film_grain_params->random_seed = rand() & 0xffff; + + film_grain_params->num_y_points = 1; + film_grain_params->scaling_points_y[0][0] = 128; + film_grain_params->scaling_points_y[0][1] = 100; + + if (!cm->seq_params->monochrome) { + film_grain_params->num_cb_points = 1; + film_grain_params->scaling_points_cb[0][0] = 128; + film_grain_params->scaling_points_cb[0][1] = 100; + + film_grain_params->num_cr_points = 1; + film_grain_params->scaling_points_cr[0][0] = 128; + film_grain_params->scaling_points_cr[0][1] = 100; + } else { + film_grain_params->num_cb_points = 0; + film_grain_params->num_cr_points = 0; + } + + film_grain_params->chroma_scaling_from_luma = 0; + + film_grain_params->scaling_shift = 1; + film_grain_params->ar_coeff_lag = 0; + film_grain_params->ar_coeff_shift = 1; + film_grain_params->overlap_flag = 1; + film_grain_params->grain_scale_shift = 0; +} + +/*!\brief Recode loop or a single loop for encoding one frame, followed by + * in-loop deblocking filters, CDEF filters, and restoration filters. + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + * + * \param[in] cpi Top-level encoder structure + * \param[in] size Bitstream size + * \param[in] dest Bitstream output + * \param[in] sse Total distortion of the frame + * \param[in] rate Total rate of the frame + * \param[in] largest_tile_id Tile id of the last tile + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval #AOM_CODEC_ERROR + */ +static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size, + uint8_t *dest, int64_t *sse, + int64_t *rate, + int *largest_tile_id) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_with_or_without_recode_time); +#endif + for (int i = 0; i < NUM_RECODES_PER_FRAME; i++) { + cpi->do_update_frame_probs_txtype[i] = 0; + cpi->do_update_frame_probs_obmc[i] = 0; + cpi->do_update_frame_probs_warp[i] = 0; + cpi->do_update_frame_probs_interpfilter[i] = 0; + } + + cpi->do_update_vbr_bits_off_target_fast = 0; + int err; +#if CONFIG_REALTIME_ONLY + err = encode_without_recode(cpi); +#else + if (cpi->sf.hl_sf.recode_loop == DISALLOW_RECODE) + err = encode_without_recode(cpi); + else + err = encode_with_recode_loop(cpi, size, dest); +#endif +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_with_or_without_recode_time); +#endif + if (err != AOM_CODEC_OK) { + if (err == -1) { + // special case as described in encode_with_recode_loop(). + // Encoding was skipped. + err = AOM_CODEC_OK; + if (sse != NULL) *sse = INT64_MAX; + if (rate != NULL) *rate = INT64_MAX; + *largest_tile_id = 0; + } + return err; + } + +#ifdef OUTPUT_YUV_DENOISED + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) { + aom_write_yuv_frame(yuv_denoised_file, + &cpi->denoiser.running_avg_y[INTRA_FRAME]); + } +#endif + + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + + // Special case code to reduce pulsing when key frames are forced at a + // fixed interval. Note the reconstruction error if it is the frame before + // the force key frame + if (cpi->ppi->p_rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { +#if CONFIG_AV1_HIGHBITDEPTH + if (seq_params->use_highbitdepth) { + cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); + } else { + cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); + } +#else + cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#endif + } + + cm->cur_frame->buf.color_primaries = seq_params->color_primaries; + cm->cur_frame->buf.transfer_characteristics = + seq_params->transfer_characteristics; + cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; + cm->cur_frame->buf.monochrome = seq_params->monochrome; + cm->cur_frame->buf.chroma_sample_position = + seq_params->chroma_sample_position; + cm->cur_frame->buf.color_range = seq_params->color_range; + cm->cur_frame->buf.render_width = cm->render_width; + cm->cur_frame->buf.render_height = cm->render_height; + + if (!cpi->mt_info.pipeline_lpf_mt_with_enc) + set_postproc_filter_default_params(&cpi->common); + + if (!cm->features.allow_intrabc) { + loopfilter_frame(cpi, cm); + } + + if (cpi->oxcf.mode != ALLINTRA && !cpi->ppi->rtc_ref.non_reference_frame) { + extend_frame_borders(cpi); + } + +#ifdef OUTPUT_YUV_REC + aom_write_one_yuv_frame(cm, &cm->cur_frame->buf); +#endif + + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_FILM) { + set_grain_syn_params(cm); + } + + av1_finalize_encoded_frame(cpi); + // Build the bitstream +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_pack_bitstream_final_time); +#endif + cpi->rc.coefficient_size = 0; + if (av1_pack_bitstream(cpi, dest, size, largest_tile_id) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_pack_bitstream_final_time); +#endif + + // Compute sse and rate. + if (sse != NULL) { +#if CONFIG_AV1_HIGHBITDEPTH + *sse = (seq_params->use_highbitdepth) + ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf) + : aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#else + *sse = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#endif + } + if (rate != NULL) { + const int64_t bits = (*size << 3); + *rate = (bits << 5); // To match scale. + } + +#if !CONFIG_REALTIME_ONLY + if (cpi->use_ducky_encode) { + PSNR_STATS psnr; + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); + DuckyEncodeFrameResult *frame_result = &cpi->ducky_encode_info.frame_result; + frame_result->global_order_idx = cm->cur_frame->display_order_hint; + frame_result->q_index = cm->quant_params.base_qindex; + frame_result->rdmult = cpi->rd.RDMULT; + frame_result->rate = (int)(*size) * 8; + frame_result->dist = psnr.sse[0]; + frame_result->psnr = psnr.psnr[0]; + } +#endif // !CONFIG_REALTIME_ONLY + + return AOM_CODEC_OK; +} + +static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size, + uint8_t *dest, + int *largest_tile_id) { + const AV1_COMMON *const cm = &cpi->common; + assert(cm->seq_params->enable_superres); + assert(av1_superres_in_recode_allowed(cpi)); + aom_codec_err_t err = AOM_CODEC_OK; + av1_save_all_coding_context(cpi); + + int64_t sse1 = INT64_MAX; + int64_t rate1 = INT64_MAX; + int largest_tile_id1 = 0; + int64_t sse2 = INT64_MAX; + int64_t rate2 = INT64_MAX; + int largest_tile_id2; + double proj_rdcost1 = DBL_MAX; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const FRAME_UPDATE_TYPE update_type = + gf_group->update_type[cpi->gf_frame_index]; + const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; + + // Encode with superres. + if (cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_ALL) { + SuperResCfg *const superres_cfg = &cpi->oxcf.superres_cfg; + int64_t superres_sses[SCALE_NUMERATOR]; + int64_t superres_rates[SCALE_NUMERATOR]; + int superres_largest_tile_ids[SCALE_NUMERATOR]; + // Use superres for Key-frames and Alt-ref frames only. + if (update_type != OVERLAY_UPDATE && update_type != INTNL_OVERLAY_UPDATE) { + for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; + ++denom) { + superres_cfg->superres_scale_denominator = denom; + superres_cfg->superres_kf_scale_denominator = denom; + const int this_index = denom - (SCALE_NUMERATOR + 1); + + cpi->superres_mode = AOM_SUPERRES_AUTO; // Super-res on for this loop. + err = encode_with_recode_loop_and_filter( + cpi, size, dest, &superres_sses[this_index], + &superres_rates[this_index], + &superres_largest_tile_ids[this_index]); + cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). + if (err != AOM_CODEC_OK) return err; + restore_all_coding_context(cpi); + } + // Reset. + superres_cfg->superres_scale_denominator = SCALE_NUMERATOR; + superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR; + } else { + for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; + ++denom) { + const int this_index = denom - (SCALE_NUMERATOR + 1); + superres_sses[this_index] = INT64_MAX; + superres_rates[this_index] = INT64_MAX; + } + } + // Encode without superres. + assert(cpi->superres_mode == AOM_SUPERRES_NONE); + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2, + &largest_tile_id2); + if (err != AOM_CODEC_OK) return err; + + // Note: Both use common rdmult based on base qindex of fullres. + const int64_t rdmult = av1_compute_rd_mult_based_on_qindex( + bit_depth, update_type, cm->quant_params.base_qindex); + + // Find the best rdcost among all superres denoms. + int best_denom = -1; + for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; + ++denom) { + const int this_index = denom - (SCALE_NUMERATOR + 1); + const int64_t this_sse = superres_sses[this_index]; + const int64_t this_rate = superres_rates[this_index]; + const int this_largest_tile_id = superres_largest_tile_ids[this_index]; + const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST( + rdmult, this_rate, this_sse, bit_depth); + if (this_rdcost < proj_rdcost1) { + sse1 = this_sse; + rate1 = this_rate; + largest_tile_id1 = this_largest_tile_id; + proj_rdcost1 = this_rdcost; + best_denom = denom; + } + } + const double proj_rdcost2 = + RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth); + // Re-encode with superres if it's better. + if (proj_rdcost1 < proj_rdcost2) { + restore_all_coding_context(cpi); + // TODO(urvang): We should avoid rerunning the recode loop by saving + // previous output+state, or running encode only for the selected 'q' in + // previous step. + // Again, temporarily force the best denom. + superres_cfg->superres_scale_denominator = best_denom; + superres_cfg->superres_kf_scale_denominator = best_denom; + int64_t sse3 = INT64_MAX; + int64_t rate3 = INT64_MAX; + cpi->superres_mode = + AOM_SUPERRES_AUTO; // Super-res on for this recode loop. + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3, + largest_tile_id); + cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). + assert(sse1 == sse3); + assert(rate1 == rate3); + assert(largest_tile_id1 == *largest_tile_id); + // Reset. + superres_cfg->superres_scale_denominator = SCALE_NUMERATOR; + superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR; + } else { + *largest_tile_id = largest_tile_id2; + } + } else { + assert(cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_DUAL); + cpi->superres_mode = + AOM_SUPERRES_AUTO; // Super-res on for this recode loop. + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1, + &largest_tile_id1); + cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). + if (err != AOM_CODEC_OK) return err; + restore_all_coding_context(cpi); + // Encode without superres. + assert(cpi->superres_mode == AOM_SUPERRES_NONE); + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2, + &largest_tile_id2); + if (err != AOM_CODEC_OK) return err; + + // Note: Both use common rdmult based on base qindex of fullres. + const int64_t rdmult = av1_compute_rd_mult_based_on_qindex( + bit_depth, update_type, cm->quant_params.base_qindex); + proj_rdcost1 = + RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1, bit_depth); + const double proj_rdcost2 = + RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth); + // Re-encode with superres if it's better. + if (proj_rdcost1 < proj_rdcost2) { + restore_all_coding_context(cpi); + // TODO(urvang): We should avoid rerunning the recode loop by saving + // previous output+state, or running encode only for the selected 'q' in + // previous step. + int64_t sse3 = INT64_MAX; + int64_t rate3 = INT64_MAX; + cpi->superres_mode = + AOM_SUPERRES_AUTO; // Super-res on for this recode loop. + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3, + largest_tile_id); + cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). + assert(sse1 == sse3); + assert(rate1 == rate3); + assert(largest_tile_id1 == *largest_tile_id); + } else { + *largest_tile_id = largest_tile_id2; + } + } + + return err; +} + +// Conditions to disable cdf_update mode in selective mode for real-time. +// Handle case for layers, scene change, and resizing. +static AOM_INLINE int selective_disable_cdf_rtc(const AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + // For single layer. + if (cpi->svc.number_spatial_layers == 1 && + cpi->svc.number_temporal_layers == 1) { + // Don't disable on intra_only, scene change (high_source_sad = 1), + // or resized frame. To avoid quality loss force enable at + // for ~30 frames after key or scene/slide change, and + // after 8 frames since last update if frame_source_sad > 0. + if (frame_is_intra_only(cm) || is_frame_resize_pending(cpi) || + rc->high_source_sad || rc->frames_since_key < 30 || + (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->counter_encode_maxq_scene_change < 30) || + (cpi->frames_since_last_update > 8 && cpi->rc.frame_source_sad > 0)) + return 0; + else + return 1; + } else if (cpi->svc.number_temporal_layers > 1) { + // Disable only on top temporal enhancement layer for now. + return cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1; + } + return 1; +} + +#if !CONFIG_REALTIME_ONLY +static void subtract_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame -= frame->frame; + section->weight -= frame->weight; + section->intra_error -= frame->intra_error; + section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy; + section->coded_error -= frame->coded_error; + section->sr_coded_error -= frame->sr_coded_error; + section->pcnt_inter -= frame->pcnt_inter; + section->pcnt_motion -= frame->pcnt_motion; + section->pcnt_second_ref -= frame->pcnt_second_ref; + section->pcnt_neutral -= frame->pcnt_neutral; + section->intra_skip_pct -= frame->intra_skip_pct; + section->inactive_zone_rows -= frame->inactive_zone_rows; + section->inactive_zone_cols -= frame->inactive_zone_cols; + section->MVr -= frame->MVr; + section->mvr_abs -= frame->mvr_abs; + section->MVc -= frame->MVc; + section->mvc_abs -= frame->mvc_abs; + section->MVrv -= frame->MVrv; + section->MVcv -= frame->MVcv; + section->mv_in_out_count -= frame->mv_in_out_count; + section->new_mv_count -= frame->new_mv_count; + section->count -= frame->count; + section->duration -= frame->duration; +} + +static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->ppi->twopass; + const FIRSTPASS_STATS *const total_stats = + twopass->stats_buf_ctx->total_stats; + + if (is_one_pass_rt_params(cpi) || + (cpi->oxcf.q_cfg.deltaq_mode != DELTA_Q_PERCEPTUAL) || + (is_fp_wavelet_energy_invalid(total_stats) == 0)) + return; + + const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.mi_params.MBs; + const YV12_BUFFER_CONFIG *const unfiltered_source = cpi->unfiltered_source; + const uint8_t *const src = unfiltered_source->y_buffer; + const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH; + const int stride = unfiltered_source->y_stride; + const BLOCK_SIZE fp_block_size = + get_fp_block_size(cpi->is_screen_content_type); + const int fp_block_size_width = block_size_wide[fp_block_size]; + const int fp_block_size_height = block_size_high[fp_block_size]; + const int num_unit_cols = + get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width); + const int num_unit_rows = + get_num_blocks(unfiltered_source->y_crop_height, fp_block_size_height); + const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8); + const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8); + int64_t frame_avg_wavelet_energy = av1_haar_ac_sad_mxn_uint8_input( + src, stride, hbd, num_8x8_rows, num_8x8_cols); + + cpi->twopass_frame.frame_avg_haar_energy = + log1p((double)frame_avg_wavelet_energy / num_mbs); +} +#endif + +extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc, + const char *filename); + +/*!\brief Run the final pass encoding for 1-pass/2-pass encoding mode, and pack + * the bitstream + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + * + * \param[in] cpi Top-level encoder structure + * \param[in] size Bitstream size + * \param[in] dest Bitstream output + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval #AOM_CODEC_ERROR + */ +static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, + uint8_t *dest) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + CurrentFrame *const current_frame = &cm->current_frame; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + struct segmentation *const seg = &cm->seg; + FeatureFlags *const features = &cm->features; + const TileConfig *const tile_cfg = &oxcf->tile_cfg; + assert(cpi->source != NULL); + cpi->td.mb.e_mbd.cur_buf = cpi->source; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_frame_to_data_rate_time); +#endif + +#if !CONFIG_REALTIME_ONLY + calculate_frame_avg_haar_energy(cpi); +#endif + + // frame type has been decided outside of this function call + cm->cur_frame->frame_type = current_frame->frame_type; + + cm->tiles.large_scale = tile_cfg->enable_large_scale_tile; + cm->tiles.single_tile_decoding = tile_cfg->enable_single_tile_decoding; + + features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm); + // features->allow_ref_frame_mvs needs to be written into the frame header + // while cm->tiles.large_scale is 1, therefore, "cm->tiles.large_scale=1" case + // is separated from frame_might_allow_ref_frame_mvs(). + features->allow_ref_frame_mvs &= !cm->tiles.large_scale; + + features->allow_warped_motion = oxcf->motion_mode_cfg.allow_warped_motion && + frame_might_allow_warped_motion(cm); + + cpi->last_frame_type = current_frame->frame_type; + + if (frame_is_intra_only(cm)) { + cpi->frames_since_last_update = 0; + } + + if (frame_is_sframe(cm)) { + GF_GROUP *gf_group = &cpi->ppi->gf_group; + // S frame will wipe out any previously encoded altref so we cannot place + // an overlay frame + gf_group->update_type[gf_group->size] = GF_UPDATE; + } + + if (encode_show_existing_frame(cm)) { +#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + // TODO(angiebird): Move this into a function. + if (oxcf->pass == AOM_RC_THIRD_PASS) { + int frame_coding_idx = + av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); + rc_log_frame_encode_param( + &cpi->rc_log, frame_coding_idx, 1, 255, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index]); + } +#endif + av1_finalize_encoded_frame(cpi); + // Build the bitstream + int largest_tile_id = 0; // Output from bitstream: unused here + cpi->rc.coefficient_size = 0; + if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; + + if (seq_params->frame_id_numbers_present_flag && + current_frame->frame_type == KEY_FRAME) { + // Displaying a forward key-frame, so reset the ref buffer IDs + int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; + for (int i = 0; i < REF_FRAMES; i++) + cm->ref_frame_id[i] = display_frame_id; + } + +#if DUMP_RECON_FRAMES == 1 + // NOTE(zoeliu): For debug - Output the filtered reconstructed video. + av1_dump_filtered_recon_frames(cpi); +#endif // DUMP_RECON_FRAMES + + // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., + // for the purpose to verify no mismatch between encoder and decoder. + if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame; + +#if CONFIG_AV1_TEMPORAL_DENOISING + av1_denoiser_update_ref_frame(cpi); +#endif + + // Since we allocate a spot for the OVERLAY frame in the gf group, we need + // to do post-encoding update accordingly. + av1_set_target_rate(cpi, cm->width, cm->height); + + if (is_psnr_calc_enabled(cpi)) { + cpi->source = + realloc_and_scale_source(cpi, cm->cur_frame->buf.y_crop_width, + cm->cur_frame->buf.y_crop_height); + } + +#if !CONFIG_REALTIME_ONLY + if (cpi->use_ducky_encode) { + PSNR_STATS psnr; + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); + DuckyEncodeFrameResult *frame_result = + &cpi->ducky_encode_info.frame_result; + frame_result->global_order_idx = cm->cur_frame->display_order_hint; + frame_result->q_index = cm->quant_params.base_qindex; + frame_result->rdmult = cpi->rd.RDMULT; + frame_result->rate = (int)(*size) * 8; + frame_result->dist = psnr.sse[0]; + frame_result->psnr = psnr.psnr[0]; + } +#endif // !CONFIG_REALTIME_ONLY + + update_counters_for_show_frame(cpi); + return AOM_CODEC_OK; + } + + // Work out whether to force_integer_mv this frame + if (!is_stat_generation_stage(cpi) && + cpi->common.features.allow_screen_content_tools && + !frame_is_intra_only(cm) && !cpi->sf.rt_sf.use_nonrd_pick_mode) { + if (cpi->common.seq_params->force_integer_mv == 2) { + // Adaptive mode: see what previous frame encoded did + if (cpi->unscaled_last_source != NULL) { + features->cur_frame_force_integer_mv = av1_is_integer_mv( + cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info); + } else { + cpi->common.features.cur_frame_force_integer_mv = 0; + } + } else { + cpi->common.features.cur_frame_force_integer_mv = + cpi->common.seq_params->force_integer_mv; + } + } else { + cpi->common.features.cur_frame_force_integer_mv = 0; + } + + // This is used by av1_pack_bitstream. So this needs to be set in case of + // row-mt where the encoding code will use a temporary structure. + cpi->td.mb.e_mbd.cur_frame_force_integer_mv = + cpi->common.features.cur_frame_force_integer_mv; + + // Set default state for segment based loop filter update flags. + cm->lf.mode_ref_delta_update = 0; + + // Set various flags etc to special state if it is a key frame. + if (frame_is_intra_only(cm) || frame_is_sframe(cm)) { + // Reset the loop filter deltas and segmentation map. + av1_reset_segment_features(cm); + + // If segmentation is enabled force a map update for key frames. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + } + } + if (tile_cfg->mtu == 0) { + cpi->num_tg = tile_cfg->num_tile_groups; + } else { + // Use a default value for the purposes of weighting costs in probability + // updates + cpi->num_tg = DEFAULT_MAX_NUM_TG; + } + + // For 1 pass CBR mode: check if we are dropping this frame. + if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR) { + // Always drop for spatial enhancement layer if layer bandwidth is 0. + // Otherwise check for frame-dropping based on buffer level in + // av1_rc_drop_frame(). + if ((cpi->svc.spatial_layer_id > 0 && + cpi->oxcf.rc_cfg.target_bandwidth == 0) || + av1_rc_drop_frame(cpi)) { + cpi->is_dropped_frame = true; + } + if (cpi->is_dropped_frame) { + av1_setup_frame_size(cpi); + av1_set_mv_search_params(cpi); + av1_rc_postencode_update_drop_frame(cpi); + release_scaled_references(cpi); + cpi->ppi->gf_group.is_frame_dropped[cpi->gf_frame_index] = true; + // A dropped frame might not be shown but it always takes a slot in the gf + // group. Therefore, even when it is not shown, we still need to update + // the relevant frame counters. + if (cm->show_frame) { + update_counters_for_show_frame(cpi); + } + return AOM_CODEC_OK; + } + } + + if (oxcf->tune_cfg.tuning == AOM_TUNE_SSIM) { + av1_set_mb_ssim_rdmult_scaling(cpi); + } +#if CONFIG_SALIENCY_MAP + else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP && + !(cpi->source->flags & YV12_FLAG_HIGHBITDEPTH)) { + if (av1_set_saliency_map(cpi) == 0) { + return AOM_CODEC_MEM_ERROR; + } +#if !CONFIG_REALTIME_ONLY + double motion_ratio = av1_setup_motion_ratio(cpi); +#else + double motion_ratio = 1.0; +#endif + if (av1_setup_sm_rdmult_scaling_factor(cpi, motion_ratio) == 0) { + return AOM_CODEC_MEM_ERROR; + } + } +#endif +#if CONFIG_TUNE_VMAF + else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || + oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN || + oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { + av1_set_mb_vmaf_rdmult_scaling(cpi); + } +#endif + + if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI && + cpi->sf.rt_sf.use_nonrd_pick_mode == 0) { + av1_init_mb_wiener_var_buffer(cpi); + av1_set_mb_wiener_variance(cpi); + } + + if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) { + av1_init_mb_ur_var_buffer(cpi); + av1_set_mb_ur_variance(cpi); + } + +#if CONFIG_INTERNAL_STATS + memset(cpi->mode_chosen_counts, 0, + MAX_MODES * sizeof(*cpi->mode_chosen_counts)); +#endif + + if (seq_params->frame_id_numbers_present_flag) { + /* Non-normative definition of current_frame_id ("frame counter" with + * wraparound) */ + if (cm->current_frame_id == -1) { + int lsb, msb; + /* quasi-random initialization of current_frame_id for a key frame */ + if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) { + lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff; + msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff; + } else { + lsb = cpi->source->y_buffer[0] & 0xff; + msb = cpi->source->y_buffer[1] & 0xff; + } + cm->current_frame_id = + ((msb << 8) + lsb) % (1 << seq_params->frame_id_length); + + // S_frame is meant for stitching different streams of different + // resolutions together, so current_frame_id must be the + // same across different streams of the same content current_frame_id + // should be the same and not random. 0x37 is a chosen number as start + // point + if (oxcf->kf_cfg.sframe_dist != 0) cm->current_frame_id = 0x37; + } else { + cm->current_frame_id = + (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) % + (1 << seq_params->frame_id_length); + } + } + + switch (oxcf->algo_cfg.cdf_update_mode) { + case 0: // No CDF update for any frames(4~6% compression loss). + features->disable_cdf_update = 1; + break; + case 1: // Enable CDF update for all frames. + if (cpi->sf.rt_sf.disable_cdf_update_non_reference_frame && + cpi->ppi->rtc_ref.non_reference_frame && cpi->rc.frames_since_key > 2) + features->disable_cdf_update = 1; + else if (cpi->sf.rt_sf.selective_cdf_update) + features->disable_cdf_update = selective_disable_cdf_rtc(cpi); + else + features->disable_cdf_update = 0; + break; + case 2: + // Strategically determine at which frames to do CDF update. + // Currently only enable CDF update for all-intra and no-show frames(1.5% + // compression loss) for good qualiy or allintra mode. + if (oxcf->mode == GOOD || oxcf->mode == ALLINTRA) { + features->disable_cdf_update = + (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1; + } else { + features->disable_cdf_update = selective_disable_cdf_rtc(cpi); + } + break; + } + + // Disable cdf update for the INTNL_ARF_UPDATE frame with + // frame_parallel_level 1. + if (!cpi->do_frame_data_update && + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { + assert(cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1); + features->disable_cdf_update = 1; + } + +#if !CONFIG_REALTIME_ONLY + if (cpi->oxcf.tool_cfg.enable_global_motion && !frame_is_intra_only(cm)) { + // Flush any stale global motion information, which may be left over + // from a previous frame + aom_invalidate_pyramid(cpi->source->y_pyramid); + av1_invalidate_corner_list(cpi->source->corners); + } +#endif // !CONFIG_REALTIME_ONLY + + int largest_tile_id = 0; + if (av1_superres_in_recode_allowed(cpi)) { + if (encode_with_and_without_superres(cpi, size, dest, &largest_tile_id) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + } else { + const aom_superres_mode orig_superres_mode = cpi->superres_mode; // save + cpi->superres_mode = cpi->oxcf.superres_cfg.superres_mode; + if (encode_with_recode_loop_and_filter(cpi, size, dest, NULL, NULL, + &largest_tile_id) != AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + cpi->superres_mode = orig_superres_mode; // restore + } + + // Update reference frame ids for reference frames this frame will overwrite + if (seq_params->frame_id_numbers_present_flag) { + for (int i = 0; i < REF_FRAMES; i++) { + if ((current_frame->refresh_frame_flags >> i) & 1) { + cm->ref_frame_id[i] = cm->current_frame_id; + } + } + } + + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->svc.num_encoded_top_layer++; + +#if DUMP_RECON_FRAMES == 1 + // NOTE(zoeliu): For debug - Output the filtered reconstructed video. + av1_dump_filtered_recon_frames(cpi); +#endif // DUMP_RECON_FRAMES + + if (cm->seg.enabled) { + if (cm->seg.update_map == 0 && cm->last_frame_seg_map) { + memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map, + cm->cur_frame->mi_cols * cm->cur_frame->mi_rows * + sizeof(*cm->cur_frame->seg_map)); + } + } + + int release_scaled_refs = 0; +#if CONFIG_FPMT_TEST + release_scaled_refs = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0; +#endif // CONFIG_FPMT_TEST + if (release_scaled_refs || + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + if (frame_is_intra_only(cm) == 0) { + release_scaled_references(cpi); + } + } +#if CONFIG_AV1_TEMPORAL_DENOISING + av1_denoiser_update_ref_frame(cpi); +#endif + + // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., + // for the purpose to verify no mismatch between encoder and decoder. + if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame; + + if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + *cm->fc = cpi->tile_data[largest_tile_id].tctx; + av1_reset_cdf_symbol_counters(cm->fc); + } + if (!cm->tiles.large_scale) { + cm->cur_frame->frame_context = *cm->fc; + } + + if (tile_cfg->enable_ext_tile_debug) { + // (yunqing) This test ensures the correctness of large scale tile coding. + if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) { + char fn[20] = "./fc"; + fn[4] = current_frame->frame_number / 100 + '0'; + fn[5] = (current_frame->frame_number % 100) / 10 + '0'; + fn[6] = (current_frame->frame_number % 10) + '0'; + fn[7] = '\0'; + av1_print_frame_contexts(cm->fc, fn); + } + } + + cpi->last_frame_type = current_frame->frame_type; + + if (cm->features.disable_cdf_update) { + cpi->frames_since_last_update++; + } else { + cpi->frames_since_last_update = 1; + } + + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->svc.prev_number_spatial_layers = cpi->svc.number_spatial_layers; + + // Clear the one shot update flags for segmentation map and mode/ref loop + // filter deltas. + cm->seg.update_map = 0; + cm->seg.update_data = 0; + cm->lf.mode_ref_delta_update = 0; + + if (cm->show_frame) { + update_counters_for_show_frame(cpi); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_frame_to_data_rate_time); +#endif + + return AOM_CODEC_OK; +} + +int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, + const EncodeFrameInput *const frame_input, + const EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + + cpi->unscaled_source = frame_input->source; + cpi->source = frame_input->source; + cpi->unscaled_last_source = frame_input->last_source; + + current_frame->refresh_frame_flags = frame_params->refresh_frame_flags; + cm->features.error_resilient_mode = frame_params->error_resilient_mode; + cm->features.primary_ref_frame = frame_params->primary_ref_frame; + cm->current_frame.frame_type = frame_params->frame_type; + cm->show_frame = frame_params->show_frame; + cpi->ref_frame_flags = frame_params->ref_frame_flags; + cpi->speed = frame_params->speed; + cm->show_existing_frame = frame_params->show_existing_frame; + cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show; + + memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx, + REF_FRAMES * sizeof(*cm->remapped_ref_idx)); + + memcpy(&cpi->refresh_frame, &frame_params->refresh_frame, + sizeof(cpi->refresh_frame)); + + if (current_frame->frame_type == KEY_FRAME && + cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { + current_frame->frame_number = 0; + } + + current_frame->order_hint = + current_frame->frame_number + frame_params->order_offset; + + current_frame->display_order_hint = current_frame->order_hint; + current_frame->order_hint %= + (1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1)); + + current_frame->pyramid_level = get_true_pyr_level( + cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], + current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth); + + if (is_stat_generation_stage(cpi)) { +#if !CONFIG_REALTIME_ONLY + if (cpi->oxcf.q_cfg.use_fixed_qp_offsets) + av1_noop_first_pass_frame(cpi, frame_input->ts_duration); + else + av1_first_pass(cpi, frame_input->ts_duration); +#endif + } else if (cpi->oxcf.pass == AOM_RC_ONE_PASS || + cpi->oxcf.pass >= AOM_RC_SECOND_PASS) { + if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_ERROR; + } + + return AOM_CODEC_OK; +} + +#if CONFIG_DENOISE +static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd, + int block_size, float noise_level, + int64_t time_stamp, int64_t end_time) { + AV1_COMMON *const cm = &cpi->common; + if (!cpi->denoise_and_model) { + cpi->denoise_and_model = aom_denoise_and_model_alloc( + cm->seq_params->bit_depth, block_size, noise_level); + if (!cpi->denoise_and_model) { + aom_set_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating denoise and model"); + return -1; + } + } + if (!cpi->film_grain_table) { + cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); + if (!cpi->film_grain_table) { + aom_set_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating grain table"); + return -1; + } + memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table)); + } + if (aom_denoise_and_model_run(cpi->denoise_and_model, sd, + &cm->film_grain_params, + cpi->oxcf.enable_dnl_denoising)) { + if (cm->film_grain_params.apply_grain) { + aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time, + &cm->film_grain_params); + } + } + return 0; +} +#endif + +int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + int res = 0; + const int subsampling_x = sd->subsampling_x; + const int subsampling_y = sd->subsampling_y; + const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + +#if CONFIG_TUNE_VMAF + if (!is_stat_generation_stage(cpi) && + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) { + av1_vmaf_frame_preprocessing(cpi, sd); + } + if (!is_stat_generation_stage(cpi) && + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN) { + av1_vmaf_blk_preprocessing(cpi, sd); + } +#endif + +#if CONFIG_INTERNAL_STATS + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif + +#if CONFIG_AV1_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif + +#if CONFIG_DENOISE + // even if denoise_noise_level is > 0, we don't need need to denoise on pass + // 1 of 2 if enable_dnl_denoising is disabled since the 2nd pass will be + // encoding the original (non-denoised) frame + if (cpi->oxcf.noise_level > 0 && !(cpi->oxcf.pass == AOM_RC_FIRST_PASS && + !cpi->oxcf.enable_dnl_denoising)) { +#if !CONFIG_REALTIME_ONLY + // Choose a synthetic noise level for still images for enhanced perceptual + // quality based on an estimated noise level in the source, but only if + // the noise level is set on the command line to > 0. + if (cpi->oxcf.mode == ALLINTRA) { + // No noise synthesis if source is very clean. + // Uses a low edge threshold to focus on smooth areas. + // Increase output noise setting a little compared to measured value. + double y_noise_level = 0.0; + av1_estimate_noise_level(sd, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y, + cm->seq_params->bit_depth, 16); + cpi->oxcf.noise_level = (float)(y_noise_level - 0.1); + cpi->oxcf.noise_level = (float)AOMMAX(0.0, cpi->oxcf.noise_level); + if (cpi->oxcf.noise_level > 0.0) { + cpi->oxcf.noise_level += (float)0.5; + } + cpi->oxcf.noise_level = (float)AOMMIN(5.0, cpi->oxcf.noise_level); + } +#endif + + if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size, + cpi->oxcf.noise_level, time_stamp, end_time) < 0) + res = -1; + } +#endif // CONFIG_DENOISE + + if (av1_lookahead_push(cpi->ppi->lookahead, sd, time_stamp, end_time, + use_highbitdepth, cpi->image_pyramid_levels, + frame_flags)) { + aom_set_error(cm->error, AOM_CODEC_ERROR, "av1_lookahead_push() failed"); + res = -1; + } +#if CONFIG_INTERNAL_STATS + aom_usec_timer_mark(&timer); + cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer); +#endif + + // Note: Regarding profile setting, the following checks are added to help + // choose a proper profile for the input video. The criterion is that all + // bitstreams must be designated as the lowest profile that match its content. + // E.G. A bitstream that contains 4:4:4 video must be designated as High + // Profile in the seq header, and likewise a bitstream that contains 4:2:2 + // bitstream must be designated as Professional Profile in the sequence + // header. + if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome && + (subsampling_x != 1 || subsampling_y != 1)) { + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Non-4:2:0 color format requires profile 1 or 2"); + res = -1; + } + if ((seq_params->profile == PROFILE_1) && + !(subsampling_x == 0 && subsampling_y == 0)) { + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 1 requires 4:4:4 color format"); + res = -1; + } + if ((seq_params->profile == PROFILE_2) && + (seq_params->bit_depth <= AOM_BITS_10) && + !(subsampling_x == 1 && subsampling_y == 0)) { + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 2 bit-depth <= 10 requires 4:2:2 color format"); + res = -1; + } + + return res; +} + +#if CONFIG_ENTROPY_STATS +void print_entropy_stats(AV1_PRIMARY *const ppi) { + if (!ppi->cpi) return; + + if (ppi->cpi->oxcf.pass != 1 && + ppi->cpi->common.current_frame.frame_number > 0) { + fprintf(stderr, "Writing counts.stt\n"); + FILE *f = fopen("counts.stt", "wb"); + fwrite(&ppi->aggregate_fc, sizeof(ppi->aggregate_fc), 1, f); + fclose(f); + } +} +#endif // CONFIG_ENTROPY_STATS + +#if CONFIG_INTERNAL_STATS +extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch, + const unsigned char *img2, int img2_pitch, + int width, int height); + +static void adjust_image_stat(double y, double u, double v, double all, + ImageStat *s) { + s->stat[STAT_Y] += y; + s->stat[STAT_U] += u; + s->stat[STAT_V] += v; + s->stat[STAT_ALL] += all; + s->worst = AOMMIN(s->worst, all); +} + +static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { + AV1_PRIMARY *const ppi = cpi->ppi; + AV1_COMMON *const cm = &cpi->common; + double samples = 0.0; + const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + + if (cpi->ppi->use_svc && + cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) + return; + +#if CONFIG_INTER_STATS_ONLY + if (cm->current_frame.frame_type == KEY_FRAME) return; // skip key frame +#endif + cpi->bytes += frame_bytes; + if (cm->show_frame) { + const YV12_BUFFER_CONFIG *orig = cpi->source; + const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; + double y, u, v, frame_all; + + ppi->count[0]++; + ppi->count[1]++; + if (cpi->ppi->b_calculate_psnr) { + PSNR_STATS psnr; + double weight[2] = { 0.0, 0.0 }; + double frame_ssim2[2] = { 0.0, 0.0 }; +#if CONFIG_AV1_HIGHBITDEPTH + aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth); +#else + aom_calc_psnr(orig, recon, &psnr); +#endif + adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0], + &(ppi->psnr[0])); + ppi->total_sq_error[0] += psnr.sse[0]; + ppi->total_samples[0] += psnr.samples[0]; + samples = psnr.samples[0]; + + aom_calc_ssim(orig, recon, bit_depth, in_bit_depth, + cm->seq_params->use_highbitdepth, weight, frame_ssim2); + + ppi->worst_ssim = AOMMIN(ppi->worst_ssim, frame_ssim2[0]); + ppi->summed_quality += frame_ssim2[0] * weight[0]; + ppi->summed_weights += weight[0]; + +#if CONFIG_AV1_HIGHBITDEPTH + // Compute PSNR based on stream bit depth + if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) && + (in_bit_depth < bit_depth)) { + adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3], + psnr.psnr_hbd[0], &ppi->psnr[1]); + ppi->total_sq_error[1] += psnr.sse_hbd[0]; + ppi->total_samples[1] += psnr.samples_hbd[0]; + + ppi->worst_ssim_hbd = AOMMIN(ppi->worst_ssim_hbd, frame_ssim2[1]); + ppi->summed_quality_hbd += frame_ssim2[1] * weight[1]; + ppi->summed_weights_hbd += weight[1]; + } +#endif + +#if 0 + { + FILE *f = fopen("q_used.stt", "a"); + double y2 = psnr.psnr[1]; + double u2 = psnr.psnr[2]; + double v2 = psnr.psnr[3]; + double frame_psnr2 = psnr.psnr[0]; + fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", + cm->current_frame.frame_number, y2, u2, v2, + frame_psnr2, frame_ssim2); + fclose(f); + } +#endif + } + if (ppi->b_calculate_blockiness) { + if (!cm->seq_params->use_highbitdepth) { + const double frame_blockiness = + av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer, + recon->y_stride, orig->y_width, orig->y_height); + ppi->worst_blockiness = AOMMAX(ppi->worst_blockiness, frame_blockiness); + ppi->total_blockiness += frame_blockiness; + } + + if (ppi->b_calculate_consistency) { + if (!cm->seq_params->use_highbitdepth) { + const double this_inconsistency = aom_get_ssim_metrics( + orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, + orig->y_width, orig->y_height, ppi->ssim_vars, &ppi->metrics, 1); + + const double peak = (double)((1 << in_bit_depth) - 1); + const double consistency = + aom_sse_to_psnr(samples, peak, ppi->total_inconsistency); + if (consistency > 0.0) + ppi->worst_consistency = + AOMMIN(ppi->worst_consistency, consistency); + ppi->total_inconsistency += this_inconsistency; + } + } + } + + frame_all = + aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); + adjust_image_stat(y, u, v, frame_all, &ppi->fastssim); + frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); + adjust_image_stat(y, u, v, frame_all, &ppi->psnrhvs); + } +} + +void print_internal_stats(AV1_PRIMARY *ppi) { + if (!ppi->cpi) return; + AV1_COMP *const cpi = ppi->cpi; + + if (ppi->cpi->oxcf.pass != 1 && + ppi->cpi->common.current_frame.frame_number > 0) { + char headings[512] = { 0 }; + char results[512] = { 0 }; + FILE *f = fopen("opsnr.stt", "a"); + double time_encoded = + (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) / + 10000000.000; + double total_encode_time = + (ppi->total_time_receive_data + ppi->total_time_compress_data) / + 1000.000; + const double dr = + (double)ppi->total_bytes * (double)8 / (double)1000 / time_encoded; + const double peak = + (double)((1 << ppi->cpi->oxcf.input_cfg.input_bit_depth) - 1); + const double target_rate = + (double)ppi->cpi->oxcf.rc_cfg.target_bandwidth / 1000; + const double rate_err = ((100.0 * (dr - target_rate)) / target_rate); + + if (ppi->b_calculate_psnr) { + const double total_psnr = aom_sse_to_psnr( + (double)ppi->total_samples[0], peak, (double)ppi->total_sq_error[0]); + const double total_ssim = + 100 * pow(ppi->summed_quality / ppi->summed_weights, 8.0); + snprintf(headings, sizeof(headings), + "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" + "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" + "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" + "AVPsrnY\tAPsnrCb\tAPsnrCr"); + snprintf(results, sizeof(results), + "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f", + dr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr, + ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr, + total_ssim, total_ssim, + ppi->fastssim.stat[STAT_ALL] / ppi->count[0], + ppi->psnrhvs.stat[STAT_ALL] / ppi->count[0], ppi->psnr[0].worst, + ppi->worst_ssim, ppi->fastssim.worst, ppi->psnrhvs.worst, + ppi->psnr[0].stat[STAT_Y] / ppi->count[0], + ppi->psnr[0].stat[STAT_U] / ppi->count[0], + ppi->psnr[0].stat[STAT_V] / ppi->count[0]); + + if (ppi->b_calculate_blockiness) { + SNPRINT(headings, "\t Block\tWstBlck"); + SNPRINT2(results, "\t%7.3f", ppi->total_blockiness / ppi->count[0]); + SNPRINT2(results, "\t%7.3f", ppi->worst_blockiness); + } + + if (ppi->b_calculate_consistency) { + double consistency = + aom_sse_to_psnr((double)ppi->total_samples[0], peak, + (double)ppi->total_inconsistency); + + SNPRINT(headings, "\tConsist\tWstCons"); + SNPRINT2(results, "\t%7.3f", consistency); + SNPRINT2(results, "\t%7.3f", ppi->worst_consistency); + } + + SNPRINT(headings, "\t Time\tRcErr\tAbsErr"); + SNPRINT2(results, "\t%8.0f", total_encode_time); + SNPRINT2(results, " %7.2f", rate_err); + SNPRINT2(results, " %7.2f", fabs(rate_err)); + + SNPRINT(headings, "\tAPsnr611"); + SNPRINT2(results, " %7.3f", + (6 * ppi->psnr[0].stat[STAT_Y] + ppi->psnr[0].stat[STAT_U] + + ppi->psnr[0].stat[STAT_V]) / + (ppi->count[0] * 8)); + +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = ppi->cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = ppi->seq_params.bit_depth; + // Since cpi->source->flags is not available here, but total_samples[1] + // will be non-zero if cpi->source->flags & YV12_FLAG_HIGHBITDEPTH was + // true in compute_internal_stats + if ((ppi->total_samples[1] > 0) && (in_bit_depth < bit_depth)) { + const double peak_hbd = (double)((1 << bit_depth) - 1); + const double total_psnr_hbd = + aom_sse_to_psnr((double)ppi->total_samples[1], peak_hbd, + (double)ppi->total_sq_error[1]); + const double total_ssim_hbd = + 100 * pow(ppi->summed_quality_hbd / ppi->summed_weights_hbd, 8.0); + SNPRINT(headings, + "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH" + " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH" + " AOMSSIMH VPSSIMPH WstSsimH"); + SNPRINT2(results, "\t%7.3f", + ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]); + SNPRINT2(results, " %7.3f", total_psnr_hbd); + SNPRINT2(results, " %7.3f", + ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]); + SNPRINT2(results, " %7.3f", total_psnr_hbd); + SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_Y] / ppi->count[1]); + SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_U] / ppi->count[1]); + SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_V] / ppi->count[1]); + SNPRINT2(results, " %7.3f", ppi->psnr[1].worst); + SNPRINT2(results, " %7.3f", total_ssim_hbd); + SNPRINT2(results, " %7.3f", total_ssim_hbd); + SNPRINT2(results, " %7.3f", ppi->worst_ssim_hbd); + } +#endif + fprintf(f, "%s\n", headings); + fprintf(f, "%s\n", results); + } + + fclose(f); + + aom_free(ppi->ssim_vars); + ppi->ssim_vars = NULL; + } +} +#endif // CONFIG_INTERNAL_STATS + +static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) { + if (cpi->common.show_frame && cpi->rc.frames_to_key) { +#if !CONFIG_REALTIME_ONLY + FIRSTPASS_INFO *firstpass_info = &cpi->ppi->twopass.firstpass_info; + if (firstpass_info->past_stats_count > FIRSTPASS_INFO_STATS_PAST_MIN) { + av1_firstpass_info_move_cur_index_and_pop(firstpass_info); + } else { + // When there is not enough past stats, we move the current + // index without popping the past stats + av1_firstpass_info_move_cur_index(firstpass_info); + } +#endif + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + cpi->rc.frames_to_fwd_kf--; + } + } +} + +static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) { + // TODO(weitinglin): Updating this counter for is_frame_droppable + // is a work-around to handle the condition when a frame is drop. + // We should fix the cpi->common.show_frame flag + // instead of checking the other condition to update the counter properly. + if (cpi->common.show_frame || + is_frame_droppable(&cpi->ppi->rtc_ref, &cpi->ext_flags.refresh_frame)) { + // Decrement count down till next gf + if (cpi->rc.frames_till_gf_update_due > 0) + cpi->rc.frames_till_gf_update_due--; + } +} + +static AOM_INLINE void update_gf_group_index(AV1_COMP *cpi) { + // Increment the gf group index ready for the next frame. + if (is_one_pass_rt_params(cpi) && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + ++cpi->gf_frame_index; + // Reset gf_frame_index in case it reaches MAX_STATIC_GF_GROUP_LENGTH + // for real time encoding. + if (cpi->gf_frame_index == MAX_STATIC_GF_GROUP_LENGTH) + cpi->gf_frame_index = 0; + } else { + ++cpi->gf_frame_index; + } +} + +static void update_fb_of_context_type(const AV1_COMP *const cpi, + int *const fb_of_context_type) { + const AV1_COMMON *const cm = &cpi->common; + const int current_frame_ref_type = get_current_frame_ref_type(cpi); + + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + cpi->ext_flags.use_primary_ref_none) { + for (int i = 0; i < REF_FRAMES; i++) { + fb_of_context_type[i] = -1; + } + fb_of_context_type[current_frame_ref_type] = + cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME) + : get_ref_frame_map_idx(cm, ALTREF_FRAME); + } + + if (!encode_show_existing_frame(cm)) { + // Refresh fb_of_context_type[]: see encoder.h for explanation + if (cm->current_frame.frame_type == KEY_FRAME) { + // All ref frames are refreshed, pick one that will live long enough + fb_of_context_type[current_frame_ref_type] = 0; + } else { + // If more than one frame is refreshed, it doesn't matter which one we + // pick so pick the first. LST sometimes doesn't refresh any: this is ok + + for (int i = 0; i < REF_FRAMES; i++) { + if (cm->current_frame.refresh_frame_flags & (1 << i)) { + fb_of_context_type[current_frame_ref_type] = i; + break; + } + } + } + } +} + +static void update_rc_counts(AV1_COMP *cpi) { + update_keyframe_counters(cpi); + update_frames_till_gf_update(cpi); + update_gf_group_index(cpi); +} + +static void update_end_of_frame_stats(AV1_COMP *cpi) { + if (cpi->do_frame_data_update) { + // Store current frame loopfilter levels in ppi, if update flag is set. + if (!cpi->common.show_existing_frame) { + AV1_COMMON *const cm = &cpi->common; + struct loopfilter *const lf = &cm->lf; + cpi->ppi->filter_level[0] = lf->filter_level[0]; + cpi->ppi->filter_level[1] = lf->filter_level[1]; + cpi->ppi->filter_level_u = lf->filter_level_u; + cpi->ppi->filter_level_v = lf->filter_level_v; + } + } + // Store frame level mv_stats from cpi to ppi. + cpi->ppi->mv_stats = cpi->mv_stats; +} + +// Updates frame level stats related to global motion +static AOM_INLINE void update_gm_stats(AV1_COMP *cpi) { + FRAME_UPDATE_TYPE update_type = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; + int i, is_gm_present = 0; + + // Check if the current frame has any valid global motion model across its + // reference frames + for (i = 0; i < REF_FRAMES; i++) { + if (cpi->common.global_motion[i].wmtype != IDENTITY) { + is_gm_present = 1; + break; + } + } + int update_actual_stats = 1; +#if CONFIG_FPMT_TEST + update_actual_stats = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; + if (!update_actual_stats) { + if (cpi->ppi->temp_valid_gm_model_found[update_type] == INT32_MAX) { + cpi->ppi->temp_valid_gm_model_found[update_type] = is_gm_present; + } else { + cpi->ppi->temp_valid_gm_model_found[update_type] |= is_gm_present; + } + int show_existing_between_parallel_frames = + (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); + if (cpi->do_frame_data_update == 1 && + !show_existing_between_parallel_frames) { + for (i = 0; i < FRAME_UPDATE_TYPES; i++) { + cpi->ppi->valid_gm_model_found[i] = + cpi->ppi->temp_valid_gm_model_found[i]; + } + } + } +#endif + if (update_actual_stats) { + if (cpi->ppi->valid_gm_model_found[update_type] == INT32_MAX) { + cpi->ppi->valid_gm_model_found[update_type] = is_gm_present; + } else { + cpi->ppi->valid_gm_model_found[update_type] |= is_gm_present; + } + } +} + +void av1_post_encode_updates(AV1_COMP *const cpi, + const AV1_COMP_DATA *const cpi_data) { + AV1_PRIMARY *const ppi = cpi->ppi; + AV1_COMMON *const cm = &cpi->common; + + update_gm_stats(cpi); + +#if !CONFIG_REALTIME_ONLY + // Update the total stats remaining structure. + if (cpi->twopass_frame.this_frame != NULL && + ppi->twopass.stats_buf_ctx->total_left_stats) { + subtract_stats(ppi->twopass.stats_buf_ctx->total_left_stats, + cpi->twopass_frame.this_frame); + } +#endif + +#if CONFIG_OUTPUT_FRAME_SIZE + FILE *f = fopen("frame_sizes.csv", "a"); + fprintf(f, "%d,", 8 * (int)cpi_data->frame_size); + fprintf(f, "%d\n", cm->quant_params.base_qindex); + fclose(f); +#endif // CONFIG_OUTPUT_FRAME_SIZE + + if (!is_stat_generation_stage(cpi) && !cpi->is_dropped_frame) { + // Before calling refresh_reference_frames(), copy ppi->ref_frame_map_copy + // to cm->ref_frame_map for frame_parallel_level 2 frame in a parallel + // encode set of lower layer frames. + // TODO(Remya): Move ref_frame_map from AV1_COMMON to AV1_PRIMARY to avoid + // copy. + if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 2 && + ppi->gf_group.frame_parallel_level[cpi->gf_frame_index - 1] == 1 && + ppi->gf_group.update_type[cpi->gf_frame_index - 1] == + INTNL_ARF_UPDATE) { + memcpy(cm->ref_frame_map, ppi->ref_frame_map_copy, + sizeof(cm->ref_frame_map)); + } + refresh_reference_frames(cpi); + // For frame_parallel_level 1 frame in a parallel encode set of lower layer + // frames, store the updated cm->ref_frame_map in ppi->ref_frame_map_copy. + if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1 && + ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { + memcpy(ppi->ref_frame_map_copy, cm->ref_frame_map, + sizeof(cm->ref_frame_map)); + } + av1_rc_postencode_update(cpi, cpi_data->frame_size); + } + + if (cpi_data->pop_lookahead == 1) { + av1_lookahead_pop(cpi->ppi->lookahead, cpi_data->flush, + cpi->compressor_stage); + } + if (cpi->common.show_frame) { + cpi->ppi->ts_start_last_show_frame = cpi_data->ts_frame_start; + cpi->ppi->ts_end_last_show_frame = cpi_data->ts_frame_end; + } + if (ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) { + // Initialize level info. at the beginning of each sequence. + if (cm->current_frame.frame_type == KEY_FRAME && + ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { + av1_init_level_info(cpi); + } + av1_update_level_info(cpi, cpi_data->frame_size, cpi_data->ts_frame_start, + cpi_data->ts_frame_end); + } + + if (!is_stat_generation_stage(cpi)) { +#if !CONFIG_REALTIME_ONLY + if (!has_no_stats_stage(cpi)) av1_twopass_postencode_update(cpi); +#endif + update_fb_of_context_type(cpi, ppi->fb_of_context_type); + update_rc_counts(cpi); + update_end_of_frame_stats(cpi); + } + + if (cpi->oxcf.pass == AOM_RC_THIRD_PASS && cpi->third_pass_ctx) { + av1_pop_third_pass_info(cpi->third_pass_ctx); + } + + if (ppi->rtc_ref.set_ref_frame_config) { + av1_svc_update_buffer_slot_refreshed(cpi); + av1_svc_set_reference_was_previous(cpi); + } + + if (ppi->use_svc) av1_save_layer_context(cpi); + + // Note *size = 0 indicates a dropped frame for which psnr is not calculated + if (ppi->b_calculate_psnr && cpi_data->frame_size > 0) { + if (cm->show_existing_frame || + (!is_stat_generation_stage(cpi) && cm->show_frame)) { + generate_psnr_packet(cpi); + } + } + +#if CONFIG_INTERNAL_STATS + if (!is_stat_generation_stage(cpi)) { + compute_internal_stats(cpi, (int)cpi_data->frame_size); + } +#endif // CONFIG_INTERNAL_STATS + + // Write frame info. Subtract 1 from frame index since if was incremented in + // update_rc_counts. + av1_write_second_pass_per_frame_info(cpi, cpi->gf_frame_index - 1); +} + +int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(cm->error->jmp)) { + cm->error->setjmp = 0; + return cm->error->error_code; + } + cm->error->setjmp = 1; + +#if CONFIG_INTERNAL_STATS + cpi->frame_recode_hits = 0; + cpi->time_compress_data = 0; + cpi->bytes = 0; +#endif +#if CONFIG_ENTROPY_STATS + if (cpi->compressor_stage == ENCODE_STAGE) { + av1_zero(cpi->counts); + } +#endif + +#if CONFIG_BITSTREAM_DEBUG + assert(cpi->oxcf.max_threads <= 1 && + "bitstream debug tool does not support multithreading"); + bitstream_queue_record_write(); + + if (cm->seq_params->order_hint_info.enable_order_hint) { + aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 + + cm->show_frame); + } else { + // This is currently used in RTC encoding. cm->show_frame is always 1. + aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number); + } +#endif + if (cpi->ppi->use_svc) { + av1_one_pass_cbr_svc_start_layer(cpi); + } + + cpi->is_dropped_frame = false; + cm->showable_frame = 0; + cpi_data->frame_size = 0; + cpi->available_bs_size = cpi_data->cx_data_sz; +#if CONFIG_INTERNAL_STATS + struct aom_usec_timer cmptimer; + aom_usec_timer_start(&cmptimer); +#endif + av1_set_high_precision_mv(cpi, 1, 0); + + // Normal defaults + cm->features.refresh_frame_context = + oxcf->tool_cfg.frame_parallel_decoding_mode + ? REFRESH_FRAME_CONTEXT_DISABLED + : REFRESH_FRAME_CONTEXT_BACKWARD; + if (oxcf->tile_cfg.enable_large_scale_tile) + cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; + + if (assign_cur_frame_new_fb(cm) == NULL) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Failed to allocate new cur_frame"); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + // Accumulate 2nd pass time in 2-pass case or 1 pass time in 1-pass case. + if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) + start_timing(cpi, av1_encode_strategy_time); +#endif + + const int result = av1_encode_strategy( + cpi, &cpi_data->frame_size, cpi_data->cx_data, &cpi_data->lib_flags, + &cpi_data->ts_frame_start, &cpi_data->ts_frame_end, + cpi_data->timestamp_ratio, &cpi_data->pop_lookahead, cpi_data->flush); + +#if CONFIG_COLLECT_COMPONENT_TIMING + if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) + end_timing(cpi, av1_encode_strategy_time); + + // Print out timing information. + // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of + // show_existing_frame and lag-in-frames. + if ((cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) && + cpi->frame_component_time[0] > 100) { + int i; + uint64_t frame_total = 0, total = 0; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + FRAME_UPDATE_TYPE frame_update_type = + get_frame_update_type(gf_group, cpi->gf_frame_index); + + fprintf(stderr, + "\n Frame number: %d, Frame type: %s, Show Frame: %d, Frame Update " + "Type: %d, Q: %d\n", + cm->current_frame.frame_number, + get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame, + frame_update_type, cm->quant_params.base_qindex); + for (i = 0; i < kTimingComponents; i++) { + cpi->component_time[i] += cpi->frame_component_time[i]; + // Use av1_encode_strategy_time (i = 0) as the total time. + if (i == 0) { + frame_total = cpi->frame_component_time[0]; + total = cpi->component_time[0]; + } + fprintf(stderr, + " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64 + " us [%6.2f%%])\n", + get_component_name(i), cpi->frame_component_time[i], + (float)((float)cpi->frame_component_time[i] * 100.0 / + (float)frame_total), + cpi->component_time[i], + (float)((float)cpi->component_time[i] * 100.0 / (float)total)); + cpi->frame_component_time[i] = 0; + } + } +#endif + + // Reset the flag to 0 afer encoding. + cpi->rc.use_external_qp_one_pass = 0; + + if (result == -1) { + cm->error->setjmp = 0; + // Returning -1 indicates no frame encoded; more input is required + return -1; + } + if (result != AOM_CODEC_OK) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Failed to encode frame"); + } +#if CONFIG_INTERNAL_STATS + aom_usec_timer_mark(&cmptimer); + cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer); +#endif // CONFIG_INTERNAL_STATS + +#if CONFIG_SPEED_STATS + if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) { + cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count; + cpi->td.mb.txfm_search_info.tx_search_count = 0; + } +#endif // CONFIG_SPEED_STATS + + cm->error->setjmp = 0; + return AOM_CODEC_OK; +} + +// Populates cpi->scaled_ref_buf corresponding to frames in a parallel encode +// set. Also sets the bitmask 'ref_buffers_used_map'. +void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map) { + AV1_COMMON *cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1). + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, ref_frame); + + if (ref == NULL) { + cpi->scaled_ref_buf[ref_frame - 1] = NULL; + continue; + } + + // FPMT does not support scaling yet. + assert(ref->y_crop_width == cm->width && + ref->y_crop_height == cm->height); + + RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame); + cpi->scaled_ref_buf[ref_frame - 1] = buf; + for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) { + if (&cm->buffer_pool->frame_bufs[i] == buf) { + *ref_buffers_used_map |= (1 << i); + } + } + } else { + if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL; + } + } +} + +// Increments the ref_count of frame buffers referenced by cpi->scaled_ref_buf +// corresponding to frames in a parallel encode set. +void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool, + int ref_buffers_used_map) { + for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) { + if (ref_buffers_used_map & (1 << i)) { + ++buffer_pool->frame_bufs[i].ref_count; + } + } +} + +// Releases cpi->scaled_ref_buf corresponding to frames in a parallel encode +// set. +void av1_release_scaled_references_fpmt(AV1_COMP *cpi) { + // TODO(isbs): only refresh the necessary frames, rather than all of them + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + RefCntBuffer *const buf = cpi->scaled_ref_buf[i]; + if (buf != NULL) { + cpi->scaled_ref_buf[i] = NULL; + } + } +} + +// Decrements the ref_count of frame buffers referenced by cpi->scaled_ref_buf +// corresponding to frames in a parallel encode set. +void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool, + int ref_buffers_used_map) { + for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) { + if (ref_buffers_used_map & (1 << i)) { + --buffer_pool->frame_bufs[i].ref_count; + } + } +} + +// Initialize parallel frame contexts with screen content decisions. +void av1_init_sc_decisions(AV1_PRIMARY *const ppi) { + AV1_COMP *const first_cpi = ppi->cpi; + for (int i = 1; i < ppi->num_fp_contexts; ++i) { + AV1_COMP *cur_cpi = ppi->parallel_cpi[i]; + cur_cpi->common.features.allow_screen_content_tools = + first_cpi->common.features.allow_screen_content_tools; + cur_cpi->common.features.allow_intrabc = + first_cpi->common.features.allow_intrabc; + cur_cpi->use_screen_content_tools = first_cpi->use_screen_content_tools; + cur_cpi->is_screen_content_type = first_cpi->is_screen_content_type; + } +} + +AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data) { + int cpi_idx = 0; + + // Loop over parallel_cpi to find the cpi that processed the current + // gf_frame_index ahead of time. + for (int i = 1; i < ppi->num_fp_contexts; i++) { + if (ppi->cpi->gf_frame_index == ppi->parallel_cpi[i]->gf_frame_index) { + cpi_idx = i; + break; + } + } + + assert(cpi_idx > 0); + assert(!ppi->parallel_cpi[cpi_idx]->common.show_existing_frame); + + // Release the previously-used frame-buffer. + if (ppi->cpi->common.cur_frame != NULL) { + --ppi->cpi->common.cur_frame->ref_count; + ppi->cpi->common.cur_frame = NULL; + } + + // Swap the appropriate parallel_cpi with the parallel_cpi[0]. + ppi->cpi = ppi->parallel_cpi[cpi_idx]; + ppi->parallel_cpi[cpi_idx] = ppi->parallel_cpi[0]; + ppi->parallel_cpi[0] = ppi->cpi; + + // Copy appropriate parallel_frames_data to local data. + { + AV1_COMP_DATA *data = &ppi->parallel_frames_data[cpi_idx - 1]; + assert(data->frame_size > 0); + assert(first_cpi_data->cx_data_sz > data->frame_size); + + first_cpi_data->lib_flags = data->lib_flags; + first_cpi_data->ts_frame_start = data->ts_frame_start; + first_cpi_data->ts_frame_end = data->ts_frame_end; + memcpy(first_cpi_data->cx_data, data->cx_data, data->frame_size); + first_cpi_data->frame_size = data->frame_size; + if (ppi->cpi->common.show_frame) { + first_cpi_data->pop_lookahead = 1; + } + } + + return ppi->cpi; +} + +// Initialises frames belonging to a parallel encode set. +int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data, + AV1_PRIMARY *const ppi, + int *ref_buffers_used_map) { + AV1_COMP *const first_cpi = ppi->cpi; + GF_GROUP *const gf_group = &ppi->gf_group; + int gf_index_start = first_cpi->gf_frame_index; + assert(gf_group->frame_parallel_level[gf_index_start] == 1); + int parallel_frame_count = 0; + int cur_frame_num = first_cpi->common.current_frame.frame_number; + int show_frame_count = first_cpi->frame_index_set.show_frame_count; + int frames_since_key = first_cpi->rc.frames_since_key; + int frames_to_key = first_cpi->rc.frames_to_key; + int frames_to_fwd_kf = first_cpi->rc.frames_to_fwd_kf; + int cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[gf_index_start]; + const FIRSTPASS_STATS *stats_in = first_cpi->twopass_frame.stats_in; + + assert(*ref_buffers_used_map == 0); + + // Release the previously used frame-buffer by a frame_parallel_level 1 frame. + if (first_cpi->common.cur_frame != NULL) { + --first_cpi->common.cur_frame->ref_count; + first_cpi->common.cur_frame = NULL; + } + + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; + RefFrameMapPair first_ref_frame_map_pairs[REF_FRAMES]; + init_ref_map_pair(first_cpi, first_ref_frame_map_pairs); + memcpy(ref_frame_map_pairs, first_ref_frame_map_pairs, + sizeof(RefFrameMapPair) * REF_FRAMES); + + // Store the reference refresh index of frame_parallel_level 1 frame in a + // parallel encode set of lower layer frames. + if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) { + first_cpi->ref_refresh_index = av1_calc_refresh_idx_for_intnl_arf( + first_cpi, ref_frame_map_pairs, gf_index_start); + assert(first_cpi->ref_refresh_index != INVALID_IDX && + first_cpi->ref_refresh_index < REF_FRAMES); + first_cpi->refresh_idx_available = true; + // Update ref_frame_map_pairs. + ref_frame_map_pairs[first_cpi->ref_refresh_index].disp_order = + gf_group->display_idx[gf_index_start]; + ref_frame_map_pairs[first_cpi->ref_refresh_index].pyr_level = + gf_group->layer_depth[gf_index_start]; + } + + // Set do_frame_data_update flag as false for frame_parallel_level 1 frame. + first_cpi->do_frame_data_update = false; + if (gf_group->arf_src_offset[gf_index_start] == 0) { + first_cpi->time_stamps.prev_ts_start = ppi->ts_start_last_show_frame; + first_cpi->time_stamps.prev_ts_end = ppi->ts_end_last_show_frame; + } + + av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, first_cpi, + gf_index_start, 1, first_cpi->common.remapped_ref_idx); + + av1_scale_references_fpmt(first_cpi, ref_buffers_used_map); + parallel_frame_count++; + + // Iterate through the GF_GROUP to find the remaining frame_parallel_level 2 + // frames which are part of the current parallel encode set and initialize the + // required cpi elements. + for (int i = gf_index_start + 1; i < gf_group->size; i++) { + // Update frame counters if previous frame was show frame or show existing + // frame. + if (gf_group->arf_src_offset[i - 1] == 0) { + cur_frame_num++; + show_frame_count++; + if (frames_to_fwd_kf <= 0) + frames_to_fwd_kf = first_cpi->oxcf.kf_cfg.fwd_kf_dist; + if (frames_to_key) { + frames_since_key++; + frames_to_key--; + frames_to_fwd_kf--; + } + stats_in++; + } + cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[i]; + if (gf_group->frame_parallel_level[i] == 2) { + AV1_COMP *cur_cpi = ppi->parallel_cpi[parallel_frame_count]; + AV1_COMP_DATA *cur_cpi_data = + &ppi->parallel_frames_data[parallel_frame_count - 1]; + cur_cpi->gf_frame_index = i; + cur_cpi->framerate = first_cpi->framerate; + cur_cpi->common.current_frame.frame_number = cur_frame_num; + cur_cpi->common.current_frame.frame_type = gf_group->frame_type[i]; + cur_cpi->frame_index_set.show_frame_count = show_frame_count; + cur_cpi->rc.frames_since_key = frames_since_key; + cur_cpi->rc.frames_to_key = frames_to_key; + cur_cpi->rc.frames_to_fwd_kf = frames_to_fwd_kf; + cur_cpi->rc.active_worst_quality = first_cpi->rc.active_worst_quality; + cur_cpi->rc.avg_frame_bandwidth = first_cpi->rc.avg_frame_bandwidth; + cur_cpi->rc.max_frame_bandwidth = first_cpi->rc.max_frame_bandwidth; + cur_cpi->rc.min_frame_bandwidth = first_cpi->rc.min_frame_bandwidth; + cur_cpi->rc.intervals_till_gf_calculate_due = + first_cpi->rc.intervals_till_gf_calculate_due; + cur_cpi->mv_search_params.max_mv_magnitude = + first_cpi->mv_search_params.max_mv_magnitude; + if (gf_group->update_type[cur_cpi->gf_frame_index] == INTNL_ARF_UPDATE) { + cur_cpi->common.lf.mode_ref_delta_enabled = 1; + } + cur_cpi->do_frame_data_update = false; + // Initialize prev_ts_start and prev_ts_end for show frame(s) and show + // existing frame(s). + if (gf_group->arf_src_offset[i] == 0) { + // Choose source of prev frame. + int src_index = gf_group->src_offset[i]; + struct lookahead_entry *prev_source = av1_lookahead_peek( + ppi->lookahead, src_index - 1, cur_cpi->compressor_stage); + // Save timestamps of prev frame. + cur_cpi->time_stamps.prev_ts_start = prev_source->ts_start; + cur_cpi->time_stamps.prev_ts_end = prev_source->ts_end; + } + cur_cpi->time_stamps.first_ts_start = + first_cpi->time_stamps.first_ts_start; + + memcpy(cur_cpi->common.ref_frame_map, first_cpi->common.ref_frame_map, + sizeof(first_cpi->common.ref_frame_map)); + cur_cpi_data->lib_flags = 0; + cur_cpi_data->timestamp_ratio = first_cpi_data->timestamp_ratio; + cur_cpi_data->flush = first_cpi_data->flush; + cur_cpi_data->frame_size = 0; + if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) { + // If the first frame in a parallel encode set is INTNL_ARF_UPDATE + // frame, initialize lib_flags of frame_parallel_level 2 frame in the + // set with that of frame_parallel_level 1 frame. + cur_cpi_data->lib_flags = first_cpi_data->lib_flags; + // Store the reference refresh index of frame_parallel_level 2 frame in + // a parallel encode set of lower layer frames. + cur_cpi->ref_refresh_index = + av1_calc_refresh_idx_for_intnl_arf(cur_cpi, ref_frame_map_pairs, i); + cur_cpi->refresh_idx_available = true; + // Skip the reference frame which will be refreshed by + // frame_parallel_level 1 frame in a parallel encode set of lower layer + // frames. + cur_cpi->ref_idx_to_skip = first_cpi->ref_refresh_index; + } else { + cur_cpi->ref_idx_to_skip = INVALID_IDX; + cur_cpi->ref_refresh_index = INVALID_IDX; + cur_cpi->refresh_idx_available = false; + } + cur_cpi->twopass_frame.stats_in = stats_in; + + av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, cur_cpi, i, + 1, cur_cpi->common.remapped_ref_idx); + av1_scale_references_fpmt(cur_cpi, ref_buffers_used_map); + parallel_frame_count++; + } + + // Set do_frame_data_update to true for the last frame_parallel_level 2 + // frame in the current parallel encode set. + if (i == (gf_group->size - 1) || + (gf_group->frame_parallel_level[i + 1] == 0 && + (gf_group->update_type[i + 1] == ARF_UPDATE || + gf_group->update_type[i + 1] == INTNL_ARF_UPDATE)) || + gf_group->frame_parallel_level[i + 1] == 1) { + ppi->parallel_cpi[parallel_frame_count - 1]->do_frame_data_update = true; + break; + } + } + + av1_increment_scaled_ref_counts_fpmt(first_cpi->common.buffer_pool, + *ref_buffers_used_map); + + // Return the number of frames in the parallel encode set. + return parallel_frame_count; +} + +int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) { + AV1_COMMON *cm = &cpi->common; + if (!cm->show_frame) { + return -1; + } else { + int ret; + if (cm->cur_frame != NULL && !cpi->oxcf.algo_cfg.skip_postproc_filtering) { + *dest = cm->cur_frame->buf; + dest->y_width = cm->width; + dest->y_height = cm->height; + dest->uv_width = cm->width >> cm->seq_params->subsampling_x; + dest->uv_height = cm->height >> cm->seq_params->subsampling_y; + ret = 0; + } else { + ret = -1; + } + return ret; + } +} + +int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) { + if (cpi->last_show_frame_buf == NULL || + cpi->oxcf.algo_cfg.skip_postproc_filtering) + return -1; + + *frame = cpi->last_show_frame_buf->buf; + return 0; +} + +aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd) { + const int num_planes = av1_num_planes(cm); + if (!equal_dimensions_and_border(new_frame, sd)) + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + aom_yv12_copy_frame(new_frame, sd, num_planes); + + return cm->error->error_code; +} + +int av1_set_internal_size(AV1EncoderConfig *const oxcf, + ResizePendingParams *resize_pending_params, + AOM_SCALING_MODE horiz_mode, + AOM_SCALING_MODE vert_mode) { + int hr = 0, hs = 0, vr = 0, vs = 0; + + // Checks for invalid AOM_SCALING_MODE values. + if (horiz_mode > AOME_ONETHREE || vert_mode > AOME_ONETHREE) return -1; + + Scale2Ratio(horiz_mode, &hr, &hs); + Scale2Ratio(vert_mode, &vr, &vs); + + // always go to the next whole number + resize_pending_params->width = (hs - 1 + oxcf->frm_dim_cfg.width * hr) / hs; + resize_pending_params->height = (vs - 1 + oxcf->frm_dim_cfg.height * vr) / vs; + + if (horiz_mode != AOME_NORMAL || vert_mode != AOME_NORMAL) { + oxcf->resize_cfg.resize_mode = RESIZE_FIXED; + oxcf->algo_cfg.enable_tpl_model = 0; + } + return 0; +} + +int av1_get_quantizer(AV1_COMP *cpi) { + return cpi->common.quant_params.base_qindex; +} + +int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) { + size_t output_size = 0; + size_t total_bytes_read = 0; + size_t remaining_size = *frame_size; + uint8_t *buff_ptr = buffer; + + // go through each OBUs + while (total_bytes_read < *frame_size) { + uint8_t saved_obu_header[2]; + uint64_t obu_payload_size; + size_t length_of_payload_size; + size_t length_of_obu_size; + uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1; + size_t obu_bytes_read = obu_header_size; // bytes read for current obu + + // save the obu header (1 or 2 bytes) + memmove(saved_obu_header, buff_ptr, obu_header_size); + // clear the obu_has_size_field + saved_obu_header[0] = saved_obu_header[0] & (~0x2); + + // get the payload_size and length of payload_size + if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size, + &obu_payload_size, &length_of_payload_size) != 0) { + return AOM_CODEC_ERROR; + } + obu_bytes_read += length_of_payload_size; + + // calculate the length of size of the obu header plus payload + length_of_obu_size = + aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size)); + + // move the rest of data to new location + memmove(buff_ptr + length_of_obu_size + obu_header_size, + buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read); + obu_bytes_read += (size_t)obu_payload_size; + + // write the new obu size + const uint64_t obu_size = obu_header_size + obu_payload_size; + size_t coded_obu_size; + if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr, + &coded_obu_size) != 0) { + return AOM_CODEC_ERROR; + } + + // write the saved (modified) obu_header following obu size + memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size); + + total_bytes_read += obu_bytes_read; + remaining_size -= obu_bytes_read; + buff_ptr += length_of_obu_size + obu_size; + output_size += length_of_obu_size + (size_t)obu_size; + } + + *frame_size = output_size; + return AOM_CODEC_OK; +} + +static void rtc_set_updates_ref_frame_config( + ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags, + RTC_REF *const rtc_ref) { + ext_refresh_frame_flags->update_pending = 1; + ext_refresh_frame_flags->last_frame = rtc_ref->refresh[rtc_ref->ref_idx[0]]; + ext_refresh_frame_flags->golden_frame = rtc_ref->refresh[rtc_ref->ref_idx[3]]; + ext_refresh_frame_flags->bwd_ref_frame = + rtc_ref->refresh[rtc_ref->ref_idx[4]]; + ext_refresh_frame_flags->alt2_ref_frame = + rtc_ref->refresh[rtc_ref->ref_idx[5]]; + ext_refresh_frame_flags->alt_ref_frame = + rtc_ref->refresh[rtc_ref->ref_idx[6]]; + rtc_ref->non_reference_frame = 1; + for (int i = 0; i < REF_FRAMES; i++) { + if (rtc_ref->refresh[i] == 1) { + rtc_ref->non_reference_frame = 0; + break; + } + } +} + +static int rtc_set_references_external_ref_frame_config(AV1_COMP *cpi) { + // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + int ref = AOM_REFFRAME_ALL; + for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { + if (!cpi->ppi->rtc_ref.reference[i]) ref ^= (1 << i); + } + return ref; +} + +void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { + // TODO(yunqingwang): For what references to use, external encoding flags + // should be consistent with internal reference frame selection. Need to + // ensure that there is not conflict between the two. In AV1 encoder, the + // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3, + // GOLDEN, BWDREF, ALTREF2. + + ExternalFlags *const ext_flags = &cpi->ext_flags; + ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = + &ext_flags->refresh_frame; + ext_flags->ref_frame_flags = AOM_REFFRAME_ALL; + if (flags & + (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | + AOM_EFLAG_NO_REF_ARF2)) { + int ref = AOM_REFFRAME_ALL; + + if (flags & AOM_EFLAG_NO_REF_LAST) ref ^= AOM_LAST_FLAG; + if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG; + if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG; + + if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG; + + if (flags & AOM_EFLAG_NO_REF_ARF) { + ref ^= AOM_ALT_FLAG; + ref ^= AOM_BWD_FLAG; + ref ^= AOM_ALT2_FLAG; + } else { + if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG; + if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG; + } + + av1_use_as_reference(&ext_flags->ref_frame_flags, ref); + } else { + if (cpi->ppi->rtc_ref.set_ref_frame_config) { + int ref = rtc_set_references_external_ref_frame_config(cpi); + av1_use_as_reference(&ext_flags->ref_frame_flags, ref); + } + } + + if (flags & + (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) { + int upd = AOM_REFFRAME_ALL; + + // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag. + if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG; + + if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG; + + if (flags & AOM_EFLAG_NO_UPD_ARF) { + upd ^= AOM_ALT_FLAG; + upd ^= AOM_BWD_FLAG; + upd ^= AOM_ALT2_FLAG; + } + + ext_refresh_frame_flags->last_frame = (upd & AOM_LAST_FLAG) != 0; + ext_refresh_frame_flags->golden_frame = (upd & AOM_GOLD_FLAG) != 0; + ext_refresh_frame_flags->alt_ref_frame = (upd & AOM_ALT_FLAG) != 0; + ext_refresh_frame_flags->bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0; + ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0; + ext_refresh_frame_flags->update_pending = 1; + } else { + if (cpi->ppi->rtc_ref.set_ref_frame_config) + rtc_set_updates_ref_frame_config(ext_refresh_frame_flags, + &cpi->ppi->rtc_ref); + else + ext_refresh_frame_flags->update_pending = 0; + } + + ext_flags->use_ref_frame_mvs = cpi->oxcf.tool_cfg.enable_ref_frame_mvs & + ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0); + ext_flags->use_error_resilient = cpi->oxcf.tool_cfg.error_resilient_mode | + ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0); + ext_flags->use_s_frame = + cpi->oxcf.kf_cfg.enable_sframe | ((flags & AOM_EFLAG_SET_S_FRAME) != 0); + ext_flags->use_primary_ref_none = + (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0; + + if (flags & AOM_EFLAG_NO_UPD_ENTROPY) { + update_entropy(&ext_flags->refresh_frame_context, + &ext_flags->refresh_frame_context_pending, 0); + } +} + +aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) { + if (!ppi) return NULL; + + uint8_t header_buf[512] = { 0 }; + const uint32_t sequence_header_size = + av1_write_sequence_header_obu(&ppi->seq_params, &header_buf[0]); + assert(sequence_header_size <= sizeof(header_buf)); + if (sequence_header_size == 0) return NULL; + + const size_t obu_header_size = 1; + const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size); + const size_t payload_offset = obu_header_size + size_field_size; + + if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL; + memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size); + + if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count, + OBU_SEQUENCE_HEADER, 0, + &header_buf[0]) != obu_header_size) { + return NULL; + } + + size_t coded_size_field_size = 0; + if (aom_uleb_encode(sequence_header_size, size_field_size, + &header_buf[obu_header_size], + &coded_size_field_size) != 0) { + return NULL; + } + assert(coded_size_field_size == size_field_size); + + aom_fixed_buf_t *global_headers = + (aom_fixed_buf_t *)malloc(sizeof(*global_headers)); + if (!global_headers) return NULL; + + const size_t global_header_buf_size = + obu_header_size + size_field_size + sequence_header_size; + + global_headers->buf = malloc(global_header_buf_size); + if (!global_headers->buf) { + free(global_headers); + return NULL; + } + + memcpy(global_headers->buf, &header_buf[0], global_header_buf_size); + global_headers->sz = global_header_buf_size; + return global_headers; +} diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h new file mode 100644 index 0000000000..5f6f67eda8 --- /dev/null +++ b/third_party/aom/av1/encoder/encoder.h @@ -0,0 +1,4512 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Declares top-level encoder structures and functions. + */ +#ifndef AOM_AV1_ENCODER_ENCODER_H_ +#define AOM_AV1_ENCODER_ENCODER_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom/aomcx.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/entropymode.h" +#include "av1/common/enums.h" +#include "av1/common/reconintra.h" +#include "av1/common/resize.h" +#include "av1/common/thread_common.h" +#include "av1/common/timing.h" + +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/external_partition.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/global_motion.h" +#include "av1/encoder/level.h" +#include "av1/encoder/lookahead.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/pickcdef.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/svc_layercontext.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/thirdpass.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/av1_noise_estimate.h" +#include "av1/encoder/bitstream.h" + +#if CONFIG_INTERNAL_STATS +#include "aom_dsp/ssim.h" +#endif +#include "aom_dsp/variance.h" +#if CONFIG_DENOISE +#include "aom_dsp/noise_model.h" +#endif +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif +#if CONFIG_AV1_TEMPORAL_DENOISING +#include "av1/encoder/av1_temporal_denoiser.h" +#endif +#if CONFIG_TUNE_BUTTERAUGLI +#include "av1/encoder/tune_butteraugli.h" +#endif + +#include "aom/internal/aom_codec_internal.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// TODO(yunqing, any): Added suppression tag to quiet Doxygen warnings. Need to +// adjust it while we work on documentation. +/*!\cond */ +// Number of frames required to test for scene cut detection +#define SCENE_CUT_KEY_TEST_INTERVAL 16 + +// Lookahead index threshold to enable temporal filtering for second arf. +#define TF_LOOKAHEAD_IDX_THR 7 + +#define HDR_QP_LEVELS 10 +#define CHROMA_CB_QP_SCALE 1.04 +#define CHROMA_CR_QP_SCALE 1.04 +#define CHROMA_QP_SCALE -0.46 +#define CHROMA_QP_OFFSET 9.26 +#define QP_SCALE_FACTOR 2.0 +#define DISABLE_HDR_LUMA_DELTAQ 1 + +// Rational number with an int64 numerator +// This structure holds a fractional value +typedef struct aom_rational64 { + int64_t num; // fraction numerator + int den; // fraction denominator +} aom_rational64_t; // alias for struct aom_rational + +enum { + // Good Quality Fast Encoding. The encoder balances quality with the amount of + // time it takes to encode the output. Speed setting controls how fast. + GOOD, + // Realtime Fast Encoding. Will force some restrictions on bitrate + // constraints. + REALTIME, + // All intra mode. All the frames are coded as intra frames. + ALLINTRA +} UENUM1BYTE(MODE); + +enum { + FRAMEFLAGS_KEY = 1 << 0, + FRAMEFLAGS_GOLDEN = 1 << 1, + FRAMEFLAGS_BWDREF = 1 << 2, + // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME + FRAMEFLAGS_ALTREF = 1 << 3, + FRAMEFLAGS_INTRAONLY = 1 << 4, + FRAMEFLAGS_SWITCH = 1 << 5, + FRAMEFLAGS_ERROR_RESILIENT = 1 << 6, +} UENUM1BYTE(FRAMETYPE_FLAGS); + +#if CONFIG_FPMT_TEST +enum { + PARALLEL_ENCODE = 0, + PARALLEL_SIMULATION_ENCODE, + NUM_FPMT_TEST_ENCODES +} UENUM1BYTE(FPMT_TEST_ENC_CFG); +#endif // CONFIG_FPMT_TEST +// 0 level frames are sometimes used for rate control purposes, but for +// reference mapping purposes, the minimum level should be 1. +#define MIN_PYR_LEVEL 1 +static INLINE int get_true_pyr_level(int frame_level, int frame_order, + int max_layer_depth) { + if (frame_order == 0) { + // Keyframe case + return MIN_PYR_LEVEL; + } else if (frame_level == MAX_ARF_LAYERS) { + // Leaves + return max_layer_depth; + } else if (frame_level == (MAX_ARF_LAYERS + 1)) { + // Altrefs + return MIN_PYR_LEVEL; + } + return AOMMAX(MIN_PYR_LEVEL, frame_level); +} + +enum { + NO_AQ = 0, + VARIANCE_AQ = 1, + COMPLEXITY_AQ = 2, + CYCLIC_REFRESH_AQ = 3, + AQ_MODE_COUNT // This should always be the last member of the enum +} UENUM1BYTE(AQ_MODE); +enum { + NO_DELTA_Q = 0, + DELTA_Q_OBJECTIVE = 1, // Modulation to improve objective quality + DELTA_Q_PERCEPTUAL = 2, // Modulation to improve video perceptual quality + DELTA_Q_PERCEPTUAL_AI = 3, // Perceptual quality opt for all intra mode + DELTA_Q_USER_RATING_BASED = 4, // User rating based delta q mode + DELTA_Q_HDR = 5, // QP adjustment based on HDR block pixel average + DELTA_Q_MODE_COUNT // This should always be the last member of the enum +} UENUM1BYTE(DELTAQ_MODE); + +enum { + RESIZE_NONE = 0, // No frame resizing allowed. + RESIZE_FIXED = 1, // All frames are coded at the specified scale. + RESIZE_RANDOM = 2, // All frames are coded at a random scale. + RESIZE_DYNAMIC = 3, // Frames coded at lower scale based on rate control. + RESIZE_MODES +} UENUM1BYTE(RESIZE_MODE); + +enum { + SS_CFG_SRC = 0, + SS_CFG_LOOKAHEAD = 1, + SS_CFG_FPF = 2, + SS_CFG_TOTAL = 3 +} UENUM1BYTE(SS_CFG_OFFSET); + +enum { + DISABLE_SCENECUT, // For LAP, lag_in_frames < 19 + ENABLE_SCENECUT_MODE_1, // For LAP, lag_in_frames >=19 and < 33 + ENABLE_SCENECUT_MODE_2 // For twopass and LAP - lag_in_frames >=33 +} UENUM1BYTE(SCENECUT_MODE); + +#define MAX_VBR_CORPUS_COMPLEXITY 10000 + +typedef enum { + MOD_FP, // First pass + MOD_TF, // Temporal filtering + MOD_TPL, // TPL + MOD_GME, // Global motion estimation + MOD_ENC, // Encode stage + MOD_LPF, // Deblocking loop filter + MOD_CDEF_SEARCH, // CDEF search + MOD_CDEF, // CDEF frame + MOD_LR, // Loop restoration filtering + MOD_PACK_BS, // Pack bitstream + MOD_FRAME_ENC, // Frame Parallel encode + MOD_AI, // All intra + NUM_MT_MODULES +} MULTI_THREADED_MODULES; + +/*!\endcond */ + +/*!\enum COST_UPDATE_TYPE + * \brief This enum controls how often the entropy costs should be updated. + * \warning In case of any modifications/additions done to the enum + * COST_UPDATE_TYPE, the enum INTERNAL_COST_UPDATE_TYPE needs to be updated as + * well. + */ +typedef enum { + COST_UPD_SB, /*!< Update every sb. */ + COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */ + COST_UPD_TILE, /*!< Update every tile. */ + COST_UPD_OFF, /*!< Turn off cost updates. */ + NUM_COST_UPDATE_TYPES, /*!< Number of cost update types. */ +} COST_UPDATE_TYPE; + +/*!\enum LOOPFILTER_CONTROL + * \brief This enum controls to which frames loopfilter is applied. + */ +typedef enum { + LOOPFILTER_NONE = 0, /*!< Disable loopfilter on all frames. */ + LOOPFILTER_ALL = 1, /*!< Enable loopfilter for all frames. */ + LOOPFILTER_REFERENCE = 2, /*!< Disable loopfilter on non reference frames. */ + LOOPFILTER_SELECTIVELY = + 3, /*!< Disable loopfilter on frames with low motion. */ +} LOOPFILTER_CONTROL; + +/*!\enum SKIP_APPLY_POSTPROC_FILTER + * \brief This enum controls the application of post-processing filters on a + * reconstructed frame. + */ +typedef enum { + SKIP_APPLY_RESTORATION = 1 << 0, + SKIP_APPLY_SUPERRES = 1 << 1, + SKIP_APPLY_CDEF = 1 << 2, + SKIP_APPLY_LOOPFILTER = 1 << 3, +} SKIP_APPLY_POSTPROC_FILTER; + +/*! + * \brief Encoder config related to resize. + */ +typedef struct { + /*! + * Indicates the frame resize mode to be used by the encoder. + */ + RESIZE_MODE resize_mode; + /*! + * Indicates the denominator for resize of inter frames, assuming 8 as the + * numerator. Its value ranges between 8-16. + */ + uint8_t resize_scale_denominator; + /*! + * Indicates the denominator for resize of key frames, assuming 8 as the + * numerator. Its value ranges between 8-16. + */ + uint8_t resize_kf_scale_denominator; +} ResizeCfg; + +/*! + * \brief Encoder config for coding block partitioning. + */ +typedef struct { + /*! + * Flag to indicate if rectanguar partitions should be enabled. + */ + bool enable_rect_partitions; + /*! + * Flag to indicate if AB partitions should be enabled. + */ + bool enable_ab_partitions; + /*! + * Flag to indicate if 1:4 / 4:1 partitions should be enabled. + */ + bool enable_1to4_partitions; + /*! + * Indicates the minimum partition size that should be allowed. Both width and + * height of a partition cannot be smaller than the min_partition_size. + */ + BLOCK_SIZE min_partition_size; + /*! + * Indicates the maximum partition size that should be allowed. Both width and + * height of a partition cannot be larger than the max_partition_size. + */ + BLOCK_SIZE max_partition_size; +} PartitionCfg; + +/*! + * \brief Encoder flags for intra prediction. + */ +typedef struct { + /*! + * Flag to indicate if intra edge filtering process should be enabled. + */ + bool enable_intra_edge_filter; + /*! + * Flag to indicate if recursive filtering based intra prediction should be + * enabled. + */ + bool enable_filter_intra; + /*! + * Flag to indicate if smooth intra prediction modes should be enabled. + */ + bool enable_smooth_intra; + /*! + * Flag to indicate if PAETH intra prediction mode should be enabled. + */ + bool enable_paeth_intra; + /*! + * Flag to indicate if CFL uv intra mode should be enabled. + */ + bool enable_cfl_intra; + /*! + * Flag to indicate if directional modes should be enabled. + */ + bool enable_directional_intra; + /*! + * Flag to indicate if the subset of directional modes from D45 to D203 intra + * should be enabled. Has no effect if directional modes are disabled. + */ + bool enable_diagonal_intra; + /*! + * Flag to indicate if delta angles for directional intra prediction should be + * enabled. + */ + bool enable_angle_delta; + /*! + * Flag to indicate whether to automatically turn off several intral coding + * tools. + * This flag is only used when "--deltaq-mode=3" is true. + * When set to 1, the encoder will analyze the reconstruction quality + * as compared to the source image in the preprocessing pass. + * If the recontruction quality is considered high enough, we disable + * the following intra coding tools, for better encoding speed: + * "--enable_smooth_intra", + * "--enable_paeth_intra", + * "--enable_cfl_intra", + * "--enable_diagonal_intra". + */ + bool auto_intra_tools_off; +} IntraModeCfg; + +/*! + * \brief Encoder flags for transform sizes and types. + */ +typedef struct { + /*! + * Flag to indicate if 64-pt transform should be enabled. + */ + bool enable_tx64; + /*! + * Flag to indicate if flip and identity transform types should be enabled. + */ + bool enable_flip_idtx; + /*! + * Flag to indicate if rectangular transform should be enabled. + */ + bool enable_rect_tx; + /*! + * Flag to indicate whether or not to use a default reduced set for ext-tx + * rather than the potential full set of 16 transforms. + */ + bool reduced_tx_type_set; + /*! + * Flag to indicate if transform type for intra blocks should be limited to + * DCT_DCT. + */ + bool use_intra_dct_only; + /*! + * Flag to indicate if transform type for inter blocks should be limited to + * DCT_DCT. + */ + bool use_inter_dct_only; + /*! + * Flag to indicate if intra blocks should use default transform type + * (mode-dependent) only. + */ + bool use_intra_default_tx_only; + /*! + * Flag to indicate if transform size search should be enabled. + */ + bool enable_tx_size_search; +} TxfmSizeTypeCfg; + +/*! + * \brief Encoder flags for compound prediction modes. + */ +typedef struct { + /*! + * Flag to indicate if distance-weighted compound type should be enabled. + */ + bool enable_dist_wtd_comp; + /*! + * Flag to indicate if masked (wedge/diff-wtd) compound type should be + * enabled. + */ + bool enable_masked_comp; + /*! + * Flag to indicate if smooth interintra mode should be enabled. + */ + bool enable_smooth_interintra; + /*! + * Flag to indicate if difference-weighted compound type should be enabled. + */ + bool enable_diff_wtd_comp; + /*! + * Flag to indicate if inter-inter wedge compound type should be enabled. + */ + bool enable_interinter_wedge; + /*! + * Flag to indicate if inter-intra wedge compound type should be enabled. + */ + bool enable_interintra_wedge; +} CompoundTypeCfg; + +/*! + * \brief Encoder config related to frame super-resolution. + */ +typedef struct { + /*! + * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH + * mode is used for inter frames. + */ + int superres_qthresh; + /*! + * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH + * mode is used for key frames. + */ + int superres_kf_qthresh; + /*! + * Indicates the denominator of the fraction that specifies the ratio between + * the superblock width before and after upscaling for inter frames. The + * numerator of this fraction is equal to the constant SCALE_NUMERATOR. + */ + uint8_t superres_scale_denominator; + /*! + * Indicates the denominator of the fraction that specifies the ratio between + * the superblock width before and after upscaling for key frames. The + * numerator of this fraction is equal to the constant SCALE_NUMERATOR. + */ + uint8_t superres_kf_scale_denominator; + /*! + * Indicates the Super-resolution mode to be used by the encoder. + */ + aom_superres_mode superres_mode; + /*! + * Flag to indicate if super-resolution should be enabled for the sequence. + */ + bool enable_superres; +} SuperResCfg; + +/*! + * \brief Encoder config related to the coding of key frames. + */ +typedef struct { + /*! + * Indicates the minimum distance to a key frame. + */ + int key_freq_min; + + /*! + * Indicates the maximum distance to a key frame. + */ + int key_freq_max; + + /*! + * Indicates if temporal filtering should be applied on keyframe. + */ + int enable_keyframe_filtering; + + /*! + * Indicates the number of frames after which a frame may be coded as an + * S-Frame. + */ + int sframe_dist; + + /*! + * Indicates how an S-Frame should be inserted. + * 1: the considered frame will be made into an S-Frame only if it is an + * altref frame. 2: the next altref frame will be made into an S-Frame. + */ + int sframe_mode; + + /*! + * Indicates if encoder should autodetect cut scenes and set the keyframes. + */ + bool auto_key; + + /*! + * Indicates the forward key frame distance. + */ + int fwd_kf_dist; + + /*! + * Indicates if forward keyframe reference should be enabled. + */ + bool fwd_kf_enabled; + + /*! + * Indicates if S-Frames should be enabled for the sequence. + */ + bool enable_sframe; + + /*! + * Indicates if intra block copy prediction mode should be enabled or not. + */ + bool enable_intrabc; +} KeyFrameCfg; + +/*! + * \brief Encoder rate control configuration parameters + */ +typedef struct { + /*!\cond */ + // BUFFERING PARAMETERS + /*!\endcond */ + /*! + * Indicates the amount of data that will be buffered by the decoding + * application prior to beginning playback, and is expressed in units of + * time(milliseconds). + */ + int64_t starting_buffer_level_ms; + /*! + * Indicates the amount of data that the encoder should try to maintain in the + * decoder's buffer, and is expressed in units of time(milliseconds). + */ + int64_t optimal_buffer_level_ms; + /*! + * Indicates the maximum amount of data that may be buffered by the decoding + * application, and is expressed in units of time(milliseconds). + */ + int64_t maximum_buffer_size_ms; + + /*! + * Indicates the bandwidth to be used in bits per second. + */ + int64_t target_bandwidth; + + /*! + * Indicates average complexity of the corpus in single pass vbr based on + * LAP. 0 indicates that corpus complexity vbr mode is disabled. + */ + unsigned int vbr_corpus_complexity_lap; + /*! + * Indicates the maximum allowed bitrate for any intra frame as % of bitrate + * target. + */ + unsigned int max_intra_bitrate_pct; + /*! + * Indicates the maximum allowed bitrate for any inter frame as % of bitrate + * target. + */ + unsigned int max_inter_bitrate_pct; + /*! + * Indicates the percentage of rate boost for golden frame in CBR mode. + */ + unsigned int gf_cbr_boost_pct; + /*! + * min_cr / 100 indicates the target minimum compression ratio for each + * frame. + */ + unsigned int min_cr; + /*! + * Indicates the frame drop threshold. + */ + int drop_frames_water_mark; + /*! + * under_shoot_pct indicates the tolerance of the VBR algorithm to + * undershoot and is used as a trigger threshold for more aggressive + * adaptation of Q. It's value can range from 0-100. + */ + int under_shoot_pct; + /*! + * over_shoot_pct indicates the tolerance of the VBR algorithm to overshoot + * and is used as a trigger threshold for more aggressive adaptation of Q. + * It's value can range from 0-1000. + */ + int over_shoot_pct; + /*! + * Indicates the maximum qindex that can be used by the quantizer i.e. the + * worst quality qindex. + */ + int worst_allowed_q; + /*! + * Indicates the minimum qindex that can be used by the quantizer i.e. the + * best quality qindex. + */ + int best_allowed_q; + /*! + * Indicates the Constant/Constrained Quality level. + */ + int cq_level; + /*! + * Indicates if the encoding mode is vbr, cbr, constrained quality or + * constant quality. + */ + enum aom_rc_mode mode; + /*! + * Indicates the bias (expressed on a scale of 0 to 100) for determining + * target size for the current frame. The value 0 indicates the optimal CBR + * mode value should be used, and 100 indicates the optimal VBR mode value + * should be used. + */ + int vbrbias; + /*! + * Indicates the minimum bitrate to be used for a single frame as a percentage + * of the target bitrate. + */ + int vbrmin_section; + /*! + * Indicates the maximum bitrate to be used for a single frame as a percentage + * of the target bitrate. + */ + int vbrmax_section; +} RateControlCfg; + +/*!\cond */ +typedef struct { + // Indicates the number of frames lag before encoding is started. + int lag_in_frames; + // Indicates the minimum gf/arf interval to be used. + int min_gf_interval; + // Indicates the maximum gf/arf interval to be used. + int max_gf_interval; + // Indicates the minimum height for GF group pyramid structure to be used. + int gf_min_pyr_height; + // Indicates the maximum height for GF group pyramid structure to be used. + int gf_max_pyr_height; + // Indicates if automatic set and use of altref frames should be enabled. + bool enable_auto_arf; + // Indicates if automatic set and use of (b)ackward (r)ef (f)rames should be + // enabled. + bool enable_auto_brf; +} GFConfig; + +typedef struct { + // Indicates the number of tile groups. + unsigned int num_tile_groups; + // Indicates the MTU size for a tile group. If mtu is non-zero, + // num_tile_groups is set to DEFAULT_MAX_NUM_TG. + unsigned int mtu; + // Indicates the number of tile columns in log2. + int tile_columns; + // Indicates the number of tile rows in log2. + int tile_rows; + // Indicates the number of widths in the tile_widths[] array. + int tile_width_count; + // Indicates the number of heights in the tile_heights[] array. + int tile_height_count; + // Indicates the tile widths, and may be empty. + int tile_widths[MAX_TILE_COLS]; + // Indicates the tile heights, and may be empty. + int tile_heights[MAX_TILE_ROWS]; + // Indicates if large scale tile coding should be used. + bool enable_large_scale_tile; + // Indicates if single tile decoding mode should be enabled. + bool enable_single_tile_decoding; + // Indicates if EXT_TILE_DEBUG should be enabled. + bool enable_ext_tile_debug; +} TileConfig; + +typedef struct { + // Indicates the width of the input frame. + int width; + // Indicates the height of the input frame. + int height; + // If forced_max_frame_width is non-zero then it is used to force the maximum + // frame width written in write_sequence_header(). + int forced_max_frame_width; + // If forced_max_frame_width is non-zero then it is used to force the maximum + // frame height written in write_sequence_header(). + int forced_max_frame_height; + // Indicates the frame width after applying both super-resolution and resize + // to the coded frame. + int render_width; + // Indicates the frame height after applying both super-resolution and resize + // to the coded frame. + int render_height; +} FrameDimensionCfg; + +typedef struct { + // Indicates if warped motion should be enabled. + bool enable_warped_motion; + // Indicates if warped motion should be evaluated or not. + bool allow_warped_motion; + // Indicates if OBMC motion should be enabled. + bool enable_obmc; +} MotionModeCfg; + +typedef struct { + // Timing info for each frame. + aom_timing_info_t timing_info; + // Indicates the number of time units of a decoding clock. + uint32_t num_units_in_decoding_tick; + // Indicates if decoder model information is present in the coded sequence + // header. + bool decoder_model_info_present_flag; + // Indicates if display model information is present in the coded sequence + // header. + bool display_model_info_present_flag; + // Indicates if timing info for each frame is present. + bool timing_info_present; +} DecoderModelCfg; + +typedef struct { + // Indicates the update frequency for coeff costs. + COST_UPDATE_TYPE coeff; + // Indicates the update frequency for mode costs. + COST_UPDATE_TYPE mode; + // Indicates the update frequency for mv costs. + COST_UPDATE_TYPE mv; + // Indicates the update frequency for dv costs. + COST_UPDATE_TYPE dv; +} CostUpdateFreq; + +typedef struct { + // Indicates the maximum number of reference frames allowed per frame. + unsigned int max_reference_frames; + // Indicates if the reduced set of references should be enabled. + bool enable_reduced_reference_set; + // Indicates if one-sided compound should be enabled. + bool enable_onesided_comp; +} RefFrameCfg; + +typedef struct { + // Indicates the color space that should be used. + aom_color_primaries_t color_primaries; + // Indicates the characteristics of transfer function to be used. + aom_transfer_characteristics_t transfer_characteristics; + // Indicates the matrix coefficients to be used for the transfer function. + aom_matrix_coefficients_t matrix_coefficients; + // Indicates the chroma 4:2:0 sample position info. + aom_chroma_sample_position_t chroma_sample_position; + // Indicates if a limited color range or full color range should be used. + aom_color_range_t color_range; +} ColorCfg; + +typedef struct { + // Indicates if extreme motion vector unit test should be enabled or not. + unsigned int motion_vector_unit_test; + // Indicates if superblock multipass unit test should be enabled or not. + unsigned int sb_multipass_unit_test; +} UnitTestCfg; + +typedef struct { + // Indicates the file path to the VMAF model. + const char *vmaf_model_path; + // Indicates the path to the film grain parameters. + const char *film_grain_table_filename; + // Indicates the visual tuning metric. + aom_tune_metric tuning; + // Indicates if the current content is screen or default type. + aom_tune_content content; + // Indicates the film grain parameters. + int film_grain_test_vector; + // Indicates the in-block distortion metric to use. + aom_dist_metric dist_metric; +} TuneCfg; + +typedef struct { + // Indicates the framerate of the input video. + double init_framerate; + // Indicates the bit-depth of the input video. + unsigned int input_bit_depth; + // Indicates the maximum number of frames to be encoded. + unsigned int limit; + // Indicates the chrome subsampling x value. + unsigned int chroma_subsampling_x; + // Indicates the chrome subsampling y value. + unsigned int chroma_subsampling_y; +} InputCfg; + +typedef struct { + // If true, encoder will use fixed QP offsets, that are either: + // - Given by the user, and stored in 'fixed_qp_offsets' array, OR + // - Picked automatically from cq_level. + int use_fixed_qp_offsets; + // Indicates the minimum flatness of the quantization matrix. + int qm_minlevel; + // Indicates the maximum flatness of the quantization matrix. + int qm_maxlevel; + // Indicates if adaptive quantize_b should be enabled. + int quant_b_adapt; + // Indicates the Adaptive Quantization mode to be used. + AQ_MODE aq_mode; + // Indicates the delta q mode to be used. + DELTAQ_MODE deltaq_mode; + // Indicates the delta q mode strength. + DELTAQ_MODE deltaq_strength; + // Indicates if delta quantization should be enabled in chroma planes. + bool enable_chroma_deltaq; + // Indicates if delta quantization should be enabled for hdr video + bool enable_hdr_deltaq; + // Indicates if encoding with quantization matrices should be enabled. + bool using_qm; +} QuantizationCfg; + +/*!\endcond */ +/*! + * \brief Algorithm configuration parameters. + */ +typedef struct { + /*! + * Controls the level at which rate-distortion optimization of transform + * coefficients favours sharpness in the block. Has no impact on RD when set + * to zero (default). For values 1-7, eob and skip block optimization are + * avoided and rdmult is adjusted in favour of block sharpness. + */ + int sharpness; + + /*! + * Indicates the trellis optimization mode of quantized coefficients. + * 0: disabled + * 1: enabled + * 2: enabled for rd search + * 3: true for estimate yrd search + */ + int disable_trellis_quant; + + /*! + * The maximum number of frames used to create an arf. + */ + int arnr_max_frames; + + /*! + * The temporal filter strength for arf used when creating ARFs. + */ + int arnr_strength; + + /*! + * Indicates the CDF update mode + * 0: no update + * 1: update on every frame(default) + * 2: selectively update + */ + uint8_t cdf_update_mode; + + /*! + * Indicates if RDO based on frame temporal dependency should be enabled. + */ + bool enable_tpl_model; + + /*! + * Indicates if coding of overlay frames for filtered ALTREF frames is + * enabled. + */ + bool enable_overlay; + + /*! + * Controls loop filtering + * 0: Loop filter is disabled for all frames + * 1: Loop filter is enabled for all frames + * 2: Loop filter is disabled for non-reference frames + * 3: Loop filter is disables for the frames with low motion + */ + LOOPFILTER_CONTROL loopfilter_control; + + /*! + * Indicates if the application of post-processing filters should be skipped + * on reconstructed frame. + */ + bool skip_postproc_filtering; +} AlgoCfg; +/*!\cond */ + +typedef struct { + // Indicates the codec bit-depth. + aom_bit_depth_t bit_depth; + // Indicates the superblock size that should be used by the encoder. + aom_superblock_size_t superblock_size; + // Indicates if loopfilter modulation should be enabled. + bool enable_deltalf_mode; + // Indicates how CDEF should be applied. + CDEF_CONTROL cdef_control; + // Indicates if loop restoration filter should be enabled. + bool enable_restoration; + // When enabled, video mode should be used even for single frame input. + bool force_video_mode; + // Indicates if the error resiliency features should be enabled. + bool error_resilient_mode; + // Indicates if frame parallel decoding feature should be enabled. + bool frame_parallel_decoding_mode; + // Indicates if the input should be encoded as monochrome. + bool enable_monochrome; + // When enabled, the encoder will use a full header even for still pictures. + // When disabled, a reduced header is used for still pictures. + bool full_still_picture_hdr; + // Indicates if dual interpolation filters should be enabled. + bool enable_dual_filter; + // Indicates if frame order hint should be enabled or not. + bool enable_order_hint; + // Indicates if ref_frame_mvs should be enabled at the sequence level. + bool ref_frame_mvs_present; + // Indicates if ref_frame_mvs should be enabled at the frame level. + bool enable_ref_frame_mvs; + // Indicates if interintra compound mode is enabled. + bool enable_interintra_comp; + // Indicates if global motion should be enabled. + bool enable_global_motion; + // Indicates if palette should be enabled. + bool enable_palette; +} ToolCfg; + +/*!\endcond */ +/*! + * \brief Main encoder configuration data structure. + */ +typedef struct AV1EncoderConfig { + /*!\cond */ + // Configuration related to the input video. + InputCfg input_cfg; + + // Configuration related to frame-dimensions. + FrameDimensionCfg frm_dim_cfg; + + /*!\endcond */ + /*! + * Encoder algorithm configuration. + */ + AlgoCfg algo_cfg; + + /*! + * Configuration related to key-frames. + */ + KeyFrameCfg kf_cfg; + + /*! + * Rate control configuration + */ + RateControlCfg rc_cfg; + /*!\cond */ + + // Configuration related to Quantization. + QuantizationCfg q_cfg; + + // Internal frame size scaling. + ResizeCfg resize_cfg; + + // Frame Super-Resolution size scaling. + SuperResCfg superres_cfg; + + /*!\endcond */ + /*! + * stats_in buffer contains all of the stats packets produced in the first + * pass, concatenated. + */ + aom_fixed_buf_t twopass_stats_in; + /*!\cond */ + + // Configuration related to encoder toolsets. + ToolCfg tool_cfg; + + // Configuration related to Group of frames. + GFConfig gf_cfg; + + // Tile related configuration parameters. + TileConfig tile_cfg; + + // Configuration related to Tune. + TuneCfg tune_cfg; + + // Configuration related to color. + ColorCfg color_cfg; + + // Configuration related to decoder model. + DecoderModelCfg dec_model_cfg; + + // Configuration related to reference frames. + RefFrameCfg ref_frm_cfg; + + // Configuration related to unit tests. + UnitTestCfg unit_test_cfg; + + // Flags related to motion mode. + MotionModeCfg motion_mode_cfg; + + // Flags related to intra mode search. + IntraModeCfg intra_mode_cfg; + + // Flags related to transform size/type. + TxfmSizeTypeCfg txfm_cfg; + + // Flags related to compound type. + CompoundTypeCfg comp_type_cfg; + + // Partition related information. + PartitionCfg part_cfg; + + // Configuration related to frequency of cost update. + CostUpdateFreq cost_upd_freq; + +#if CONFIG_DENOISE + // Indicates the noise level. + float noise_level; + // Indicates the the denoisers block size. + int noise_block_size; + // Indicates whether to apply denoising to the frame to be encoded + int enable_dnl_denoising; +#endif + +#if CONFIG_AV1_TEMPORAL_DENOISING + // Noise sensitivity. + int noise_sensitivity; +#endif + // Bit mask to specify which tier each of the 32 possible operating points + // conforms to. + unsigned int tier_mask; + + // Indicates the number of pixels off the edge of a reference frame we're + // allowed to go when forming an inter prediction. + int border_in_pixels; + + // Indicates the maximum number of threads that may be used by the encoder. + int max_threads; + + // Indicates the speed preset to be used. + int speed; + + // Indicates the target sequence level index for each operating point(OP). + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + + // Indicates the bitstream profile to be used. + BITSTREAM_PROFILE profile; + + /*!\endcond */ + /*! + * Indicates the current encoder pass : + * AOM_RC_ONE_PASS = One pass encode, + * AOM_RC_FIRST_PASS = First pass of multiple-pass + * AOM_RC_SECOND_PASS = Second pass of multiple-pass + * AOM_RC_THIRD_PASS = Third pass of multiple-pass + */ + enum aom_enc_pass pass; + /*!\cond */ + + // Total number of encoding passes. + int passes; + + // the name of the second pass output file when passes > 2 + const char *two_pass_output; + + // the name of the second pass log file when passes > 2 + const char *second_pass_log; + + // Indicates if the encoding is GOOD or REALTIME. + MODE mode; + + // Indicates if row-based multi-threading should be enabled or not. + bool row_mt; + + // Indicates if frame parallel multi-threading should be enabled or not. + bool fp_mt; + + // Indicates if 16bit frame buffers are to be used i.e., the content is > + // 8-bit. + bool use_highbitdepth; + + // Indicates the bitstream syntax mode. 0 indicates bitstream is saved as + // Section 5 bitstream, while 1 indicates the bitstream is saved in Annex - B + // format. + bool save_as_annexb; + + // The path for partition stats reading and writing, used in the experiment + // CONFIG_PARTITION_SEARCH_ORDER. + const char *partition_info_path; + + // The flag that indicates whether we use an external rate distribution to + // guide adaptive quantization. It requires --deltaq-mode=3. The rate + // distribution map file name is stored in |rate_distribution_info|. + unsigned int enable_rate_guide_deltaq; + + // The input file of rate distribution information used in all intra mode + // to determine delta quantization. + const char *rate_distribution_info; + + // Exit the encoder when it fails to encode to a given level. + int strict_level_conformance; + + // Max depth for the GOP after a key frame + int kf_max_pyr_height; + + // A flag to control if we enable the superblock qp sweep for a given lambda + int sb_qp_sweep; + /*!\endcond */ +} AV1EncoderConfig; + +/*!\cond */ +static INLINE int is_lossless_requested(const RateControlCfg *const rc_cfg) { + return rc_cfg->best_allowed_q == 0 && rc_cfg->worst_allowed_q == 0; +} +/*!\endcond */ + +/*! + * \brief Encoder-side probabilities for pruning of various AV1 tools + */ +typedef struct { + /*! + * obmc_probs[i][j] is the probability of OBMC being the best motion mode for + * jth block size and ith frame update type, averaged over past frames. If + * obmc_probs[i][j] < thresh, then OBMC search is pruned. + */ + int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL]; + + /*! + * warped_probs[i] is the probability of warped motion being the best motion + * mode for ith frame update type, averaged over past frames. If + * warped_probs[i] < thresh, then warped motion search is pruned. + */ + int warped_probs[FRAME_UPDATE_TYPES]; + + /*! + * tx_type_probs[i][j][k] is the probability of kth tx_type being the best + * for jth transform size and ith frame update type, averaged over past + * frames. If tx_type_probs[i][j][k] < thresh, then transform search for that + * type is pruned. + */ + int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES]; + + /*! + * switchable_interp_probs[i][j][k] is the probability of kth interpolation + * filter being the best for jth filter context and ith frame update type, + * averaged over past frames. If switchable_interp_probs[i][j][k] < thresh, + * then interpolation filter search is pruned for that case. + */ + int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; +} FrameProbInfo; + +/*!\cond */ + +typedef struct FRAME_COUNTS { +// Note: This structure should only contain 'unsigned int' fields, or +// aggregates built solely from 'unsigned int' fields/elements +#if CONFIG_ENTROPY_STATS + unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES]; + unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1]; + unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; + unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES]; + unsigned int cfl_sign[CFL_JOINT_SIGNS]; + unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE]; + unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2]; + unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2]; + unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + unsigned int palette_y_color_index[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + unsigned int palette_uv_color_index[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES]; + unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2]; + unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [EOB_COEF_CONTEXTS][2]; + unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2]; + unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS] + [2]; + unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2]; + unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5]; + unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6]; + unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7]; + unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8]; + unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9]; + unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10]; + unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11]; + unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [LEVEL_CONTEXTS][BR_CDF_SIZE]; + unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2]; + unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1]; + unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2]; + unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2]; + unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2]; + unsigned int drl_mode[DRL_MODE_CONTEXTS][2]; + unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; + unsigned int wedge_idx[BLOCK_SIZES_ALL][16]; + unsigned int interintra[BLOCK_SIZE_GROUPS][2]; + unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; + unsigned int wedge_interintra[BLOCK_SIZES_ALL][2]; + unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES]; + unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES]; + unsigned int obmc[BLOCK_SIZES_ALL][2]; + unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; + unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; + unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2]; + unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2]; + unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2]; + unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2]; + unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2]; + unsigned int intrabc[2]; + + unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2]; + unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1]; + unsigned int skip_mode[SKIP_MODE_CONTEXTS][2]; + unsigned int skip_txfm[SKIP_CONTEXTS][2]; + unsigned int compound_index[COMP_INDEX_CONTEXTS][2]; + unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2]; + unsigned int delta_q[DELTA_Q_PROBS][2]; + unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2]; + unsigned int delta_lf[DELTA_LF_PROBS][2]; + + unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; + unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [TX_TYPES]; + unsigned int filter_intra_mode[FILTER_INTRA_MODES]; + unsigned int filter_intra[BLOCK_SIZES_ALL][2]; + unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES]; + unsigned int wiener_restore[2]; + unsigned int sgrproj_restore[2]; +#endif // CONFIG_ENTROPY_STATS + + unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; +} FRAME_COUNTS; + +#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 + +typedef struct { + int ready; + double a; + double b; + double dist_mean; + double ld_mean; + double sse_mean; + double sse_sse_mean; + double sse_ld_mean; + int num; + double dist_sum; + double ld_sum; + double sse_sum; + double sse_sse_sum; + double sse_ld_sum; +} InterModeRdModel; + +typedef struct { + int idx; + int64_t rd; +} RdIdxPair; +// TODO(angiebird): This is an estimated size. We still need to figure what is +// the maximum number of modes. +#define MAX_INTER_MODES 1024 +// TODO(any): rename this struct to something else. There is already another +// struct called inter_mode_info, which makes this terribly confusing. +/*!\endcond */ +/*! + * \brief Struct used to hold inter mode data for fast tx search. + * + * This struct is used to perform a full transform search only on winning + * candidates searched with an estimate for transform coding RD. + */ +typedef struct inter_modes_info { + /*! + * The number of inter modes for which data was stored in each of the + * following arrays. + */ + int num; + /*! + * Mode info struct for each of the candidate modes. + */ + MB_MODE_INFO mbmi_arr[MAX_INTER_MODES]; + /*! + * The rate for each of the candidate modes. + */ + int mode_rate_arr[MAX_INTER_MODES]; + /*! + * The sse of the predictor for each of the candidate modes. + */ + int64_t sse_arr[MAX_INTER_MODES]; + /*! + * The estimated rd of the predictor for each of the candidate modes. + */ + int64_t est_rd_arr[MAX_INTER_MODES]; + /*! + * The rate and mode index for each of the candidate modes. + */ + RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES]; + /*! + * The full rd stats for each of the candidate modes. + */ + RD_STATS rd_cost_arr[MAX_INTER_MODES]; + /*! + * The full rd stats of luma only for each of the candidate modes. + */ + RD_STATS rd_cost_y_arr[MAX_INTER_MODES]; + /*! + * The full rd stats of chroma only for each of the candidate modes. + */ + RD_STATS rd_cost_uv_arr[MAX_INTER_MODES]; +} InterModesInfo; + +/*!\cond */ +typedef struct { + // TODO(kyslov): consider changing to 64bit + + // This struct is used for computing variance in choose_partitioning(), where + // the max number of samples within a superblock is 32x32 (with 4x4 avg). + // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32 + // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit + uint32_t sum_square_error; + int32_t sum_error; + int log2_count; + int variance; +} VPartVar; + +typedef struct { + VPartVar none; + VPartVar horz[2]; + VPartVar vert[2]; +} VPVariance; + +typedef struct { + VPVariance part_variances; + VPartVar split[4]; +} VP4x4; + +typedef struct { + VPVariance part_variances; + VP4x4 split[4]; +} VP8x8; + +typedef struct { + VPVariance part_variances; + VP8x8 split[4]; +} VP16x16; + +typedef struct { + VPVariance part_variances; + VP16x16 split[4]; +} VP32x32; + +typedef struct { + VPVariance part_variances; + VP32x32 split[4]; +} VP64x64; + +typedef struct { + VPVariance part_variances; + VP64x64 *split; +} VP128x128; + +/*!\endcond */ + +/*! + * \brief Thresholds for variance based partitioning. + */ +typedef struct { + /*! + * If block variance > threshold, then that block is forced to split. + * thresholds[0] - threshold for 128x128; + * thresholds[1] - threshold for 64x64; + * thresholds[2] - threshold for 32x32; + * thresholds[3] - threshold for 16x16; + * thresholds[4] - threshold for 8x8; + */ + int64_t thresholds[5]; + + /*! + * MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual + * minmax > threshold_minmax, the 16x16 is forced to split. + */ + int64_t threshold_minmax; +} VarBasedPartitionInfo; + +/*! + * \brief Encoder parameters for synchronization of row based multi-threading + */ +typedef struct { +#if CONFIG_MULTITHREAD + /** + * \name Synchronization objects for top-right dependency. + */ + /**@{*/ + pthread_mutex_t *mutex_; /*!< Mutex lock object */ + pthread_cond_t *cond_; /*!< Condition variable */ + /**@}*/ +#endif // CONFIG_MULTITHREAD + /*! + * Buffer to store the superblock whose encoding is complete. + * num_finished_cols[i] stores the number of superblocks which finished + * encoding in the ith superblock row. + */ + int *num_finished_cols; + /*! + * Denotes the superblock interval at which conditional signalling should + * happen. Also denotes the minimum number of extra superblocks of the top row + * to be complete to start encoding the current superblock. A value of 1 + * indicates top-right dependency. + */ + int sync_range; + /*! + * Denotes the additional number of superblocks in the previous row to be + * complete to start encoding the current superblock when intraBC tool is + * enabled. This additional top-right delay is required to satisfy the + * hardware constraints for intraBC tool when row multithreading is enabled. + */ + int intrabc_extra_top_right_sb_delay; + /*! + * Number of superblock rows. + */ + int rows; + /*! + * The superblock row (in units of MI blocks) to be processed next. + */ + int next_mi_row; + /*! + * Number of threads processing the current tile. + */ + int num_threads_working; +} AV1EncRowMultiThreadSync; + +/*!\cond */ + +// TODO(jingning) All spatially adaptive variables should go to TileDataEnc. +typedef struct TileDataEnc { + TileInfo tile_info; + DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); + FRAME_CONTEXT *row_ctx; + uint64_t abs_sum_level; + uint8_t allow_update_cdf; + InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; + AV1EncRowMultiThreadSync row_mt_sync; + MV firstpass_top_mv; +} TileDataEnc; + +typedef struct RD_COUNTS { + int compound_ref_used_flag; + int skip_mode_used_flag; + int tx_type_used[TX_SIZES_ALL][TX_TYPES]; + int obmc_used[BLOCK_SIZES_ALL][2]; + int warped_used[2]; + int newmv_or_intra_blocks; + uint64_t seg_tmp_pred_cost[2]; +} RD_COUNTS; + +typedef struct ThreadData { + MACROBLOCK mb; + MvCosts *mv_costs_alloc; + IntraBCMVCosts *dv_costs_alloc; + RD_COUNTS rd_counts; + FRAME_COUNTS *counts; + PC_TREE_SHARED_BUFFERS shared_coeff_buf; + SIMPLE_MOTION_DATA_TREE *sms_tree; + SIMPLE_MOTION_DATA_TREE *sms_root; + uint32_t *hash_value_buffer[2][2]; + OBMCBuffer obmc_buffer; + PALETTE_BUFFER *palette_buffer; + CompoundTypeRdBuffers comp_rd_buffer; + CONV_BUF_TYPE *tmp_conv_dst; + uint64_t abs_sum_level; + uint8_t *tmp_pred_bufs[2]; + uint8_t *wiener_tmp_pred_buf; + int intrabc_used; + int deltaq_used; + int coefficient_size; + int max_mv_magnitude; + int interp_filter_selected[SWITCHABLE]; + FRAME_CONTEXT *tctx; + VP64x64 *vt64x64; + int32_t num_64x64_blocks; + PICK_MODE_CONTEXT *firstpass_ctx; + TemporalFilterData tf_data; + TplBuffers tpl_tmp_buffers; + TplTxfmStats tpl_txfm_stats; + GlobalMotionData gm_data; + // Pointer to the array of structures to store gradient information of each + // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level + // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV). + PixelLevelGradientInfo *pixel_gradient_info; + // Pointer to the array of structures to store source variance information of + // each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to + // store source variance and log of source variance of each 4x4 sub-block + // for subsequent retrieval. + Block4x4VarInfo *src_var_info_of_4x4_sub_blocks; + // Pointer to pc tree root. + PC_TREE *pc_root; +} ThreadData; + +struct EncWorkerData; + +/*!\endcond */ + +/*! + * \brief Encoder data related to row-based multi-threading + */ +typedef struct { + /*! + * Number of tile rows for which row synchronization memory is allocated. + */ + int allocated_tile_rows; + /*! + * Number of tile cols for which row synchronization memory is allocated. + */ + int allocated_tile_cols; + /*! + * Number of rows for which row synchronization memory is allocated + * per tile. During first-pass/look-ahead stage this equals the + * maximum number of macroblock rows in a tile. During encode stage, + * this equals the maximum number of superblock rows in a tile. + */ + int allocated_rows; + /*! + * Number of columns for which entropy context memory is allocated + * per tile. During encode stage, this equals the maximum number of + * superblock columns in a tile minus 1. The entropy context memory + * is not allocated during first-pass/look-ahead stage. + */ + int allocated_cols; + + /*! + * thread_id_to_tile_id[i] indicates the tile id assigned to the ith thread. + */ + int thread_id_to_tile_id[MAX_NUM_THREADS]; + + /*! + * num_tile_cols_done[i] indicates the number of tile columns whose encoding + * is complete in the ith superblock row. + */ + int *num_tile_cols_done; + + /*! + * Number of superblock rows in a frame for which 'num_tile_cols_done' is + * allocated. + */ + int allocated_sb_rows; + + /*! + * Initialized to false, set to true by the worker thread that encounters an + * error in order to abort the processing of other worker threads. + */ + bool row_mt_exit; + + /*! + * Initialized to false, set to true during first pass encoding by the worker + * thread that encounters an error in order to abort the processing of other + * worker threads. + */ + bool firstpass_mt_exit; + + /*! + * Initialized to false, set to true in cal_mb_wiener_var_hook() by the worker + * thread that encounters an error in order to abort the processing of other + * worker threads. + */ + bool mb_wiener_mt_exit; + +#if CONFIG_MULTITHREAD + /*! + * Mutex lock used while dispatching jobs. + */ + pthread_mutex_t *mutex_; + /*! + * Condition variable used to dispatch loopfilter jobs. + */ + pthread_cond_t *cond_; +#endif + + /** + * \name Row synchronization related function pointers. + */ + /**@{*/ + /*! + * Reader. + */ + void (*sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int); + /*! + * Writer. + */ + void (*sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int); + /**@}*/ +} AV1EncRowMultiThreadInfo; + +/*! + * \brief Encoder data related to multi-threading for allintra deltaq-mode=3 + */ +typedef struct { +#if CONFIG_MULTITHREAD + /*! + * Mutex lock used while dispatching jobs. + */ + pthread_mutex_t *mutex_; + /*! + * Condition variable used to dispatch loopfilter jobs. + */ + pthread_cond_t *cond_; +#endif + + /** + * \name Row synchronization related function pointers for all intra mode + */ + /**@{*/ + /*! + * Reader. + */ + void (*intra_sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int); + /*! + * Writer. + */ + void (*intra_sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int); + /**@}*/ +} AV1EncAllIntraMultiThreadInfo; + +/*! + * \brief Max number of recodes used to track the frame probabilities. + */ +#define NUM_RECODES_PER_FRAME 10 + +/*! + * \brief Max number of frames that can be encoded in a parallel encode set. + */ +#define MAX_PARALLEL_FRAMES 4 + +/*! + * \brief Buffers to be backed up during parallel encode set to be restored + * later. + */ +typedef struct RestoreStateBuffers { + /*! + * Backup of original CDEF srcbuf. + */ + uint16_t *cdef_srcbuf; + + /*! + * Backup of original CDEF colbuf. + */ + uint16_t *cdef_colbuf[MAX_MB_PLANE]; + + /*! + * Backup of original LR rst_tmpbuf. + */ + int32_t *rst_tmpbuf; + + /*! + * Backup of original LR rlbs. + */ + RestorationLineBuffers *rlbs; +} RestoreStateBuffers; + +/*! + * \brief Parameters related to restoration types. + */ +typedef struct { + /*! + * Stores the best coefficients for Wiener restoration. + */ + WienerInfo wiener; + + /*! + * Stores the best coefficients for Sgrproj restoration. + */ + SgrprojInfo sgrproj; + + /*! + * The rtype to use for this unit given a frame rtype as index. Indices: + * WIENER, SGRPROJ, SWITCHABLE. + */ + RestorationType best_rtype[RESTORE_TYPES - 1]; +} RestUnitSearchInfo; + +/*! + * \brief Structure to hold search parameter per restoration unit and + * intermediate buffer of Wiener filter used in pick filter stage of Loop + * restoration. + */ +typedef struct { + /*! + * Array of pointers to 'RestUnitSearchInfo' which holds data related to + * restoration types. + */ + RestUnitSearchInfo *rusi[MAX_MB_PLANE]; + + /*! + * Buffer used to hold dgd-avg data during SIMD call of Wiener filter. + */ + int16_t *dgd_avg; +} AV1LrPickStruct; + +/*! + * \brief Primary Encoder parameters related to multi-threading. + */ +typedef struct PrimaryMultiThreadInfo { + /*! + * Number of workers created for multi-threading. + */ + int num_workers; + + /*! + * Number of workers used for different MT modules. + */ + int num_mod_workers[NUM_MT_MODULES]; + + /*! + * Synchronization object used to launch job in the worker thread. + */ + AVxWorker *workers; + + /*! + * Data specific to each worker in encoder multi-threading. + * tile_thr_data[i] stores the worker data of the ith thread. + */ + struct EncWorkerData *tile_thr_data; + + /*! + * CDEF row multi-threading data. + */ + AV1CdefWorkerData *cdef_worker; + + /*! + * Primary(Level 1) Synchronization object used to launch job in the worker + * thread. + */ + AVxWorker *p_workers[MAX_PARALLEL_FRAMES]; + + /*! + * Number of primary workers created for multi-threading. + */ + int p_num_workers; + + /*! + * Tracks the number of workers in encode stage multi-threading. + */ + int prev_num_enc_workers; +} PrimaryMultiThreadInfo; + +/*! + * \brief Encoder parameters related to multi-threading. + */ +typedef struct MultiThreadInfo { + /*! + * Number of workers created for multi-threading. + */ + int num_workers; + + /*! + * Number of workers used for different MT modules. + */ + int num_mod_workers[NUM_MT_MODULES]; + + /*! + * Synchronization object used to launch job in the worker thread. + */ + AVxWorker *workers; + + /*! + * Data specific to each worker in encoder multi-threading. + * tile_thr_data[i] stores the worker data of the ith thread. + */ + struct EncWorkerData *tile_thr_data; + + /*! + * When set, indicates that row based multi-threading of the encoder is + * enabled. + */ + bool row_mt_enabled; + + /*! + * When set, indicates that multi-threading for bitstream packing is enabled. + */ + bool pack_bs_mt_enabled; + + /*! + * Encoder row multi-threading data. + */ + AV1EncRowMultiThreadInfo enc_row_mt; + + /*! + * Encoder multi-threading data for allintra mode in the preprocessing stage + * when --deltaq-mode=3. + */ + AV1EncAllIntraMultiThreadInfo intra_mt; + + /*! + * Tpl row multi-threading data. + */ + AV1TplRowMultiThreadInfo tpl_row_mt; + + /*! + * Loop Filter multi-threading object. + */ + AV1LfSync lf_row_sync; + + /*! + * Loop Restoration multi-threading object. + */ + AV1LrSync lr_row_sync; + + /*! + * Pack bitstream multi-threading object. + */ + AV1EncPackBSSync pack_bs_sync; + + /*! + * Global Motion multi-threading object. + */ + AV1GlobalMotionSync gm_sync; + + /*! + * Temporal Filter multi-threading object. + */ + AV1TemporalFilterSync tf_sync; + + /*! + * CDEF search multi-threading object. + */ + AV1CdefSync cdef_sync; + + /*! + * Pointer to CDEF row multi-threading data for the frame. + */ + AV1CdefWorkerData *cdef_worker; + + /*! + * Buffers to be stored/restored before/after parallel encode. + */ + RestoreStateBuffers restore_state_buf; + + /*! + * In multi-threaded realtime encoding with row-mt enabled, pipeline + * loop-filtering after encoding. + */ + int pipeline_lpf_mt_with_enc; +} MultiThreadInfo; + +/*!\cond */ + +typedef struct ActiveMap { + int enabled; + int update; + unsigned char *map; +} ActiveMap; + +/*!\endcond */ + +/*! + * \brief Encoder info used for decision on forcing integer motion vectors. + */ +typedef struct { + /*! + * cs_rate_array[i] is the fraction of blocks in a frame which either match + * with the collocated block or are smooth, where i is the rate_index. + */ + double cs_rate_array[32]; + /*! + * rate_index is used to index cs_rate_array. + */ + int rate_index; + /*! + * rate_size is the total number of entries populated in cs_rate_array. + */ + int rate_size; +} ForceIntegerMVInfo; + +/*!\cond */ + +#if CONFIG_INTERNAL_STATS +// types of stats +enum { + STAT_Y, + STAT_U, + STAT_V, + STAT_ALL, + NUM_STAT_TYPES // This should always be the last member of the enum +} UENUM1BYTE(StatType); + +typedef struct IMAGE_STAT { + double stat[NUM_STAT_TYPES]; + double worst; +} ImageStat; +#endif // CONFIG_INTERNAL_STATS + +typedef struct { + int ref_count; + YV12_BUFFER_CONFIG buf; +} EncRefCntBuffer; + +/*!\endcond */ + +/*! + * \brief Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level + * + * This is used for bitstream preparation. + */ +typedef struct { + /*! + * frame_base[mi_row * stride + mi_col] stores the mode information of + * block (mi_row,mi_col). + */ + MB_MODE_INFO_EXT_FRAME *frame_base; + /*! + * Size of frame_base buffer. + */ + int alloc_size; + /*! + * Stride of frame_base buffer. + */ + int stride; +} MBMIExtFrameBufferInfo; + +/*!\cond */ + +#if CONFIG_COLLECT_PARTITION_STATS +typedef struct FramePartitionTimingStats { + int partition_decisions[6][EXT_PARTITION_TYPES]; + int partition_attempts[6][EXT_PARTITION_TYPES]; + int64_t partition_times[6][EXT_PARTITION_TYPES]; + + int partition_redo; +} FramePartitionTimingStats; +#endif // CONFIG_COLLECT_PARTITION_STATS + +#if CONFIG_COLLECT_COMPONENT_TIMING +#include "aom_ports/aom_timer.h" +// Adjust the following to add new components. +enum { + av1_encode_strategy_time, + av1_get_one_pass_rt_params_time, + av1_get_second_pass_params_time, + denoise_and_encode_time, + apply_filtering_time, + av1_tpl_setup_stats_time, + encode_frame_to_data_rate_time, + encode_with_or_without_recode_time, + loop_filter_time, + cdef_time, + loop_restoration_time, + av1_pack_bitstream_final_time, + av1_encode_frame_time, + av1_compute_global_motion_time, + av1_setup_motion_field_time, + encode_sb_row_time, + + rd_pick_partition_time, + rd_use_partition_time, + choose_var_based_partitioning_time, + av1_prune_partitions_time, + none_partition_search_time, + split_partition_search_time, + rectangular_partition_search_time, + ab_partitions_search_time, + rd_pick_4partition_time, + encode_sb_time, + + rd_pick_sb_modes_time, + av1_rd_pick_intra_mode_sb_time, + av1_rd_pick_inter_mode_sb_time, + set_params_rd_pick_inter_mode_time, + skip_inter_mode_time, + handle_inter_mode_time, + evaluate_motion_mode_for_winner_candidates_time, + do_tx_search_time, + handle_intra_mode_time, + refine_winner_mode_tx_time, + av1_search_palette_mode_time, + handle_newmv_time, + compound_type_rd_time, + interpolation_filter_search_time, + motion_mode_rd_time, + + nonrd_use_partition_time, + pick_sb_modes_nonrd_time, + hybrid_intra_mode_search_time, + nonrd_pick_inter_mode_sb_time, + encode_b_nonrd_time, + + kTimingComponents, +} UENUM1BYTE(TIMING_COMPONENT); + +static INLINE char const *get_component_name(int index) { + switch (index) { + case av1_encode_strategy_time: return "av1_encode_strategy_time"; + case av1_get_one_pass_rt_params_time: + return "av1_get_one_pass_rt_params_time"; + case av1_get_second_pass_params_time: + return "av1_get_second_pass_params_time"; + case denoise_and_encode_time: return "denoise_and_encode_time"; + case apply_filtering_time: return "apply_filtering_time"; + case av1_tpl_setup_stats_time: return "av1_tpl_setup_stats_time"; + case encode_frame_to_data_rate_time: + return "encode_frame_to_data_rate_time"; + case encode_with_or_without_recode_time: + return "encode_with_or_without_recode_time"; + case loop_filter_time: return "loop_filter_time"; + case cdef_time: return "cdef_time"; + case loop_restoration_time: return "loop_restoration_time"; + case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time"; + case av1_encode_frame_time: return "av1_encode_frame_time"; + case av1_compute_global_motion_time: + return "av1_compute_global_motion_time"; + case av1_setup_motion_field_time: return "av1_setup_motion_field_time"; + case encode_sb_row_time: return "encode_sb_row_time"; + + case rd_pick_partition_time: return "rd_pick_partition_time"; + case rd_use_partition_time: return "rd_use_partition_time"; + case choose_var_based_partitioning_time: + return "choose_var_based_partitioning_time"; + case av1_prune_partitions_time: return "av1_prune_partitions_time"; + case none_partition_search_time: return "none_partition_search_time"; + case split_partition_search_time: return "split_partition_search_time"; + case rectangular_partition_search_time: + return "rectangular_partition_search_time"; + case ab_partitions_search_time: return "ab_partitions_search_time"; + case rd_pick_4partition_time: return "rd_pick_4partition_time"; + case encode_sb_time: return "encode_sb_time"; + + case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time"; + case av1_rd_pick_intra_mode_sb_time: + return "av1_rd_pick_intra_mode_sb_time"; + case av1_rd_pick_inter_mode_sb_time: + return "av1_rd_pick_inter_mode_sb_time"; + case set_params_rd_pick_inter_mode_time: + return "set_params_rd_pick_inter_mode_time"; + case skip_inter_mode_time: return "skip_inter_mode_time"; + case handle_inter_mode_time: return "handle_inter_mode_time"; + case evaluate_motion_mode_for_winner_candidates_time: + return "evaluate_motion_mode_for_winner_candidates_time"; + case do_tx_search_time: return "do_tx_search_time"; + case handle_intra_mode_time: return "handle_intra_mode_time"; + case refine_winner_mode_tx_time: return "refine_winner_mode_tx_time"; + case av1_search_palette_mode_time: return "av1_search_palette_mode_time"; + case handle_newmv_time: return "handle_newmv_time"; + case compound_type_rd_time: return "compound_type_rd_time"; + case interpolation_filter_search_time: + return "interpolation_filter_search_time"; + case motion_mode_rd_time: return "motion_mode_rd_time"; + + case nonrd_use_partition_time: return "nonrd_use_partition_time"; + case pick_sb_modes_nonrd_time: return "pick_sb_modes_nonrd_time"; + case hybrid_intra_mode_search_time: return "hybrid_intra_mode_search_time"; + case nonrd_pick_inter_mode_sb_time: return "nonrd_pick_inter_mode_sb_time"; + case encode_b_nonrd_time: return "encode_b_nonrd_time"; + + default: assert(0); + } + return "error"; +} +#endif + +// The maximum number of internal ARFs except ALTREF_FRAME +#define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1) + +/*!\endcond */ + +/*! + * \brief Parameters related to global motion search + */ +typedef struct { + /*! + * Flag to indicate if global motion search needs to be rerun. + */ + bool search_done; + + /*! + * Array of pointers to the frame buffers holding the reference frames. + * ref_buf[i] stores the pointer to the reference frame of the ith + * reference frame type. + */ + YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES]; + + /*! + * Holds the number of valid reference frames in past and future directions + * w.r.t. the current frame. num_ref_frames[i] stores the total number of + * valid reference frames in 'i' direction. + */ + int num_ref_frames[MAX_DIRECTIONS]; + + /*! + * Array of structure which stores the valid reference frames in past and + * future directions and their corresponding distance from the source frame. + * reference_frames[i][j] holds the jth valid reference frame type in the + * direction 'i' and its temporal distance from the source frame . + */ + FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1]; + + /** + * \name Dimensions for which segment map is allocated. + */ + /**@{*/ + int segment_map_w; /*!< segment map width */ + int segment_map_h; /*!< segment map height */ + /**@}*/ +} GlobalMotionInfo; + +/*! + * \brief Flags related to interpolation filter search + */ +typedef struct { + /*! + * Stores the default value of skip flag depending on chroma format + * Set as 1 for monochrome and 3 for other color formats + */ + int default_interp_skip_flags; + /*! + * Filter mask to allow certain interp_filter type. + */ + uint16_t interp_filter_search_mask; +} InterpSearchFlags; + +/*! + * \brief Parameters for motion vector search process + */ +typedef struct { + /*! + * Largest MV component used in a frame. + * The value from the previous frame is used to set the full pixel search + * range for the current frame. + */ + int max_mv_magnitude; + /*! + * Parameter indicating initial search window to be used in full-pixel search. + * Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window. + */ + int mv_step_param; + /*! + * Pointer to sub-pixel search function. + * In encoder: av1_find_best_sub_pixel_tree + * av1_find_best_sub_pixel_tree_pruned + * av1_find_best_sub_pixel_tree_pruned_more + * In MV unit test: av1_return_max_sub_pixel_mv + * av1_return_min_sub_pixel_mv + */ + fractional_mv_step_fp *find_fractional_mv_step; + /*! + * Search site configuration for full-pel MV search. + * search_site_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple + * motion search. search_site_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal + * filter search_site_cfg[SS_CFG_FPF]: Used during first pass and lookahead + */ + search_site_config search_site_cfg[SS_CFG_TOTAL][NUM_DISTINCT_SEARCH_METHODS]; +} MotionVectorSearchParams; + +/*! + * \brief Refresh frame flags for different type of frames. + * + * If the refresh flag is true for a particular reference frame, after the + * current frame is encoded, the reference frame gets refreshed (updated) to + * be the current frame. Note: Usually at most one flag will be set to true at + * a time. But, for key-frames, all flags are set to true at once. + */ +typedef struct { + bool golden_frame; /*!< Refresh flag for golden frame */ + bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */ + bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */ +} RefreshFrameInfo; + +/*! + * \brief Desired dimensions for an externally triggered resize. + * + * When resize is triggered externally, the desired dimensions are stored in + * this struct until used in the next frame to be coded. These values are + * effective only for one frame and are reset after they are used. + */ +typedef struct { + int width; /*!< Desired resized width */ + int height; /*!< Desired resized height */ +} ResizePendingParams; + +/*! + * \brief Refrence frame distance related variables. + */ +typedef struct { + /*! + * True relative distance of reference frames w.r.t. the current frame. + */ + int ref_relative_dist[INTER_REFS_PER_FRAME]; + /*! + * The nearest reference w.r.t. current frame in the past. + */ + int8_t nearest_past_ref; + /*! + * The nearest reference w.r.t. current frame in the future. + */ + int8_t nearest_future_ref; +} RefFrameDistanceInfo; + +/*! + * \brief Parameters used for winner mode processing. + * + * This is a basic two pass approach: in the first pass, we reduce the number of + * transform searches based on some thresholds during the rdopt process to find + * the "winner mode". In the second pass, we perform a more through tx search + * on the winner mode. + * There are some arrays in the struct, and their indices are used in the + * following manner: + * Index 0: Default mode evaluation, Winner mode processing is not applicable + * (Eg : IntraBc). + * Index 1: Mode evaluation. + * Index 2: Winner mode evaluation + * Index 1 and 2 are only used when the respective speed feature is on. + */ +typedef struct { + /*! + * Threshold to determine if trellis optimization is to be enabled + * based on : + * 0 : dist threshold + * 1 : satd threshold + * Corresponds to enable_winner_mode_for_coeff_opt speed feature. + */ + unsigned int coeff_opt_thresholds[MODE_EVAL_TYPES][2]; + + /*! + * Determines the tx size search method during rdopt. + * Corresponds to enable_winner_mode_for_tx_size_srch speed feature. + */ + TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES]; + + /*! + * Controls how often we should approximate prediction error with tx + * coefficients. If it's 0, then never. If 1, then it's during the tx_type + * search only. If 2, then always. + * Corresponds to tx_domain_dist_level speed feature. + */ + unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES]; + + /*! + * Threshold to approximate pixel domain distortion with transform domain + * distortion. This is only used if use_transform_domain_distortion is on. + * Corresponds to enable_winner_mode_for_use_tx_domain_dist speed feature. + */ + unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES]; + + /*! + * Controls how often we should try to skip the transform process based on + * result from dct. + * Corresponds to use_skip_flag_prediction speed feature. + */ + unsigned int skip_txfm_level[MODE_EVAL_TYPES]; + + /*! + * Predict DC only txfm blocks for default, mode and winner mode evaluation. + * Index 0: Default mode evaluation, Winner mode processing is not applicable. + * Index 1: Mode evaluation, Index 2: Winner mode evaluation + */ + unsigned int predict_dc_level[MODE_EVAL_TYPES]; +} WinnerModeParams; + +/*! + * \brief Frame refresh flags set by the external interface. + * + * Flags set by external interface to determine which reference buffers are + * refreshed by this frame. When set, the encoder will update the particular + * reference frame buffer with the contents of the current frame. + */ +typedef struct { + bool last_frame; /*!< Refresh flag for last frame */ + bool golden_frame; /*!< Refresh flag for golden frame */ + bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */ + bool alt2_ref_frame; /*!< Refresh flag for alt2-ref frame */ + bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */ + /*! + * Flag indicating if the update of refresh frame flags is pending. + */ + bool update_pending; +} ExtRefreshFrameFlagsInfo; + +/*! + * \brief Flags signalled by the external interface at frame level. + */ +typedef struct { + /*! + * Bit mask to disable certain reference frame types. + */ + int ref_frame_flags; + + /*! + * Frame refresh flags set by the external interface. + */ + ExtRefreshFrameFlagsInfo refresh_frame; + + /*! + * Flag to enable the update of frame contexts at the end of a frame decode. + */ + bool refresh_frame_context; + + /*! + * Flag to indicate that update of refresh_frame_context from external + * interface is pending. + */ + bool refresh_frame_context_pending; + + /*! + * Flag to enable temporal MV prediction. + */ + bool use_ref_frame_mvs; + + /*! + * Indicates whether the current frame is to be coded as error resilient. + */ + bool use_error_resilient; + + /*! + * Indicates whether the current frame is to be coded as s-frame. + */ + bool use_s_frame; + + /*! + * Indicates whether the current frame's primary_ref_frame is set to + * PRIMARY_REF_NONE. + */ + bool use_primary_ref_none; +} ExternalFlags; + +/*!\cond */ + +typedef struct { + // Some misc info + int high_prec; + int q; + int order; + + // MV counters + int inter_count; + int intra_count; + int default_mvs; + int mv_joint_count[4]; + int last_bit_zero; + int last_bit_nonzero; + + // Keep track of the rates + int total_mv_rate; + int hp_total_mv_rate; + int lp_total_mv_rate; + + // Texture info + int horz_text; + int vert_text; + int diag_text; + + // Whether the current struct contains valid data + int valid; +} MV_STATS; + +typedef struct WeberStats { + int64_t mb_wiener_variance; + int64_t src_variance; + int64_t rec_variance; + int16_t src_pix_max; + int16_t rec_pix_max; + int64_t distortion; + int64_t satd; + double max_scale; +} WeberStats; + +typedef struct { + struct loopfilter lf; + CdefInfo cdef_info; + YV12_BUFFER_CONFIG copy_buffer; + RATE_CONTROL rc; + MV_STATS mv_stats; +} CODING_CONTEXT; + +typedef struct { + int frame_width; + int frame_height; + int mi_rows; + int mi_cols; + int mb_rows; + int mb_cols; + int num_mbs; + aom_bit_depth_t bit_depth; + int subsampling_x; + int subsampling_y; +} FRAME_INFO; + +/*! + * \brief This structure stores different types of frame indices. + */ +typedef struct { + int show_frame_count; +} FRAME_INDEX_SET; + +/*!\endcond */ + +/*! + * \brief Segmentation related information for the current frame. + */ +typedef struct { + /*! + * 3-bit number containing the segment affiliation for each 4x4 block in the + * frame. map[y * stride + x] contains the segment id of the 4x4 block at + * (x,y) position. + */ + uint8_t *map; + /*! + * Flag to indicate if current frame has lossless segments or not. + * 1: frame has at least one lossless segment. + * 0: frame has no lossless segments. + */ + bool has_lossless_segment; +} EncSegmentationInfo; + +/*! + * \brief Frame time stamps. + */ +typedef struct { + /*! + * Start time stamp of the previous frame + */ + int64_t prev_ts_start; + /*! + * End time stamp of the previous frame + */ + int64_t prev_ts_end; + /*! + * Start time stamp of the first frame + */ + int64_t first_ts_start; +} TimeStamps; + +/*! + * Pointers to the memory allocated for frame level transform coeff related + * info. + */ +typedef struct { + /*! + * Pointer to the transformed coefficients buffer. + */ + tran_low_t *tcoeff; + /*! + * Pointer to the eobs buffer. + */ + uint16_t *eobs; + /*! + * Pointer to the entropy_ctx buffer. + */ + uint8_t *entropy_ctx; +} CoeffBufferPool; + +#if !CONFIG_REALTIME_ONLY +/*!\cond */ +// DUCKY_ENCODE_FRAME_MODE is c version of EncodeFrameMode +enum { + DUCKY_ENCODE_FRAME_MODE_NONE, // Let native AV1 determine q index and rdmult + DUCKY_ENCODE_FRAME_MODE_QINDEX, // DuckyEncode determines q index and AV1 + // determines rdmult + DUCKY_ENCODE_FRAME_MODE_QINDEX_RDMULT, // DuckyEncode determines q index and + // rdmult +} UENUM1BYTE(DUCKY_ENCODE_FRAME_MODE); + +enum { + DUCKY_ENCODE_GOP_MODE_NONE, // native AV1 decides GOP + DUCKY_ENCODE_GOP_MODE_RCL, // rate control lib decides GOP +} UENUM1BYTE(DUCKY_ENCODE_GOP_MODE); + +typedef struct DuckyEncodeFrameInfo { + DUCKY_ENCODE_FRAME_MODE qp_mode; + DUCKY_ENCODE_GOP_MODE gop_mode; + int q_index; + int rdmult; + // These two arrays are equivalent to std::vector + int *superblock_encode_qindex; + int *superblock_encode_rdmult; + int delta_q_enabled; +} DuckyEncodeFrameInfo; + +typedef struct DuckyEncodeFrameResult { + int global_order_idx; + int q_index; + int rdmult; + int rate; + int64_t dist; + double psnr; +} DuckyEncodeFrameResult; + +typedef struct DuckyEncodeInfo { + DuckyEncodeFrameInfo frame_info; + DuckyEncodeFrameResult frame_result; +} DuckyEncodeInfo; +/*!\endcond */ +#endif + +/*!\cond */ +typedef struct RTC_REF { + /*! + * LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + * BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + */ + int reference[INTER_REFS_PER_FRAME]; + int ref_idx[INTER_REFS_PER_FRAME]; + int refresh[REF_FRAMES]; + int set_ref_frame_config; + int non_reference_frame; + int ref_frame_comp[3]; + int gld_idx_1layer; + /*! + * Frame number of the last frame that refreshed the buffer slot. + */ + unsigned int buffer_time_index[REF_FRAMES]; + /*! + * Spatial layer id of the last frame that refreshed the buffer slot. + */ + unsigned char buffer_spatial_layer[REF_FRAMES]; + /*! + * Flag to indicate whether closest reference was the previous frame. + */ + bool reference_was_previous_frame; + /*! + * Flag to indicate this frame is based on longer term reference only, + * for recovery from past loss, and it should be biased for improved coding. + */ + bool bias_recovery_frame; +} RTC_REF; +/*!\endcond */ + +/*! + * \brief Structure to hold data corresponding to an encoded frame. + */ +typedef struct AV1_COMP_DATA { + /*! + * Buffer to store packed bitstream data of a frame. + */ + unsigned char *cx_data; + + /*! + * Allocated size of the cx_data buffer. + */ + size_t cx_data_sz; + + /*! + * Size of data written in the cx_data buffer. + */ + size_t frame_size; + + /*! + * Flags for the frame. + */ + unsigned int lib_flags; + + /*! + * Time stamp for start of frame. + */ + int64_t ts_frame_start; + + /*! + * Time stamp for end of frame. + */ + int64_t ts_frame_end; + + /*! + * Flag to indicate flush call. + */ + int flush; + + /*! + * Time base for sequence. + */ + const aom_rational64_t *timestamp_ratio; + + /*! + * Decide to pop the source for this frame from input buffer queue. + */ + int pop_lookahead; + + /*! + * Display order hint of frame whose packed data is in cx_data buffer. + */ + int frame_display_order_hint; +} AV1_COMP_DATA; + +/*! + * \brief Top level primary encoder structure + */ +typedef struct AV1_PRIMARY { + /*! + * Array of frame level encoder stage top level structures + */ + struct AV1_COMP *parallel_cpi[MAX_PARALLEL_FRAMES]; + + /*! + * Array of structures to hold data of frames encoded in a given parallel + * encode set. + */ + struct AV1_COMP_DATA parallel_frames_data[MAX_PARALLEL_FRAMES - 1]; +#if CONFIG_FPMT_TEST + /*! + * Flag which enables/disables simulation path for fpmt unit test. + * 0 - FPMT integration + * 1 - FPMT simulation + */ + FPMT_TEST_ENC_CFG fpmt_unit_test_cfg; + + /*! + * Temporary variable simulating the delayed frame_probability update. + */ + FrameProbInfo temp_frame_probs; + + /*! + * Temporary variable holding the updated frame probability across + * frames. Copy its value to temp_frame_probs for frame_parallel_level 0 + * frames or last frame in parallel encode set. + */ + FrameProbInfo temp_frame_probs_simulation; + + /*! + * Temporary variable simulating the delayed update of valid global motion + * model across frames. + */ + int temp_valid_gm_model_found[FRAME_UPDATE_TYPES]; +#endif // CONFIG_FPMT_TEST + /*! + * Copy of cm->ref_frame_map maintained to facilitate sequential update of + * ref_frame_map by lower layer depth frames encoded ahead of time in a + * parallel encode set. + */ + RefCntBuffer *ref_frame_map_copy[REF_FRAMES]; + + /*! + * Start time stamp of the last encoded show frame + */ + int64_t ts_start_last_show_frame; + + /*! + * End time stamp of the last encoded show frame + */ + int64_t ts_end_last_show_frame; + + /*! + * Number of frame level contexts(cpis) + */ + int num_fp_contexts; + + /*! + * Loopfilter levels of the previous encoded frame. + */ + int filter_level[2]; + + /*! + * Chrominance component loopfilter level of the previous encoded frame. + */ + int filter_level_u; + + /*! + * Chrominance component loopfilter level of the previous encoded frame. + */ + int filter_level_v; + + /*! + * Encode stage top level structure + * During frame parallel encode, this is the same as parallel_cpi[0] + */ + struct AV1_COMP *cpi; + + /*! + * Lookahead processing stage top level structure + */ + struct AV1_COMP *cpi_lap; + + /*! + * Look-ahead context. + */ + struct lookahead_ctx *lookahead; + + /*! + * Sequence parameters have been transmitted already and locked + * or not. Once locked av1_change_config cannot change the seq + * parameters. + */ + int seq_params_locked; + + /*! + * Pointer to internal utility functions that manipulate aom_codec_* data + * structures. + */ + struct aom_codec_pkt_list *output_pkt_list; + + /*! + * When set, indicates that internal ARFs are enabled. + */ + int internal_altref_allowed; + + /*! + * Tell if OVERLAY frame shows existing alt_ref frame. + */ + int show_existing_alt_ref; + + /*! + * Information related to a gf group. + */ + GF_GROUP gf_group; + + /*! + * Track prior gf group state. + */ + GF_STATE gf_state; + + /*! + * Flag indicating whether look ahead processing (LAP) is enabled. + */ + int lap_enabled; + + /*! + * Parameters for AV1 bitstream levels. + */ + AV1LevelParams level_params; + + /*! + * Calculates PSNR on each frame when set to 1. + */ + int b_calculate_psnr; + + /*! + * Number of frames left to be encoded, is 0 if limit is not set. + */ + int frames_left; + + /*! + * Information related to two pass encoding. + */ + TWO_PASS twopass; + + /*! + * Rate control related parameters. + */ + PRIMARY_RATE_CONTROL p_rc; + + /*! + * Info and resources used by temporal filtering. + */ + TEMPORAL_FILTER_INFO tf_info; + /*! + * Elements part of the sequence header, that are applicable for all the + * frames in the video. + */ + SequenceHeader seq_params; + + /*! + * Indicates whether to use SVC. + */ + int use_svc; + + /*! + * If true, buffer removal times are present. + */ + bool buffer_removal_time_present; + + /*! + * Number of temporal layers: may be > 1 for SVC (scalable vector coding). + */ + unsigned int number_temporal_layers; + + /*! + * Number of spatial layers: may be > 1 for SVC (scalable vector coding). + */ + unsigned int number_spatial_layers; + + /*! + * Code and details about current error status. + */ + struct aom_internal_error_info error; + + /*! + * Function pointers to variants of sse/sad/variance computation functions. + * fn_ptr[i] indicates the list of function pointers corresponding to block + * size i. + */ + aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL]; + + /*! + * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of + * the ith 16 x 16 block in raster scan order. + */ + double *tpl_sb_rdmult_scaling_factors; + + /*! + * Parameters related to tpl. + */ + TplParams tpl_data; + + /*! + * Motion vector stats of the previous encoded frame. + */ + MV_STATS mv_stats; + +#if CONFIG_INTERNAL_STATS + /*!\cond */ + uint64_t total_time_receive_data; + uint64_t total_time_compress_data; + + unsigned int total_mode_chosen_counts[MAX_MODES]; + + int count[2]; + uint64_t total_sq_error[2]; + uint64_t total_samples[2]; + ImageStat psnr[2]; + + double total_blockiness; + double worst_blockiness; + + int total_bytes; + double summed_quality; + double summed_weights; + double summed_quality_hbd; + double summed_weights_hbd; + unsigned int total_recode_hits; + double worst_ssim; + double worst_ssim_hbd; + + ImageStat fastssim; + ImageStat psnrhvs; + + int b_calculate_blockiness; + int b_calculate_consistency; + + double total_inconsistency; + double worst_consistency; + Ssimv *ssim_vars; + Metrics metrics; + /*!\endcond */ +#endif + +#if CONFIG_ENTROPY_STATS + /*! + * Aggregates frame counts for the sequence. + */ + FRAME_COUNTS aggregate_fc; +#endif // CONFIG_ENTROPY_STATS + + /*! + * For each type of reference frame, this contains the index of a reference + * frame buffer for a reference frame of the same type. We use this to + * choose our primary reference frame (which is the most recent reference + * frame of the same type as the current frame). + */ + int fb_of_context_type[REF_FRAMES]; + + /*! + * Primary Multi-threading parameters. + */ + PrimaryMultiThreadInfo p_mt_info; + + /*! + * Probabilities for pruning of various AV1 tools. + */ + FrameProbInfo frame_probs; + + /*! + * Indicates if a valid global motion model has been found in the different + * frame update types of a GF group. + * valid_gm_model_found[i] indicates if valid global motion model has been + * found in the frame update type with enum value equal to i + */ + int valid_gm_model_found[FRAME_UPDATE_TYPES]; + + /*! + * Struct for the reference structure for RTC. + */ + RTC_REF rtc_ref; + + /*! + * Struct for all intra mode row multi threading in the preprocess stage + * when --deltaq-mode=3. + */ + AV1EncRowMultiThreadSync intra_row_mt_sync; +} AV1_PRIMARY; + +/*! + * \brief Top level encoder structure. + */ +typedef struct AV1_COMP { + /*! + * Pointer to top level primary encoder structure + */ + AV1_PRIMARY *ppi; + + /*! + * Quantization and dequantization parameters for internal quantizer setup + * in the encoder. + */ + EncQuantDequantParams enc_quant_dequant_params; + + /*! + * Structure holding thread specific variables. + */ + ThreadData td; + + /*! + * Statistics collected at frame level. + */ + FRAME_COUNTS counts; + + /*! + * Holds buffer storing mode information at 4x4/8x8 level. + */ + MBMIExtFrameBufferInfo mbmi_ext_info; + + /*! + * Buffer holding the transform block related information. + * coeff_buffer_base[i] stores the transform block related information of the + * ith superblock in raster scan order. + */ + CB_COEFF_BUFFER *coeff_buffer_base; + + /*! + * Structure holding pointers to frame level memory allocated for transform + * block related information. + */ + CoeffBufferPool coeff_buffer_pool; + + /*! + * Structure holding variables common to encoder and decoder. + */ + AV1_COMMON common; + + /*! + * Encoder configuration related parameters. + */ + AV1EncoderConfig oxcf; + + /*! + * Stores the trellis optimization type at segment level. + * optimize_seg_arr[i] stores the trellis opt type for ith segment. + */ + TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS]; + + /*! + * Pointer to the frame buffer holding the source frame to be used during the + * current stage of encoding. It can be the raw input, temporally filtered + * input or scaled input. + */ + YV12_BUFFER_CONFIG *source; + + /*! + * Pointer to the frame buffer holding the last raw source frame. + * last_source is NULL for the following cases: + * 1) First frame + * 2) Alt-ref frames + * 3) All frames for all-intra frame encoding. + */ + YV12_BUFFER_CONFIG *last_source; + + /*! + * Pointer to the frame buffer holding the unscaled source frame. + * It can be either the raw input or temporally filtered input. + */ + YV12_BUFFER_CONFIG *unscaled_source; + + /*! + * Frame buffer holding the resized source frame (cropping / superres). + */ + YV12_BUFFER_CONFIG scaled_source; + + /*! + * Pointer to the frame buffer holding the unscaled last source frame. + */ + YV12_BUFFER_CONFIG *unscaled_last_source; + + /*! + * Frame buffer holding the resized last source frame. + */ + YV12_BUFFER_CONFIG scaled_last_source; + + /*! + * Pointer to the original source frame. This is used to determine if the + * content is screen. + */ + YV12_BUFFER_CONFIG *unfiltered_source; + + /*! + * Frame buffer holding the orig source frame for PSNR calculation in rtc tf + * case. + */ + YV12_BUFFER_CONFIG orig_source; + + /*! + * Skip tpl setup when tpl data from gop length decision can be reused. + */ + int skip_tpl_setup_stats; + + /*! + * Scaling factors used in the RD multiplier modulation. + * TODO(sdeng): consider merge the following arrays. + * tpl_rdmult_scaling_factors is a temporary buffer used to store the + * intermediate scaling factors which are used in the calculation of + * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the + * intermediate scaling factor of the ith 16 x 16 block in raster scan order. + */ + double *tpl_rdmult_scaling_factors; + + /*! + * Temporal filter context. + */ + TemporalFilterCtx tf_ctx; + + /*! + * Pointer to CDEF search context. + */ + CdefSearchCtx *cdef_search_ctx; + + /*! + * Variables related to forcing integer mv decisions for the current frame. + */ + ForceIntegerMVInfo force_intpel_info; + + /*! + * Pointer to the buffer holding the scaled reference frames. + * scaled_ref_buf[i] holds the scaled reference frame of type i. + */ + RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME]; + + /*! + * Pointer to the buffer holding the last show frame. + */ + RefCntBuffer *last_show_frame_buf; + + /*! + * Refresh frame flags for golden, bwd-ref and alt-ref frames. + */ + RefreshFrameInfo refresh_frame; + + /*! + * Flag to reduce the number of reference frame buffers used in rt. + */ + int rt_reduce_num_ref_buffers; + + /*! + * Flags signalled by the external interface at frame level. + */ + ExternalFlags ext_flags; + + /*! + * Temporary frame buffer used to store the non-loop filtered reconstructed + * frame during the search of loop filter level. + */ + YV12_BUFFER_CONFIG last_frame_uf; + + /*! + * Temporary frame buffer used to store the loop restored frame during loop + * restoration search. + */ + YV12_BUFFER_CONFIG trial_frame_rst; + + /*! + * Ambient reconstruction err target for force key frames. + */ + int64_t ambient_err; + + /*! + * Parameters related to rate distortion optimization. + */ + RD_OPT rd; + + /*! + * Temporary coding context used to save and restore when encoding with and + * without super-resolution. + */ + CODING_CONTEXT coding_context; + + /*! + * Parameters related to global motion search. + */ + GlobalMotionInfo gm_info; + + /*! + * Parameters related to winner mode processing. + */ + WinnerModeParams winner_mode_params; + + /*! + * Frame time stamps. + */ + TimeStamps time_stamps; + + /*! + * Rate control related parameters. + */ + RATE_CONTROL rc; + + /*! + * Frame rate of the video. + */ + double framerate; + + /*! + * Bitmask indicating which reference buffers may be referenced by this frame. + */ + int ref_frame_flags; + + /*! + * speed is passed as a per-frame parameter into the encoder. + */ + int speed; + + /*! + * sf contains fine-grained config set internally based on speed. + */ + SPEED_FEATURES sf; + + /*! + * Parameters for motion vector search process. + */ + MotionVectorSearchParams mv_search_params; + + /*! + * When set, indicates that all reference frames are forward references, + * i.e., all the reference frames are output before the current frame. + */ + int all_one_sided_refs; + + /*! + * Segmentation related information for current frame. + */ + EncSegmentationInfo enc_seg; + + /*! + * Parameters related to cyclic refresh aq-mode. + */ + CYCLIC_REFRESH *cyclic_refresh; + /*! + * Parameters related to active map. Active maps indicate + * if there is any activity on a 4x4 block basis. + */ + ActiveMap active_map; + + /*! + * The frame processing order within a GOP. + */ + unsigned char gf_frame_index; + +#if CONFIG_INTERNAL_STATS + /*!\cond */ + uint64_t time_compress_data; + + unsigned int mode_chosen_counts[MAX_MODES]; + int bytes; + unsigned int frame_recode_hits; + /*!\endcond */ +#endif + +#if CONFIG_SPEED_STATS + /*! + * For debugging: number of transform searches we have performed. + */ + unsigned int tx_search_count; +#endif // CONFIG_SPEED_STATS + + /*! + * When set, indicates that the frame is droppable, i.e., this frame + * does not update any reference buffers. + */ + int droppable; + + /*! + * Stores the frame parameters during encoder initialization. + */ + FRAME_INFO frame_info; + + /*! + * Stores different types of frame indices. + */ + FRAME_INDEX_SET frame_index_set; + + /*! + * Store the cm->width in the last call of alloc_compressor_data(). Help + * determine whether compressor data should be reallocated when cm->width + * changes. + */ + int data_alloc_width; + + /*! + * Store the cm->height in the last call of alloc_compressor_data(). Help + * determine whether compressor data should be reallocated when cm->height + * changes. + */ + int data_alloc_height; + + /*! + * Number of MBs in the full-size frame; to be used to + * normalize the firstpass stats. This will differ from the + * number of MBs in the current frame when the frame is + * scaled. + */ + int initial_mbs; + + /*! + * Flag to indicate whether the frame size inforamation has been + * setup and propagated to associated allocations. + */ + bool frame_size_related_setup_done; + + /*! + * The width of the frame that is lastly encoded. + * It is updated in the function "encoder_encode()". + */ + int last_coded_width; + + /*! + * The height of the frame that is lastly encoded. + * It is updated in the function "encoder_encode()". + */ + int last_coded_height; + + /*! + * Resize related parameters. + */ + ResizePendingParams resize_pending_params; + + /*! + * Pointer to struct holding adaptive data/contexts/models for the tile during + * encoding. + */ + TileDataEnc *tile_data; + /*! + * Number of tiles for which memory has been allocated for tile_data. + */ + int allocated_tiles; + + /*! + * Structure to store the palette token related information. + */ + TokenInfo token_info; + + /*! + * VARIANCE_AQ segment map refresh. + */ + int vaq_refresh; + + /*! + * Thresholds for variance based partitioning. + */ + VarBasedPartitionInfo vbp_info; + + /*! + * Number of recodes in the frame. + */ + int num_frame_recode; + + /*! + * Current frame probability of parallel frames, across recodes. + */ + FrameProbInfo frame_new_probs[NUM_RECODES_PER_FRAME]; + + /*! + * Retain condition for transform type frame_probability calculation + */ + int do_update_frame_probs_txtype[NUM_RECODES_PER_FRAME]; + + /*! + * Retain condition for obmc frame_probability calculation + */ + int do_update_frame_probs_obmc[NUM_RECODES_PER_FRAME]; + + /*! + * Retain condition for warped motion frame_probability calculation + */ + int do_update_frame_probs_warp[NUM_RECODES_PER_FRAME]; + + /*! + * Retain condition for interpolation filter frame_probability calculation + */ + int do_update_frame_probs_interpfilter[NUM_RECODES_PER_FRAME]; + +#if CONFIG_FPMT_TEST + /*! + * Temporary variable for simulation. + * Previous frame's framerate. + */ + double temp_framerate; +#endif + /*! + * Updated framerate for the current parallel frame. + * cpi->framerate is updated with new_framerate during + * post encode updates for parallel frames. + */ + double new_framerate; + + /*! + * Retain condition for fast_extra_bits calculation. + */ + int do_update_vbr_bits_off_target_fast; + + /*! + * Multi-threading parameters. + */ + MultiThreadInfo mt_info; + + /*! + * Specifies the frame to be output. It is valid only if show_existing_frame + * is 1. When show_existing_frame is 0, existing_fb_idx_to_show is set to + * INVALID_IDX. + */ + int existing_fb_idx_to_show; + + /*! + * A flag to indicate if intrabc is ever used in current frame. + */ + int intrabc_used; + + /*! + * Mark which ref frames can be skipped for encoding current frame during RDO. + */ + int prune_ref_frame_mask; + + /*! + * Loop Restoration context. + */ + AV1LrStruct lr_ctxt; + + /*! + * Loop Restoration context used during pick stage. + */ + AV1LrPickStruct pick_lr_ctxt; + + /*! + * Pointer to list of tables with film grain parameters. + */ + aom_film_grain_table_t *film_grain_table; + +#if CONFIG_DENOISE + /*! + * Pointer to structure holding the denoised image buffers and the helper + * noise models. + */ + struct aom_denoise_and_model_t *denoise_and_model; +#endif + + /*! + * Flags related to interpolation filter search. + */ + InterpSearchFlags interp_search_flags; + + /*! + * Turn on screen content tools flag. + * Note that some videos are not screen content videos, but + * screen content tools could also improve coding efficiency. + * For example, videos with large flat regions, gaming videos that look + * like natural videos. + */ + int use_screen_content_tools; + + /*! + * A flag to indicate "real" screen content videos. + * For example, screen shares, screen editing. + * This type is true indicates |use_screen_content_tools| must be true. + * In addition, rate control strategy is adjusted when this flag is true. + */ + int is_screen_content_type; + +#if CONFIG_COLLECT_PARTITION_STATS + /*! + * Accumulates the partition timing stat over the whole frame. + */ + FramePartitionTimingStats partition_stats; +#endif // CONFIG_COLLECT_PARTITION_STATS + +#if CONFIG_COLLECT_COMPONENT_TIMING + /*! + * component_time[] are initialized to zero while encoder starts. + */ + uint64_t component_time[kTimingComponents]; + /*! + * Stores timing for individual components between calls of start_timing() + * and end_timing(). + */ + struct aom_usec_timer component_timer[kTimingComponents]; + /*! + * frame_component_time[] are initialized to zero at beginning of each frame. + */ + uint64_t frame_component_time[kTimingComponents]; +#endif + + /*! + * Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation. + */ + int frame_header_count; + + /*! + * Whether any no-zero delta_q was actually used. + */ + int deltaq_used; + + /*! + * Refrence frame distance related variables. + */ + RefFrameDistanceInfo ref_frame_dist_info; + + /*! + * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of + * the ith 16 x 16 block in raster scan order. This scaling factor is used for + * RD multiplier modulation when SSIM tuning is enabled. + */ + double *ssim_rdmult_scaling_factors; + +#if CONFIG_TUNE_VMAF + /*! + * Parameters for VMAF tuning. + */ + TuneVMAFInfo vmaf_info; +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + /*! + * Parameters for Butteraugli tuning. + */ + TuneButteraugliInfo butteraugli_info; +#endif + + /*! + * Parameters for scalable video coding. + */ + SVC svc; + + /*! + * Indicates whether current processing stage is encode stage or LAP stage. + */ + COMPRESSOR_STAGE compressor_stage; + + /*! + * Frame type of the last frame. May be used in some heuristics for speeding + * up the encoding. + */ + FRAME_TYPE last_frame_type; + + /*! + * Number of tile-groups. + */ + int num_tg; + + /*! + * Super-resolution mode currently being used by the encoder. + * This may / may not be same as user-supplied mode in oxcf->superres_mode + * (when we are recoding to try multiple options for example). + */ + aom_superres_mode superres_mode; + + /*! + * First pass related data. + */ + FirstPassData firstpass_data; + + /*! + * Temporal Noise Estimate + */ + NOISE_ESTIMATE noise_estimate; + +#if CONFIG_AV1_TEMPORAL_DENOISING + /*! + * Temporal Denoiser + */ + AV1_DENOISER denoiser; +#endif + + /*! + * Count on how many consecutive times a block uses small/zeromv for encoding + * in a scale of 8x8 block. + */ + uint8_t *consec_zero_mv; + + /*! + * Allocated memory size for |consec_zero_mv|. + */ + int consec_zero_mv_alloc_size; + + /*! + * Block size of first pass encoding + */ + BLOCK_SIZE fp_block_size; + + /*! + * The counter of encoded super block, used to differentiate block names. + * This number starts from 0 and increases whenever a super block is encoded. + */ + int sb_counter; + + /*! + * Available bitstream buffer size in bytes + */ + size_t available_bs_size; + + /*! + * The controller of the external partition model. + * It is used to do partition type selection based on external models. + */ + ExtPartController ext_part_controller; + + /*! + * Motion vector stats of the current encoded frame, used to update the + * ppi->mv_stats during postencode. + */ + MV_STATS mv_stats; + /*! + * Stores the reference refresh index for the current frame. + */ + int ref_refresh_index; + + /*! + * A flag to indicate if the reference refresh index is available for the + * current frame. + */ + bool refresh_idx_available; + + /*! + * Reference frame index corresponding to the frame to be excluded from being + * used as a reference by frame_parallel_level 2 frame in a parallel + * encode set of lower layer frames. + */ + int ref_idx_to_skip; +#if CONFIG_FPMT_TEST + /*! + * Stores the wanted frame buffer index for choosing primary ref frame by a + * frame_parallel_level 2 frame in a parallel encode set of lower layer + * frames. + */ + + int wanted_fb; +#endif // CONFIG_FPMT_TEST + + /*! + * A flag to indicate frames that will update their data to the primary + * context at the end of the encode. It is set for non-parallel frames and the + * last frame in encode order in a given parallel encode set. + */ + bool do_frame_data_update; + +#if CONFIG_RD_COMMAND + /*! + * A structure for assigning external q_index / rdmult for experiments + */ + RD_COMMAND rd_command; +#endif // CONFIG_RD_COMMAND + + /*! + * Buffer to store MB variance after Wiener filter. + */ + WeberStats *mb_weber_stats; + + /*! + * Buffer to store rate cost estimates for each macro block (8x8) in the + * preprocessing stage used in allintra mode. + */ + int *prep_rate_estimates; + + /*! + * Buffer to store rate cost estimates for each 16x16 block read + * from an external file, used in allintra mode. + */ + double *ext_rate_distribution; + + /*! + * The scale that equals sum_rate_uniform_quantizer / sum_ext_rate. + */ + double ext_rate_scale; + + /*! + * Buffer to store MB variance after Wiener filter. + */ + BLOCK_SIZE weber_bsize; + + /*! + * Frame level Wiener filter normalization. + */ + int64_t norm_wiener_variance; + + /*! + * Buffer to store delta-q values for delta-q mode 4. + */ + int *mb_delta_q; + + /*! + * Flag to indicate that current frame is dropped. + */ + bool is_dropped_frame; + +#if CONFIG_BITRATE_ACCURACY + /*! + * Structure stores information needed for bitrate accuracy experiment. + */ + VBR_RATECTRL_INFO vbr_rc_info; +#endif + +#if CONFIG_RATECTRL_LOG + /*! + * Structure stores information of rate control decisions. + */ + RATECTRL_LOG rc_log; +#endif // CONFIG_RATECTRL_LOG + + /*! + * Frame level twopass status and control data + */ + TWO_PASS_FRAME twopass_frame; + + /*! + * Context needed for third pass encoding. + */ + THIRD_PASS_DEC_CTX *third_pass_ctx; + + /*! + * File pointer to second pass log + */ + FILE *second_pass_log_stream; + + /*! + * Buffer to store 64x64 SAD + */ + uint64_t *src_sad_blk_64x64; + + /*! + * SSE between the current frame and the reconstructed last frame + * It is only used for CBR mode. + * It is not used if the reference frame has a different frame size. + */ + uint64_t rec_sse; + + /*! + * A flag to indicate whether the encoder is controlled by DuckyEncode or not. + * 1:yes 0:no + */ + int use_ducky_encode; + +#if !CONFIG_REALTIME_ONLY + /*! A structure that facilitates the communication between DuckyEncode and AV1 + * encoder. + */ + DuckyEncodeInfo ducky_encode_info; +#endif // CONFIG_REALTIME_ONLY + // + /*! + * Frames since last frame with cdf update. + */ + int frames_since_last_update; + + /*! + * Block level thresholds to force zeromv-skip at partition level. + */ + unsigned int zeromv_skip_thresh_exit_part[BLOCK_SIZES_ALL]; + + /*! + * Number of downsampling pyramid levels to allocate for each frame + * This is currently only used for global motion + */ + int image_pyramid_levels; + +#if CONFIG_SALIENCY_MAP + /*! + * Pixel level saliency map for each frame. + */ + uint8_t *saliency_map; + + /*! + * Superblock level rdmult scaling factor driven by saliency map. + */ + double *sm_scaling_factor; +#endif + + /*! + * Number of pixels that choose palette mode for luma in the + * fast encoding pass in av1_determine_sc_tools_with_encoding(). + */ + int palette_pixel_num; + + /*! + * Flag to indicate scaled_last_source is available, + * so scaling is not needed for last_source. + */ + int scaled_last_source_available; +} AV1_COMP; + +/*! + * \brief Input frames and last input frame + */ +typedef struct EncodeFrameInput { + /*!\cond */ + YV12_BUFFER_CONFIG *source; + YV12_BUFFER_CONFIG *last_source; + int64_t ts_duration; + /*!\endcond */ +} EncodeFrameInput; + +/*! + * \brief contains per-frame encoding parameters decided upon by + * av1_encode_strategy() and passed down to av1_encode(). + */ +typedef struct EncodeFrameParams { + /*! + * Is error resilient mode enabled + */ + int error_resilient_mode; + /*! + * Frame type (eg KF vs inter frame etc) + */ + FRAME_TYPE frame_type; + + /*!\cond */ + int primary_ref_frame; + int order_offset; + + /*!\endcond */ + /*! + * Should the current frame be displayed after being decoded + */ + int show_frame; + + /*!\cond */ + int refresh_frame_flags; + + int show_existing_frame; + int existing_fb_idx_to_show; + + /*!\endcond */ + /*! + * Bitmask of which reference buffers may be referenced by this frame. + */ + int ref_frame_flags; + + /*! + * Reference buffer assignment for this frame. + */ + int remapped_ref_idx[REF_FRAMES]; + + /*! + * Flags which determine which reference buffers are refreshed by this + * frame. + */ + RefreshFrameInfo refresh_frame; + + /*! + * Speed level to use for this frame: Bigger number means faster. + */ + int speed; +} EncodeFrameParams; + +/*!\cond */ + +// EncodeFrameResults contains information about the result of encoding a +// single frame +typedef struct { + size_t size; // Size of resulting bitstream +} EncodeFrameResults; + +void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage); + +struct AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf, + BufferPool *const pool, + COMPRESSOR_STAGE stage, + int lap_lag_in_frames); + +struct AV1_PRIMARY *av1_create_primary_compressor( + struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers, + const AV1EncoderConfig *oxcf); + +void av1_remove_compressor(AV1_COMP *cpi); + +void av1_remove_primary_compressor(AV1_PRIMARY *ppi); + +#if CONFIG_ENTROPY_STATS +void print_entropy_stats(AV1_PRIMARY *const ppi); +#endif +#if CONFIG_INTERNAL_STATS +void print_internal_stats(AV1_PRIMARY *ppi); +#endif + +void av1_change_config_seq(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf, + bool *sb_size_changed); + +void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf, + bool sb_size_changed); + +aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y); + +void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi, + const AV1EncoderConfig *oxcf, int use_svc); + +void av1_post_encode_updates(AV1_COMP *const cpi, + const AV1_COMP_DATA *const cpi_data); + +void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map); + +void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool, + int ref_buffers_used_map); + +void av1_release_scaled_references_fpmt(AV1_COMP *cpi); + +void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool, + int ref_buffers_used_map); + +void av1_init_sc_decisions(AV1_PRIMARY *const ppi); + +AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data); + +int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data, + AV1_PRIMARY *const ppi, + int *ref_buffers_used_map); +/*!\endcond */ + +/*!\brief Obtain the raw frame data + * + * \ingroup high_level_algo + * This function receives the raw frame data from input. + * + * \param[in] cpi Top-level encoder structure + * \param[in] frame_flags Flags to decide how to encoding the frame + * \param[in,out] sd Contain raw frame data + * \param[in] time_stamp Time stamp of the frame + * \param[in] end_time_stamp End time stamp + * + * \return Returns a value to indicate if the frame data is received + * successfully. + * \note The caller can assume that a copy of this frame is made and not just a + * copy of the pointer. + */ +int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time_stamp); + +/*!\brief Encode a frame + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + * This function encodes the raw frame data, and outputs the frame bit stream + * to the designated buffer. The caller should use the output parameters + * cpi_data->ts_frame_start and cpi_data->ts_frame_end only when this function + * returns AOM_CODEC_OK. + * + * \param[in] cpi Top-level encoder structure + * \param[in,out] cpi_data Data corresponding to a frame encode + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval -1 + * No frame encoded; more input is required. + * \retval "A nonzero (positive) aom_codec_err_t code" + * The encoding failed with the error. Sets the error code and error message + * in \c cpi->common.error. + */ +int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data); + +/*!\brief Run 1-pass/2-pass encoding + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + */ +int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, + const EncodeFrameInput *const frame_input, + const EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results); + +/*!\cond */ +int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest); + +int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame); + +aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd); + +int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags); + +int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); + +int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); + +void av1_set_frame_size(AV1_COMP *cpi, int width, int height); + +void av1_set_mv_search_params(AV1_COMP *cpi); + +int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); + +int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); + +int av1_set_internal_size(AV1EncoderConfig *const oxcf, + ResizePendingParams *resize_pending_params, + AOM_SCALING_MODE horiz_mode, + AOM_SCALING_MODE vert_mode); + +int av1_get_quantizer(struct AV1_COMP *cpi); + +int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size); + +void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td); + +void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td); + +// Set screen content options. +// This function estimates whether to use screen content tools, by counting +// the portion of blocks that have few luma colors. +// Modifies: +// cpi->commom.features.allow_screen_content_tools +// cpi->common.features.allow_intrabc +// cpi->use_screen_content_tools +// cpi->is_screen_content_type +// However, the estimation is not accurate and may misclassify videos. +// A slower but more accurate approach that determines whether to use screen +// content tools is employed later. See av1_determine_sc_tools_with_encoding(). +void av1_set_screen_content_options(struct AV1_COMP *cpi, + FeatureFlags *features); + +void av1_update_frame_size(AV1_COMP *cpi); + +typedef struct { + int pyr_level; + int disp_order; +} RefFrameMapPair; + +static INLINE void init_ref_map_pair( + AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) { + if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) { + memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES); + return; + } + memset(ref_frame_map_pairs, 0, sizeof(*ref_frame_map_pairs) * REF_FRAMES); + for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { + // Get reference frame buffer. + const RefCntBuffer *const buf = cpi->common.ref_frame_map[map_idx]; + if (ref_frame_map_pairs[map_idx].disp_order == -1) continue; + if (buf == NULL) { + ref_frame_map_pairs[map_idx].disp_order = -1; + ref_frame_map_pairs[map_idx].pyr_level = -1; + continue; + } else if (buf->ref_count > 1) { + // Once the keyframe is coded, the slots in ref_frame_map will all + // point to the same frame. In that case, all subsequent pointers + // matching the current are considered "free" slots. This will find + // the next occurrence of the current pointer if ref_count indicates + // there are multiple instances of it and mark it as free. + for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) { + const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2]; + if (buf2 == buf) { + ref_frame_map_pairs[idx2].disp_order = -1; + ref_frame_map_pairs[idx2].pyr_level = -1; + } + } + } + ref_frame_map_pairs[map_idx].disp_order = (int)buf->display_order_hint; + ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level; + } +} + +#if CONFIG_FPMT_TEST +static AOM_INLINE void calc_frame_data_update_flag( + GF_GROUP *const gf_group, int gf_frame_index, + bool *const do_frame_data_update) { + *do_frame_data_update = true; + // Set the flag to false for all frames in a given parallel encode set except + // the last frame in the set with frame_parallel_level = 2. + if (gf_group->frame_parallel_level[gf_frame_index] == 1) { + *do_frame_data_update = false; + } else if (gf_group->frame_parallel_level[gf_frame_index] == 2) { + // Check if this is the last frame in the set with frame_parallel_level = 2. + for (int i = gf_frame_index + 1; i < gf_group->size; i++) { + if ((gf_group->frame_parallel_level[i] == 0 && + (gf_group->update_type[i] == ARF_UPDATE || + gf_group->update_type[i] == INTNL_ARF_UPDATE)) || + gf_group->frame_parallel_level[i] == 1) { + break; + } else if (gf_group->frame_parallel_level[i] == 2) { + *do_frame_data_update = false; + break; + } + } + } +} +#endif + +// av1 uses 10,000,000 ticks/second as time stamp +#define TICKS_PER_SEC 10000000LL + +static INLINE int64_t +timebase_units_to_ticks(const aom_rational64_t *timestamp_ratio, int64_t n) { + return n * timestamp_ratio->num / timestamp_ratio->den; +} + +static INLINE int64_t +ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) { + int64_t round = timestamp_ratio->num / 2; + if (round > 0) --round; + return (n * timestamp_ratio->den + round) / timestamp_ratio->num; +} + +static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) { + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const FRAME_UPDATE_TYPE update_type = + gf_group->update_type[cpi->gf_frame_index]; + + return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE || + update_type == GF_UPDATE; +} + +// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD. +static INLINE int av1_use_hash_me(const AV1_COMP *const cpi) { + return (cpi->common.features.allow_screen_content_tools && + cpi->common.features.allow_intrabc && + frame_is_intra_only(&cpi->common)); +} + +static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf( + const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + return buf != NULL ? &buf->buf : NULL; +} + +static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) { + assert(buf != NULL); + ensure_mv_buffer(buf, cm); + buf->width = cm->width; + buf->height = cm->height; +} + +// Get the allocated token size for a tile. It does the same calculation as in +// the frame token allocation. +static INLINE unsigned int allocated_tokens(const TileInfo *tile, + int sb_size_log2, int num_planes) { + int tile_mb_rows = + ROUND_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, 2); + int tile_mb_cols = + ROUND_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start, 2); + + return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes); +} + +static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col, + int mi_row, TokenExtra **tok, int sb_size_log2, + int num_planes) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + const int tile_mb_cols = + (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; + const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2; + + *tok = cpi->token_info.tile_tok[tile_row][tile_col] + + get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes); +} + +void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags); + +#define ALT_MIN_LAG 3 +static INLINE int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) { + return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf; +} + +static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) { + return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) && + (gf_cfg->gf_min_pyr_height == 0); +} + +// Helper function to compute number of blocks on either side of the frame. +static INLINE int get_num_blocks(const int frame_length, const int mb_length) { + return (frame_length + mb_length - 1) / mb_length; +} + +// Check if statistics generation stage +static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) { + assert(IMPLIES(cpi->compressor_stage == LAP_STAGE, + cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->ppi->lap_enabled)); + return (cpi->oxcf.pass == AOM_RC_FIRST_PASS || + (cpi->compressor_stage == LAP_STAGE)); +} +// Check if statistics consumption stage +static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) { + return (cpi->oxcf.pass >= AOM_RC_SECOND_PASS); +} + +// Check if statistics consumption stage +static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) { + return (is_stat_consumption_stage_twopass(cpi) || + (cpi->oxcf.pass == AOM_RC_ONE_PASS && + (cpi->compressor_stage == ENCODE_STAGE) && cpi->ppi->lap_enabled)); +} + +// Decide whether 'dv_costs' need to be allocated/stored during the encoding. +static AOM_INLINE bool av1_need_dv_costs(const AV1_COMP *const cpi) { + return !cpi->sf.rt_sf.use_nonrd_pick_mode && + av1_allow_intrabc(&cpi->common) && !is_stat_generation_stage(cpi); +} + +/*!\endcond */ +/*!\brief Check if the current stage has statistics + * + *\ingroup two_pass_algo + * + * \param[in] cpi Top - level encoder instance structure + * + * \return 0 if no stats for current stage else 1 + */ +static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) { + assert( + IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE)); + return (cpi->oxcf.pass == AOM_RC_ONE_PASS && !cpi->ppi->lap_enabled); +} + +/*!\cond */ + +static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) { + return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME && + cpi->oxcf.gf_cfg.lag_in_frames == 0; +} + +// Use default/internal reference structure for single-layer RTC. +static INLINE int use_rtc_reference_structure_one_layer(const AV1_COMP *cpi) { + return is_one_pass_rt_params(cpi) && cpi->ppi->number_spatial_layers == 1 && + cpi->ppi->number_temporal_layers == 1 && + !cpi->ppi->rtc_ref.set_ref_frame_config; +} + +// Function return size of frame stats buffer +static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) { + /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */ + return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer); +} + +// TODO(zoeliu): To set up cpi->oxcf.gf_cfg.enable_auto_brf + +static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd, + MV_REFERENCE_FRAME ref0, + MV_REFERENCE_FRAME ref1) { + xd->block_ref_scale_factors[0] = + get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1); + xd->block_ref_scale_factors[1] = + get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1); +} + +static INLINE int get_chessboard_index(int frame_index) { + return frame_index & 0x1; +} + +static INLINE const int *cond_cost_list_const(const struct AV1_COMP *cpi, + const int *cost_list) { + const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE && + cpi->sf.mv_sf.use_fullpel_costlist; + return use_cost_list ? cost_list : NULL; +} + +static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) { + const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE && + cpi->sf.mv_sf.use_fullpel_costlist; + return use_cost_list ? cost_list : NULL; +} + +// Compression ratio of current frame. +double av1_get_compression_ratio(const AV1_COMMON *const cm, + size_t encoded_frame_size); + +void av1_new_framerate(AV1_COMP *cpi, double framerate); + +void av1_setup_frame_size(AV1_COMP *cpi); + +#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) + +// Returns 1 if a frame is scaled and 0 otherwise. +static INLINE int av1_resize_scaled(const AV1_COMMON *cm) { + return cm->superres_upscaled_width != cm->render_width || + cm->superres_upscaled_height != cm->render_height; +} + +static INLINE int av1_frame_scaled(const AV1_COMMON *cm) { + return av1_superres_scaled(cm) || av1_resize_scaled(cm); +} + +// Don't allow a show_existing_frame to coincide with an error resilient +// frame. An exception can be made for a forward keyframe since it has no +// previous dependencies. +static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) { + return cm->show_existing_frame && (!cm->features.error_resilient_mode || + cm->current_frame.frame_type == KEY_FRAME); +} + +// Get index into the 'cpi->mbmi_ext_info.frame_base' array for the given +// 'mi_row' and 'mi_col'. +static INLINE int get_mi_ext_idx(const int mi_row, const int mi_col, + const BLOCK_SIZE mi_alloc_bsize, + const int mbmi_ext_stride) { + const int mi_ext_size_1d = mi_size_wide[mi_alloc_bsize]; + const int mi_ext_row = mi_row / mi_ext_size_1d; + const int mi_ext_col = mi_col / mi_ext_size_1d; + return mi_ext_row * mbmi_ext_stride + mi_ext_col; +} + +// Lighter version of set_offsets that only sets the mode info +// pointers. +static INLINE void set_mode_info_offsets( + const CommonModeInfoParams *const mi_params, + const MBMIExtFrameBufferInfo *const mbmi_ext_info, MACROBLOCK *const x, + MACROBLOCKD *const xd, int mi_row, int mi_col) { + set_mi_offsets(mi_params, xd, mi_row, mi_col); + const int ext_idx = get_mi_ext_idx(mi_row, mi_col, mi_params->mi_alloc_bsize, + mbmi_ext_info->stride); + x->mbmi_ext_frame = mbmi_ext_info->frame_base + ext_idx; +} + +// Check to see if the given partition size is allowed for a specified number +// of mi block rows and columns remaining in the image. +// If not then return the largest allowed partition size +static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left, + int cols_left, int *bh, int *bw) { + int int_size = (int)bsize; + if (rows_left <= 0 || cols_left <= 0) { + return AOMMIN(bsize, BLOCK_8X8); + } else { + for (; int_size > 0; int_size -= 3) { + *bh = mi_size_high[int_size]; + *bw = mi_size_wide[int_size]; + if ((*bh <= rows_left) && (*bw <= cols_left)) { + break; + } + } + } + return (BLOCK_SIZE)int_size; +} + +static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + +// When more than 'max_allowed_refs' are available, we reduce the number of +// reference frames one at a time based on this order. +static const MV_REFERENCE_FRAME disable_order[] = { + LAST3_FRAME, + LAST2_FRAME, + ALTREF2_FRAME, + BWDREF_FRAME, +}; + +static const MV_REFERENCE_FRAME + ref_frame_priority_order[INTER_REFS_PER_FRAME] = { + LAST_FRAME, ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME, + ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME, + }; + +static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf, + const int use_one_pass_rt_params, + const YV12_BUFFER_CONFIG **ref_frames, + const int ext_ref_frame_flags) { + // cpi->ext_flags.ref_frame_flags allows certain reference types to be + // disabled by the external interface. These are set by + // av1_apply_encoding_flags(). Start with what the external interface allows, + // then suppress any reference types which we have found to be duplicates. + int flags = ext_ref_frame_flags; + + for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) { + const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i]; + // If this_ref has appeared before, mark the corresponding ref frame as + // invalid. For one_pass_rt mode, only disable GOLDEN_FRAME if it's the + // same as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd). + int index = + (use_one_pass_rt_params && ref_frame_priority_order[i] == GOLDEN_FRAME) + ? (1 + sf->rt_sf.use_nonrd_altref_frame) + : i; + for (int j = 0; j < index; ++j) { + // If this_ref has appeared before (same as the reference corresponding + // to lower index j), remove it as a reference only if that reference + // (for index j) is actually used as a reference. + if (this_ref == ref_frames[j] && + (flags & (1 << (ref_frame_priority_order[j] - 1)))) { + flags &= ~(1 << (ref_frame_priority_order[i] - 1)); + break; + } + } + } + return flags; +} + +// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon +// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this +// function, the memory must be freed by the caller. Both the buf member of the +// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory +// returned must be freed via call to free(). +// +// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically, +// the obu_has_size_field bit is set, and the buffer contains the obu_size +// field. +aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi); + +#define MAX_GFUBOOST_FACTOR 10.0 +#define MIN_GFUBOOST_FACTOR 4.0 + +static INLINE int is_frame_tpl_eligible(const GF_GROUP *const gf_group, + uint8_t index) { + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[index]; + return update_type == ARF_UPDATE || update_type == GF_UPDATE || + update_type == KF_UPDATE; +} + +static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group, + int selective_ref_frame, + int prune_ref_frames, + int gf_index) { + return (selective_ref_frame > 0) && (prune_ref_frames > 0) && + !is_frame_tpl_eligible(gf_group, gf_index); +} + +// Get update type of the current frame. +static INLINE FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group, + int gf_frame_index) { + return gf_group->update_type[gf_frame_index]; +} + +static INLINE int av1_pixels_to_mi(int pixels) { + return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2; +} + +static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + + return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) && + cm->show_frame; +} + +static INLINE int is_frame_resize_pending(const AV1_COMP *const cpi) { + const ResizePendingParams *const resize_pending_params = + &cpi->resize_pending_params; + return (resize_pending_params->width && resize_pending_params->height && + (cpi->common.width != resize_pending_params->width || + cpi->common.height != resize_pending_params->height)); +} + +// Check if loop filter is used. +static INLINE int is_loopfilter_used(const AV1_COMMON *const cm) { + return !cm->features.coded_lossless && !cm->tiles.large_scale; +} + +// Check if CDEF is used. +static INLINE int is_cdef_used(const AV1_COMMON *const cm) { + return cm->seq_params->enable_cdef && !cm->features.coded_lossless && + !cm->tiles.large_scale; +} + +// Check if loop restoration filter is used. +static INLINE int is_restoration_used(const AV1_COMMON *const cm) { + return cm->seq_params->enable_restoration && !cm->features.all_lossless && + !cm->tiles.large_scale; +} + +// Checks if post-processing filters need to be applied. +// NOTE: This function decides if the application of different post-processing +// filters on the reconstructed frame can be skipped at the encoder side. +// However the computation of different filter parameters that are signaled in +// the bitstream is still required. +static INLINE unsigned int derive_skip_apply_postproc_filters( + const AV1_COMP *cpi, int use_loopfilter, int use_cdef, int use_superres, + int use_restoration) { + // Though CDEF parameter selection should be dependent on + // deblocked/loop-filtered pixels for cdef_pick_method <= + // CDEF_FAST_SEARCH_LVL5, CDEF strength values are calculated based on the + // pixel values that are not loop-filtered in svc real-time encoding mode. + // Hence this case is handled separately using the condition below. + if (cpi->ppi->rtc_ref.non_reference_frame) + return (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF); + + if (!cpi->oxcf.algo_cfg.skip_postproc_filtering || cpi->ppi->b_calculate_psnr) + return 0; + assert(cpi->oxcf.mode == ALLINTRA); + + // The post-processing filters are applied one after the other in the + // following order: deblocking->cdef->superres->restoration. In case of + // ALLINTRA encoding, the reconstructed frame is not used as a reference + // frame. Hence, the application of these filters can be skipped when + // 1. filter parameters of the subsequent stages are not dependent on the + // filtered output of the current stage or + // 2. subsequent filtering stages are disabled + if (use_restoration) return SKIP_APPLY_RESTORATION; + if (use_superres) return SKIP_APPLY_SUPERRES; + if (use_cdef) { + // CDEF parameter selection is not dependent on the deblocked frame if + // cdef_pick_method is CDEF_PICK_FROM_Q. Hence the application of deblocking + // filters and cdef filters can be skipped in this case. + return (cpi->sf.lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q && + use_loopfilter) + ? (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF) + : SKIP_APPLY_CDEF; + } + if (use_loopfilter) return SKIP_APPLY_LOOPFILTER; + + // If we reach here, all post-processing stages are disabled, so none need to + // be skipped. + return 0; +} + +static INLINE void set_postproc_filter_default_params(AV1_COMMON *cm) { + struct loopfilter *const lf = &cm->lf; + CdefInfo *const cdef_info = &cm->cdef_info; + RestorationInfo *const rst_info = cm->rst_info; + + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + cdef_info->cdef_bits = 0; + cdef_info->cdef_strengths[0] = 0; + cdef_info->nb_cdef_strengths = 1; + cdef_info->cdef_uv_strengths[0] = 0; + rst_info[0].frame_restoration_type = RESTORE_NONE; + rst_info[1].frame_restoration_type = RESTORE_NONE; + rst_info[2].frame_restoration_type = RESTORE_NONE; +} + +static INLINE int is_inter_tx_size_search_level_one( + const TX_SPEED_FEATURES *tx_sf) { + return (tx_sf->inter_tx_size_search_init_depth_rect >= 1 && + tx_sf->inter_tx_size_search_init_depth_sqr >= 1); +} + +static INLINE int get_lpf_opt_level(const SPEED_FEATURES *sf) { + int lpf_opt_level = 0; + if (is_inter_tx_size_search_level_one(&sf->tx_sf)) + lpf_opt_level = (sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q) ? 2 : 1; + return lpf_opt_level; +} + +// Enable switchable motion mode only if warp and OBMC tools are allowed +static INLINE bool is_switchable_motion_mode_allowed(bool allow_warped_motion, + bool enable_obmc) { + return (allow_warped_motion || enable_obmc); +} + +#if CONFIG_AV1_TEMPORAL_DENOISING +static INLINE int denoise_svc(const struct AV1_COMP *const cpi) { + return (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && + cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise)); +} +#endif + +#if CONFIG_COLLECT_PARTITION_STATS == 2 +static INLINE void av1_print_fr_partition_timing_stats( + const FramePartitionTimingStats *part_stats, const char *filename) { + FILE *f = fopen(filename, "w"); + if (!f) { + return; + } + + fprintf(f, "bsize,redo,"); + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "decision_%d,", part); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "attempt_%d,", part); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "time_%d,", part); + } + fprintf(f, "\n"); + + static const int bsizes[6] = { 128, 64, 32, 16, 8, 4 }; + + for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) { + fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo); + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]); + } + fprintf(f, "\n"); + } + fclose(f); +} +#endif // CONFIG_COLLECT_PARTITION_STATS == 2 + +#if CONFIG_COLLECT_PARTITION_STATS +static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) { + assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 || + bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 || + bsize == BLOCK_4X4); + switch (bsize) { + case BLOCK_128X128: return 0; + case BLOCK_64X64: return 1; + case BLOCK_32X32: return 2; + case BLOCK_16X16: return 3; + case BLOCK_8X8: return 4; + case BLOCK_4X4: return 5; + default: assert(0 && "Invalid bsize for partition_stats."); return -1; + } +} +#endif // CONFIG_COLLECT_PARTITION_STATS + +#if CONFIG_COLLECT_COMPONENT_TIMING +static INLINE void start_timing(AV1_COMP *cpi, int component) { + aom_usec_timer_start(&cpi->component_timer[component]); +} +static INLINE void end_timing(AV1_COMP *cpi, int component) { + aom_usec_timer_mark(&cpi->component_timer[component]); + cpi->frame_component_time[component] += + aom_usec_timer_elapsed(&cpi->component_timer[component]); +} +static INLINE char const *get_frame_type_enum(int type) { + switch (type) { + case 0: return "KEY_FRAME"; + case 1: return "INTER_FRAME"; + case 2: return "INTRA_ONLY_FRAME"; + case 3: return "S_FRAME"; + default: assert(0); + } + return "error"; +} +#endif + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODER_H_ diff --git a/third_party/aom/av1/encoder/encoder_alloc.h b/third_party/aom/av1/encoder/encoder_alloc.h new file mode 100644 index 0000000000..ce48496d48 --- /dev/null +++ b/third_party/aom/av1/encoder/encoder_alloc.h @@ -0,0 +1,531 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODER_ALLOC_H_ +#define AOM_AV1_ENCODER_ENCODER_ALLOC_H_ + +#include "av1/encoder/block.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/global_motion_facade.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/pickcdef.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static AOM_INLINE void dealloc_context_buffers_ext( + MBMIExtFrameBufferInfo *mbmi_ext_info) { + aom_free(mbmi_ext_info->frame_base); + mbmi_ext_info->frame_base = NULL; + mbmi_ext_info->alloc_size = 0; +} + +static AOM_INLINE void alloc_context_buffers_ext( + AV1_COMMON *cm, MBMIExtFrameBufferInfo *mbmi_ext_info) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int mi_alloc_rows = + (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d; + const int mi_alloc_cols = + (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d; + const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols; + + if (new_ext_mi_size > mbmi_ext_info->alloc_size) { + dealloc_context_buffers_ext(mbmi_ext_info); + CHECK_MEM_ERROR( + cm, mbmi_ext_info->frame_base, + aom_malloc(new_ext_mi_size * sizeof(*mbmi_ext_info->frame_base))); + mbmi_ext_info->alloc_size = new_ext_mi_size; + } + // The stride needs to be updated regardless of whether new allocation + // happened or not. + mbmi_ext_info->stride = mi_alloc_cols; +} + +static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + CommonModeInfoParams *const mi_params = &cm->mi_params; + + // Setup mi_params + mi_params->set_mb_mi(mi_params, cm->width, cm->height, + cpi->sf.part_sf.default_min_partition_size); + + if (!is_stat_generation_stage(cpi)) av1_alloc_txb_buf(cpi); + + aom_free(cpi->td.mv_costs_alloc); + cpi->td.mv_costs_alloc = NULL; + // Avoid the memory allocation of 'mv_costs_alloc' for allintra encoding + // mode. + if (cpi->oxcf.kf_cfg.key_freq_max != 0) { + CHECK_MEM_ERROR(cm, cpi->td.mv_costs_alloc, + (MvCosts *)aom_calloc(1, sizeof(*cpi->td.mv_costs_alloc))); + cpi->td.mb.mv_costs = cpi->td.mv_costs_alloc; + } + + av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf, + cm->error); + if (av1_setup_sms_tree(cpi, &cpi->td)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate SMS tree"); + } + cpi->td.firstpass_ctx = + av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf); + if (!cpi->td.firstpass_ctx) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); +} + +// Allocate mbmi buffers which are used to store mode information at block +// level. +static AOM_INLINE void alloc_mb_mode_info_buffers(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + if (av1_alloc_context_buffers(cm, cm->width, cm->height, + cpi->sf.part_sf.default_min_partition_size)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + + if (!is_stat_generation_stage(cpi)) + alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info); +} + +static AOM_INLINE void realloc_segmentation_maps(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + CommonModeInfoParams *const mi_params = &cm->mi_params; + + // Create the encoder segmentation map and set all entries to 0 + aom_free(cpi->enc_seg.map); + CHECK_MEM_ERROR(cm, cpi->enc_seg.map, + aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1)); + + // Create a map used for cyclic background refresh. + if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh); + CHECK_MEM_ERROR( + cm, cpi->cyclic_refresh, + av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols)); + + // Create a map used to mark inactive areas. + aom_free(cpi->active_map.map); + CHECK_MEM_ERROR(cm, cpi->active_map.map, + aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1)); +} + +static AOM_INLINE void alloc_obmc_buffers( + OBMCBuffer *obmc_buffer, struct aom_internal_error_info *error) { + AOM_CHECK_MEM_ERROR( + error, obmc_buffer->wsrc, + (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->wsrc))); + AOM_CHECK_MEM_ERROR( + error, obmc_buffer->mask, + (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->mask))); + AOM_CHECK_MEM_ERROR( + error, obmc_buffer->above_pred, + (uint8_t *)aom_memalign( + 16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->above_pred))); + AOM_CHECK_MEM_ERROR( + error, obmc_buffer->left_pred, + (uint8_t *)aom_memalign( + 16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->left_pred))); +} + +static AOM_INLINE void release_obmc_buffers(OBMCBuffer *obmc_buffer) { + aom_free(obmc_buffer->mask); + aom_free(obmc_buffer->above_pred); + aom_free(obmc_buffer->left_pred); + aom_free(obmc_buffer->wsrc); + + obmc_buffer->mask = NULL; + obmc_buffer->above_pred = NULL; + obmc_buffer->left_pred = NULL; + obmc_buffer->wsrc = NULL; +} + +static AOM_INLINE void alloc_compound_type_rd_buffers( + struct aom_internal_error_info *error, CompoundTypeRdBuffers *const bufs) { + AOM_CHECK_MEM_ERROR( + error, bufs->pred0, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0))); + AOM_CHECK_MEM_ERROR( + error, bufs->pred1, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1))); + AOM_CHECK_MEM_ERROR( + error, bufs->residual1, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1))); + AOM_CHECK_MEM_ERROR( + error, bufs->diff10, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10))); + AOM_CHECK_MEM_ERROR(error, bufs->tmp_best_mask_buf, + (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE * + sizeof(*bufs->tmp_best_mask_buf))); +} + +static AOM_INLINE void release_compound_type_rd_buffers( + CompoundTypeRdBuffers *const bufs) { + aom_free(bufs->pred0); + aom_free(bufs->pred1); + aom_free(bufs->residual1); + aom_free(bufs->diff10); + aom_free(bufs->tmp_best_mask_buf); + av1_zero(*bufs); // Set all pointers to NULL for safety. +} + +static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + TokenInfo *token_info = &cpi->token_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int num_planes = av1_num_planes(cm); + dealloc_context_buffers_ext(&cpi->mbmi_ext_info); + + aom_free(cpi->tile_data); + cpi->tile_data = NULL; + cpi->allocated_tiles = 0; + enc_row_mt->allocated_tile_cols = 0; + enc_row_mt->allocated_tile_rows = 0; + + // Delete sementation map + aom_free(cpi->enc_seg.map); + cpi->enc_seg.map = NULL; + + av1_cyclic_refresh_free(cpi->cyclic_refresh); + cpi->cyclic_refresh = NULL; + + aom_free(cpi->active_map.map); + cpi->active_map.map = NULL; + + aom_free(cpi->ssim_rdmult_scaling_factors); + cpi->ssim_rdmult_scaling_factors = NULL; + + aom_free(cpi->tpl_rdmult_scaling_factors); + cpi->tpl_rdmult_scaling_factors = NULL; + +#if CONFIG_TUNE_VMAF + aom_free(cpi->vmaf_info.rdmult_scaling_factors); + cpi->vmaf_info.rdmult_scaling_factors = NULL; + aom_close_vmaf_model(cpi->vmaf_info.vmaf_model); +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + aom_free(cpi->butteraugli_info.rdmult_scaling_factors); + cpi->butteraugli_info.rdmult_scaling_factors = NULL; + aom_free_frame_buffer(&cpi->butteraugli_info.source); + aom_free_frame_buffer(&cpi->butteraugli_info.resized_source); +#endif + +#if CONFIG_SALIENCY_MAP + aom_free(cpi->saliency_map); + aom_free(cpi->sm_scaling_factor); +#endif + + release_obmc_buffers(&cpi->td.mb.obmc_buffer); + + aom_free(cpi->td.mv_costs_alloc); + cpi->td.mv_costs_alloc = NULL; + aom_free(cpi->td.dv_costs_alloc); + cpi->td.dv_costs_alloc = NULL; + + aom_free(cpi->td.mb.sb_stats_cache); + cpi->td.mb.sb_stats_cache = NULL; + + aom_free(cpi->td.mb.sb_fp_stats); + cpi->td.mb.sb_fp_stats = NULL; + +#if CONFIG_PARTITION_SEARCH_ORDER + aom_free(cpi->td.mb.rdcost); + cpi->td.mb.rdcost = NULL; +#endif + + av1_free_pc_tree_recursive(cpi->td.pc_root, num_planes, 0, 0, + cpi->sf.part_sf.partition_search_type); + cpi->td.pc_root = NULL; + + for (int i = 0; i < 2; i++) + for (int j = 0; j < 2; j++) { + aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]); + cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL; + } + + av1_hash_table_destroy(&cpi->td.mb.intrabc_hash_info.intrabc_hash_table); + + aom_free(cm->tpl_mvs); + cm->tpl_mvs = NULL; + + aom_free(cpi->td.pixel_gradient_info); + cpi->td.pixel_gradient_info = NULL; + + aom_free(cpi->td.src_var_info_of_4x4_sub_blocks); + cpi->td.src_var_info_of_4x4_sub_blocks = NULL; + + aom_free(cpi->td.vt64x64); + cpi->td.vt64x64 = NULL; + + av1_free_pmc(cpi->td.firstpass_ctx, num_planes); + cpi->td.firstpass_ctx = NULL; + + const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth; + // This call ensures that the buffers allocated by tf_alloc_and_reset_data() + // in av1_temporal_filter() for single-threaded encode are freed in case an + // error is encountered during temporal filtering (due to early termination + // tf_dealloc_data() in av1_temporal_filter() would not be invoked). + tf_dealloc_data(&cpi->td.tf_data, is_highbitdepth); + + // This call ensures that tpl_tmp_buffers for single-threaded encode are freed + // in case of an error during tpl. + tpl_dealloc_temp_buffers(&cpi->td.tpl_tmp_buffers); + + // This call ensures that the global motion (gm) data buffers for + // single-threaded encode are freed in case of an error during gm. + gm_dealloc_data(&cpi->td.gm_data); + + // This call ensures that CDEF search context buffers are deallocated in case + // of an error during cdef search. + av1_cdef_dealloc_data(cpi->cdef_search_ctx); + aom_free(cpi->cdef_search_ctx); + cpi->cdef_search_ctx = NULL; + + av1_dealloc_mb_data(&cpi->td.mb, num_planes); + + av1_dealloc_mb_wiener_var_pred_buf(&cpi->td); + + av1_free_txb_buf(cpi); + av1_free_context_buffers(cm); + + aom_free_frame_buffer(&cpi->last_frame_uf); +#if !CONFIG_REALTIME_ONLY + av1_free_restoration_buffers(cm); + av1_free_firstpass_data(&cpi->firstpass_data); +#endif + + if (!is_stat_generation_stage(cpi)) { + av1_free_cdef_buffers(cm, &cpi->ppi->p_mt_info.cdef_worker, + &cpi->mt_info.cdef_sync); + } + + for (int plane = 0; plane < num_planes; plane++) { + aom_free(cpi->pick_lr_ctxt.rusi[plane]); + cpi->pick_lr_ctxt.rusi[plane] = NULL; + } + aom_free(cpi->pick_lr_ctxt.dgd_avg); + cpi->pick_lr_ctxt.dgd_avg = NULL; + + aom_free_frame_buffer(&cpi->trial_frame_rst); + aom_free_frame_buffer(&cpi->scaled_source); + aom_free_frame_buffer(&cpi->scaled_last_source); + aom_free_frame_buffer(&cpi->orig_source); + aom_free_frame_buffer(&cpi->svc.source_last_TL0); + + free_token_info(token_info); + + av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); + av1_free_sms_tree(&cpi->td); + + aom_free(cpi->td.mb.palette_buffer); + release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer); + aom_free(cpi->td.mb.tmp_conv_dst); + for (int j = 0; j < 2; ++j) { + aom_free(cpi->td.mb.tmp_pred_bufs[j]); + } + +#if CONFIG_DENOISE + if (cpi->denoise_and_model) { + aom_denoise_and_model_free(cpi->denoise_and_model); + cpi->denoise_and_model = NULL; + } +#endif + if (cpi->film_grain_table) { + aom_film_grain_table_free(cpi->film_grain_table); + aom_free(cpi->film_grain_table); + cpi->film_grain_table = NULL; + } + + if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi); + aom_free(cpi->svc.layer_context); + cpi->svc.layer_context = NULL; + + aom_free(cpi->consec_zero_mv); + cpi->consec_zero_mv = NULL; + cpi->consec_zero_mv_alloc_size = 0; + + aom_free(cpi->src_sad_blk_64x64); + cpi->src_sad_blk_64x64 = NULL; + + aom_free(cpi->mb_weber_stats); + cpi->mb_weber_stats = NULL; + + if (cpi->oxcf.enable_rate_guide_deltaq) { + aom_free(cpi->prep_rate_estimates); + cpi->prep_rate_estimates = NULL; + + aom_free(cpi->ext_rate_distribution); + cpi->ext_rate_distribution = NULL; + } + + aom_free(cpi->mb_delta_q); + cpi->mb_delta_q = NULL; +} + +static AOM_INLINE void allocate_gradient_info_for_hog(AV1_COMP *cpi) { + if (!is_gradient_caching_for_hog_enabled(cpi)) return; + + PixelLevelGradientInfo *pixel_gradient_info = cpi->td.pixel_gradient_info; + if (!pixel_gradient_info) { + const AV1_COMMON *const cm = &cpi->common; + const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome; + CHECK_MEM_ERROR( + cm, pixel_gradient_info, + aom_malloc(sizeof(*pixel_gradient_info) * plane_types * MAX_SB_SQUARE)); + cpi->td.pixel_gradient_info = pixel_gradient_info; + } + + cpi->td.mb.pixel_gradient_info = pixel_gradient_info; +} + +static AOM_INLINE void allocate_src_var_of_4x4_sub_block_buf(AV1_COMP *cpi) { + if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return; + + Block4x4VarInfo *source_variance_info = + cpi->td.src_var_info_of_4x4_sub_blocks; + if (!source_variance_info) { + const AV1_COMMON *const cm = &cpi->common; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size]; + CHECK_MEM_ERROR(cm, source_variance_info, + aom_malloc(sizeof(*source_variance_info) * mi_count_in_sb)); + cpi->td.src_var_info_of_4x4_sub_blocks = source_variance_info; + } + + cpi->td.mb.src_var_info_of_4x4_sub_blocks = source_variance_info; +} + +static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4; + if (cpi->td.vt64x64) { + if (num_64x64_blocks != cpi->td.num_64x64_blocks) { + aom_free(cpi->td.vt64x64); + cpi->td.vt64x64 = NULL; + } + } + if (!cpi->td.vt64x64) { + CHECK_MEM_ERROR(cm, cpi->td.vt64x64, + aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks)); + cpi->td.num_64x64_blocks = num_64x64_blocks; + } +} + +static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source( + AV1_COMP *cpi, int scaled_width, int scaled_height) { + AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + if (scaled_width == cpi->unscaled_source->y_crop_width && + scaled_height == cpi->unscaled_source->y_crop_height) { + return cpi->unscaled_source; + } + + if (aom_realloc_frame_buffer( + &cpi->scaled_source, scaled_width, scaled_height, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->features.byte_alignment, NULL, NULL, NULL, + cpi->image_pyramid_levels, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to reallocate scaled source buffer"); + assert(cpi->scaled_source.y_crop_width == scaled_width); + assert(cpi->scaled_source.y_crop_height == scaled_height); + if (!av1_resize_and_extend_frame_nonnormative( + cpi->unscaled_source, &cpi->scaled_source, + (int)cm->seq_params->bit_depth, num_planes)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to reallocate buffers during resize"); + return &cpi->scaled_source; +} + +// Deallocate allocated thread_data. +static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) { + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + const int num_tf_workers = + AOMMIN(p_mt_info->num_mod_workers[MOD_TF], p_mt_info->num_workers); + const int num_tpl_workers = + AOMMIN(p_mt_info->num_mod_workers[MOD_TPL], p_mt_info->num_workers); + const int is_highbitdepth = ppi->seq_params.use_highbitdepth; + const int num_planes = ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE; + for (int t = 1; t < p_mt_info->num_workers; ++t) { + EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t]; + thread_data->td = thread_data->original_td; + ThreadData *const td = thread_data->td; + if (!td) continue; + aom_free(td->tctx); + aom_free(td->palette_buffer); + aom_free(td->tmp_conv_dst); + release_compound_type_rd_buffers(&td->comp_rd_buffer); + for (int j = 0; j < 2; ++j) { + aom_free(td->tmp_pred_bufs[j]); + } + aom_free(td->pixel_gradient_info); + aom_free(td->src_var_info_of_4x4_sub_blocks); + release_obmc_buffers(&td->obmc_buffer); + aom_free(td->vt64x64); + + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + aom_free(td->hash_value_buffer[x][y]); + td->hash_value_buffer[x][y] = NULL; + } + } + aom_free(td->mv_costs_alloc); + td->mv_costs_alloc = NULL; + aom_free(td->dv_costs_alloc); + td->dv_costs_alloc = NULL; + aom_free(td->counts); + av1_free_pmc(td->firstpass_ctx, num_planes); + td->firstpass_ctx = NULL; + av1_free_shared_coeff_buffer(&td->shared_coeff_buf); + av1_free_sms_tree(td); + // This call ensures that the buffers allocated by tf_alloc_and_reset_data() + // in prepare_tf_workers() for MT encode are freed in case an error is + // encountered during temporal filtering (due to early termination + // tf_dealloc_thread_data() in av1_tf_do_filtering_mt() would not be + // invoked). + if (t < num_tf_workers) tf_dealloc_data(&td->tf_data, is_highbitdepth); + // This call ensures that tpl_tmp_buffers for MT encode are freed in case of + // an error during tpl. + if (t < num_tpl_workers) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers); + // This call ensures that the buffers in gm_data for MT encode are freed in + // case of an error during gm. + gm_dealloc_data(&td->gm_data); + av1_dealloc_mb_data(&td->mb, num_planes); + aom_free(td->mb.sb_stats_cache); + td->mb.sb_stats_cache = NULL; + aom_free(td->mb.sb_fp_stats); + td->mb.sb_fp_stats = NULL; +#if CONFIG_PARTITION_SEARCH_ORDER + aom_free(td->mb.rdcost); + td->mb.rdcost = NULL; +#endif + av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, SEARCH_PARTITION); + td->pc_root = NULL; + av1_dealloc_mb_wiener_var_pred_buf(td); + aom_free(td); + thread_data->td = NULL; + thread_data->original_td = NULL; + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODER_ALLOC_H_ diff --git a/third_party/aom/av1/encoder/encoder_utils.c b/third_party/aom/av1/encoder/encoder_utils.c new file mode 100644 index 0000000000..c35873d207 --- /dev/null +++ b/third_party/aom/av1/encoder/encoder_utils.c @@ -0,0 +1,1503 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aomcx.h" + +#include "av1/encoder/bitstream.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/grain_test_vectors.h" +#include "av1/encoder/mv_prec.h" +#include "av1/encoder/rc_utils.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/superres_scale.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/var_based_part.h" + +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif + +#define MIN_BOOST_COMBINE_FACTOR 4.0 +#define MAX_BOOST_COMBINE_FACTOR 12.0 + +const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = { + { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 }, + { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 }, + { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 }, + { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 }, + { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 }, + { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 }, + { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 }, + { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 }, + { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 }, + { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 }, + { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 }, + { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 }, + { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 }, + { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 }, + { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 }, + { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 }, + { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 }, + { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } }, + { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 }, + { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 }, + { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 }, + { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 }, + { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 }, + { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 }, + { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 }, + { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, + { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 }, + { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 }, + { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 }, + { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 }, + { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 }, + { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 }, + { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 }, + { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 }, + { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 }, + { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 }, + { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 }, + { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 }, + { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 }, + { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 }, + { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } }, + { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 }, + { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 }, + { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 }, + { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 }, + { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 }, + { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 }, + { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 }, + { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 }, + { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 }, + { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 }, + { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 }, + { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } +}; + +const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 106, 90, 90, 97, 67, 59, 70, 28, + 30, 38, 16, 16, 16, 0, 0, 44, 50, 26, 25 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 98, 93, 97, 68, 82, 85, 33, 30, + 33, 16, 16, 16, 16, 0, 0, 43, 37, 26, 16 }, + { 0, 0, 0, 91, 80, 76, 78, 55, 49, 24, 16, + 16, 16, 16, 16, 16, 0, 0, 29, 45, 16, 38 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 103, 89, 89, 89, 62, 63, 76, 34, + 35, 32, 19, 16, 16, 0, 0, 49, 55, 29, 19 } +}; + +const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64, + 64, 64, 64 }; + +// TODO(yunqing): the default probs can be trained later from better +// performance. +const int default_switchable_interp_probs[FRAME_UPDATE_TYPES] + [SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS] = { + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } } + }; + +static void configure_static_seg_features(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + struct segmentation *const seg = &cm->seg; + + double avg_q; +#if CONFIG_FPMT_TEST + avg_q = ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) && + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)) + ? cpi->ppi->p_rc.temp_avg_q + : cpi->ppi->p_rc.avg_q; +#else + avg_q = cpi->ppi->p_rc.avg_q; +#endif + + int high_q = (int)(avg_q > 48.0); + int qi_delta; + + // Disable and clear down for KF + if (cm->current_frame.frame_type == KEY_FRAME) { + // Clear down the global segmentation map + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + seg->update_map = 0; + seg->update_data = 0; + + // Disable segmentation + av1_disable_segmentation(seg); + + // Clear down the segment features. + av1_clearall_segfeatures(seg); + } else if (cpi->refresh_frame.alt_ref_frame) { + // If this is an alt ref frame + // Clear down the global segmentation map + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + seg->update_map = 0; + seg->update_data = 0; + + // Disable segmentation and individual segment features by default + av1_disable_segmentation(seg); + av1_clearall_segfeatures(seg); + + // If segmentation was enabled set those features needed for the + // arf itself. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + + qi_delta = av1_compute_qdelta(rc, avg_q, avg_q * 0.875, + cm->seq_params->bit_depth); + av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2); + + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V); + + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + } + } else if (seg->enabled) { + // All other frames if segmentation has been enabled + + // First normal frame in a valid gf or alt ref group + if (rc->frames_since_golden == 0) { + // Set up segment features for normal frames in an arf group + // Disable segmentation and clear down features if alt ref + // is not active for this group + + av1_disable_segmentation(seg); + + memset(cpi->enc_seg.map, 0, + cm->mi_params.mi_rows * cm->mi_params.mi_cols); + + seg->update_map = 0; + seg->update_data = 0; + + av1_clearall_segfeatures(seg); + } else if (rc->is_src_frame_alt_ref) { + // Special case where we are coding over the top of a previous + // alt ref frame. + // Segment coding disabled for compred testing + + // Enable ref frame features for segment 0 as well + av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME); + av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); + + // All mbs should use ALTREF_FRAME + av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME); + av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME); + av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME); + av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + + // Skip all MBs if high Q (0,0 mv and skip coeffs) + if (high_q) { + av1_enable_segfeature(seg, 0, SEG_LVL_SKIP); + av1_enable_segfeature(seg, 1, SEG_LVL_SKIP); + } + // Enable data update + seg->update_data = 1; + } else { + // All other frames. + + // No updates.. leave things as they are. + seg->update_map = 0; + seg->update_data = 0; + } + } +} + +void av1_apply_active_map(AV1_COMP *cpi) { + struct segmentation *const seg = &cpi->common.seg; + unsigned char *const seg_map = cpi->enc_seg.map; + const unsigned char *const active_map = cpi->active_map.map; + int i; + + assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE); + + if (frame_is_intra_only(&cpi->common)) { + cpi->active_map.enabled = 0; + cpi->active_map.update = 1; + } + + if (cpi->active_map.update) { + if (cpi->active_map.enabled) { + const int num_mis = + cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; + for (i = 0; i < num_mis; ++i) + if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i]; + av1_enable_segmentation(seg); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V); + + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H, + -MAX_LOOP_FILTER); + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V, + -MAX_LOOP_FILTER); + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U, + -MAX_LOOP_FILTER); + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V, + -MAX_LOOP_FILTER); + } else { + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V); + if (seg->enabled) { + seg->update_data = 1; + seg->update_map = 1; + } + } + cpi->active_map.update = 0; + } +} + +#if !CONFIG_REALTIME_ONLY +static void process_tpl_stats_frame(AV1_COMP *cpi) { + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + AV1_COMMON *const cm = &cpi->common; + + assert(IMPLIES(gf_group->size > 0, cpi->gf_frame_index < gf_group->size)); + + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + + if (tpl_frame->is_valid) { + int tpl_stride = tpl_frame->stride; + double intra_cost_base = 0; + double mc_dep_cost_base = 0; + double cbcmp_base = 1; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int row_step = step; + const int col_step_sr = + coded_to_superres_mi(step, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + for (int row = 0; row < cm->mi_params.mi_rows; row += row_step) { + for (int col = 0; col < mi_cols_sr; col += col_step_sr) { + TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + double cbcmp = (double)(this_stats->srcrf_dist); + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); + intra_cost_base += log(dist_scaled) * cbcmp; + mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp; + cbcmp_base += cbcmp; + } + } + + if (mc_dep_cost_base == 0) { + tpl_frame->is_valid = 0; + } else { + cpi->rd.r0 = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base); + if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { + if (cpi->ppi->lap_enabled) { + double min_boost_factor = sqrt(cpi->ppi->p_rc.baseline_gf_interval); + const int gfu_boost = get_gfu_boost_from_r0_lap( + min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0, + cpi->ppi->p_rc.num_stats_required_for_gfu_boost); + // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost, + // gfu_boost); + cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost( + min_boost_factor, MAX_BOOST_COMBINE_FACTOR, + cpi->ppi->p_rc.gfu_boost, gfu_boost, + cpi->ppi->p_rc.num_stats_used_for_gfu_boost); + } else { + // TPL may only look at a subset of frame in the gf group when the + // speed feature 'reduce_num_frames' is on, which affects the r0 + // calcuation. Thus, to compensate for TPL not using all frames a + // factor to adjust r0 is used. + const int gfu_boost = + (int)(200.0 * cpi->ppi->tpl_data.r0_adjust_factor / cpi->rd.r0); + cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost( + MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR, + cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key); + } + } + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, + int *top_index) { + AV1_COMMON *const cm = &cpi->common; + + // Setup variables that depend on the dimensions of the frame. + av1_set_speed_features_framesize_dependent(cpi, cpi->speed); + +#if !CONFIG_REALTIME_ONLY + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (cpi->oxcf.algo_cfg.enable_tpl_model && + av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) { + process_tpl_stats_frame(cpi); + av1_tpl_rdmult_setup(cpi); + } +#endif + + // Decide q and q bounds. + *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index, + bottom_index, top_index); + +#if !CONFIG_REALTIME_ONLY + if (cpi->oxcf.rc_cfg.mode == AOM_Q && + cpi->ppi->tpl_data.tpl_frame[cpi->gf_frame_index].is_valid && + !is_lossless_requested(&cpi->oxcf.rc_cfg)) { + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + const int tpl_q = av1_tpl_get_q_index( + &cpi->ppi->tpl_data, cpi->gf_frame_index, cpi->rc.active_worst_quality, + cm->seq_params->bit_depth); + *q = clamp(tpl_q, rc_cfg->best_allowed_q, rc_cfg->worst_allowed_q); + *top_index = *bottom_index = *q; + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE) + cpi->ppi->p_rc.arf_q = *q; + } + + if (cpi->oxcf.q_cfg.use_fixed_qp_offsets && cpi->oxcf.rc_cfg.mode == AOM_Q) { + if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { + const double qratio_grad = + cpi->ppi->p_rc.baseline_gf_interval > 20 ? 0.2 : 0.3; + const double qstep_ratio = + 0.2 + + (1.0 - (double)cpi->rc.active_worst_quality / MAXQ) * qratio_grad; + *q = av1_get_q_index_from_qstep_ratio( + cpi->rc.active_worst_quality, qstep_ratio, cm->seq_params->bit_depth); + *top_index = *bottom_index = *q; + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == GF_UPDATE) + cpi->ppi->p_rc.arf_q = *q; + } else if (gf_group->layer_depth[cpi->gf_frame_index] < + gf_group->max_layer_depth) { + int this_height = gf_group->layer_depth[cpi->gf_frame_index]; + int arf_q = cpi->ppi->p_rc.arf_q; + while (this_height > 1) { + arf_q = (arf_q + cpi->oxcf.rc_cfg.cq_level + 1) / 2; + --this_height; + } + *top_index = *bottom_index = *q = arf_q; + } + } +#endif + + // Configure experimental use of segmentation for enhanced coding of + // static regions if indicated. + // Only allowed in the second pass of a two pass encode, as it requires + // lagged coding, and if the relevant speed feature flag is set. + if (is_stat_consumption_stage_twopass(cpi) && + cpi->sf.hl_sf.static_segmentation) + configure_static_seg_features(cpi); +} + +static void reset_film_grain_chroma_params(aom_film_grain_t *pars) { + pars->num_cr_points = 0; + pars->cr_mult = 0; + pars->cr_luma_mult = 0; + memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr)); + memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr)); + pars->num_cb_points = 0; + pars->cb_mult = 0; + pars->cb_luma_mult = 0; + pars->chroma_scaling_from_luma = 0; + memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb)); + memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb)); +} + +void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf) { + SequenceHeader *const seq_params = &ppi->seq_params; + const TuneCfg *const tune_cfg = &oxcf->tune_cfg; + + if (tune_cfg->film_grain_test_vector || tune_cfg->film_grain_table_filename || + tune_cfg->content == AOM_CONTENT_FILM) { + seq_params->film_grain_params_present = 1; + } else { +#if CONFIG_DENOISE + seq_params->film_grain_params_present = (oxcf->noise_level > 0); +#else + seq_params->film_grain_params_present = 0; +#endif + } +} + +void av1_update_film_grain_parameters(struct AV1_COMP *cpi, + const AV1EncoderConfig *oxcf) { + AV1_COMMON *const cm = &cpi->common; + const TuneCfg *const tune_cfg = &oxcf->tune_cfg; + + if (cpi->film_grain_table) { + aom_film_grain_table_free(cpi->film_grain_table); + aom_free(cpi->film_grain_table); + cpi->film_grain_table = NULL; + } + + if (tune_cfg->film_grain_test_vector) { + if (cm->current_frame.frame_type == KEY_FRAME) { + memcpy(&cm->film_grain_params, + film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1, + sizeof(cm->film_grain_params)); + if (oxcf->tool_cfg.enable_monochrome) + reset_film_grain_chroma_params(&cm->film_grain_params); + cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; + if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) { + cm->film_grain_params.clip_to_restricted_range = 0; + } + } + } else if (tune_cfg->film_grain_table_filename) { + CHECK_MEM_ERROR(cm, cpi->film_grain_table, + aom_calloc(1, sizeof(*cpi->film_grain_table))); + + aom_film_grain_table_read(cpi->film_grain_table, + tune_cfg->film_grain_table_filename, cm->error); + } else if (tune_cfg->content == AOM_CONTENT_FILM) { + cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; + if (oxcf->tool_cfg.enable_monochrome) + reset_film_grain_chroma_params(&cm->film_grain_params); + if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) + cm->film_grain_params.clip_to_restricted_range = 0; + } else { + memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); + } +} + +void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, + const int phase, const int use_optimized_scaler) { + AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1). + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + BufferPool *const pool = cm->buffer_pool; + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, ref_frame); + + if (ref == NULL) { + cpi->scaled_ref_buf[ref_frame - 1] = NULL; + continue; + } + + // For RTC-SVC: if force_zero_mode_spatial_ref is enabled, check if the + // motion search can be skipped for the references: last, golden, altref. + // If so, we can skip scaling that reference. + if (cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref && + cpi->ppi->rtc_ref.set_ref_frame_config) { + if (ref_frame == LAST_FRAME && cpi->svc.skip_mvsearch_last) continue; + if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_mvsearch_gf) continue; + if (ref_frame == ALTREF_FRAME && cpi->svc.skip_mvsearch_altref) + continue; + } + // For RTC with superres on: golden reference only needs to be scaled + // if it was refreshed in previous frame. + if (is_one_pass_rt_params(cpi) && + cpi->oxcf.superres_cfg.enable_superres && ref_frame == GOLDEN_FRAME && + cpi->rc.frame_num_last_gf_refresh < + (int)cm->current_frame.frame_number - 1) { + continue; + } + + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { + // Replace the reference buffer with a copy having a thicker border, + // if the reference buffer is higher resolution than the current + // frame, and the border is thin. + if ((ref->y_crop_width > cm->width || + ref->y_crop_height > cm->height) && + ref->border < AOM_BORDER_IN_PIXELS) { + RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame); + if (aom_yv12_realloc_with_new_border( + &ref_fb->buf, AOM_BORDER_IN_PIXELS, + cm->features.byte_alignment, cpi->image_pyramid_levels, + num_planes) != 0) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + } + int force_scaling = 0; + RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1]; + if (new_fb == NULL) { + const int new_fb_idx = get_free_fb(cm); + if (new_fb_idx == INVALID_IDX) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Unable to find free frame buffer"); + } + force_scaling = 1; + new_fb = &pool->frame_bufs[new_fb_idx]; + } + + if (force_scaling || new_fb->buf.y_crop_width != cm->width || + new_fb->buf.y_crop_height != cm->height) { + if (aom_realloc_frame_buffer( + &new_fb->buf, cm->width, cm->height, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) { + if (force_scaling) { + // Release the reference acquired in the get_free_fb() call above. + --new_fb->ref_count; + } + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + bool has_optimized_scaler = av1_has_optimized_scaler( + ref->y_crop_width, ref->y_crop_height, new_fb->buf.y_crop_width, + new_fb->buf.y_crop_height); + if (num_planes > 1) { + has_optimized_scaler = + has_optimized_scaler && + av1_has_optimized_scaler( + ref->uv_crop_width, ref->uv_crop_height, + new_fb->buf.uv_crop_width, new_fb->buf.uv_crop_height); + } +#if CONFIG_AV1_HIGHBITDEPTH + if (use_optimized_scaler && has_optimized_scaler && + cm->seq_params->bit_depth == AOM_BITS_8) { + av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase, + num_planes); + } else if (!av1_resize_and_extend_frame_nonnormative( + ref, &new_fb->buf, (int)cm->seq_params->bit_depth, + num_planes)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate buffer during resize"); + } +#else + if (use_optimized_scaler && has_optimized_scaler) { + av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase, + num_planes); + } else if (!av1_resize_and_extend_frame_nonnormative( + ref, &new_fb->buf, (int)cm->seq_params->bit_depth, + num_planes)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate buffer during resize"); + } +#endif + cpi->scaled_ref_buf[ref_frame - 1] = new_fb; + alloc_frame_mvs(cm, new_fb); + } + } else { + RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame); + buf->buf.y_crop_width = ref->y_crop_width; + buf->buf.y_crop_height = ref->y_crop_height; + cpi->scaled_ref_buf[ref_frame - 1] = buf; + ++buf->ref_count; + } + } else { + if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL; + } + } +} + +BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width, + int height, int number_spatial_layers) { + if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) { + return BLOCK_64X64; + } + if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) { + return BLOCK_128X128; + } +#if CONFIG_TFLITE + if (oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) return BLOCK_64X64; +#endif + // Force 64x64 superblock size to increase resolution in perceptual + // AQ mode. + if (oxcf->mode == ALLINTRA && + (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI || + oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED)) { + return BLOCK_64X64; + } + assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC); + + if (number_spatial_layers > 1 || + oxcf->resize_cfg.resize_mode != RESIZE_NONE) { + // Use the configured size (top resolution) for spatial layers or + // on resize. + return AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) > 720 + ? BLOCK_128X128 + : BLOCK_64X64; + } else if (oxcf->mode == REALTIME) { + if (oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) { + const TileConfig *const tile_cfg = &oxcf->tile_cfg; + const int num_tiles = + (1 << tile_cfg->tile_columns) * (1 << tile_cfg->tile_rows); + // For multi-thread encode: if the number of (128x128) superblocks + // per tile is low use 64X64 superblock. + if (oxcf->row_mt == 1 && oxcf->max_threads >= 4 && + oxcf->max_threads >= num_tiles && AOMMIN(width, height) > 720 && + (width * height) / (128 * 128 * num_tiles) <= 38) + return BLOCK_64X64; + else + return AOMMIN(width, height) >= 720 ? BLOCK_128X128 : BLOCK_64X64; + } else { + return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64; + } + } + + // TODO(any): Possibly could improve this with a heuristic. + // When superres / resize is on, 'cm->width / height' can change between + // calls, so we don't apply this heuristic there. + // Things break if superblock size changes between the first pass and second + // pass encoding, which is why this heuristic is not configured as a + // speed-feature. + if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE && + oxcf->resize_cfg.resize_mode == RESIZE_NONE) { + int is_480p_or_lesser = AOMMIN(width, height) <= 480; + if (oxcf->speed >= 1 && is_480p_or_lesser) return BLOCK_64X64; + + // For 1080p and lower resolutions, choose SB size adaptively based on + // resolution and speed level for multi-thread encode. + int is_1080p_or_lesser = AOMMIN(width, height) <= 1080; + if (!is_480p_or_lesser && is_1080p_or_lesser && oxcf->mode == GOOD && + oxcf->row_mt == 1 && oxcf->max_threads > 1 && oxcf->speed >= 5) + return BLOCK_64X64; + + // For allintra encode, since the maximum partition size is set to 32X32 for + // speed>=6, superblock size is set to 64X64 instead of 128X128. This + // improves the multithread performance due to reduction in top right delay + // and thread sync wastage. Currently, this setting is selectively enabled + // only for speed>=9 and resolutions less than 4k since cost update + // frequency is set to INTERNAL_COST_UPD_OFF in these cases. + const int is_4k_or_larger = AOMMIN(width, height) >= 2160; + if (oxcf->mode == ALLINTRA && oxcf->speed >= 9 && !is_4k_or_larger) + return BLOCK_64X64; + } + return BLOCK_128X128; +} + +void av1_setup_frame(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + // Set up entropy context depending on frame type. The decoder mandates + // the use of the default context, index 0, for keyframes and inter + // frames where the error_resilient_mode or intra_only flag is set. For + // other inter-frames the encoder currently uses only two contexts; + // context 1 for ALTREF frames and context 0 for the others. + + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + cpi->ext_flags.use_primary_ref_none) { + av1_setup_past_independence(cm); + } + + if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) || + frame_is_sframe(cm)) { + if (!cpi->ppi->seq_params_locked) { + set_sb_size(cm->seq_params, + av1_select_sb_size(&cpi->oxcf, cm->width, cm->height, + cpi->ppi->number_spatial_layers)); + } + } else { + const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm); + if (primary_ref_buf == NULL) { + av1_setup_past_independence(cm); + cm->seg.update_map = 1; + cm->seg.update_data = 1; + } else { + *cm->fc = primary_ref_buf->frame_context; + } + } + + av1_zero(cm->cur_frame->interp_filter_selected); + cm->prev_frame = get_primary_ref_frame_buf(cm); + cpi->vaq_refresh = 0; +} + +#if !CONFIG_REALTIME_ONLY +static int get_interp_filter_selected(const AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref, + InterpFilter ifilter) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); + if (buf == NULL) return 0; + return buf->interp_filter_selected[ifilter]; +} + +uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + int ref_total[REF_FRAMES] = { 0 }; + uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK; + + if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_frame.alt_ref_frame) + return mask; + + for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) { + for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP; + ++ifilter) { + ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter); + } + } + int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] + + ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] + + ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]); + + for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP; + ++ifilter) { + int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30; + if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) { + int filter_score = + get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 + + get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 + + get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10; + if (filter_score < ref_total_total) { + DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter; + reset_interp_filter_allowed_mask(&mask, filt_type); + } + } + } + return mask; +} + +#define STRICT_PSNR_DIFF_THRESH 0.9 +// Encode key frame with/without screen content tools to determine whether +// screen content tools should be enabled for this key frame group or not. +// The first encoding is without screen content tools. +// The second encoding is with screen content tools. +// We compare the psnr and frame size to make the decision. +static void screen_content_tools_determination( + AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision, + const int allow_intrabc_orig_decision, + const int use_screen_content_tools_orig_decision, + const int is_screen_content_type_orig_decision, const int pass, + int *projected_size_pass, PSNR_STATS *psnr) { + AV1_COMMON *const cm = &cpi->common; + FeatureFlags *const features = &cm->features; + +#if CONFIG_FPMT_TEST + projected_size_pass[pass] = + ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) && + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)) + ? cpi->ppi->p_rc.temp_projected_frame_size + : cpi->rc.projected_frame_size; +#else + projected_size_pass[pass] = cpi->rc.projected_frame_size; +#endif + +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass], + bit_depth, in_bit_depth); +#else + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]); +#endif + if (pass != 1) return; + + const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0]; + // Calculate % of palette mode to be chosen in a frame from mode decision. + const double palette_ratio = + (double)cpi->palette_pixel_num / (double)(cm->height * cm->width); + const int psnr_diff_is_large = (psnr_diff > STRICT_PSNR_DIFF_THRESH); + const int ratio_is_large = + ((palette_ratio >= 0.0001) && ((psnr_diff / palette_ratio) > 4)); + const int is_sc_encoding_much_better = (psnr_diff_is_large || ratio_is_large); + if (is_sc_encoding_much_better) { + // Use screen content tools, if we get coding gain. + features->allow_screen_content_tools = 1; + features->allow_intrabc = cpi->intrabc_used; + cpi->use_screen_content_tools = 1; + cpi->is_screen_content_type = 1; + } else { + // Use original screen content decision. + features->allow_screen_content_tools = + allow_screen_content_tools_orig_decision; + features->allow_intrabc = allow_intrabc_orig_decision; + cpi->use_screen_content_tools = use_screen_content_tools_orig_decision; + cpi->is_screen_content_type = is_screen_content_type_orig_decision; + } +} + +// Set some encoding parameters to make the encoding process fast. +// A fixed block partition size, and a large q is used. +static void set_encoding_params_for_screen_content(AV1_COMP *cpi, + const int pass) { + AV1_COMMON *const cm = &cpi->common; + if (pass == 0) { + // In the first pass, encode without screen content tools. + // Use a high q, and a fixed block size for fast encoding. + cm->features.allow_screen_content_tools = 0; + cm->features.allow_intrabc = 0; + cpi->use_screen_content_tools = 0; + cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; + cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32; + return; + } + assert(pass == 1); + // In the second pass, encode with screen content tools. + // Use a high q, and a fixed block size for fast encoding. + cm->features.allow_screen_content_tools = 1; + // TODO(chengchen): turn intrabc on could lead to data race issue. + // cm->allow_intrabc = 1; + cpi->use_screen_content_tools = 1; + cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; + cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32; +} + +// Determines whether to use screen content tools for the key frame group. +// This function modifies "cm->features.allow_screen_content_tools", +// "cm->features.allow_intrabc" and "cpi->use_screen_content_tools". +void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) { + AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const QuantizationCfg *const q_cfg = &oxcf->q_cfg; + // Variables to help determine if we should allow screen content tools. + int projected_size_pass[3] = { 0 }; + PSNR_STATS psnr[3]; + const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME; + const int allow_screen_content_tools_orig_decision = + cm->features.allow_screen_content_tools; + const int allow_intrabc_orig_decision = cm->features.allow_intrabc; + const int use_screen_content_tools_orig_decision = + cpi->use_screen_content_tools; + const int is_screen_content_type_orig_decision = cpi->is_screen_content_type; + // Turn off the encoding trial for forward key frame and superres. + if (cpi->sf.rt_sf.use_nonrd_pick_mode || oxcf->kf_cfg.fwd_kf_enabled || + cpi->superres_mode != AOM_SUPERRES_NONE || oxcf->mode == REALTIME || + use_screen_content_tools_orig_decision || !is_key_frame) { + return; + } + + // TODO(chengchen): multiple encoding for the lossless mode is time consuming. + // Find a better way to determine whether screen content tools should be used + // for lossless coding. + // Use a high q and a fixed partition to do quick encoding. + const int q_for_screen_content_quick_run = + is_lossless_requested(&oxcf->rc_cfg) ? q_orig : AOMMAX(q_orig, 244); + const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type; + const BLOCK_SIZE fixed_partition_block_size_orig = + cpi->sf.part_sf.fixed_partition_size; + + // Setup necessary params for encoding, including frame source, etc. + + cpi->source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter, + 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, + cpi->image_pyramid_levels); + } + + av1_setup_frame(cpi); + + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + + // The two encoding passes aim to help determine whether to use screen + // content tools, with a high q and fixed partition. + for (int pass = 0; pass < 2; ++pass) { + set_encoding_params_for_screen_content(cpi, pass); + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, + q_for_screen_content_quick_run, + q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq); + av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + + av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run, + 0); + // transform / motion compensation build reconstruction frame + av1_encode_frame(cpi); + // Screen content decision + screen_content_tools_determination( + cpi, allow_screen_content_tools_orig_decision, + allow_intrabc_orig_decision, use_screen_content_tools_orig_decision, + is_screen_content_type_orig_decision, pass, projected_size_pass, psnr); + } + + // Set partition speed feature back. + cpi->sf.part_sf.partition_search_type = partition_search_type_orig; + cpi->sf.part_sf.fixed_partition_size = fixed_partition_block_size_orig; + + // Free token related info if screen content coding tools are not enabled. + if (!cm->features.allow_screen_content_tools) + free_token_info(&cpi->token_info); +} +#endif // CONFIG_REALTIME_ONLY + +static void fix_interp_filter(InterpFilter *const interp_filter, + const FRAME_COUNTS *const counts) { + if (*interp_filter == SWITCHABLE) { + // Check to see if only one of the filters is actually used + int count[SWITCHABLE_FILTERS] = { 0 }; + int num_filters_used = 0; + for (int i = 0; i < SWITCHABLE_FILTERS; ++i) { + for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + count[i] += counts->switchable_interp[j][i]; + num_filters_used += (count[i] > 0); + } + if (num_filters_used == 1) { + // Only one filter is used. So set the filter at frame level + for (int i = 0; i < SWITCHABLE_FILTERS; ++i) { + if (count[i]) { + *interp_filter = i; + break; + } + } + } + } +} + +void av1_finalize_encoded_frame(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + + if (!cm->seq_params->reduced_still_picture_hdr && + encode_show_existing_frame(cm)) { + RefCntBuffer *const frame_to_show = + cm->ref_frame_map[cpi->existing_fb_idx_to_show]; + + if (frame_to_show == NULL) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer does not contain a reconstructed frame"); + } + assert(frame_to_show->ref_count > 0); + assign_frame_buffer_p(&cm->cur_frame, frame_to_show); + } + + if (!encode_show_existing_frame(cm) && + cm->seq_params->film_grain_params_present && + (cm->show_frame || cm->showable_frame)) { + // Copy the current frame's film grain params to the its corresponding + // RefCntBuffer slot. + cm->cur_frame->film_grain_params = cm->film_grain_params; + + // We must update the parameters if this is not an INTER_FRAME + if (current_frame->frame_type != INTER_FRAME) + cm->cur_frame->film_grain_params.update_parameters = 1; + + // Iterate the random seed for the next frame. + cm->film_grain_params.random_seed += 3381; + if (cm->film_grain_params.random_seed == 0) + cm->film_grain_params.random_seed = 7391; + } + + // Initialise all tiles' contexts from the global frame context + for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) { + for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) { + const int tile_idx = tile_row * cm->tiles.cols + tile_col; + cpi->tile_data[tile_idx].tctx = *cm->fc; + } + } + + if (!frame_is_intra_only(cm)) + fix_interp_filter(&cm->features.interp_filter, cpi->td.counts); +} + +int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture, + const YV12_BUFFER_CONFIG *last_picture, + ForceIntegerMVInfo *const force_intpel_info) { + // check use hash ME + int k; + + const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE; + const double threshold_current = 0.8; + const double threshold_average = 0.95; + const int max_history_size = 32; + int T = 0; // total block + int C = 0; // match with collocated block + int S = 0; // smooth region but not match with collocated block + + const int pic_width = cur_picture->y_width; + const int pic_height = cur_picture->y_height; + for (int i = 0; i + block_size <= pic_height; i += block_size) { + for (int j = 0; j + block_size <= pic_width; j += block_size) { + const int x_pos = j; + const int y_pos = i; + int match = 1; + T++; + + // check whether collocated block match with current + uint8_t *p_cur = cur_picture->y_buffer; + uint8_t *p_ref = last_picture->y_buffer; + int stride_cur = cur_picture->y_stride; + int stride_ref = last_picture->y_stride; + p_cur += (y_pos * stride_cur + x_pos); + p_ref += (y_pos * stride_ref + x_pos); + + if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur); + uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref); + for (int tmpY = 0; tmpY < block_size && match; tmpY++) { + for (int tmpX = 0; tmpX < block_size && match; tmpX++) { + if (p16_cur[tmpX] != p16_ref[tmpX]) { + match = 0; + } + } + p16_cur += stride_cur; + p16_ref += stride_ref; + } + } else { + for (int tmpY = 0; tmpY < block_size && match; tmpY++) { + for (int tmpX = 0; tmpX < block_size && match; tmpX++) { + if (p_cur[tmpX] != p_ref[tmpX]) { + match = 0; + } + } + p_cur += stride_cur; + p_ref += stride_ref; + } + } + + if (match) { + C++; + continue; + } + + if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos, + y_pos) || + av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) { + S++; + continue; + } + } + } + + assert(T > 0); + double cs_rate = ((double)(C + S)) / ((double)(T)); + + force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate; + + force_intpel_info->rate_index = + (force_intpel_info->rate_index + 1) % max_history_size; + force_intpel_info->rate_size++; + force_intpel_info->rate_size = + AOMMIN(force_intpel_info->rate_size, max_history_size); + + if (cs_rate < threshold_current) { + return 0; + } + + if (C == T) { + return 1; + } + + double cs_average = 0.0; + + for (k = 0; k < force_intpel_info->rate_size; k++) { + cs_average += force_intpel_info->cs_rate_array[k]; + } + cs_average /= force_intpel_info->rate_size; + + if (cs_average < threshold_average) { + return 0; + } + + if ((T - C - S) < 0) { + return 1; + } + + if (cs_average > 1.01) { + return 1; + } + + return 0; +} + +void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + uint8_t *y_buffer = cpi->source->y_buffer; + const int y_stride = cpi->source->y_stride; + const int block_size = BLOCK_16X16; + + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h; + double log_sum = 0.0; + + // Loop through each 16x16 block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + double var = 0.0, num_of_var = 0.0; + const int index = row * num_cols + col; + + // Loop through each 8x8 block. + for (int mi_row = row * num_mi_h; + mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h; + mi_row += 2) { + for (int mi_col = col * num_mi_w; + mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w; + mi_col += 2) { + struct buf_2d buf; + const int row_offset_y = mi_row << 2; + const int col_offset_y = mi_col << 2; + + buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + var += av1_get_perpixel_variance_facade(cpi, xd, &buf, BLOCK_8X8, + AOM_PLANE_Y); + num_of_var += 1.0; + } + } + var = var / num_of_var; + + // Curve fitting with an exponential model on all 16x16 blocks from the + // midres dataset. + var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222; + + // As per the above computation, var will be in the range of + // [17.492222, 84.527656], assuming the data type is of infinite + // precision. The following assert conservatively checks if var is in the + // range of [17.0, 85.0] to avoid any issues due to the precision of the + // relevant data type. + assert(var > 17.0 && var < 85.0); + cpi->ssim_rdmult_scaling_factors[index] = var; + log_sum += log(var); + } + } + + // As log_sum holds the geometric mean, it will be in the range + // [17.492222, 84.527656]. Hence, in the below loop, the value of + // cpi->ssim_rdmult_scaling_factors[index] would be in the range + // [0.2069, 4.8323]. + log_sum = exp(log_sum / (double)(num_rows * num_cols)); + + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + cpi->ssim_rdmult_scaling_factors[index] /= log_sum; + } + } +} + +// Coding context that only needs to be saved when recode loop includes +// filtering (deblocking, CDEF, superres post-encode upscale and/or loop +// restoraton). +static void save_extra_coding_context(AV1_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + AV1_COMMON *cm = &cpi->common; + + cc->lf = cm->lf; + cc->cdef_info = cm->cdef_info; + cc->rc = cpi->rc; + cc->mv_stats = cpi->ppi->mv_stats; +} + +void av1_save_all_coding_context(AV1_COMP *cpi) { + save_extra_coding_context(cpi); + if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi); +} + +#if DUMP_RECON_FRAMES == 1 + +// NOTE(zoeliu): For debug - Output the filtered reconstructed video. +void av1_dump_filtered_recon_frames(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const CurrentFrame *const current_frame = &cm->current_frame; + const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf; + + if (recon_buf == NULL) { + printf("Frame %d is not ready.\n", current_frame->frame_number); + return; + } + + static const int flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + printf( + "\n***Frame=%d (frame_offset=%d, show_frame=%d, " + "show_existing_frame=%d) " + "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[", + current_frame->frame_number, current_frame->order_hint, cm->show_frame, + cm->show_existing_frame); + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + const int ref_offset = buf != NULL ? (int)buf->order_hint : -1; + printf(" %d(%c)", ref_offset, + (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N'); + } + printf(" ]\n"); + + if (!cm->show_frame) { + printf("Frame %d is a no show frame, so no image dump.\n", + current_frame->frame_number); + return; + } + + int h; + char file_name[256] = "/tmp/enc_filtered_recon.yuv"; + FILE *f_recon = NULL; + + if (current_frame->frame_number == 0) { + if ((f_recon = fopen(file_name, "wb")) == NULL) { + printf("Unable to open file %s to write.\n", file_name); + return; + } + } else { + if ((f_recon = fopen(file_name, "ab")) == NULL) { + printf("Unable to open file %s to append.\n", file_name); + return; + } + } + printf( + "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, " + "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, " + "refresh_alt_ref_frame=%d, " + "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n", + current_frame->frame_number, cpi->gf_frame_index, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], + current_frame->order_hint, cm->show_frame, cm->show_existing_frame, + cpi->rc.source_alt_ref_active, cpi->refresh_frame.alt_ref_frame, + recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height); +#if 0 + int ref_frame; + printf("get_ref_frame_map_idx: ["); + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) + printf(" %d", get_ref_frame_map_idx(cm, ref_frame)); + printf(" ]\n"); +#endif // 0 + + // --- Y --- + for (h = 0; h < cm->height; ++h) { + fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width, + f_recon); + } + // --- U --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1), + f_recon); + } + // --- V --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1), + f_recon); + } + + fclose(f_recon); +} +#endif // DUMP_RECON_FRAMES diff --git a/third_party/aom/av1/encoder/encoder_utils.h b/third_party/aom/av1/encoder/encoder_utils.h new file mode 100644 index 0000000000..113f62aa59 --- /dev/null +++ b/third_party/aom/av1/encoder/encoder_utils.h @@ -0,0 +1,1141 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODER_UTILS_H_ +#define AOM_AV1_ENCODER_ENCODER_UTILS_H_ + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define AM_SEGMENT_ID_INACTIVE 7 +#define AM_SEGMENT_ID_ACTIVE 0 +#define DUMP_RECON_FRAMES 0 + +extern const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL] + [TX_TYPES]; + +extern const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL]; + +extern const int default_warped_probs[FRAME_UPDATE_TYPES]; + +extern const int default_switchable_interp_probs[FRAME_UPDATE_TYPES] + [SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; + +// Mark all inactive blocks as active. Other segmentation features may be set +// so memset cannot be used, instead only inactive blocks should be reset. +static AOM_INLINE void suppress_active_map(AV1_COMP *cpi) { + unsigned char *const seg_map = cpi->enc_seg.map; + int i; + const int num_mis = + cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; + if (cpi->active_map.enabled || cpi->active_map.update) + for (i = 0; i < num_mis; ++i) + if (seg_map[i] == AM_SEGMENT_ID_INACTIVE) + seg_map[i] = AM_SEGMENT_ID_ACTIVE; +} + +// Returns 'size' in the number of Mode Info (MI) units. 'size' is either the +// width or height. +static AOM_INLINE int size_in_mi(int size) { + // Ensure that the decoded width and height are both multiples of + // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if + // subsampling is used). + // This simplifies the implementation of various experiments, + // eg. cdef, which operates on units of 8x8 luma pixels. + const int aligned_size = ALIGN_POWER_OF_TWO(size, 3); + return aligned_size >> MI_SIZE_LOG2; +} + +static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width, + int height) { + mi_params->mi_cols = size_in_mi(width); + mi_params->mi_rows = size_in_mi(height); + mi_params->mi_stride = calc_mi_size(mi_params->mi_cols); + + mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2); + mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2); + mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols; + + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + mi_params->mi_alloc_stride = + (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d; + + assert(mi_size_wide[mi_params->mi_alloc_bsize] == + mi_size_high[mi_params->mi_alloc_bsize]); +} + +static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) { + aom_free(mi_params->mi_alloc); + mi_params->mi_alloc = NULL; + mi_params->mi_alloc_size = 0; + aom_free(mi_params->mi_grid_base); + mi_params->mi_grid_base = NULL; + mi_params->mi_grid_size = 0; + aom_free(mi_params->tx_type_map); + mi_params->tx_type_map = NULL; +} + +static AOM_INLINE void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width, + int height, + BLOCK_SIZE min_partition_size) { + mi_params->mi_alloc_bsize = min_partition_size; + + set_mb_mi(mi_params, width, height); +} + +static AOM_INLINE void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params, + int width, int height, + BLOCK_SIZE min_partition_size) { + (void)min_partition_size; + mi_params->mi_alloc_bsize = BLOCK_16X16; + + set_mb_mi(mi_params, width, height); +} + +static AOM_INLINE void enc_setup_mi(CommonModeInfoParams *mi_params) { + const int mi_grid_size = + mi_params->mi_stride * calc_mi_size(mi_params->mi_rows); + memset(mi_params->mi_alloc, 0, + mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc)); + memset(mi_params->mi_grid_base, 0, + mi_grid_size * sizeof(*mi_params->mi_grid_base)); + memset(mi_params->tx_type_map, 0, + mi_grid_size * sizeof(*mi_params->tx_type_map)); +} + +static AOM_INLINE void init_buffer_indices( + ForceIntegerMVInfo *const force_intpel_info, int *const remapped_ref_idx) { + int fb_idx; + for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx) + remapped_ref_idx[fb_idx] = fb_idx; + force_intpel_info->rate_index = 0; + force_intpel_info->rate_size = 0; +} + +#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \ + ppi->fn_ptr[BT].sdf = SDF; \ + ppi->fn_ptr[BT].sdaf = SDAF; \ + ppi->fn_ptr[BT].vf = VF; \ + ppi->fn_ptr[BT].svf = SVF; \ + ppi->fn_ptr[BT].svaf = SVAF; \ + ppi->fn_ptr[BT].sdx4df = SDX4DF; \ + ppi->fn_ptr[BT].sdx3df = SDX3DF; \ + ppi->fn_ptr[BT].jsdaf = JSDAF; \ + ppi->fn_ptr[BT].jsvaf = JSVAF; + +#define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD) \ + HIGHBD_BFP( \ + BLOCK_##WIDTH##X##HEIGHT, aom_highbd_sad##WIDTH##x##HEIGHT##_bits##BD, \ + aom_highbd_sad##WIDTH##x##HEIGHT##_avg_bits##BD, \ + aom_highbd_##BD##_variance##WIDTH##x##HEIGHT, \ + aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT, \ + aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT, \ + aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD, \ + aom_highbd_sad##WIDTH##x##HEIGHT##x3d_bits##BD, \ + aom_highbd_dist_wtd_sad##WIDTH##x##HEIGHT##_avg_bits##BD, \ + aom_highbd_##BD##_dist_wtd_sub_pixel_avg_variance##WIDTH##x##HEIGHT) + +#define MAKE_BFP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ + int source_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \ + } + +#define MAKE_BFP_SADAVG_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ + 4; \ + } + +#define MAKE_BFP_SAD4D_WRAPPER(fnname) \ + static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + } \ + static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 2; \ + } \ + static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 4; \ + } + +#define MAKE_BFP_JSADAVG_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param) >> \ + 4; \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x3d) + +#if !CONFIG_REALTIME_ONLY +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x3d) +#endif + +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg) +#if !CONFIG_REALTIME_ONLY +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg) +#endif +#endif // CONFIG_AV1_HIGHBITDEPTH + +#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \ + ppi->fn_ptr[BT].msdf = MCSDF; \ + ppi->fn_ptr[BT].msvf = MCSVF; + +#define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD) \ + HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT, \ + aom_highbd_masked_sad##WIDTH##x##HEIGHT##_bits##BD, \ + aom_highbd_##BD##_masked_sub_pixel_variance##WIDTH##x##HEIGHT) + +#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ + int m_stride, int invert_mask) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ + second_pred_ptr, m, m_stride, invert_mask); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ + int m_stride, int invert_mask) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ + second_pred_ptr, m, m_stride, invert_mask) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ + int m_stride, int invert_mask) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ + second_pred_ptr, m, m_stride, invert_mask) >> \ + 4; \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4) +#if !CONFIG_REALTIME_ONLY +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16) +#endif +#endif + +#define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \ + ppi->fn_ptr[BT].sdsf = SDSF; \ + ppi->fn_ptr[BT].sdsx4df = SDSX4DF; + +#define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD) \ + HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT, \ + aom_highbd_sad_skip_##WIDTH##x##HEIGHT##_bits##BD, \ + aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d##_bits##BD) + +#define MAKE_SDSF_SKIP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return fnname(src, src_stride, ref, ref_stride); \ + } \ + static unsigned int fnname##_bits10(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return fnname(src, src_stride, ref, ref_stride) >> 2; \ + } \ + static unsigned int fnname##_bits12(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return fnname(src, src_stride, ref, ref_stride) >> 4; \ + } + +#define MAKE_SDSF_SKIP_SAD_4D_WRAPPER(fnname) \ + static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + } \ + static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 2; \ + } \ + static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 4; \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x128) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x64) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x128) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x64) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x32) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x64) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x32) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x16) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x32) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x16) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x8) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x16) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x8) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x8) + +#if !CONFIG_REALTIME_ONLY +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x16) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x8) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x64) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x16) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x32) +#endif + +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x128x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x64x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x128x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x64x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x32x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x64x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x32x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x16x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x32x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x16x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x8x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x16x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x8x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x8x4d) + +#if !CONFIG_REALTIME_ONLY +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x16x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x8x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x64x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x16x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d) +#endif +#endif + +#if !CONFIG_REALTIME_ONLY + +#if CONFIG_AV1_HIGHBITDEPTH +#define HIGHBD_OBFP_WRAPPER_8(WIDTH, HEIGHT) \ + HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \ + aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits8, \ + aom_highbd_8_obmc_variance##WIDTH##x##HEIGHT, \ + aom_highbd_8_obmc_sub_pixel_variance##WIDTH##x##HEIGHT) + +#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \ + ppi->fn_ptr[BT].osdf = OSDF; \ + ppi->fn_ptr[BT].ovf = OVF; \ + ppi->fn_ptr[BT].osvf = OSVF; + +#define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD) \ + HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \ + aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits##BD, \ + aom_highbd_##BD##_obmc_variance##WIDTH##x##HEIGHT, \ + aom_highbd_##BD##_obmc_sub_pixel_variance##WIDTH##x##HEIGHT) + +#define MAKE_OBFP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk); \ + } \ + static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk) >> 2; \ + } \ + static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk) >> 4; \ + } +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_AV1_HIGHBITDEPTH +#if !CONFIG_REALTIME_ONLY +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16) +#endif + +static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) { + SequenceHeader *const seq_params = &ppi->seq_params; + if (seq_params->use_highbitdepth) { + switch (seq_params->bit_depth) { + case AOM_BITS_8: +#if !CONFIG_REALTIME_ONLY + HIGHBD_BFP_WRAPPER(64, 16, 8) + HIGHBD_BFP_WRAPPER(16, 64, 8) + HIGHBD_BFP_WRAPPER(32, 8, 8) + HIGHBD_BFP_WRAPPER(8, 32, 8) + HIGHBD_BFP_WRAPPER(16, 4, 8) + HIGHBD_BFP_WRAPPER(4, 16, 8) +#endif + HIGHBD_BFP_WRAPPER(32, 16, 8) + HIGHBD_BFP_WRAPPER(16, 32, 8) + HIGHBD_BFP_WRAPPER(64, 32, 8) + HIGHBD_BFP_WRAPPER(32, 64, 8) + HIGHBD_BFP_WRAPPER(32, 32, 8) + HIGHBD_BFP_WRAPPER(64, 64, 8) + HIGHBD_BFP_WRAPPER(16, 16, 8) + HIGHBD_BFP_WRAPPER(16, 8, 8) + HIGHBD_BFP_WRAPPER(8, 16, 8) + HIGHBD_BFP_WRAPPER(8, 8, 8) + HIGHBD_BFP_WRAPPER(8, 4, 8) + HIGHBD_BFP_WRAPPER(4, 8, 8) + HIGHBD_BFP_WRAPPER(4, 4, 8) + HIGHBD_BFP_WRAPPER(128, 128, 8) + HIGHBD_BFP_WRAPPER(128, 64, 8) + HIGHBD_BFP_WRAPPER(64, 128, 8) + + HIGHBD_MBFP_WRAPPER(128, 128, 8) + HIGHBD_MBFP_WRAPPER(128, 64, 8) + HIGHBD_MBFP_WRAPPER(64, 128, 8) + HIGHBD_MBFP_WRAPPER(64, 64, 8) + HIGHBD_MBFP_WRAPPER(64, 32, 8) + HIGHBD_MBFP_WRAPPER(32, 64, 8) + HIGHBD_MBFP_WRAPPER(32, 32, 8) + HIGHBD_MBFP_WRAPPER(32, 16, 8) + HIGHBD_MBFP_WRAPPER(16, 32, 8) + HIGHBD_MBFP_WRAPPER(16, 16, 8) + HIGHBD_MBFP_WRAPPER(8, 16, 8) + HIGHBD_MBFP_WRAPPER(16, 8, 8) + HIGHBD_MBFP_WRAPPER(8, 8, 8) + HIGHBD_MBFP_WRAPPER(4, 8, 8) + HIGHBD_MBFP_WRAPPER(8, 4, 8) + HIGHBD_MBFP_WRAPPER(4, 4, 8) +#if !CONFIG_REALTIME_ONLY + HIGHBD_MBFP_WRAPPER(64, 16, 8) + HIGHBD_MBFP_WRAPPER(16, 64, 8) + HIGHBD_MBFP_WRAPPER(32, 8, 8) + HIGHBD_MBFP_WRAPPER(8, 32, 8) + HIGHBD_MBFP_WRAPPER(16, 4, 8) + HIGHBD_MBFP_WRAPPER(4, 16, 8) +#endif + +// OBMC excluded from realtime only build. +#if !CONFIG_REALTIME_ONLY + HIGHBD_OBFP_WRAPPER_8(128, 128) + HIGHBD_OBFP_WRAPPER_8(128, 64) + HIGHBD_OBFP_WRAPPER_8(64, 128) + HIGHBD_OBFP_WRAPPER_8(64, 64) + HIGHBD_OBFP_WRAPPER_8(64, 32) + HIGHBD_OBFP_WRAPPER_8(32, 64) + HIGHBD_OBFP_WRAPPER_8(32, 32) + HIGHBD_OBFP_WRAPPER_8(32, 16) + HIGHBD_OBFP_WRAPPER_8(16, 32) + HIGHBD_OBFP_WRAPPER_8(16, 16) + HIGHBD_OBFP_WRAPPER_8(8, 16) + HIGHBD_OBFP_WRAPPER_8(16, 8) + HIGHBD_OBFP_WRAPPER_8(8, 8) + HIGHBD_OBFP_WRAPPER_8(4, 8) + HIGHBD_OBFP_WRAPPER_8(8, 4) + HIGHBD_OBFP_WRAPPER_8(4, 4) + HIGHBD_OBFP_WRAPPER_8(64, 16) + HIGHBD_OBFP_WRAPPER_8(16, 64) + HIGHBD_OBFP_WRAPPER_8(32, 8) + HIGHBD_OBFP_WRAPPER_8(8, 32) + HIGHBD_OBFP_WRAPPER_8(16, 4) + HIGHBD_OBFP_WRAPPER_8(4, 16) +#endif + + HIGHBD_SDSFP_WRAPPER(128, 128, 8) + HIGHBD_SDSFP_WRAPPER(128, 64, 8) + HIGHBD_SDSFP_WRAPPER(64, 128, 8) + HIGHBD_SDSFP_WRAPPER(64, 64, 8) + HIGHBD_SDSFP_WRAPPER(64, 32, 8) + HIGHBD_SDSFP_WRAPPER(32, 64, 8) + HIGHBD_SDSFP_WRAPPER(32, 32, 8) + HIGHBD_SDSFP_WRAPPER(32, 16, 8) + HIGHBD_SDSFP_WRAPPER(16, 32, 8) + HIGHBD_SDSFP_WRAPPER(16, 16, 8) + HIGHBD_SDSFP_WRAPPER(16, 8, 8) + HIGHBD_SDSFP_WRAPPER(8, 16, 8) + HIGHBD_SDSFP_WRAPPER(8, 8, 8) + HIGHBD_SDSFP_WRAPPER(4, 8, 8) +#if !CONFIG_REALTIME_ONLY + HIGHBD_SDSFP_WRAPPER(64, 16, 8) + HIGHBD_SDSFP_WRAPPER(32, 8, 8) + HIGHBD_SDSFP_WRAPPER(16, 64, 8) + HIGHBD_SDSFP_WRAPPER(8, 32, 8) + HIGHBD_SDSFP_WRAPPER(4, 16, 8) +#endif + break; + + case AOM_BITS_10: +#if !CONFIG_REALTIME_ONLY + HIGHBD_BFP_WRAPPER(64, 16, 10) + HIGHBD_BFP_WRAPPER(16, 64, 10) + HIGHBD_BFP_WRAPPER(32, 8, 10) + HIGHBD_BFP_WRAPPER(8, 32, 10) + HIGHBD_BFP_WRAPPER(16, 4, 10) + HIGHBD_BFP_WRAPPER(4, 16, 10) +#endif + HIGHBD_BFP_WRAPPER(32, 16, 10) + HIGHBD_BFP_WRAPPER(16, 32, 10) + HIGHBD_BFP_WRAPPER(64, 32, 10) + HIGHBD_BFP_WRAPPER(32, 64, 10) + HIGHBD_BFP_WRAPPER(32, 32, 10) + HIGHBD_BFP_WRAPPER(64, 64, 10) + HIGHBD_BFP_WRAPPER(16, 16, 10) + HIGHBD_BFP_WRAPPER(16, 8, 10) + HIGHBD_BFP_WRAPPER(8, 16, 10) + HIGHBD_BFP_WRAPPER(8, 8, 10) + HIGHBD_BFP_WRAPPER(8, 4, 10) + HIGHBD_BFP_WRAPPER(4, 8, 10) + HIGHBD_BFP_WRAPPER(4, 4, 10) + HIGHBD_BFP_WRAPPER(128, 128, 10) + HIGHBD_BFP_WRAPPER(128, 64, 10) + HIGHBD_BFP_WRAPPER(64, 128, 10) + + HIGHBD_MBFP_WRAPPER(128, 128, 10) + HIGHBD_MBFP_WRAPPER(128, 64, 10) + HIGHBD_MBFP_WRAPPER(64, 128, 10) + HIGHBD_MBFP_WRAPPER(64, 64, 10) + HIGHBD_MBFP_WRAPPER(64, 32, 10) + HIGHBD_MBFP_WRAPPER(32, 64, 10) + HIGHBD_MBFP_WRAPPER(32, 32, 10) + HIGHBD_MBFP_WRAPPER(32, 16, 10) + HIGHBD_MBFP_WRAPPER(16, 32, 10) + HIGHBD_MBFP_WRAPPER(16, 16, 10) + HIGHBD_MBFP_WRAPPER(8, 16, 10) + HIGHBD_MBFP_WRAPPER(16, 8, 10) + HIGHBD_MBFP_WRAPPER(8, 8, 10) + HIGHBD_MBFP_WRAPPER(4, 8, 10) + HIGHBD_MBFP_WRAPPER(8, 4, 10) + HIGHBD_MBFP_WRAPPER(4, 4, 10) +#if !CONFIG_REALTIME_ONLY + HIGHBD_MBFP_WRAPPER(64, 16, 10) + HIGHBD_MBFP_WRAPPER(16, 64, 10) + HIGHBD_MBFP_WRAPPER(32, 8, 10) + HIGHBD_MBFP_WRAPPER(8, 32, 10) + HIGHBD_MBFP_WRAPPER(16, 4, 10) + HIGHBD_MBFP_WRAPPER(4, 16, 10) +#endif + +// OBMC excluded from realtime only build. +#if !CONFIG_REALTIME_ONLY + HIGHBD_OBFP_WRAPPER(128, 128, 10) + HIGHBD_OBFP_WRAPPER(128, 64, 10) + HIGHBD_OBFP_WRAPPER(64, 128, 10) + HIGHBD_OBFP_WRAPPER(64, 64, 10) + HIGHBD_OBFP_WRAPPER(64, 32, 10) + HIGHBD_OBFP_WRAPPER(32, 64, 10) + HIGHBD_OBFP_WRAPPER(32, 32, 10) + HIGHBD_OBFP_WRAPPER(32, 16, 10) + HIGHBD_OBFP_WRAPPER(16, 32, 10) + HIGHBD_OBFP_WRAPPER(16, 16, 10) + HIGHBD_OBFP_WRAPPER(8, 16, 10) + HIGHBD_OBFP_WRAPPER(16, 8, 10) + HIGHBD_OBFP_WRAPPER(8, 8, 10) + HIGHBD_OBFP_WRAPPER(4, 8, 10) + HIGHBD_OBFP_WRAPPER(8, 4, 10) + HIGHBD_OBFP_WRAPPER(4, 4, 10) + HIGHBD_OBFP_WRAPPER(64, 16, 10) + HIGHBD_OBFP_WRAPPER(16, 64, 10) + HIGHBD_OBFP_WRAPPER(32, 8, 10) + HIGHBD_OBFP_WRAPPER(8, 32, 10) + HIGHBD_OBFP_WRAPPER(16, 4, 10) + HIGHBD_OBFP_WRAPPER(4, 16, 10) +#endif + + HIGHBD_SDSFP_WRAPPER(128, 128, 10) + HIGHBD_SDSFP_WRAPPER(128, 64, 10) + HIGHBD_SDSFP_WRAPPER(64, 128, 10) + HIGHBD_SDSFP_WRAPPER(64, 64, 10) + HIGHBD_SDSFP_WRAPPER(64, 32, 10) + HIGHBD_SDSFP_WRAPPER(32, 64, 10) + HIGHBD_SDSFP_WRAPPER(32, 32, 10) + HIGHBD_SDSFP_WRAPPER(32, 16, 10) + HIGHBD_SDSFP_WRAPPER(16, 32, 10) + HIGHBD_SDSFP_WRAPPER(16, 16, 10) + HIGHBD_SDSFP_WRAPPER(16, 8, 10) + HIGHBD_SDSFP_WRAPPER(8, 16, 10) + HIGHBD_SDSFP_WRAPPER(8, 8, 10) + HIGHBD_SDSFP_WRAPPER(4, 8, 10) + +#if !CONFIG_REALTIME_ONLY + HIGHBD_SDSFP_WRAPPER(64, 16, 10) + HIGHBD_SDSFP_WRAPPER(32, 8, 10) + HIGHBD_SDSFP_WRAPPER(16, 64, 10) + HIGHBD_SDSFP_WRAPPER(8, 32, 10) + HIGHBD_SDSFP_WRAPPER(4, 16, 10) +#endif + break; + + case AOM_BITS_12: +#if !CONFIG_REALTIME_ONLY + HIGHBD_BFP_WRAPPER(64, 16, 12) + HIGHBD_BFP_WRAPPER(16, 64, 12) + HIGHBD_BFP_WRAPPER(32, 8, 12) + HIGHBD_BFP_WRAPPER(8, 32, 12) + HIGHBD_BFP_WRAPPER(16, 4, 12) + HIGHBD_BFP_WRAPPER(4, 16, 12) +#endif + HIGHBD_BFP_WRAPPER(32, 16, 12) + HIGHBD_BFP_WRAPPER(16, 32, 12) + HIGHBD_BFP_WRAPPER(64, 32, 12) + HIGHBD_BFP_WRAPPER(32, 64, 12) + HIGHBD_BFP_WRAPPER(32, 32, 12) + HIGHBD_BFP_WRAPPER(64, 64, 12) + HIGHBD_BFP_WRAPPER(16, 16, 12) + HIGHBD_BFP_WRAPPER(16, 8, 12) + HIGHBD_BFP_WRAPPER(8, 16, 12) + HIGHBD_BFP_WRAPPER(8, 8, 12) + HIGHBD_BFP_WRAPPER(8, 4, 12) + HIGHBD_BFP_WRAPPER(4, 8, 12) + HIGHBD_BFP_WRAPPER(4, 4, 12) + HIGHBD_BFP_WRAPPER(128, 128, 12) + HIGHBD_BFP_WRAPPER(128, 64, 12) + HIGHBD_BFP_WRAPPER(64, 128, 12) + + HIGHBD_MBFP_WRAPPER(128, 128, 12) + HIGHBD_MBFP_WRAPPER(128, 64, 12) + HIGHBD_MBFP_WRAPPER(64, 128, 12) + HIGHBD_MBFP_WRAPPER(64, 64, 12) + HIGHBD_MBFP_WRAPPER(64, 32, 12) + HIGHBD_MBFP_WRAPPER(32, 64, 12) + HIGHBD_MBFP_WRAPPER(32, 32, 12) + HIGHBD_MBFP_WRAPPER(32, 16, 12) + HIGHBD_MBFP_WRAPPER(16, 32, 12) + HIGHBD_MBFP_WRAPPER(16, 16, 12) + HIGHBD_MBFP_WRAPPER(8, 16, 12) + HIGHBD_MBFP_WRAPPER(16, 8, 12) + HIGHBD_MBFP_WRAPPER(8, 8, 12) + HIGHBD_MBFP_WRAPPER(4, 8, 12) + HIGHBD_MBFP_WRAPPER(8, 4, 12) + HIGHBD_MBFP_WRAPPER(4, 4, 12) +#if !CONFIG_REALTIME_ONLY + HIGHBD_MBFP_WRAPPER(64, 16, 12) + HIGHBD_MBFP_WRAPPER(16, 64, 12) + HIGHBD_MBFP_WRAPPER(32, 8, 12) + HIGHBD_MBFP_WRAPPER(8, 32, 12) + HIGHBD_MBFP_WRAPPER(16, 4, 12) + HIGHBD_MBFP_WRAPPER(4, 16, 12) +#endif + +// OBMC excluded from realtime only build. +#if !CONFIG_REALTIME_ONLY + HIGHBD_OBFP_WRAPPER(128, 128, 12) + HIGHBD_OBFP_WRAPPER(128, 64, 12) + HIGHBD_OBFP_WRAPPER(64, 128, 12) + HIGHBD_OBFP_WRAPPER(64, 64, 12) + HIGHBD_OBFP_WRAPPER(64, 32, 12) + HIGHBD_OBFP_WRAPPER(32, 64, 12) + HIGHBD_OBFP_WRAPPER(32, 32, 12) + HIGHBD_OBFP_WRAPPER(32, 16, 12) + HIGHBD_OBFP_WRAPPER(16, 32, 12) + HIGHBD_OBFP_WRAPPER(16, 16, 12) + HIGHBD_OBFP_WRAPPER(8, 16, 12) + HIGHBD_OBFP_WRAPPER(16, 8, 12) + HIGHBD_OBFP_WRAPPER(8, 8, 12) + HIGHBD_OBFP_WRAPPER(4, 8, 12) + HIGHBD_OBFP_WRAPPER(8, 4, 12) + HIGHBD_OBFP_WRAPPER(4, 4, 12) + HIGHBD_OBFP_WRAPPER(64, 16, 12) + HIGHBD_OBFP_WRAPPER(16, 64, 12) + HIGHBD_OBFP_WRAPPER(32, 8, 12) + HIGHBD_OBFP_WRAPPER(8, 32, 12) + HIGHBD_OBFP_WRAPPER(16, 4, 12) + HIGHBD_OBFP_WRAPPER(4, 16, 12) +#endif + + HIGHBD_SDSFP_WRAPPER(128, 128, 12) + HIGHBD_SDSFP_WRAPPER(128, 64, 12) + HIGHBD_SDSFP_WRAPPER(64, 128, 12) + HIGHBD_SDSFP_WRAPPER(64, 64, 12) + HIGHBD_SDSFP_WRAPPER(64, 32, 12) + HIGHBD_SDSFP_WRAPPER(32, 64, 12) + HIGHBD_SDSFP_WRAPPER(32, 32, 12) + HIGHBD_SDSFP_WRAPPER(32, 16, 12) + HIGHBD_SDSFP_WRAPPER(16, 32, 12) + HIGHBD_SDSFP_WRAPPER(16, 16, 12) + HIGHBD_SDSFP_WRAPPER(16, 8, 12) + HIGHBD_SDSFP_WRAPPER(8, 16, 12) + HIGHBD_SDSFP_WRAPPER(8, 8, 12) + HIGHBD_SDSFP_WRAPPER(4, 8, 12) + +#if !CONFIG_REALTIME_ONLY + HIGHBD_SDSFP_WRAPPER(64, 16, 12) + HIGHBD_SDSFP_WRAPPER(32, 8, 12) + HIGHBD_SDSFP_WRAPPER(16, 64, 12) + HIGHBD_SDSFP_WRAPPER(8, 32, 12) + HIGHBD_SDSFP_WRAPPER(4, 16, 12) +#endif + break; + + default: + assert(0 && + "cm->seq_params->bit_depth should be AOM_BITS_8, " + "AOM_BITS_10 or AOM_BITS_12"); + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) { + FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs; + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + av1_copy(frame_probs->tx_type_probs, default_tx_type_probs); + } + if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { + av1_copy(frame_probs->obmc_probs, default_obmc_probs); + } + if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + av1_copy(frame_probs->warped_probs, default_warped_probs); + } + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + av1_copy(frame_probs->switchable_interp_probs, + default_switchable_interp_probs); + } + +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs; + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + av1_copy(temp_frame_probs->tx_type_probs, default_tx_type_probs); + } + if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { + av1_copy(temp_frame_probs->obmc_probs, default_obmc_probs); + } + if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + av1_copy(temp_frame_probs->warped_probs, default_warped_probs); + } + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + av1_copy(temp_frame_probs->switchable_interp_probs, + default_switchable_interp_probs); + } + + FrameProbInfo *const temp_frame_probs_simulation = + &cpi->ppi->temp_frame_probs_simulation; + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + av1_copy(temp_frame_probs_simulation->tx_type_probs, + default_tx_type_probs); + } + if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { + av1_copy(temp_frame_probs_simulation->obmc_probs, default_obmc_probs); + } + if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + av1_copy(temp_frame_probs_simulation->warped_probs, default_warped_probs); + } + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + av1_copy(temp_frame_probs_simulation->switchable_interp_probs, + default_switchable_interp_probs); + } + } +#endif +} + +static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst, + const CdefInfo *const src) { + dst->cdef_bits = src->cdef_bits; + dst->cdef_damping = src->cdef_damping; + av1_copy(dst->cdef_strengths, src->cdef_strengths); + av1_copy(dst->cdef_uv_strengths, src->cdef_uv_strengths); + dst->nb_cdef_strengths = src->nb_cdef_strengths; +} + +// Coding context that only needs to be restored when recode loop includes +// filtering (deblocking, CDEF, superres post-encode upscale and/or loop +// restoraton). +static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + AV1_COMMON *cm = &cpi->common; + cm->lf = cc->lf; + restore_cdef_coding_context(&cm->cdef_info, &cc->cdef_info); + cpi->rc = cc->rc; + cpi->ppi->mv_stats = cc->mv_stats; +} + +static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width && + a->y_stride == b->y_stride && a->uv_stride == b->uv_stride && + a->border == b->border && + (a->flags & YV12_FLAG_HIGHBITDEPTH) == + (b->flags & YV12_FLAG_HIGHBITDEPTH); +} + +static AOM_INLINE int update_entropy(bool *ext_refresh_frame_context, + bool *ext_refresh_frame_context_pending, + bool update) { + *ext_refresh_frame_context = update; + *ext_refresh_frame_context_pending = 1; + return 0; +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE int combine_prior_with_tpl_boost(double min_factor, + double max_factor, + int prior_boost, + int tpl_boost, + int frames_to_key) { + double factor = sqrt((double)frames_to_key); + double range = max_factor - min_factor; + factor = AOMMIN(factor, max_factor); + factor = AOMMAX(factor, min_factor); + factor -= min_factor; + int boost = + (int)((factor * prior_boost + (range - factor) * tpl_boost) / range); + return boost; +} +#endif + +static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) { + int i; + AV1_COMMON *const cm = &cpi->common; + FeatureFlags *const features = &cm->features; + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + cm->global_motion[i] = default_warp_params; + } + cpi->gm_info.search_done = 0; + + av1_set_speed_features_framesize_independent(cpi, cpi->speed); + av1_set_rd_speed_thresholds(cpi); + features->interp_filter = SWITCHABLE; + features->switchable_motion_mode = is_switchable_motion_mode_allowed( + features->allow_warped_motion, cpi->oxcf.motion_mode_cfg.enable_obmc); +} + +static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) { + // Scaled references should only need to be released under certain conditions: + // if the reference will be updated, or if the scaled reference has same + // resolution. For now only apply this to Golden for non-svc RTC mode. + AV1_COMMON *const cm = &cpi->common; + const bool refresh_golden = (cpi->refresh_frame.golden_frame) ? 1 : 0; + bool release_golden = true; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + RefCntBuffer *const buf = cpi->scaled_ref_buf[i]; + const int golden_ref = (i == GOLDEN_FRAME - 1); + if (golden_ref && is_one_pass_rt_params(cpi) && !cpi->ppi->use_svc && + buf != NULL) { + const RefCntBuffer *const ref = get_ref_frame_buf(cm, GOLDEN_FRAME); + const bool same_resoln = buf->buf.y_crop_width == ref->buf.y_crop_width && + buf->buf.y_crop_height == ref->buf.y_crop_height; + release_golden = refresh_golden || same_resoln; + } + if (buf != NULL && (!golden_ref || (golden_ref && release_golden))) { + --buf->ref_count; + cpi->scaled_ref_buf[i] = NULL; + } + } +} + +static AOM_INLINE void restore_all_coding_context(AV1_COMP *cpi) { + restore_extra_coding_context(cpi); + if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi); +} + +static AOM_INLINE int reduce_num_ref_buffers(const AV1_COMP *cpi) { + const SequenceHeader *const seq_params = cpi->common.seq_params; + return is_one_pass_rt_params(cpi) && + use_rtc_reference_structure_one_layer(cpi) && + (seq_params->order_hint_info.enable_order_hint == 0) && + cpi->rt_reduce_num_ref_buffers; +} + +// Refresh reference frame buffers according to refresh_frame_flags. +static AOM_INLINE void refresh_reference_frames(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + // All buffers are refreshed for shown keyframes and S-frames. + // In case of RT, golden frame refreshes the 6th slot and other reference + // frames refresh slots 0 to 5. Slot 7 is not refreshed by any reference + // frame. Thus, only 7 buffers are refreshed for keyframes and S-frames + // instead of 8. + int num_ref_buffers = REF_FRAMES; + if (reduce_num_ref_buffers(cpi)) { + const int refresh_all_bufs = + (cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET || + frame_is_sframe(cm)); + assert(IMPLIES(((cm->current_frame.refresh_frame_flags >> 7) & 1) == 1, + refresh_all_bufs)); + (void)refresh_all_bufs; + num_ref_buffers--; + } + + for (int ref_frame = 0; ref_frame < num_ref_buffers; ref_frame++) { + if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) { + assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame); + } + } +} + +void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf); +void av1_update_film_grain_parameters(struct AV1_COMP *cpi, + const AV1EncoderConfig *oxcf); + +void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, + const int phase, const int use_optimized_scaler); + +void av1_setup_frame(AV1_COMP *cpi); + +BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width, + int height, int number_spatial_layers); + +void av1_apply_active_map(AV1_COMP *cpi); + +#if !CONFIG_REALTIME_ONLY +uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi); + +void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig); +#endif + +void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, + int *top_index); + +void av1_finalize_encoded_frame(AV1_COMP *const cpi); + +int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture, + const YV12_BUFFER_CONFIG *last_picture, + ForceIntegerMVInfo *const force_intpel_info); + +void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi); + +void av1_save_all_coding_context(AV1_COMP *cpi); + +#if DUMP_RECON_FRAMES == 1 +void av1_dump_filtered_recon_frames(AV1_COMP *cpi); +#endif + +static AOM_INLINE int av1_get_enc_border_size(bool resize, bool all_intra, + BLOCK_SIZE sb_size) { + // For allintra encoding mode, inter-frame motion search is not applicable and + // the intraBC motion vectors are restricted within the tile boundaries. Hence + // a smaller frame border size (AOM_ENC_ALLINTRA_BORDER) is used in this case. + if (resize) { + return AOM_BORDER_IN_PIXELS; + } + if (all_intra) { + return AOM_ENC_ALLINTRA_BORDER; + } + return block_size_wide[sb_size] + 32; +} + +static AOM_INLINE bool av1_is_resize_needed(const AV1EncoderConfig *oxcf) { + const ResizeCfg *resize_cfg = &oxcf->resize_cfg; + const SuperResCfg *superres_cfg = &oxcf->superres_cfg; + return resize_cfg->resize_mode || superres_cfg->superres_mode; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODER_UTILS_H_ diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c new file mode 100644 index 0000000000..5fe2a497c7 --- /dev/null +++ b/third_party/aom/av1/encoder/encodetxb.c @@ -0,0 +1,886 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/encodetxb.h" + +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "av1/common/idct.h" +#include "av1/common/pred_common.h" +#include "av1/common/scan.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/hash.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/tokenize.h" + +void av1_alloc_txb_buf(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool; + const int num_sb_rows = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); + const int num_sb_cols = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); + const int size = num_sb_rows * num_sb_cols; + const int num_planes = av1_num_planes(cm); + const int subsampling_x = cm->seq_params->subsampling_x; + const int subsampling_y = cm->seq_params->subsampling_y; + const int luma_max_sb_square = + 1 << num_pels_log2_lookup[cm->seq_params->sb_size]; + const int chroma_max_sb_square = + luma_max_sb_square >> (subsampling_x + subsampling_y); + const int num_tcoeffs = + size * (luma_max_sb_square + (num_planes - 1) * chroma_max_sb_square); + const int txb_unit_size = TX_SIZE_W_MIN * TX_SIZE_H_MIN; + + av1_free_txb_buf(cpi); + // TODO(jingning): This should be further reduced. + CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base, + aom_malloc(sizeof(*cpi->coeff_buffer_base) * size)); + CHECK_MEM_ERROR( + cm, coeff_buf_pool->tcoeff, + aom_memalign(32, sizeof(*coeff_buf_pool->tcoeff) * num_tcoeffs)); + CHECK_MEM_ERROR( + cm, coeff_buf_pool->eobs, + aom_malloc(sizeof(*coeff_buf_pool->eobs) * num_tcoeffs / txb_unit_size)); + CHECK_MEM_ERROR(cm, coeff_buf_pool->entropy_ctx, + aom_malloc(sizeof(*coeff_buf_pool->entropy_ctx) * + num_tcoeffs / txb_unit_size)); + + tran_low_t *tcoeff_ptr = coeff_buf_pool->tcoeff; + uint16_t *eob_ptr = coeff_buf_pool->eobs; + uint8_t *entropy_ctx_ptr = coeff_buf_pool->entropy_ctx; + for (int i = 0; i < size; i++) { + for (int plane = 0; plane < num_planes; plane++) { + const int max_sb_square = + (plane == AOM_PLANE_Y) ? luma_max_sb_square : chroma_max_sb_square; + cpi->coeff_buffer_base[i].tcoeff[plane] = tcoeff_ptr; + cpi->coeff_buffer_base[i].eobs[plane] = eob_ptr; + cpi->coeff_buffer_base[i].entropy_ctx[plane] = entropy_ctx_ptr; + tcoeff_ptr += max_sb_square; + eob_ptr += max_sb_square / txb_unit_size; + entropy_ctx_ptr += max_sb_square / txb_unit_size; + } + } +} + +void av1_free_txb_buf(AV1_COMP *cpi) { + CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool; + aom_free(cpi->coeff_buffer_base); + cpi->coeff_buffer_base = NULL; + aom_free(coeff_buf_pool->tcoeff); + coeff_buf_pool->tcoeff = NULL; + aom_free(coeff_buf_pool->eobs); + coeff_buf_pool->eobs = NULL; + aom_free(coeff_buf_pool->entropy_ctx); + coeff_buf_pool->entropy_ctx = NULL; +} + +static void write_golomb(aom_writer *w, int level) { + int x = level + 1; + int i = x; + int length = 0; + + while (i) { + i >>= 1; + ++length; + } + assert(length > 0); + + for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0); + + for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01); +} + +static const int8_t eob_to_pos_small[33] = { + 0, 1, 2, // 0-2 + 3, 3, // 3-4 + 4, 4, 4, 4, // 5-8 + 5, 5, 5, 5, 5, 5, 5, 5, // 9-16 + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32 +}; + +static const int8_t eob_to_pos_large[17] = { + 6, // place holder + 7, // 33-64 + 8, 8, // 65-128 + 9, 9, 9, 9, // 129-256 + 10, 10, 10, 10, 10, 10, 10, 10, // 257-512 + 11 // 513- +}; + +int av1_get_eob_pos_token(const int eob, int *const extra) { + int t; + + if (eob < 33) { + t = eob_to_pos_small[eob]; + } else { + const int e = AOMMIN((eob - 1) >> 5, 16); + t = eob_to_pos_large[e]; + } + + *extra = eob - av1_eob_group_start[t]; + + return t; +} + +#if CONFIG_ENTROPY_STATS +void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size, + TX_CLASS tx_class, PLANE_TYPE plane, + FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts, + uint8_t allow_update_cdf) { +#else +void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class, + PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx, + uint8_t allow_update_cdf) { +#endif + int eob_extra; + const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra); + TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + + switch (eob_multi_size) { + case 0: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5); + break; + case 1: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6); + break; + case 2: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7); + break; + case 3: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1, + 8); + } + break; + case 4: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1, + 9); + } + break; + case 5: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1, + 10); + } + break; + case 6: + default: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1, + 11); + } + break; + } + + if (av1_eob_offset_bits[eob_pt] > 0) { + int eob_ctx = eob_pt - 3; + int eob_shift = av1_eob_offset_bits[eob_pt] - 1; + int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; +#if CONFIG_ENTROPY_STATS + counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) + update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2); + } +} + +static INLINE int get_nz_map_ctx(const uint8_t *const levels, + const int coeff_idx, const int bhl, + const int width, const int scan_idx, + const int is_eob, const TX_SIZE tx_size, + const TX_CLASS tx_class) { + if (is_eob) { + if (scan_idx == 0) return 0; + if (scan_idx <= (width << bhl) / 8) return 1; + if (scan_idx <= (width << bhl) / 4) return 2; + return 3; + } + const int stats = + get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class); + return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class); +} + +void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = height + TX_PAD_HOR; + uint8_t *ls = levels; + + memset(levels + stride * width, 0, + sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); + + for (int i = 0; i < width; i++) { + for (int j = 0; j < height; j++) { + *ls++ = (uint8_t)clamp(abs(coeff[i * height + j]), 0, INT8_MAX); + } + for (int j = 0; j < TX_PAD_HOR; j++) { + *ls++ = 0; + } + } +} + +void av1_get_nz_map_contexts_c(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + for (int i = 0; i < eob; ++i) { + const int pos = scan[i]; + coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bhl, width, i, + i == eob - 1, tx_size, tx_class); + } +} + +void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x, + aom_writer *w, int blk_row, int blk_col, int plane, + int block, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; + const PLANE_TYPE plane_type = get_plane_type(plane); + const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] / + (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; + const uint16_t eob = eob_txb[block]; + const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; + const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2); + if (eob == 0) return; + + const TX_TYPE tx_type = + av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + // Only y plane's tx_type is transmitted + if (plane == 0) { + av1_write_tx_type(cm, xd, tx_type, tx_size, w); + } + + int eob_extra; + const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra); + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + switch (eob_multi_size) { + case 0: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5); + break; + case 1: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6); + break; + case 2: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7); + break; + case 3: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8); + break; + case 4: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9); + break; + case 5: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10); + break; + default: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11); + break; + } + + const int eob_offset_bits = av1_eob_offset_bits[eob_pt]; + if (eob_offset_bits > 0) { + const int eob_ctx = eob_pt - 3; + int eob_shift = eob_offset_bits - 1; + int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + aom_write_symbol(w, bit, + ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2); + for (int i = 1; i < eob_offset_bits; i++) { + eob_shift = eob_offset_bits - 1 - i; + bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + aom_write_bit(w, bit); + } + } + + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + const tran_low_t *tcoeff_txb = + cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type]; + const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block); + av1_txb_init_levels(tcoeff, width, height, levels); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); + + const int bhl = get_txb_bhl(tx_size); + for (int c = eob - 1; c >= 0; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = tcoeff[pos]; + const tran_low_t level = abs(v); + + if (c == eob - 1) { + aom_write_symbol( + w, AOMMIN(level, 3) - 1, + ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3); + } else { + aom_write_symbol(w, AOMMIN(level, 3), + ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx], + 4); + } + if (level > NUM_BASE_LEVELS) { + // level is above 1. + const int base_range = level - 1 - NUM_BASE_LEVELS; + const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); + aom_cdf_prob *cdf = + ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx]; + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); + aom_write_symbol(w, k, cdf, BR_CDF_SIZE); + if (k < BR_CDF_SIZE - 1) break; + } + } + } + + // Loop to code all signs in the transform block, + // starting with the sign of DC (if applicable) + for (int c = 0; c < eob; ++c) { + const tran_low_t v = tcoeff[scan[c]]; + const tran_low_t level = abs(v); + const int sign = (v < 0) ? 1 : 0; + if (level) { + if (c == 0) { + const int dc_sign_ctx = + (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK; + aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], + 2); + } else { + aom_write_bit(w, sign); + } + if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS) + write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS); + } + } +} + +void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, + aom_writer *w, BLOCK_SIZE bsize) { + MACROBLOCKD *xd = &x->e_mbd; + const int num_planes = av1_num_planes(cm); + int block[MAX_MB_PLANE] = { 0 }; + int row, col; + assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int max_blocks_high = max_block_high(xd, bsize, 0); + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); + mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); + + for (row = 0; row < max_blocks_high; row += mu_blocks_high) { + for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const int stepr = tx_size_high_unit[tx_size]; + const int stepc = tx_size_wide_unit[tx_size]; + const int step = stepr * stepc; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int unit_height = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y); + const int unit_width = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x); + for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height; + blk_row += stepr) { + for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width; + blk_col += stepc) { + av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, + block[plane], tx_size); + block[plane] += step; + } + } + } + } + } +} + +uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff, + const SCAN_ORDER *scan_order, int eob) { + const int16_t *const scan = scan_order->scan; + int cul_level = 0; + int c; + + if (eob == 0) return 0; + for (c = 0; c < eob; ++c) { + cul_level += abs(qcoeff[scan[c]]); + if (cul_level > COEFF_CONTEXT_MASK) break; + } + + cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); + set_dc_sign(&cul_level, qcoeff[0]); + + return (uint8_t)cul_level; +} + +static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm, + MACROBLOCKD *xd, int blk_row, int blk_col, + int plane, TX_SIZE tx_size, + FRAME_COUNTS *counts, + uint8_t allow_update_cdf) { + MB_MODE_INFO *mbmi = xd->mi[0]; + int is_inter = is_inter_block(mbmi); + const int reduced_tx_set_used = cm->features.reduced_tx_set_used; + FRAME_CONTEXT *fc = xd->tile_ctx; +#if !CONFIG_ENTROPY_STATS + (void)counts; +#endif // !CONFIG_ENTROPY_STATS + + // Only y plane's tx_type is updated + if (plane > 0) return; + const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col, + tx_size, reduced_tx_set_used); + if (is_inter) { + if (cpi->oxcf.txfm_cfg.use_inter_dct_only) { + assert(tx_type == DCT_DCT); + } + } else { + if (cpi->oxcf.txfm_cfg.use_intra_dct_only) { + assert(tx_type == DCT_DCT); + } else if (cpi->oxcf.txfm_cfg.use_intra_default_tx_only) { + const TX_TYPE default_type = get_default_tx_type( + PLANE_TYPE_Y, xd, tx_size, cpi->use_screen_content_tools); + (void)default_type; + // TODO(kyslov): We don't always respect use_intra_default_tx_only flag in + // NonRD and REALTIME case. Specifically we ignore it in hybrid inta mode + // search, when picking up intra mode in nonRD inter mode search and in RD + // REALTIME mode when we limit TX type usage. + // We need to fix txfm cfg for these cases. Meanwhile relieving the + // assert. + assert(tx_type == default_type || cpi->sf.rt_sf.use_nonrd_pick_mode || + cpi->oxcf.mode == REALTIME); + } + } + + if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 && + cm->quant_params.base_qindex > 0 && !mbmi->skip_txfm && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used); + if (eset > 0) { + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used); + if (is_inter) { + if (allow_update_cdf) { + update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]], + av1_ext_tx_ind[tx_set_type][tx_type], + av1_num_ext_tx_set[tx_set_type]); + } +#if CONFIG_ENTROPY_STATS + ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]] + [av1_ext_tx_ind[tx_set_type][tx_type]]; +#endif // CONFIG_ENTROPY_STATS + } else { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode]; + else + intra_dir = mbmi->mode; +#if CONFIG_ENTROPY_STATS + ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir] + [av1_ext_tx_ind[tx_set_type][tx_type]]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + update_cdf( + fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir], + av1_ext_tx_ind[tx_set_type][tx_type], + av1_num_ext_tx_set[tx_set_type]); + } + } + } + } +} + +void av1_update_and_record_txb_context(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct tokenize_b_args *const args = arg; + const AV1_COMP *cpi = args->cpi; + const AV1_COMMON *cm = &cpi->common; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const int eob = p->eobs[block]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + block_offset; + const PLANE_TYPE plane_type = pd->plane_type; + const TX_TYPE tx_type = + av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + tran_low_t *tcoeff; + assert(args->dry_run != DRY_RUN_COSTCOEFFS); + if (args->dry_run == OUTPUT_ENABLED) { + MB_MODE_INFO *mbmi = xd->mi[0]; + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, + pd->above_entropy_context + blk_col, + pd->left_entropy_context + blk_row, &txb_ctx); + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const uint8_t allow_update_cdf = args->allow_update_cdf; + const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#if CONFIG_ENTROPY_STATS + int cdf_idx = cm->coef_cdf_category; + ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], + eob == 0, 2); + } + + CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; + const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] / + (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; + uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; + entropy_ctx[block] = txb_ctx.txb_skip_ctx; + eob_txb[block] = eob; + + if (eob == 0) { + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, + blk_row); + return; + } + const int segment_id = mbmi->segment_id; + const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); + tran_low_t *tcoeff_txb = + cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type]; + tcoeff = tcoeff_txb + block_offset; + memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob); + + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + av1_txb_init_levels(tcoeff, width, height, levels); + update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size, + td->counts, allow_update_cdf); + + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const int16_t *const scan = scan_order->scan; + + // record tx type usage + td->rd_counts.tx_type_used[tx_size][tx_type]++; + +#if CONFIG_ENTROPY_STATS + av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, + td->counts, allow_update_cdf); +#else + av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx, + allow_update_cdf); +#endif + + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, + coeff_contexts); + + for (int c = eob - 1; c >= 0; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = qcoeff[pos]; + const tran_low_t level = abs(v); + /* abs_sum_level is needed to decide the job scheduling order of + * pack bitstream multi-threading. This data is not needed if + * multi-threading is disabled. */ + if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level; + + if (allow_update_cdf) { + if (c == eob - 1) { + assert(coeff_ctx < 4); + update_cdf( + ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx], + AOMMIN(level, 3) - 1, 3); + } else { + update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx], + AOMMIN(level, 3), 4); + } + } + if (c == eob - 1) { + assert(coeff_ctx < 4); +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3) - 1]; + } else { + ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3)]; +#endif + } + if (level > NUM_BASE_LEVELS) { + const int base_range = level - 1 - NUM_BASE_LEVELS; + const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); + if (allow_update_cdf) { + update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)] + [plane_type][br_ctx], + k, BR_CDF_SIZE); + } + for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) { +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type] + [lps][br_ctx][lps == k]; +#endif // CONFIG_ENTROPY_STATS + if (lps == k) break; + } +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)] + [plane_type][br_ctx][k]; +#endif + if (k < BR_CDF_SIZE - 1) break; + } + } + } + // Update the context needed to code the DC sign (if applicable) + if (tcoeff[0] != 0) { + const int dc_sign = (tcoeff[0] < 0) ? 1 : 0; + const int dc_sign_ctx = txb_ctx.dc_sign_ctx; +#if CONFIG_ENTROPY_STATS + ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) + update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2); + entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT; + } + } else { + tcoeff = qcoeff; + } + const uint8_t cul_level = + av1_get_txb_entropy_context(tcoeff, scan_order, eob); + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, + blk_col, blk_row); +} + +void av1_record_txb_context(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct tokenize_b_args *const args = arg; + const AV1_COMP *cpi = args->cpi; + const AV1_COMMON *cm = &cpi->common; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const int eob = p->eobs[block]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + block_offset; + const PLANE_TYPE plane_type = pd->plane_type; + const TX_TYPE tx_type = + av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + tran_low_t *tcoeff; + assert(args->dry_run != DRY_RUN_COSTCOEFFS); + if (args->dry_run == OUTPUT_ENABLED) { + MB_MODE_INFO *mbmi = xd->mi[0]; + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, + pd->above_entropy_context + blk_col, + pd->left_entropy_context + blk_row, &txb_ctx); +#if CONFIG_ENTROPY_STATS + const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size); + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + int cdf_idx = cm->coef_cdf_category; + ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0]; +#endif // CONFIG_ENTROPY_STATS + + CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; + const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] / + (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; + uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; + entropy_ctx[block] = txb_ctx.txb_skip_ctx; + eob_txb[block] = eob; + + if (eob == 0) { + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, + blk_row); + return; + } + const int segment_id = mbmi->segment_id; + const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); + tran_low_t *tcoeff_txb = + cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type]; + tcoeff = tcoeff_txb + block_offset; + memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob); + +#if CONFIG_ENTROPY_STATS + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + av1_txb_init_levels(tcoeff, width, height, levels); + update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size, + td->counts, 0 /*allow_update_cdf*/); + + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const bool do_coeff_scan = true; +#else + const bool do_coeff_scan = cpi->mt_info.pack_bs_mt_enabled; +#endif + const int16_t *const scan = scan_order->scan; + + // record tx type usage + td->rd_counts.tx_type_used[tx_size][tx_type]++; + +#if CONFIG_ENTROPY_STATS + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, + td->counts, 0 /*allow_update_cdf*/); + + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, + coeff_contexts); +#endif + + for (int c = eob - 1; (c >= 0) && do_coeff_scan; --c) { + const int pos = scan[c]; + const tran_low_t v = qcoeff[pos]; + const tran_low_t level = abs(v); + /* abs_sum_level is needed to decide the job scheduling order of + * pack bitstream multi-threading. This data is not needed if + * multi-threading is disabled. */ + if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level; + +#if CONFIG_ENTROPY_STATS + const int coeff_ctx = coeff_contexts[pos]; + if (c == eob - 1) { + assert(coeff_ctx < 4); + ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3) - 1]; + } else { + ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3)]; + } + if (level > NUM_BASE_LEVELS) { + const int base_range = level - 1 - NUM_BASE_LEVELS; + const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); + for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) { + ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type] + [lps][br_ctx][lps == k]; + if (lps == k) break; + } + ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)] + [plane_type][br_ctx][k]; + if (k < BR_CDF_SIZE - 1) break; + } + } +#endif + } + // Update the context needed to code the DC sign (if applicable) + if (tcoeff[0] != 0) { + const int dc_sign_ctx = txb_ctx.dc_sign_ctx; +#if CONFIG_ENTROPY_STATS + const int dc_sign = (tcoeff[0] < 0) ? 1 : 0; + ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign]; +#endif // CONFIG_ENTROPY_STATS + entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT; + } + } else { + tcoeff = qcoeff; + } + const uint8_t cul_level = + av1_get_txb_entropy_context(tcoeff, scan_order, eob); + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, + blk_col, blk_row); +} + +void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + uint8_t allow_update_cdf) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run }; + if (mbmi->skip_txfm) { + av1_reset_entropy_context(xd, bsize, num_planes); + return; + } + const foreach_transformed_block_visitor visit = + allow_update_cdf ? av1_update_and_record_txb_context + : av1_record_txb_context; + + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, visit, &arg); + } +} + +CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row, + int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + const int mib_size_log2 = cm->seq_params->mib_size_log2; + const int stride = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); + const int offset = + (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); + return cpi->coeff_buffer_base + offset; +} diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h new file mode 100644 index 0000000000..67b94046b4 --- /dev/null +++ b/third_party/aom/av1/encoder/encodetxb.h @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODETXB_H_ +#define AOM_AV1_ENCODER_ENCODETXB_H_ + +#include "config/aom_config.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "aom_dsp/bitwriter.h" +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ +#define TXB_SKIP_CTX_MASK 15 +#define DC_SIGN_CTX_SHIFT 4 +#define DC_SIGN_CTX_MASK 3 + +int av1_get_eob_pos_token(const int eob, int *const extra); + +/*!\endcond */ +/*!\brief Allocate the memory resources for all the macro blocks in the current + * coding frame. + * \ingroup coefficient_coding + * + * Each macro block will need a \ref CB_COEFF_BUFFER to store information for + * rate-distortion optimization and entropy coding of transform coefficients. + * + * \param[in] cpi Top-level encoder structure + */ +void av1_alloc_txb_buf(AV1_COMP *cpi); +/*!\brief Free the memory resources for all the macro blocks in the current + * coding frame. + * \ingroup coefficient_coding + * + * See \ref av1_alloc_txb_buf and \ref CB_COEFF_BUFFER for more details. + * + * \param[in] cpi Top-level encoder structure + */ +void av1_free_txb_buf(AV1_COMP *cpi); + +/*!\brief Write quantized coefficients in a transform block into bitstream using + * entropy coding. + * + * \ingroup coefficient_coding + * + * This function will write the quantized coefficients in a transform block into + * the bitstream using entropy coding. + * + * The coding steps are as follows. + * + * 1) Code the end of block position "eob", which is the scan index of the + * last non-zero coefficient plus one. + * + * 2) Code the lower magnitude level (<= COEFF_BASE_RANGE + NUM_BASE_LEVELS) + * for each coefficient in reversed scan order. + * + * 3) Code the sign and higher magnitude level + * (> COEFF_BASE_RANGE + NUM_BASE_LEVELS) in forward scan order. + * + * \param[in] cm Top-level structure shared by encoder and + * decoder + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] w Entropy coding write pointer + * \param[in] blk_row The row index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane + * \param[in] blk_col The col index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane + * \param[in] plane The index of the current plane + * \param[in] block The index of the current transform block in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block + * \param[in] tx_size The given transform size + */ +void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x, + aom_writer *w, int blk_row, int blk_col, int plane, + int block, TX_SIZE tx_size); + +/*!\brief Write quantized coefficients of all transform blocks in an intra + * macroblock into the bitstream using entropy coding. + * + * \ingroup coefficient_coding + * + * All transform blocks in the intra macroblock share the same transform size. + * + * This function use \ref av1_write_coeffs_txb() to code each transform block in + * raster order. + * + * \param[in] cm Top-level structure shared by encoder and + * decoder + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] w Entropy coding write pointer + * \param[in] bsize Block size of the current macroblock + */ +void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, + aom_writer *w, BLOCK_SIZE bsize); + +/*!\brief Pack the context info of the current transform block into an uint8_t. + * \ingroup coefficient_coding + * + * This context info will be collected and consolidated by its neighbor + * transform blocks for coding transform block skip flag (tx_skip) and + * the sign of DC coefficient (dc_sign). + * + * \param[in] qcoeff Buffer of quantized coefficients + * \param[in] scan_order Coding order of coefficients in the transform + * block + * \param[in] eob The scan index of last non-zero coefficient plus + * one + */ +uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff, + const SCAN_ORDER *scan_order, int eob); + +/*!\brief Update the probability model (cdf) and the entropy context related to + * coefficient coding for all transform blocks in the intra macroblock. + * + * \ingroup coefficient_coding + * + * This function will go through each transform block in the intra macorblock + * and call \ref av1_update_and_record_txb_context to update the probability + * model and entropy context properly. + * + * \param[in] cpi Top-level encoder structure + * \param[in] td Top-level multithreading structure + * \param[in] dry_run Whether this is a dry run. + * \param[in] bsize Block size of the current macroblock + * \param[in] allow_update_cdf Allowed to update probability model (cdf) or + * not. + */ +void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + uint8_t allow_update_cdf); + +/*!\brief Update the probability model (cdf) and the entropy context related to + * coefficient coding for a transform block. + * + * \ingroup coefficient_coding + * + * There are regular mode and dry run for this funtion. + * + * Regular mode: + * + * The probability model (cdf) for each coding symbol in the + * transform block will be updated. + * + * The entropy context of this transform block will be updated. + * + * Dry run: + * + * The probability model update will be skipped. + * + * The entropy context of this transform block will be updated. + * + * \param[in] plane The index of the current plane. + * \param[in] block The index of the current transform block in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block. + * \param[in] blk_row The row index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane. + * \param[in] blk_col The col index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane. + * \param[in] plane_bsize Block size for this plane. When the video source + * uses chroma subsampling, the block size of UV planes will be smaller than the + * block size of Y plane. + * \param[in] tx_size The given transform size. + * \param[in] arg This parameter will be translated into + * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run. + */ +void av1_update_and_record_txb_context(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); + +/*!\brief Update the entropy context related to coefficient coding for a + * transform block. + * + * \ingroup coefficient_coding + * + * There are regular mode and dry run for this function. + * + * Regular mode: + * + * The entropy context of this transform block will be updated. + * + * Dry run: + * + * The probability model update will be skipped. + * + * The entropy context of this transform block will be updated. + * + * \param[in] plane The index of the current plane. + * \param[in] block The index of the current transform block in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block. + * \param[in] blk_row The row index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane. + * \param[in] blk_col The col index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane. + * \param[in] plane_bsize Block size for this plane. When the video source + * uses chroma subsampling, the block size of UV planes will be smaller than the + * block size of Y plane. + * \param[in] tx_size The given transform size. + * \param[in] arg This parameter will be translated into + * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run. + */ +void av1_record_txb_context(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); + +/*!\brief Get the corresponding \ref CB_COEFF_BUFFER of the current macro block. + * + * \ingroup coefficient_coding + * + * The macroblock's location is described by mi_row and mi_col, row and column + * mi indexes in the coding frame. + * + * Each mi unit is a 4x4 pixel block. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] mi_row Row mi index of the current transform block + * in the frame. + * \param[in] mi_col Column mi index of the current transform + * block in the frame. + * \return CB_COEFF_BUFFER* Pointer of \ref CB_COEFF_BUFFER associated + * to this macroblock. + */ +CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row, + int mi_col); + +/*!\brief Returns the entropy cost associated with skipping the current + * transform block. + * + * \ingroup coefficient_coding + * + * \param[in] coeff_costs Table of entropy cost for coefficient coding. + * \param[in] txb_ctx Context info for entropy coding transform block + * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). + * \param[in] plane The index of the current plane + * \param[in] tx_size The transform size + */ +static INLINE int av1_cost_skip_txb(const CoeffCosts *coeff_costs, + const TXB_CTX *const txb_ctx, int plane, + TX_SIZE tx_size) { + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs_ = + &coeff_costs->coeff_costs[txs_ctx][plane_type]; + return coeff_costs_->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; +} + +/*!\cond */ +// These numbers are empirically obtained. +static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { + { 17, 13 }, + { 16, 10 }, +}; +/*!\endcond */ + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_ENCODETXB_H_ diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c new file mode 100644 index 0000000000..d6a806d504 --- /dev/null +++ b/third_party/aom/av1/encoder/ethread.c @@ -0,0 +1,3469 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/warped_motion.h" +#include "av1/common/thread_common.h" + +#include "av1/encoder/allintra_vis.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/ethread.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/firstpass.h" +#endif +#include "av1/encoder/global_motion.h" +#include "av1/encoder/global_motion_facade.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/picklpf.h" +#include "av1/encoder/rdopt.h" +#include "aom_dsp/aom_dsp_common.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/tpl_model.h" + +static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { + td->rd_counts.compound_ref_used_flag |= + td_t->rd_counts.compound_ref_used_flag; + td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag; + + for (int i = 0; i < TX_SIZES_ALL; i++) { + for (int j = 0; j < TX_TYPES; j++) + td->rd_counts.tx_type_used[i][j] += td_t->rd_counts.tx_type_used[i][j]; + } + + for (int i = 0; i < BLOCK_SIZES_ALL; i++) { + for (int j = 0; j < 2; j++) { + td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j]; + } + } + + for (int i = 0; i < 2; i++) { + td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i]; + } + + td->rd_counts.seg_tmp_pred_cost[0] += td_t->rd_counts.seg_tmp_pred_cost[0]; + td->rd_counts.seg_tmp_pred_cost[1] += td_t->rd_counts.seg_tmp_pred_cost[1]; + + td->rd_counts.newmv_or_intra_blocks += td_t->rd_counts.newmv_or_intra_blocks; +} + +static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + const int mib_size = cm->seq_params->mib_size; + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int row = 0; row < cm->tiles.rows; row++) { + for (int col = 0; col < cm->tiles.cols; col++) { + TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col]; + const TileInfo *const tile_info = &tile_data->tile_info; + for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; + mi_row += mib_size) { + if (mi_row == tile_info->mi_row_start) + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + for (int mi_col = tile_info->mi_col_start; + mi_col < tile_info->mi_col_end; mi_col += mib_size) { + const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col; + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str; + MB_MODE_INFO *mbmi = mi[0]; + if (mbmi->skip_txfm == 1 && + (mbmi->bsize == cm->seq_params->sb_size)) { + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) + mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; + mbmi->delta_lf_from_base = xd->delta_lf_from_base; + } else { + if (cm->delta_q_info.delta_lf_multi) { + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; + } else { + xd->delta_lf_from_base = mbmi->delta_lf_from_base; + } + } + } + } + } + } +} + +void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, + int c) { + (void)row_mt_sync; + (void)r; + (void)c; +} + +void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, + int c, int cols) { + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; +} + +void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + + if (r) { + pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1]; + pthread_mutex_lock(mutex); + + while (c > row_mt_sync->num_finished_cols[r - 1] - nsync - + row_mt_sync->intrabc_extra_top_right_sb_delay) { + pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c, + int cols) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + int cur; + // Only signal when there are enough encoded blocks for next row to run. + int sig = 1; + + if (c < cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = cols + nsync + row_mt_sync->intrabc_extra_top_right_sb_delay; + } + + if (sig) { + pthread_mutex_lock(&row_mt_sync->mutex_[r]); + + // When a thread encounters an error, num_finished_cols[r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // num_finished_cols[r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + row_mt_sync->num_finished_cols[r] = + AOMMAX(row_mt_sync->num_finished_cols[r], cur); + + pthread_cond_signal(&row_mt_sync->cond_[r]); + pthread_mutex_unlock(&row_mt_sync->mutex_[r]); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; +#endif // CONFIG_MULTITHREAD +} + +// Allocate memory for row synchronization +static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync, + AV1_COMMON *cm, int rows) { +#if CONFIG_MULTITHREAD + int i; + + CHECK_MEM_ERROR(cm, row_mt_sync->mutex_, + aom_malloc(sizeof(*row_mt_sync->mutex_) * rows)); + if (row_mt_sync->mutex_) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&row_mt_sync->mutex_[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, row_mt_sync->cond_, + aom_malloc(sizeof(*row_mt_sync->cond_) * rows)); + if (row_mt_sync->cond_) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&row_mt_sync->cond_[i], NULL); + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols, + aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows)); + + row_mt_sync->rows = rows; + // Set up nsync. + row_mt_sync->sync_range = 1; +} + +// Deallocate row based multi-threading synchronization related mutex and data +void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) { + if (row_mt_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + + if (row_mt_sync->mutex_ != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_mutex_destroy(&row_mt_sync->mutex_[i]); + } + aom_free(row_mt_sync->mutex_); + } + if (row_mt_sync->cond_ != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_cond_destroy(&row_mt_sync->cond_[i]); + } + aom_free(row_mt_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + aom_free(row_mt_sync->num_finished_cols); + + // clear the structure as the source of this call may be dynamic change + // in tiles in which case this call will be followed by an _alloc() + // which may fail. + av1_zero(*row_mt_sync); + } +} + +static AOM_INLINE int get_sb_rows_in_frame(AV1_COMMON *cm) { + return CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, + cm->seq_params->mib_size_log2); +} + +static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols, + int alloc_row_ctx) { + struct AV1Common *cm = &cpi->common; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int tile_col, tile_row; + + av1_row_mt_mem_dealloc(cpi); + + // Allocate memory for row based multi-threading + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + + row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows); + + if (alloc_row_ctx) { + assert(max_cols > 0); + const int num_row_ctx = AOMMAX(1, (max_cols - 1)); + CHECK_MEM_ERROR(cm, this_tile->row_ctx, + (FRAME_CONTEXT *)aom_memalign( + 16, num_row_ctx * sizeof(*this_tile->row_ctx))); + } + } + } + const int sb_rows = get_sb_rows_in_frame(cm); + CHECK_MEM_ERROR( + cm, enc_row_mt->num_tile_cols_done, + aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows)); + + enc_row_mt->allocated_rows = max_rows; + enc_row_mt->allocated_cols = max_cols - 1; + enc_row_mt->allocated_sb_rows = sb_rows; +} + +void av1_row_mt_mem_dealloc(AV1_COMP *cpi) { + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int tile_cols = enc_row_mt->allocated_tile_cols; + const int tile_rows = enc_row_mt->allocated_tile_rows; + int tile_col, tile_row; + + // Free row based multi-threading sync memory + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + + av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); + + if (cpi->oxcf.algo_cfg.cdf_update_mode) { + aom_free(this_tile->row_ctx); + this_tile->row_ctx = NULL; + } + } + } + aom_free(enc_row_mt->num_tile_cols_done); + enc_row_mt->num_tile_cols_done = NULL; + enc_row_mt->allocated_rows = 0; + enc_row_mt->allocated_cols = 0; + enc_row_mt->allocated_sb_rows = 0; +} + +static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id, + int num_tiles, int num_workers) { + int tile_id = 0; + int i; + + for (i = 0; i < num_workers; i++) { + thread_id_to_tile_id[i] = tile_id++; + if (tile_id == num_tiles) tile_id = 0; + } +} + +static AOM_INLINE int get_next_job(TileDataEnc *const tile_data, + int *current_mi_row, int mib_size) { + AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; + const int mi_row_end = tile_data->tile_info.mi_row_end; + + if (row_mt_sync->next_mi_row < mi_row_end) { + *current_mi_row = row_mt_sync->next_mi_row; + row_mt_sync->num_threads_working++; + row_mt_sync->next_mi_row += mib_size; + return 1; + } + return 0; +} + +static AOM_INLINE void switch_tile_and_get_next_job( + AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id, + int *current_mi_row, int *end_of_frame, int is_firstpass, + const BLOCK_SIZE fp_block_size) { + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + int tile_id = -1; // Stores the tile ID with minimum proc done + int max_mis_to_encode = 0; + int min_num_threads_working = INT_MAX; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &tile_data[tile_index]; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + +#if CONFIG_REALTIME_ONLY + int num_b_rows_in_tile = + av1_get_sb_rows_in_tile(cm, &this_tile->tile_info); + int num_b_cols_in_tile = + av1_get_sb_cols_in_tile(cm, &this_tile->tile_info); +#else + int num_b_rows_in_tile = + is_firstpass + ? av1_get_unit_rows_in_tile(&this_tile->tile_info, fp_block_size) + : av1_get_sb_rows_in_tile(cm, &this_tile->tile_info); + int num_b_cols_in_tile = + is_firstpass + ? av1_get_unit_cols_in_tile(&this_tile->tile_info, fp_block_size) + : av1_get_sb_cols_in_tile(cm, &this_tile->tile_info); +#endif + int theoretical_limit_on_threads = + AOMMIN((num_b_cols_in_tile + 1) >> 1, num_b_rows_in_tile); + int num_threads_working = row_mt_sync->num_threads_working; + + if (num_threads_working < theoretical_limit_on_threads) { + int num_mis_to_encode = + this_tile->tile_info.mi_row_end - row_mt_sync->next_mi_row; + + // Tile to be processed by this thread is selected on the basis of + // availability of jobs: + // 1) If jobs are available, tile to be processed is chosen on the + // basis of minimum number of threads working for that tile. If two or + // more tiles have same number of threads working for them, then the + // tile with maximum number of jobs available will be chosen. + // 2) If no jobs are available, then end_of_frame is reached. + if (num_mis_to_encode > 0) { + if (num_threads_working < min_num_threads_working) { + min_num_threads_working = num_threads_working; + max_mis_to_encode = 0; + } + if (num_threads_working == min_num_threads_working && + num_mis_to_encode > max_mis_to_encode) { + tile_id = tile_index; + max_mis_to_encode = num_mis_to_encode; + } + } + } + } + } + if (tile_id == -1) { + *end_of_frame = 1; + } else { + // Update the current tile id to the tile id that will be processed next, + // which will be the least processed tile. + *cur_tile_id = tile_id; + const int unit_height = mi_size_high[fp_block_size]; + get_next_job(&tile_data[tile_id], current_mi_row, + is_firstpass ? unit_height : cm->seq_params->mib_size); + } +} + +#if !CONFIG_REALTIME_ONLY +static void set_firstpass_encode_done(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const BLOCK_SIZE fp_block_size = cpi->fp_block_size; + const int unit_height = mi_size_high[fp_block_size]; + + // In case of multithreading of firstpass encode, due to top-right + // dependency, the worker on a firstpass row waits for the completion of the + // firstpass processing of the top and top-right fp_blocks. Hence, in case a + // thread (main/worker) encounters an error, update the firstpass processing + // of every row in the frame to indicate that it is complete in order to avoid + // dependent workers waiting indefinitely. + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileInfo *tile = &tile_data->tile_info; + AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; + const int unit_cols_in_tile = + av1_get_unit_cols_in_tile(tile, fp_block_size); + for (int mi_row = tile->mi_row_start, unit_row_in_tile = 0; + mi_row < tile->mi_row_end; + mi_row += unit_height, unit_row_in_tile++) { + enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, + unit_cols_in_tile - 1, unit_cols_in_tile); + } + } + } +} + +static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + AV1_COMP *const cpi = thread_data->cpi; + int thread_id = thread_data->thread_id; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; +#endif + (void)unused; + struct aom_internal_error_info *const error_info = &thread_data->error_info; + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); + enc_row_mt->firstpass_mt_exit = true; + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + set_firstpass_encode_done(cpi); + return 0; + } + error_info->setjmp = 1; + + AV1_COMMON *const cm = &cpi->common; + int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id]; + assert(cur_tile_id != -1); + + const BLOCK_SIZE fp_block_size = cpi->fp_block_size; + const int unit_height = mi_size_high[fp_block_size]; + int end_of_frame = 0; + while (1) { + int current_mi_row = -1; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); +#endif + bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit; + if (!firstpass_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id], + ¤t_mi_row, unit_height)) { + // No jobs are available for the current tile. Query for the status of + // other tiles and get the next job if available + switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id, + ¤t_mi_row, &end_of_frame, 1, + fp_block_size); + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + // When firstpass_mt_exit is set to true, other workers need not pursue any + // further jobs. + if (firstpass_mt_exit || end_of_frame) break; + + TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + ThreadData *td = thread_data->td; + + assert(current_mi_row != -1 && + current_mi_row < this_tile->tile_info.mi_row_end); + + const int unit_height_log2 = mi_size_high_log2[fp_block_size]; + av1_first_pass_row(cpi, td, this_tile, current_mi_row >> unit_height_log2, + fp_block_size); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); +#endif + row_mt_sync->num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + } + error_info->setjmp = 0; + return 1; +} +#endif + +static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data, + AV1EncRowMultiThreadInfo *enc_row_mt, + int mib_size_log2) { + AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync; + const int sb_rows = get_sb_rows_in_frame(cm); + AV1LfMTInfo *cur_job_info; + bool row_mt_exit = false; + (void)enc_row_mt; +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; +#endif + + while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) { + LFWorkerData *const lf_data = (LFWorkerData *)thread_data->lf_data; + const int lpf_opt_level = cur_job_info->lpf_opt_level; + (void)sb_rows; +#if CONFIG_MULTITHREAD + const int cur_sb_row = cur_job_info->mi_row >> mib_size_log2; + const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1); + // Wait for current and next superblock row to finish encoding. + pthread_mutex_lock(enc_row_mt_mutex_); + while (!enc_row_mt->row_mt_exit && + (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols || + enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols)) { + pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_); + } + row_mt_exit = enc_row_mt->row_mt_exit; + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + if (row_mt_exit) return; + + av1_thread_loop_filter_rows( + lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd, + cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir, + lpf_opt_level, lf_sync, &thread_data->error_info, lf_data->params_buf, + lf_data->tx_buf, mib_size_log2); + } +} + +static void set_encoding_done(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int mib_size = cm->seq_params->mib_size; + + // In case of row-multithreading, due to top-right dependency, the worker on + // an SB row waits for the completion of the encode of the top and top-right + // SBs. Hence, in case a thread (main/worker) encounters an error, update that + // encoding of every SB row in the frame is complete in order to avoid the + // dependent workers of every tile from waiting indefinitely. + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); + for (int mi_row = tile_info->mi_row_start, sb_row_in_tile = 0; + mi_row < tile_info->mi_row_end; + mi_row += mib_size, sb_row_in_tile++) { + enc_row_mt->sync_write_ptr(row_mt_sync, sb_row_in_tile, + sb_cols_in_tile - 1, sb_cols_in_tile); + } + } + } +} + +static bool lpf_mt_with_enc_enabled(int pipeline_lpf_mt_with_enc, + const int filter_level[2]) { + return pipeline_lpf_mt_with_enc && (filter_level[0] || filter_level[1]); +} + +static int enc_row_mt_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + AV1_COMP *const cpi = thread_data->cpi; + int thread_id = thread_data->thread_id; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; +#endif + (void)unused; + + struct aom_internal_error_info *const error_info = &thread_data->error_info; + AV1LfSync *const lf_sync = thread_data->lf_sync; + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + xd->error_info = error_info; + AV1_COMMON *volatile const cm = &cpi->common; + volatile const bool do_pipelined_lpf_mt_with_enc = lpf_mt_with_enc_enabled( + cpi->mt_info.pipeline_lpf_mt_with_enc, cm->lf.filter_level); + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); + enc_row_mt->row_mt_exit = true; + // Wake up all the workers waiting in launch_loop_filter_rows() to exit in + // case of an error. + pthread_cond_broadcast(enc_row_mt->cond_); + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + set_encoding_done(cpi); + + if (do_pipelined_lpf_mt_with_enc) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(lf_sync->job_mutex); + lf_sync->lf_mt_exit = true; + pthread_mutex_unlock(lf_sync->job_mutex); +#endif + av1_set_vert_loop_filter_done(&cpi->common, lf_sync, + cpi->common.seq_params->mib_size_log2); + } + return 0; + } + error_info->setjmp = 1; + + const int mib_size_log2 = cm->seq_params->mib_size_log2; + int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id]; + + // Preallocate the pc_tree for realtime coding to reduce the cost of memory + // allocation. + if (cpi->sf.rt_sf.use_nonrd_pick_mode) { + thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); + if (!thread_data->td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + } else { + thread_data->td->pc_root = NULL; + } + + assert(cur_tile_id != -1); + + const BLOCK_SIZE fp_block_size = cpi->fp_block_size; + int end_of_frame = 0; + bool row_mt_exit = false; + + // When master thread does not have a valid job to process, xd->tile_ctx + // is not set and it contains NULL pointer. This can result in NULL pointer + // access violation if accessed beyond the encode stage. Hence, updating + // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame + // context to avoid NULL pointer access in subsequent stages. + thread_data->td->mb.e_mbd.tile_ctx = cm->fc; + while (1) { + int current_mi_row = -1; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); +#endif + row_mt_exit = enc_row_mt->row_mt_exit; + // row_mt_exit check here can be avoided as it is checked after + // sync_read_ptr() in encode_sb_row(). However, checking row_mt_exit here, + // tries to return before calling the function get_next_job(). + if (!row_mt_exit && + !get_next_job(&cpi->tile_data[cur_tile_id], ¤t_mi_row, + cm->seq_params->mib_size)) { + // No jobs are available for the current tile. Query for the status of + // other tiles and get the next job if available + switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id, + ¤t_mi_row, &end_of_frame, 0, + fp_block_size); + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + // When row_mt_exit is set to true, other workers need not pursue any + // further jobs. + if (row_mt_exit) { + error_info->setjmp = 0; + return 1; + } + + if (end_of_frame) break; + + TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + const TileInfo *const tile_info = &this_tile->tile_info; + const int tile_row = tile_info->tile_row; + const int tile_col = tile_info->tile_col; + ThreadData *td = thread_data->td; + const int sb_row = current_mi_row >> mib_size_log2; + + assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end); + + td->mb.e_mbd.tile_ctx = td->tctx; + td->mb.tile_pb_ctx = &this_tile->tctx; + td->abs_sum_level = 0; + + if (this_tile->allow_update_cdf) { + td->mb.row_ctx = this_tile->row_ctx; + if (current_mi_row == tile_info->mi_row_start) + memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); + } else { + memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); + } + + av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, + &td->mb.e_mbd); + + cfl_init(&td->mb.e_mbd.cfl, cm->seq_params); + if (td->mb.txfm_search_info.mb_rd_record != NULL) { + av1_crc32c_calculator_init( + &td->mb.txfm_search_info.mb_rd_record->crc_calculator); + } + + av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); +#endif + this_tile->abs_sum_level += td->abs_sum_level; + row_mt_sync->num_threads_working--; + enc_row_mt->num_tile_cols_done[sb_row]++; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(enc_row_mt->cond_); + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + } + if (do_pipelined_lpf_mt_with_enc) { + // Loop-filter a superblock row if encoding of the current and next + // superblock row is complete. + // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving + // encoding and loop filter stage. + launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2); + } + av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + thread_data->td->pc_root = NULL; + error_info->setjmp = 0; + return 1; +} + +static int enc_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + AV1_COMP *const cpi = thread_data->cpi; + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + struct aom_internal_error_info *const error_info = &thread_data->error_info; + const AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int t; + + (void)unused; + + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; + return 0; + } + error_info->setjmp = 1; + + // Preallocate the pc_tree for realtime coding to reduce the cost of memory + // allocation. + if (cpi->sf.rt_sf.use_nonrd_pick_mode) { + thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); + if (!thread_data->td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + } else { + thread_data->td->pc_root = NULL; + } + + for (t = thread_data->start; t < tile_rows * tile_cols; + t += cpi->mt_info.num_workers) { + int tile_row = t / tile_cols; + int tile_col = t % tile_cols; + + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; + thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; + thread_data->td->mb.tile_pb_ctx = &this_tile->tctx; + av1_encode_tile(cpi, thread_data->td, tile_row, tile_col); + } + + av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + thread_data->td->pc_root = NULL; + error_info->setjmp = 0; + return 1; +} + +void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi) { + cpi->mt_info.workers = ppi->p_mt_info.workers; + cpi->mt_info.num_workers = ppi->p_mt_info.num_workers; + cpi->mt_info.tile_thr_data = ppi->p_mt_info.tile_thr_data; + int i; + for (i = MOD_FP; i < NUM_MT_MODULES; i++) { + cpi->mt_info.num_mod_workers[i] = + AOMMIN(cpi->mt_info.num_workers, ppi->p_mt_info.num_mod_workers[i]); + } +} + +void av1_init_cdef_worker(AV1_COMP *cpi) { + // The allocation is done only for level 0 parallel frames. No change + // in config is supported in the middle of a parallel encode set, since the + // rest of the MT modules also do not support dynamic change of config. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) return; + PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info; + int num_cdef_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_CDEF); + + av1_alloc_cdef_buffers(&cpi->common, &p_mt_info->cdef_worker, + &cpi->mt_info.cdef_sync, num_cdef_workers, 1); + cpi->mt_info.cdef_worker = p_mt_info->cdef_worker; +} + +#if !CONFIG_REALTIME_ONLY +void av1_init_lr_mt_buffers(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync; + if (lr_sync->sync_range) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + return; + int num_lr_workers = + av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR); + assert(num_lr_workers <= lr_sync->num_workers); + lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf; + lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs; + } +} +#endif + +#if CONFIG_MULTITHREAD +void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + + if (setjmp(cm->error->jmp)) { + cm->error->setjmp = 0; + aom_internal_error_copy(&cpi->ppi->error, cm->error); + } + cm->error->setjmp = 1; + // Initialize enc row MT object. + if (is_first_pass || cpi->oxcf.row_mt == 1) { + AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt; + if (enc_row_mt->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, enc_row_mt->mutex_, + aom_malloc(sizeof(*(enc_row_mt->mutex_)))); + if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL); + } + if (enc_row_mt->cond_ == NULL) { + CHECK_MEM_ERROR(cm, enc_row_mt->cond_, + aom_malloc(sizeof(*(enc_row_mt->cond_)))); + if (enc_row_mt->cond_) pthread_cond_init(enc_row_mt->cond_, NULL); + } + } + + if (!is_first_pass) { + // Initialize global motion MT object. + AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync; + if (gm_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, gm_sync->mutex_, + aom_malloc(sizeof(*(gm_sync->mutex_)))); + if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL); + } +#if !CONFIG_REALTIME_ONLY + // Initialize temporal filtering MT object. + AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync; + if (tf_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, tf_sync->mutex_, + aom_malloc(sizeof(*tf_sync->mutex_))); + if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL); + } +#endif // !CONFIG_REALTIME_ONLY + // Initialize CDEF MT object. + AV1CdefSync *cdef_sync = &mt_info->cdef_sync; + if (cdef_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, cdef_sync->mutex_, + aom_malloc(sizeof(*(cdef_sync->mutex_)))); + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); + } + + // Initialize loop filter MT object. + AV1LfSync *lf_sync = &mt_info->lf_row_sync; + // Number of superblock rows + const int sb_rows = + CEIL_POWER_OF_TWO(cm->height >> MI_SIZE_LOG2, MAX_MIB_SIZE_LOG2); + PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info; + int num_lf_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LPF); + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_lf_workers > lf_sync->num_workers) { + av1_loop_filter_dealloc(lf_sync); + av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers); + } + + // Initialize tpl MT object. + AV1TplRowMultiThreadInfo *tpl_row_mt = &mt_info->tpl_row_mt; + if (tpl_row_mt->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, tpl_row_mt->mutex_, + aom_malloc(sizeof(*(tpl_row_mt->mutex_)))); + if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL); + } + +#if !CONFIG_REALTIME_ONLY + if (is_restoration_used(cm)) { + // Initialize loop restoration MT object. + AV1LrSync *lr_sync = &mt_info->lr_row_sync; + int rst_unit_size = cpi->sf.lpf_sf.min_lr_unit_size; + int num_rows_lr = av1_lr_count_units(rst_unit_size, cm->height); + int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR); + if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows || + num_lr_workers > lr_sync->num_workers || + MAX_MB_PLANE > lr_sync->num_planes) { + av1_loop_restoration_dealloc(lr_sync); + av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr, + MAX_MB_PLANE, cm->width); + } + } +#endif + + // Initialization of pack bitstream MT object. + AV1EncPackBSSync *pack_bs_sync = &mt_info->pack_bs_sync; + if (pack_bs_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_, + aom_malloc(sizeof(*pack_bs_sync->mutex_))); + if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL); + } + } + cm->error->setjmp = 0; +} +#endif // CONFIG_MULTITHREAD + +// Computes the number of workers to be considered while allocating memory for a +// multi-threaded module under FPMT. +int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info, + MULTI_THREADED_MODULES mod_name) { + int num_mod_workers = p_mt_info->num_mod_workers[mod_name]; + if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) { + // TODO(anyone): Change num_mod_workers to num_mod_workers[MOD_FRAME_ENC]. + // As frame parallel jobs will only perform multi-threading for the encode + // stage, we can limit the allocations according to num_enc_workers per + // frame parallel encode(a.k.a num_mod_workers[MOD_FRAME_ENC]). + num_mod_workers = p_mt_info->num_workers; + } + return num_mod_workers; +} + +void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) { + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + + assert(p_mt_info->workers != NULL); + assert(p_mt_info->tile_thr_data != NULL); + + int num_workers = p_mt_info->num_workers; + int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC); + assert(num_enc_workers <= num_workers); + for (int i = num_workers - 1; i >= 0; i--) { + EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i]; + + if (i > 0) { + // Allocate thread data. + ThreadData *td; + AOM_CHECK_MEM_ERROR(&ppi->error, td, aom_memalign(32, sizeof(*td))); + av1_zero(*td); + thread_data->original_td = thread_data->td = td; + + // Set up shared coeff buffers. + av1_setup_shared_coeff_buffer(&ppi->seq_params, &td->shared_coeff_buf, + &ppi->error); + AOM_CHECK_MEM_ERROR(&ppi->error, td->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*td->tmp_conv_dst))); + + if (i < p_mt_info->num_mod_workers[MOD_FP]) { + // Set up firstpass PICK_MODE_CONTEXT. + td->firstpass_ctx = + av1_alloc_pmc(ppi->cpi, BLOCK_16X16, &td->shared_coeff_buf); + if (!td->firstpass_ctx) + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + + if (!is_first_pass && i < num_enc_workers) { + // Set up sms_tree. + if (av1_setup_sms_tree(ppi->cpi, td)) { + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate SMS tree"); + } + + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + AOM_CHECK_MEM_ERROR( + &ppi->error, td->hash_value_buffer[x][y], + (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*td->hash_value_buffer[0][0]))); + + // Allocate frame counters in thread data. + AOM_CHECK_MEM_ERROR(&ppi->error, td->counts, + aom_calloc(1, sizeof(*td->counts))); + + // Allocate buffers used by palette coding mode. + AOM_CHECK_MEM_ERROR(&ppi->error, td->palette_buffer, + aom_memalign(16, sizeof(*td->palette_buffer))); + + // The buffers 'tmp_pred_bufs[]', 'comp_rd_buffer' and 'obmc_buffer' are + // used in inter frames to store intermediate inter mode prediction + // results and are not required for allintra encoding mode. Hence, the + // memory allocations for these buffers are avoided for allintra + // encoding mode. + if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) { + alloc_obmc_buffers(&td->obmc_buffer, &ppi->error); + + alloc_compound_type_rd_buffers(&ppi->error, &td->comp_rd_buffer); + + for (int j = 0; j < 2; ++j) { + AOM_CHECK_MEM_ERROR( + &ppi->error, td->tmp_pred_bufs[j], + aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*td->tmp_pred_bufs[j]))); + } + } + + if (is_gradient_caching_for_hog_enabled(ppi->cpi)) { + const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome; + AOM_CHECK_MEM_ERROR(&ppi->error, td->pixel_gradient_info, + aom_malloc(sizeof(*td->pixel_gradient_info) * + plane_types * MAX_SB_SQUARE)); + } + + if (is_src_var_for_4x4_sub_blocks_caching_enabled(ppi->cpi)) { + const BLOCK_SIZE sb_size = ppi->cpi->common.seq_params->sb_size; + const int mi_count_in_sb = + mi_size_wide[sb_size] * mi_size_high[sb_size]; + + AOM_CHECK_MEM_ERROR( + &ppi->error, td->src_var_info_of_4x4_sub_blocks, + aom_malloc(sizeof(*td->src_var_info_of_4x4_sub_blocks) * + mi_count_in_sb)); + } + + if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) { + const int num_64x64_blocks = + (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; + AOM_CHECK_MEM_ERROR( + &ppi->error, td->vt64x64, + aom_malloc(sizeof(*td->vt64x64) * num_64x64_blocks)); + } + } + } + + if (!is_first_pass && ppi->cpi->oxcf.row_mt == 1 && i < num_enc_workers) { + if (i == 0) { + for (int j = 0; j < ppi->num_fp_contexts; j++) { + AOM_CHECK_MEM_ERROR(&ppi->error, ppi->parallel_cpi[j]->td.tctx, + (FRAME_CONTEXT *)aom_memalign( + 16, sizeof(*ppi->parallel_cpi[j]->td.tctx))); + } + } else { + AOM_CHECK_MEM_ERROR( + &ppi->error, thread_data->td->tctx, + (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx))); + } + } + } + + // Record the number of workers in encode stage multi-threading for which + // allocation is done. + p_mt_info->prev_num_enc_workers = num_enc_workers; +} + +void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) { + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + assert(p_mt_info->num_workers == 0); + + AOM_CHECK_MEM_ERROR(&ppi->error, p_mt_info->workers, + aom_malloc(num_workers * sizeof(*p_mt_info->workers))); + + AOM_CHECK_MEM_ERROR( + &ppi->error, p_mt_info->tile_thr_data, + aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data))); + + for (int i = 0; i < num_workers; ++i) { + AVxWorker *const worker = &p_mt_info->workers[i]; + EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i]; + + winterface->init(worker); + worker->thread_name = "aom enc worker"; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + if (i > 0) { + // Create threads + if (!winterface->reset(worker)) + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, + "Tile encoder thread creation failed"); + } + winterface->sync(worker); + + ++p_mt_info->num_workers; + } +} + +// This function will change the state and free the mutex of corresponding +// workers and terminate the object. The object can not be re-used unless a call +// to reset() is made. +void av1_terminate_workers(AV1_PRIMARY *ppi) { + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + for (int t = 0; t < p_mt_info->num_workers; ++t) { + AVxWorker *const worker = &p_mt_info->workers[t]; + aom_get_worker_interface()->end(worker); + } +} + +// This function returns 1 if frame parallel encode is supported for +// the current configuration. Returns 0 otherwise. +static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) { + // FPMT is enabled for AOM_Q and AOM_VBR. + // TODO(Tarun): Test and enable resize config. + if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) { + return 0; + } + if (ppi->use_svc) { + return 0; + } + if (oxcf->tile_cfg.enable_large_scale_tile) { + return 0; + } + if (oxcf->dec_model_cfg.timing_info_present) { + return 0; + } + if (oxcf->mode != GOOD) { + return 0; + } + if (oxcf->tool_cfg.error_resilient_mode) { + return 0; + } + if (oxcf->resize_cfg.resize_mode) { + return 0; + } + if (oxcf->pass != AOM_RC_SECOND_PASS) { + return 0; + } + if (oxcf->max_threads < 2) { + return 0; + } + if (!oxcf->fp_mt) { + return 0; + } + + return 1; +} + +int av1_check_fpmt_config(AV1_PRIMARY *const ppi, + AV1EncoderConfig *const oxcf) { + if (is_fpmt_config(ppi, oxcf)) return 1; + // Reset frame parallel configuration for unsupported config + if (ppi->num_fp_contexts > 1) { + for (int i = 1; i < ppi->num_fp_contexts; i++) { + // Release the previously-used frame-buffer + if (ppi->parallel_cpi[i]->common.cur_frame != NULL) { + --ppi->parallel_cpi[i]->common.cur_frame->ref_count; + ppi->parallel_cpi[i]->common.cur_frame = NULL; + } + } + + int cur_gf_index = ppi->cpi->gf_frame_index; + int reset_size = AOMMAX(0, ppi->gf_group.size - cur_gf_index); + av1_zero_array(&ppi->gf_group.frame_parallel_level[cur_gf_index], + reset_size); + av1_zero_array(&ppi->gf_group.is_frame_non_ref[cur_gf_index], reset_size); + av1_zero_array(&ppi->gf_group.src_offset[cur_gf_index], reset_size); + memset(&ppi->gf_group.skip_frame_refresh[cur_gf_index][0], INVALID_IDX, + sizeof(ppi->gf_group.skip_frame_refresh[cur_gf_index][0]) * + reset_size * REF_FRAMES); + memset(&ppi->gf_group.skip_frame_as_ref[cur_gf_index], INVALID_IDX, + sizeof(ppi->gf_group.skip_frame_as_ref[cur_gf_index]) * reset_size); + ppi->num_fp_contexts = 1; + } + return 0; +} + +// A large value for threads used to compute the max num_enc_workers +// possible for each resolution. +#define MAX_THREADS 100 + +// Computes the max number of enc workers possible for each resolution. +static AOM_INLINE int compute_max_num_enc_workers( + CommonModeInfoParams *const mi_params, int mib_size_log2) { + int num_sb_rows = CEIL_POWER_OF_TWO(mi_params->mi_rows, mib_size_log2); + int num_sb_cols = CEIL_POWER_OF_TWO(mi_params->mi_cols, mib_size_log2); + + return AOMMIN((num_sb_cols + 1) >> 1, num_sb_rows); +} + +// Computes the number of frame parallel(fp) contexts to be created +// based on the number of max_enc_workers. +int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) { + ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 0; + if (!av1_check_fpmt_config(ppi, oxcf)) { + return 1; + } + int max_num_enc_workers = compute_max_num_enc_workers( + &ppi->cpi->common.mi_params, ppi->cpi->common.seq_params->mib_size_log2); + // Scaling factors and rounding factors used to tune worker_per_frame + // computation. + int rounding_factor[2] = { 2, 4 }; + int scaling_factor[2] = { 4, 8 }; + int is_480p_or_lesser = + AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) <= 480; + int is_sb_64 = 0; + if (ppi->cpi != NULL) + is_sb_64 = ppi->cpi->common.seq_params->sb_size == BLOCK_64X64; + // A parallel frame encode has at least 1/4th the + // theoretical limit of max enc workers in default case. For resolutions + // larger than 480p, if SB size is 64x64, optimal performance is obtained with + // limit of 1/8. + int index = (!is_480p_or_lesser && is_sb_64) ? 1 : 0; + int workers_per_frame = + AOMMAX(1, (max_num_enc_workers + rounding_factor[index]) / + scaling_factor[index]); + int max_threads = oxcf->max_threads; + int num_fp_contexts = max_threads / workers_per_frame; + // Based on empirical results, FPMT gains with multi-tile are significant when + // more parallel frames are available. Use FPMT with multi-tile encode only + // when sufficient threads are available for parallel encode of + // MAX_PARALLEL_FRAMES frames. + if (oxcf->tile_cfg.tile_columns > 0 || oxcf->tile_cfg.tile_rows > 0) { + if (num_fp_contexts < MAX_PARALLEL_FRAMES) num_fp_contexts = 1; + } + + num_fp_contexts = AOMMAX(1, AOMMIN(num_fp_contexts, MAX_PARALLEL_FRAMES)); + // Limit recalculated num_fp_contexts to ppi->num_fp_contexts. + num_fp_contexts = (ppi->num_fp_contexts == 1) + ? num_fp_contexts + : AOMMIN(num_fp_contexts, ppi->num_fp_contexts); + if (num_fp_contexts > 1) { + ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = + AOMMIN(max_num_enc_workers * num_fp_contexts, oxcf->max_threads); + } + return num_fp_contexts; +} + +// Computes the number of workers to process each of the parallel frames. +static AOM_INLINE int compute_num_workers_per_frame( + const int num_workers, const int parallel_frame_count) { + // Number of level 2 workers per frame context (floor division). + int workers_per_frame = (num_workers / parallel_frame_count); + return workers_per_frame; +} + +static AOM_INLINE void restore_workers_after_fpmt( + AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared); + +// Prepare level 1 workers. This function is only called for +// parallel_frame_count > 1. This function populates the mt_info structure of +// frame level contexts appropriately by dividing the total number of available +// workers amongst the frames as level 2 workers. It also populates the hook and +// data members of level 1 workers. +static AOM_INLINE void prepare_fpmt_workers(AV1_PRIMARY *ppi, + AV1_COMP_DATA *first_cpi_data, + AVxWorkerHook hook, + int parallel_frame_count) { + assert(parallel_frame_count <= ppi->num_fp_contexts && + parallel_frame_count > 1); + + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + int num_workers = p_mt_info->num_workers; + + volatile int frame_idx = 0; + volatile int i = 0; + while (i < num_workers) { + // Assign level 1 worker + AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] = + &p_mt_info->workers[i]; + AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; + MultiThreadInfo *mt_info = &cur_cpi->mt_info; + // This 'aom_internal_error_info' pointer is not derived from the local + // pointer ('AV1_COMMON *const cm') to silence the compiler warning + // "variable 'cm' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered]". + struct aom_internal_error_info *const error = cur_cpi->common.error; + + // The jmp_buf is valid only within the scope of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error->jmp)) { + error->setjmp = 0; + restore_workers_after_fpmt(ppi, parallel_frame_count, i); + aom_internal_error_copy(&ppi->error, error); + } + error->setjmp = 1; + + AV1_COMMON *const cm = &cur_cpi->common; + // Assign start of level 2 worker pool + mt_info->workers = &p_mt_info->workers[i]; + mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i]; + // Assign number of workers for each frame in the parallel encode set. + mt_info->num_workers = compute_num_workers_per_frame( + num_workers - i, parallel_frame_count - frame_idx); + for (int j = MOD_FP; j < NUM_MT_MODULES; j++) { + mt_info->num_mod_workers[j] = + AOMMIN(mt_info->num_workers, p_mt_info->num_mod_workers[j]); + } + if (p_mt_info->cdef_worker != NULL) { + mt_info->cdef_worker = &p_mt_info->cdef_worker[i]; + + // Back up the original cdef_worker pointers. + mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf; + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; plane++) + mt_info->restore_state_buf.cdef_colbuf[plane] = + mt_info->cdef_worker->colbuf[plane]; + } +#if !CONFIG_REALTIME_ONLY + if (is_restoration_used(cm)) { + // Back up the original LR buffers before update. + int idx = i + mt_info->num_workers - 1; + assert(idx < mt_info->lr_row_sync.num_workers); + mt_info->restore_state_buf.rst_tmpbuf = + mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf; + mt_info->restore_state_buf.rlbs = + mt_info->lr_row_sync.lrworkerdata[idx].rlbs; + + // Update LR buffers. + mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = cm->rst_tmpbuf; + mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cm->rlbs; + } +#endif + + i += mt_info->num_workers; + + // At this stage, the thread specific CDEF buffers for the current frame's + // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has + // already been allocated across parallel frames. + av1_alloc_cdef_buffers(cm, &p_mt_info->cdef_worker, &mt_info->cdef_sync, + p_mt_info->num_workers, 0); + + frame_worker->hook = hook; + frame_worker->data1 = cur_cpi; + frame_worker->data2 = (frame_idx == 0) + ? first_cpi_data + : &ppi->parallel_frames_data[frame_idx - 1]; + frame_idx++; + error->setjmp = 0; + } + p_mt_info->p_num_workers = parallel_frame_count; +} + +// Launch level 1 workers to perform frame parallel encode. +static AOM_INLINE void launch_fpmt_workers(AV1_PRIMARY *ppi) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int num_workers = ppi->p_mt_info.p_num_workers; + + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = ppi->p_mt_info.p_workers[i]; + if (i == 0) + winterface->execute(worker); + else + winterface->launch(worker); + } +} + +// Restore worker states after parallel encode. +static AOM_INLINE void restore_workers_after_fpmt( + AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared) { + assert(parallel_frame_count <= ppi->num_fp_contexts && + parallel_frame_count > 1); + (void)parallel_frame_count; + + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + + int frame_idx = 0; + int i = 0; + while (i < num_fpmt_workers_prepared) { + AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; + MultiThreadInfo *mt_info = &cur_cpi->mt_info; + const AV1_COMMON *const cm = &cur_cpi->common; + const int num_planes = av1_num_planes(cm); + + // Restore the original cdef_worker pointers. + if (p_mt_info->cdef_worker != NULL) { + mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf; + for (int plane = 0; plane < num_planes; plane++) + mt_info->cdef_worker->colbuf[plane] = + mt_info->restore_state_buf.cdef_colbuf[plane]; + } +#if !CONFIG_REALTIME_ONLY + if (is_restoration_used(cm)) { + // Restore the original LR buffers. + int idx = i + mt_info->num_workers - 1; + assert(idx < mt_info->lr_row_sync.num_workers); + mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = + mt_info->restore_state_buf.rst_tmpbuf; + mt_info->lr_row_sync.lrworkerdata[idx].rlbs = + mt_info->restore_state_buf.rlbs; + } +#endif + + frame_idx++; + i += mt_info->num_workers; + } +} + +// Synchronize level 1 workers. +static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi, + int frames_in_parallel_set) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int num_workers = ppi->p_mt_info.p_num_workers; + int had_error = 0; + // Points to error in the earliest display order frame in the parallel set. + const struct aom_internal_error_info *error; + + // Encoding ends. + for (int i = num_workers - 1; i >= 0; --i) { + AVxWorker *const worker = ppi->p_mt_info.p_workers[i]; + if (!winterface->sync(worker)) { + had_error = 1; + error = ppi->parallel_cpi[i]->common.error; + } + } + + restore_workers_after_fpmt(ppi, frames_in_parallel_set, + ppi->p_mt_info.num_workers); + + if (had_error) aom_internal_error_copy(&ppi->error, error); +} + +static int get_compressed_data_hook(void *arg1, void *arg2) { + AV1_COMP *cpi = (AV1_COMP *)arg1; + AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2; + int status = av1_get_compressed_data(cpi, cpi_data); + + // AOM_CODEC_OK(0) means no error. + return !status; +} + +// This function encodes the raw frame data for each frame in parallel encode +// set, and outputs the frame bit stream to the designated buffers. +void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data) { + // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf + // corresponding to frames in the current parallel encode set. + int ref_buffers_used_map = 0; + int frames_in_parallel_set = av1_init_parallel_frame_context( + first_cpi_data, ppi, &ref_buffers_used_map); + prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook, + frames_in_parallel_set); + launch_fpmt_workers(ppi); + sync_fpmt_workers(ppi, frames_in_parallel_set); + + // Release cpi->scaled_ref_buf corresponding to frames in the current parallel + // encode set. + for (int i = 0; i < frames_in_parallel_set; ++i) { + av1_release_scaled_references_fpmt(ppi->parallel_cpi[i]); + } + av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool, + ref_buffers_used_map); +} + +static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + worker->had_error = 0; + if (i == 0) + winterface->execute(worker); + else + winterface->launch(worker); + } +} + +static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info, + AV1_COMMON *const cm, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + const AVxWorker *const worker_main = &mt_info->workers[0]; + int had_error = worker_main->had_error; + struct aom_internal_error_info error_info; + + // Read the error_info of main thread. + if (had_error) { + error_info = ((EncWorkerData *)worker_main->data1)->error_info; + } + + // Encoding ends. + for (int i = num_workers - 1; i > 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + if (!winterface->sync(worker)) { + had_error = 1; + error_info = ((EncWorkerData *)worker->data1)->error_info; + } + } + + if (had_error) aom_internal_error_copy(cm->error, &error_info); + + // Restore xd->error_info of the main thread back to cm->error so that the + // multithreaded code, when executed using a single thread, has a valid + // xd->error_info. + MACROBLOCKD *const xd = &((EncWorkerData *)worker_main->data1)->td->mb.e_mbd; + xd->error_info = cm->error; +} + +static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi, + int num_workers) { + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &cpi->mt_info.workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + cpi->intrabc_used |= thread_data->td->intrabc_used; + cpi->deltaq_used |= thread_data->td->deltaq_used; + // Accumulate rtc counters. + if (!frame_is_intra_only(&cpi->common)) + av1_accumulate_rtc_counters(cpi, &thread_data->td->mb); + cpi->palette_pixel_num += thread_data->td->mb.palette_pixels; + if (thread_data->td != &cpi->td) { + // Keep these conditional expressions in sync with the corresponding ones + // in prepare_enc_workers(). + if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) { + aom_free(thread_data->td->mv_costs_alloc); + thread_data->td->mv_costs_alloc = NULL; + } + if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) { + aom_free(thread_data->td->dv_costs_alloc); + thread_data->td->dv_costs_alloc = NULL; + } + } + av1_dealloc_mb_data(&thread_data->td->mb, av1_num_planes(&cpi->common)); + + // Accumulate counters. + if (i > 0) { + av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); + accumulate_rd_opt(&cpi->td, thread_data->td); + cpi->td.mb.txfm_search_info.txb_split_count += + thread_data->td->mb.txfm_search_info.txb_split_count; +#if CONFIG_SPEED_STATS + cpi->td.mb.txfm_search_info.tx_search_count += + thread_data->td->mb.txfm_search_info.tx_search_count; +#endif // CONFIG_SPEED_STATS + } + } +} + +static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1_COMMON *const cm = &cpi->common; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + thread_data->td->intrabc_used = 0; + thread_data->td->deltaq_used = 0; + thread_data->td->abs_sum_level = 0; + thread_data->td->rd_counts.seg_tmp_pred_cost[0] = 0; + thread_data->td->rd_counts.seg_tmp_pred_cost[1] = 0; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + thread_data->td->rd_counts = cpi->td.rd_counts; + thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer; + + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + memcpy(thread_data->td->hash_value_buffer[x][y], + cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y], + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*thread_data->td->hash_value_buffer[0][0])); + thread_data->td->mb.intrabc_hash_info.hash_value_buffer[x][y] = + thread_data->td->hash_value_buffer[x][y]; + } + } + // Keep these conditional expressions in sync with the corresponding ones + // in accumulate_counters_enc_workers(). + if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) { + CHECK_MEM_ERROR( + cm, thread_data->td->mv_costs_alloc, + (MvCosts *)aom_malloc(sizeof(*thread_data->td->mv_costs_alloc))); + thread_data->td->mb.mv_costs = thread_data->td->mv_costs_alloc; + memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs, + sizeof(MvCosts)); + } + if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) { + // Reset dv_costs to NULL for worker threads when dv cost update is + // enabled so that only dv_cost_upd_level needs to be checked before the + // aom_free() call for the same. + thread_data->td->mb.dv_costs = NULL; + if (av1_need_dv_costs(cpi)) { + CHECK_MEM_ERROR(cm, thread_data->td->dv_costs_alloc, + (IntraBCMVCosts *)aom_malloc( + sizeof(*thread_data->td->dv_costs_alloc))); + thread_data->td->mb.dv_costs = thread_data->td->dv_costs_alloc; + memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs, + sizeof(IntraBCMVCosts)); + } + } + } + av1_alloc_mb_data(cpi, &thread_data->td->mb); + + // Reset rtc counters. + av1_init_rtc_counters(&thread_data->td->mb); + + thread_data->td->mb.palette_pixels = 0; + + if (thread_data->td->counts != &cpi->counts) { + memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); + } + + if (i > 0) { + thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer; + thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer; + thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->mb.tmp_pred_bufs[j] = + thread_data->td->tmp_pred_bufs[j]; + } + thread_data->td->mb.pixel_gradient_info = + thread_data->td->pixel_gradient_info; + + thread_data->td->mb.src_var_info_of_4x4_sub_blocks = + thread_data->td->src_var_info_of_4x4_sub_blocks; + + thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] = + thread_data->td->mb.tmp_pred_bufs[j]; + } + } + } +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + // Before encoding a frame, copy the thread data from cpi. + thread_data->td->mb = cpi->td.mb; + } + av1_alloc_src_diff_buf(cm, &thread_data->td->mb); + } +} +#endif + +// Computes the number of workers for row multi-threading of encoding stage +static AOM_INLINE int compute_num_enc_row_mt_workers(const AV1_COMMON *cm, + int max_threads) { + TileInfo tile_info; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int total_num_threads_row_mt = 0; + for (int row = 0; row < tile_rows; row++) { + for (int col = 0; col < tile_cols; col++) { + av1_tile_init(&tile_info, cm, row, col); + const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, &tile_info); + const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, &tile_info); + total_num_threads_row_mt += + AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile); + } + } + return AOMMIN(max_threads, total_num_threads_row_mt); +} + +// Computes the number of workers for tile multi-threading of encoding stage +static AOM_INLINE int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm, + int max_threads) { + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + return AOMMIN(max_threads, tile_cols * tile_rows); +} + +// Find max worker of all MT stages +int av1_get_max_num_workers(const AV1_COMP *cpi) { + int max_num_workers = 0; + for (int i = MOD_FP; i < NUM_MT_MODULES; i++) + max_num_workers = + AOMMAX(cpi->ppi->p_mt_info.num_mod_workers[i], max_num_workers); + assert(max_num_workers >= 1); + return AOMMIN(max_num_workers, cpi->oxcf.max_threads); +} + +// Computes the number of workers for encoding stage (row/tile multi-threading) +int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) { + if (max_workers <= 1) return 1; + if (cpi->oxcf.row_mt) + return compute_num_enc_row_mt_workers(&cpi->common, max_workers); + else + return compute_num_enc_tile_mt_workers(&cpi->common, max_workers); +} + +void av1_encode_tiles_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int num_workers = mt_info->num_mod_workers[MOD_ENC]; + + assert(IMPLIES(cpi->tile_data == NULL, + cpi->allocated_tiles < tile_cols * tile_rows)); + if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi); + + av1_init_tile_data(cpi); + num_workers = AOMMIN(num_workers, mt_info->num_workers); + + prepare_enc_workers(cpi, enc_worker_hook, num_workers); + launch_workers(&cpi->mt_info, num_workers); + sync_enc_workers(&cpi->mt_info, cm, num_workers); + accumulate_counters_enc_workers(cpi, num_workers); +} + +// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int' +// members, so we treat it as an array, and sum over the whole length. +void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts, + const FRAME_COUNTS *counts) { + unsigned int *const acc = (unsigned int *)acc_counts; + const unsigned int *const cnt = (const unsigned int *)counts; + + const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int); + + for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i]; +} + +// Computes the maximum number of sb rows and sb_cols across tiles which are +// used to allocate memory for multi-threaded encoding with row-mt=1. +static AOM_INLINE void compute_max_sb_rows_cols(const AV1_COMMON *cm, + int *max_sb_rows_in_tile, + int *max_sb_cols_in_tile) { + const int tile_rows = cm->tiles.rows; + const int mib_size_log2 = cm->seq_params->mib_size_log2; + const int num_mi_rows = cm->mi_params.mi_rows; + const int *const row_start_sb = cm->tiles.row_start_sb; + for (int row = 0; row < tile_rows; row++) { + const int mi_row_start = row_start_sb[row] << mib_size_log2; + const int mi_row_end = + AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows); + const int num_sb_rows_in_tile = + CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, mib_size_log2); + *max_sb_rows_in_tile = AOMMAX(*max_sb_rows_in_tile, num_sb_rows_in_tile); + } + + const int tile_cols = cm->tiles.cols; + const int num_mi_cols = cm->mi_params.mi_cols; + const int *const col_start_sb = cm->tiles.col_start_sb; + for (int col = 0; col < tile_cols; col++) { + const int mi_col_start = col_start_sb[col] << mib_size_log2; + const int mi_col_end = + AOMMIN(col_start_sb[col + 1] << mib_size_log2, num_mi_cols); + const int num_sb_cols_in_tile = + CEIL_POWER_OF_TWO(mi_col_end - mi_col_start, mib_size_log2); + *max_sb_cols_in_tile = AOMMAX(*max_sb_cols_in_tile, num_sb_cols_in_tile); + } +} + +#if !CONFIG_REALTIME_ONLY +// Computes the number of workers for firstpass stage (row/tile multi-threading) +int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int total_num_threads_row_mt = 0; + TileInfo tile_info; + + if (cpi->oxcf.max_threads <= 1) return 1; + + for (int row = 0; row < tile_rows; row++) { + for (int col = 0; col < tile_cols; col++) { + av1_tile_init(&tile_info, cm, row, col); + const int num_mb_rows_in_tile = + av1_get_unit_rows_in_tile(&tile_info, cpi->fp_block_size); + const int num_mb_cols_in_tile = + av1_get_unit_cols_in_tile(&tile_info, cpi->fp_block_size); + total_num_threads_row_mt += + AOMMIN((num_mb_cols_in_tile + 1) >> 1, num_mb_rows_in_tile); + } + } + return AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt); +} + +// Computes the maximum number of mb_rows for row multi-threading of firstpass +// stage +static AOM_INLINE int fp_compute_max_mb_rows(const AV1_COMMON *cm, + BLOCK_SIZE fp_block_size) { + const int tile_rows = cm->tiles.rows; + const int unit_height_log2 = mi_size_high_log2[fp_block_size]; + const int mib_size_log2 = cm->seq_params->mib_size_log2; + const int num_mi_rows = cm->mi_params.mi_rows; + const int *const row_start_sb = cm->tiles.row_start_sb; + int max_mb_rows = 0; + + for (int row = 0; row < tile_rows; row++) { + const int mi_row_start = row_start_sb[row] << mib_size_log2; + const int mi_row_end = + AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows); + const int num_mb_rows_in_tile = + CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, unit_height_log2); + max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile); + } + return max_mb_rows; +} +#endif + +static void lpf_pipeline_mt_init(AV1_COMP *cpi, int num_workers) { + // Pipelining of loop-filtering after encoding is enabled when loop-filter + // level is chosen based on quantizer and frame type. It is disabled in case + // of 'LOOPFILTER_SELECTIVELY' as the stats collected during encoding stage + // decides the filter level. Loop-filtering is disabled in case + // of non-reference frames and for frames with intra block copy tool enabled. + AV1_COMMON *cm = &cpi->common; + const int use_loopfilter = is_loopfilter_used(cm); + const int use_superres = av1_superres_scaled(cm); + const int use_cdef = is_cdef_used(cm); + const int use_restoration = is_restoration_used(cm); + MultiThreadInfo *const mt_info = &cpi->mt_info; + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + + const unsigned int skip_apply_postproc_filters = + derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef, + use_superres, use_restoration); + mt_info->pipeline_lpf_mt_with_enc = + (cpi->oxcf.mode == REALTIME) && (cpi->oxcf.speed >= 5) && + (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) && + (cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY) && + !cpi->ppi->rtc_ref.non_reference_frame && !cm->features.allow_intrabc && + ((skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0); + + if (!mt_info->pipeline_lpf_mt_with_enc) return; + + set_postproc_filter_default_params(cm); + + if (!use_loopfilter) return; + + const LPF_PICK_METHOD method = cpi->sf.lpf_sf.lpf_pick; + assert(method == LPF_PICK_FROM_Q); + assert(cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY); + + av1_pick_filter_level(cpi->source, cpi, method); + + struct loopfilter *lf = &cm->lf; + const int plane_start = 0; + const int plane_end = av1_num_planes(cm); + int planes_to_lf[MAX_MB_PLANE]; + if (lpf_mt_with_enc_enabled(cpi->mt_info.pipeline_lpf_mt_with_enc, + lf->filter_level)) { + set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); + int lpf_opt_level = get_lpf_opt_level(&cpi->sf); + assert(lpf_opt_level == 2); + + const int start_mi_row = 0; + const int end_mi_row = start_mi_row + cm->mi_params.mi_rows; + + av1_loop_filter_frame_init(cm, plane_start, plane_end); + + assert(mt_info->num_mod_workers[MOD_ENC] == + mt_info->num_mod_workers[MOD_LPF]); + loop_filter_frame_mt_init(cm, start_mi_row, end_mi_row, planes_to_lf, + mt_info->num_mod_workers[MOD_LPF], + &mt_info->lf_row_sync, lpf_opt_level, + cm->seq_params->mib_size_log2); + + for (int i = num_workers - 1; i >= 0; i--) { + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + // Initialize loopfilter data + thread_data->lf_sync = &mt_info->lf_row_sync; + thread_data->lf_data = &thread_data->lf_sync->lfdata[i]; + loop_filter_data_reset(thread_data->lf_data, &cm->cur_frame->buf, cm, xd); + } + } +} + +void av1_encode_tiles_row_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const int sb_rows_in_frame = get_sb_rows_in_frame(cm); + int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id; + int max_sb_rows_in_tile = 0, max_sb_cols_in_tile = 0; + int num_workers = mt_info->num_mod_workers[MOD_ENC]; + + compute_max_sb_rows_cols(cm, &max_sb_rows_in_tile, &max_sb_cols_in_tile); + const bool alloc_row_mt_mem = + (enc_row_mt->allocated_tile_cols != tile_cols || + enc_row_mt->allocated_tile_rows != tile_rows || + enc_row_mt->allocated_rows != max_sb_rows_in_tile || + enc_row_mt->allocated_cols != (max_sb_cols_in_tile - 1) || + enc_row_mt->allocated_sb_rows != sb_rows_in_frame); + const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows; + + assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data)); + if (alloc_tile_data) { + av1_alloc_tile_data(cpi); + } + + assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem)); + if (alloc_row_mt_mem) { + row_mt_mem_alloc(cpi, max_sb_rows_in_tile, max_sb_cols_in_tile, + cpi->oxcf.algo_cfg.cdf_update_mode); + } + + num_workers = AOMMIN(num_workers, mt_info->num_workers); + lpf_pipeline_mt_init(cpi, num_workers); + + av1_init_tile_data(cpi); + + memset(thread_id_to_tile_id, -1, + sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); + memset(enc_row_mt->num_tile_cols_done, 0, + sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows_in_frame); + enc_row_mt->row_mt_exit = false; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + + // Initialize num_finished_cols to -1 for all rows. + memset(row_mt_sync->num_finished_cols, -1, + sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows_in_tile); + row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start; + row_mt_sync->num_threads_working = 0; + row_mt_sync->intrabc_extra_top_right_sb_delay = + av1_get_intrabc_extra_top_right_sb_delay(cm); + + av1_inter_mode_data_init(this_tile); + av1_zero_above_context(cm, &cpi->td.mb.e_mbd, + this_tile->tile_info.mi_col_start, + this_tile->tile_info.mi_col_end, tile_row); + } + } + + assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows, + num_workers); + prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers); + launch_workers(&cpi->mt_info, num_workers); + sync_enc_workers(&cpi->mt_info, cm, num_workers); + if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi); + accumulate_counters_enc_workers(cpi, num_workers); +} + +#if !CONFIG_REALTIME_ONLY +static void dealloc_thread_data_src_diff_buf(AV1_COMP *cpi, int num_workers) { + for (int i = num_workers - 1; i >= 0; --i) { + EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i]; + if (thread_data->td != &cpi->td) + av1_dealloc_src_diff_buf(&thread_data->td->mb, + av1_num_planes(&cpi->common)); + } +} + +void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id; + int num_workers = 0; + int max_mb_rows = 0; + + max_mb_rows = fp_compute_max_mb_rows(cm, cpi->fp_block_size); + const bool alloc_row_mt_mem = enc_row_mt->allocated_tile_cols != tile_cols || + enc_row_mt->allocated_tile_rows != tile_rows || + enc_row_mt->allocated_rows != max_mb_rows; + const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows; + + assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data)); + if (alloc_tile_data) { + av1_alloc_tile_data(cpi); + } + + assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem)); + if (alloc_row_mt_mem) { + row_mt_mem_alloc(cpi, max_mb_rows, -1, 0); + } + + av1_init_tile_data(cpi); + + // For pass = 1, compute the no. of workers needed. For single-pass encode + // (pass = 0), no. of workers are already computed. + if (mt_info->num_mod_workers[MOD_FP] == 0) + num_workers = av1_fp_compute_num_enc_workers(cpi); + else + num_workers = mt_info->num_mod_workers[MOD_FP]; + + memset(thread_id_to_tile_id, -1, + sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); + enc_row_mt->firstpass_mt_exit = false; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + + // Initialize num_finished_cols to -1 for all rows. + memset(row_mt_sync->num_finished_cols, -1, + sizeof(*row_mt_sync->num_finished_cols) * max_mb_rows); + row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start; + row_mt_sync->num_threads_working = 0; + + // intraBC mode is not evaluated during first-pass encoding. Hence, no + // additional top-right delay is required. + row_mt_sync->intrabc_extra_top_right_sb_delay = 0; + } + } + + num_workers = AOMMIN(num_workers, mt_info->num_workers); + assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows, + num_workers); + fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers); + launch_workers(&cpi->mt_info, num_workers); + sync_enc_workers(&cpi->mt_info, cm, num_workers); + dealloc_thread_data_src_diff_buf(cpi, num_workers); +} + +void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, + int r, int c) { + (void)tpl_mt_sync; + (void)r; + (void)c; +} + +void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, + int r, int c, int cols) { + (void)tpl_mt_sync; + (void)r; + (void)c; + (void)cols; +} + +void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r, + int c) { +#if CONFIG_MULTITHREAD + int nsync = tpl_row_mt_sync->sync_range; + + if (r) { + pthread_mutex_t *const mutex = &tpl_row_mt_sync->mutex_[r - 1]; + pthread_mutex_lock(mutex); + + while (c > tpl_row_mt_sync->num_finished_cols[r - 1] - nsync) + pthread_cond_wait(&tpl_row_mt_sync->cond_[r - 1], mutex); + pthread_mutex_unlock(mutex); + } +#else + (void)tpl_row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r, + int c, int cols) { +#if CONFIG_MULTITHREAD + int nsync = tpl_row_mt_sync->sync_range; + int cur; + // Only signal when there are enough encoded blocks for next row to run. + int sig = 1; + + if (c < cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]); + + // When a thread encounters an error, num_finished_cols[r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // num_finished_cols[r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + tpl_row_mt_sync->num_finished_cols[r] = + AOMMAX(tpl_row_mt_sync->num_finished_cols[r], cur); + + pthread_cond_signal(&tpl_row_mt_sync->cond_[r]); + pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]); + } +#else + (void)tpl_row_mt_sync; + (void)r; + (void)c; + (void)cols; +#endif // CONFIG_MULTITHREAD +} + +static AOM_INLINE void set_mode_estimation_done(AV1_COMP *cpi) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + const BLOCK_SIZE bsize = + convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); + const int mi_height = mi_size_high[bsize]; + AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt; + const int tplb_cols_in_tile = + ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]); + // In case of tpl row-multithreading, due to top-right dependency, the worker + // on an mb_row waits for the completion of the tpl processing of the top and + // top-right blocks. Hence, in case a thread (main/worker) encounters an + // error, update that the tpl processing of every mb_row in the frame is + // complete in order to avoid dependent workers waiting indefinitely. + for (int mi_row = 0, tplb_row = 0; mi_row < mi_params->mi_rows; + mi_row += mi_height, tplb_row++) { + (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row, + tplb_cols_in_tile - 1, tplb_cols_in_tile); + } +} + +// Each worker calls tpl_worker_hook() and computes the tpl data. +static int tpl_worker_hook(void *arg1, void *unused) { + (void)unused; + EncWorkerData *thread_data = (EncWorkerData *)arg1; + AV1_COMP *cpi = thread_data->cpi; + AV1_COMMON *cm = &cpi->common; + MACROBLOCK *x = &thread_data->td->mb; + MACROBLOCKD *xd = &x->e_mbd; + TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats; + TplBuffers *tpl_tmp_buffers = &thread_data->td->tpl_tmp_buffers; + CommonModeInfoParams *mi_params = &cm->mi_params; + int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working; + + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt; + (void)tpl_row_mt; +#if CONFIG_MULTITHREAD + pthread_mutex_t *tpl_error_mutex_ = tpl_row_mt->mutex_; +#endif + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(tpl_error_mutex_); + tpl_row_mt->tpl_mt_exit = true; + pthread_mutex_unlock(tpl_error_mutex_); +#endif + set_mode_estimation_done(cpi); + return 0; + } + error_info->setjmp = 1; + + BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); + TX_SIZE tx_size = max_txsize_lookup[bsize]; + int mi_height = mi_size_high[bsize]; + + av1_init_tpl_txfm_stats(tpl_txfm_stats); + + for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows; + mi_row += num_active_workers * mi_height) { + // Motion estimation row boundary + av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height, + cpi->oxcf.border_in_pixels); + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = + GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); + av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, + bsize, tx_size); + } + error_info->setjmp = 0; + return 1; +} + +// Deallocate tpl synchronization related mutex and data. +void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) { + assert(tpl_sync != NULL); + +#if CONFIG_MULTITHREAD + if (tpl_sync->mutex_ != NULL) { + for (int i = 0; i < tpl_sync->rows; ++i) + pthread_mutex_destroy(&tpl_sync->mutex_[i]); + aom_free(tpl_sync->mutex_); + } + if (tpl_sync->cond_ != NULL) { + for (int i = 0; i < tpl_sync->rows; ++i) + pthread_cond_destroy(&tpl_sync->cond_[i]); + aom_free(tpl_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + + aom_free(tpl_sync->num_finished_cols); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*tpl_sync); +} + +// Allocate memory for tpl row synchronization. +void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm, + int mb_rows) { + tpl_sync->rows = mb_rows; +#if CONFIG_MULTITHREAD + { + CHECK_MEM_ERROR(cm, tpl_sync->mutex_, + aom_malloc(sizeof(*tpl_sync->mutex_) * mb_rows)); + if (tpl_sync->mutex_) { + for (int i = 0; i < mb_rows; ++i) + pthread_mutex_init(&tpl_sync->mutex_[i], NULL); + } + + CHECK_MEM_ERROR(cm, tpl_sync->cond_, + aom_malloc(sizeof(*tpl_sync->cond_) * mb_rows)); + if (tpl_sync->cond_) { + for (int i = 0; i < mb_rows; ++i) + pthread_cond_init(&tpl_sync->cond_[i], NULL); + } + } +#endif // CONFIG_MULTITHREAD + CHECK_MEM_ERROR(cm, tpl_sync->num_finished_cols, + aom_malloc(sizeof(*tpl_sync->num_finished_cols) * mb_rows)); + + // Set up nsync. + tpl_sync->sync_range = 1; +} + +// Each worker is prepared by assigning the hook function and individual thread +// data. +static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + MultiThreadInfo *mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + // OBMC buffers are used only to init MS params and remain unused when + // called from tpl, hence set the buffers to defaults. + av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer); + if (!tpl_alloc_temp_buffers(&thread_data->td->tpl_tmp_buffers, + cpi->ppi->tpl_data.tpl_bsize_1d)) { + aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, + "Error allocating tpl data"); + } + thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; + thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; + } + } +} + +#if CONFIG_BITRATE_ACCURACY +// Accumulate transform stats after tpl. +static void tpl_accumulate_txfm_stats(ThreadData *main_td, + const MultiThreadInfo *mt_info, + int num_workers) { + TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + ThreadData *td = thread_data->td; + if (td != main_td) { + const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats; + av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats); + } + } +} +#endif // CONFIG_BITRATE_ACCURACY + +// Implements multi-threading for tpl. +void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + CommonModeInfoParams *mi_params = &cm->mi_params; + MultiThreadInfo *mt_info = &cpi->mt_info; + TplParams *tpl_data = &cpi->ppi->tpl_data; + AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync; + int mb_rows = mi_params->mb_rows; + int num_workers = + AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers); + + if (mb_rows != tpl_sync->rows) { + av1_tpl_dealloc(tpl_sync); + av1_tpl_alloc(tpl_sync, cm, mb_rows); + } + tpl_sync->num_threads_working = num_workers; + mt_info->tpl_row_mt.tpl_mt_exit = false; + + // Initialize cur_mb_col to -1 for all MB rows. + memset(tpl_sync->num_finished_cols, -1, + sizeof(*tpl_sync->num_finished_cols) * mb_rows); + + prepare_tpl_workers(cpi, tpl_worker_hook, num_workers); + launch_workers(&cpi->mt_info, num_workers); + sync_enc_workers(&cpi->mt_info, cm, num_workers); +#if CONFIG_BITRATE_ACCURACY + tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers); +#endif // CONFIG_BITRATE_ACCURACY + for (int i = num_workers - 1; i >= 0; i--) { + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + ThreadData *td = thread_data->td; + if (td != &cpi->td) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers); + } +} + +// Deallocate memory for temporal filter multi-thread synchronization. +void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) { + assert(tf_sync != NULL); +#if CONFIG_MULTITHREAD + if (tf_sync->mutex_ != NULL) { + pthread_mutex_destroy(tf_sync->mutex_); + aom_free(tf_sync->mutex_); + } +#endif // CONFIG_MULTITHREAD + tf_sync->next_tf_row = 0; +} + +// Checks if a job is available. If job is available, +// populates next_tf_row and returns 1, else returns 0. +static AOM_INLINE int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync, + int *current_mb_row, int mb_rows) { + int do_next_row = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_; + pthread_mutex_lock(tf_mutex_); +#endif + if (!tf_mt_sync->tf_mt_exit && tf_mt_sync->next_tf_row < mb_rows) { + *current_mb_row = tf_mt_sync->next_tf_row; + tf_mt_sync->next_tf_row++; + do_next_row = 1; + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(tf_mutex_); +#endif + return do_next_row; +} + +// Hook function for each thread in temporal filter multi-threading. +static int tf_worker_hook(void *arg1, void *unused) { + (void)unused; + EncWorkerData *thread_data = (EncWorkerData *)arg1; + AV1_COMP *cpi = thread_data->cpi; + ThreadData *td = thread_data->td; + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync; + const struct scale_factors *scale = &cpi->tf_ctx.sf; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *tf_mutex_ = tf_sync->mutex_; +#endif + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(tf_mutex_); + tf_sync->tf_mt_exit = true; + pthread_mutex_unlock(tf_mutex_); +#endif + return 0; + } + error_info->setjmp = 1; + + const int num_planes = av1_num_planes(&cpi->common); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + MACROBLOCKD *mbd = &td->mb.e_mbd; + uint8_t *input_buffer[MAX_MB_PLANE]; + MB_MODE_INFO **input_mb_mode_info; + tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes); + tf_setup_macroblockd(mbd, &td->tf_data, scale); + + int current_mb_row = -1; + + while (tf_get_next_job(tf_sync, ¤t_mb_row, tf_ctx->mb_rows)) + av1_tf_do_filtering_row(cpi, td, current_mb_row); + + tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes); + + error_info->setjmp = 0; + return 1; +} + +// Assigns temporal filter hook function and thread data to each worker. +static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers, int is_highbitdepth) { + MultiThreadInfo *mt_info = &cpi->mt_info; + mt_info->tf_sync.next_tf_row = 0; + mt_info->tf_sync.tf_mt_exit = false; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + // OBMC buffers are used only to init MS params and remain unused when + // called from tf, hence set the buffers to defaults. + av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer); + if (!tf_alloc_and_reset_data(&thread_data->td->tf_data, + cpi->tf_ctx.num_pels, is_highbitdepth)) { + aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, + "Error allocating temporal filter data"); + } + } + } +} + +// Deallocate thread specific data for temporal filter. +static void tf_dealloc_thread_data(AV1_COMP *cpi, int num_workers, + int is_highbitdepth) { + MultiThreadInfo *mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + ThreadData *td = thread_data->td; + if (td != &cpi->td) tf_dealloc_data(&td->tf_data, is_highbitdepth); + } +} + +// Accumulate sse and sum after temporal filtering. +static void tf_accumulate_frame_diff(AV1_COMP *cpi, int num_workers) { + FRAME_DIFF *total_diff = &cpi->td.tf_data.diff; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &cpi->mt_info.workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + ThreadData *td = thread_data->td; + FRAME_DIFF *diff = &td->tf_data.diff; + if (td != &cpi->td) { + total_diff->sse += diff->sse; + total_diff->sum += diff->sum; + } + } +} + +// Implements multi-threading for temporal filter. +void av1_tf_do_filtering_mt(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + MultiThreadInfo *mt_info = &cpi->mt_info; + const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth; + + int num_workers = + AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers); + + prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth); + launch_workers(mt_info, num_workers); + sync_enc_workers(mt_info, cm, num_workers); + tf_accumulate_frame_diff(cpi, num_workers); + tf_dealloc_thread_data(cpi, num_workers, is_highbitdepth); +} + +// Checks if a job is available in the current direction. If a job is available, +// frame_idx will be populated and returns 1, else returns 0. +static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx, + int cur_dir) { + GlobalMotionInfo *gm_info = &cpi->gm_info; + JobInfo *job_info = &cpi->mt_info.gm_sync.job_info; + + int total_refs = gm_info->num_ref_frames[cur_dir]; + int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir]; + + if (cur_frame_to_process < total_refs && !job_info->early_exit[cur_dir]) { + *frame_idx = gm_info->reference_frames[cur_dir][cur_frame_to_process].frame; + job_info->next_frame_to_process[cur_dir] += 1; + return 1; + } + return 0; +} + +// Switches the current direction and calls the function get_next_gm_job() if +// the speed feature 'prune_ref_frame_for_gm_search' is not set. +static AOM_INLINE void switch_direction(AV1_COMP *cpi, int *frame_idx, + int *cur_dir) { + if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return; + // Switch the direction and get next job + *cur_dir = !(*cur_dir); + get_next_gm_job(cpi, frame_idx, *(cur_dir)); +} + +// Hook function for each thread in global motion multi-threading. +static int gm_mt_worker_hook(void *arg1, void *unused) { + (void)unused; + + EncWorkerData *thread_data = (EncWorkerData *)arg1; + AV1_COMP *cpi = thread_data->cpi; + GlobalMotionInfo *gm_info = &cpi->gm_info; + AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync; + JobInfo *job_info = &gm_sync->job_info; + int thread_id = thread_data->thread_id; + GlobalMotionData *gm_thread_data = &thread_data->td->gm_data; +#if CONFIG_MULTITHREAD + pthread_mutex_t *gm_mt_mutex_ = gm_sync->mutex_; +#endif + + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(gm_mt_mutex_); + gm_sync->gm_mt_exit = true; + pthread_mutex_unlock(gm_mt_mutex_); +#endif + return 0; + } + error_info->setjmp = 1; + + int cur_dir = job_info->thread_id_to_dir[thread_id]; + bool gm_mt_exit = false; + while (1) { + int ref_buf_idx = -1; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(gm_mt_mutex_); +#endif + + gm_mt_exit = gm_sync->gm_mt_exit; + // Populates ref_buf_idx(the reference frame type) for which global motion + // estimation will be done. + if (!gm_mt_exit && !get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) { + // No jobs are available for the current direction. Switch + // to other direction and get the next job, if available. + switch_direction(cpi, &ref_buf_idx, &cur_dir); + } + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(gm_mt_mutex_); +#endif + + // When gm_mt_exit is set to true, other workers need not pursue any + // further jobs. + if (gm_mt_exit || ref_buf_idx == -1) break; + + // Compute global motion for the given ref_buf_idx. + av1_compute_gm_for_valid_ref_frames( + cpi, error_info, gm_info->ref_buf, ref_buf_idx, + gm_thread_data->motion_models, gm_thread_data->segment_map, + gm_info->segment_map_w, gm_info->segment_map_h); + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(gm_mt_mutex_); +#endif + // If global motion w.r.t. current ref frame is + // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t + // the remaining ref frames in that direction. + if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search && + cpi->common.global_motion[ref_buf_idx].wmtype <= TRANSLATION) + job_info->early_exit[cur_dir] = 1; + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(gm_mt_mutex_); +#endif + } + error_info->setjmp = 0; + return 1; +} + +// Assigns global motion hook function and thread data to each worker. +static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + MultiThreadInfo *mt_info = &cpi->mt_info; + mt_info->gm_sync.gm_mt_exit = false; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + if (thread_data->td != &cpi->td) + gm_alloc_data(cpi, &thread_data->td->gm_data); + } +} + +// Assigns available threads to past/future direction. +static AOM_INLINE void assign_thread_to_dir(int8_t *thread_id_to_dir, + int num_workers) { + int8_t frame_dir_idx = 0; + + for (int i = 0; i < num_workers; i++) { + thread_id_to_dir[i] = frame_dir_idx++; + if (frame_dir_idx == MAX_DIRECTIONS) frame_dir_idx = 0; + } +} + +// Computes number of workers for global motion multi-threading. +static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) { + int total_refs = + cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1]; + int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search + ? AOMMIN(MAX_DIRECTIONS, total_refs) + : total_refs; + num_gm_workers = AOMMIN(num_gm_workers, cpi->mt_info.num_workers); + return (num_gm_workers); +} + +// Frees the memory allocated for each worker in global motion multi-threading. +static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) { + MultiThreadInfo *mt_info = &cpi->mt_info; + for (int j = 0; j < num_workers; j++) { + EncWorkerData *thread_data = &mt_info->tile_thr_data[j]; + ThreadData *td = thread_data->td; + if (td != &cpi->td) gm_dealloc_data(&td->gm_data); + } +} + +// Implements multi-threading for global motion. +void av1_global_motion_estimation_mt(AV1_COMP *cpi) { + JobInfo *job_info = &cpi->mt_info.gm_sync.job_info; + + av1_zero(*job_info); + + int num_workers = compute_gm_workers(cpi); + + assign_thread_to_dir(job_info->thread_id_to_dir, num_workers); + prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers); + launch_workers(&cpi->mt_info, num_workers); + sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers); + gm_dealloc_thread_data(cpi, num_workers); +} +#endif // !CONFIG_REALTIME_ONLY + +static AOM_INLINE int get_next_job_allintra( + AV1EncRowMultiThreadSync *const row_mt_sync, const int mi_row_end, + int *current_mi_row, int mib_size) { + if (row_mt_sync->next_mi_row < mi_row_end) { + *current_mi_row = row_mt_sync->next_mi_row; + row_mt_sync->num_threads_working++; + row_mt_sync->next_mi_row += mib_size; + return 1; + } + return 0; +} + +static AOM_INLINE void prepare_wiener_var_workers(AV1_COMP *const cpi, + AVxWorkerHook hook, + const int num_workers) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread, in this case the preprocessing + // stage does not need tiles. So we set it to 0. + thread_data->start = 0; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + av1_alloc_mb_wiener_var_pred_buf(&cpi->common, thread_data->td); + } + } +} + +static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const int mb_step = mi_size_wide[bsize]; + assert(MB_WIENER_MT_UNIT_SIZE < BLOCK_SIZES_ALL); + const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; + const int mt_unit_cols = + (mi_params->mi_cols + (mt_unit_step >> 1)) / mt_unit_step; + const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt; + AV1EncRowMultiThreadSync *const intra_row_mt_sync = + &cpi->ppi->intra_row_mt_sync; + + // Update the wiener variance computation of every row in the frame to + // indicate that it is complete in order to avoid dependent workers waiting + // indefinitely. + for (int mi_row = 0, mt_thread_id = 0; mi_row < mi_params->mi_rows; + mi_row += mb_step, ++mt_thread_id) { + intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id, + mt_unit_cols - 1, mt_unit_cols); + } +} + +static int cal_mb_wiener_var_hook(void *arg1, void *unused) { + (void)unused; + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + AV1_COMP *const cpi = thread_data->cpi; + MACROBLOCK *x = &thread_data->td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const int mb_step = mi_size_wide[bsize]; + AV1EncRowMultiThreadSync *const intra_row_mt_sync = + &cpi->ppi->intra_row_mt_sync; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + (void)enc_row_mt; +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex = enc_row_mt->mutex_; +#endif + + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex); + enc_row_mt->mb_wiener_mt_exit = true; + pthread_mutex_unlock(enc_row_mt_mutex); +#endif + set_mb_wiener_var_calc_done(cpi); + return 0; + } + error_info->setjmp = 1; + DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); + double sum_rec_distortion = 0; + double sum_est_rate = 0; + while (1) { + int current_mi_row = -1; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex); +#endif + int has_jobs = enc_row_mt->mb_wiener_mt_exit + ? 0 + : get_next_job_allintra(intra_row_mt_sync, + cpi->common.mi_params.mi_rows, + ¤t_mi_row, mb_step); +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(enc_row_mt_mutex); +#endif + if (!has_jobs) break; + // TODO(chengchen): properly accumulate the distortion and rate. + av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff, + qcoeff, dqcoeff, &sum_rec_distortion, + &sum_est_rate, + thread_data->td->wiener_tmp_pred_buf); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex); +#endif + intra_row_mt_sync->num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(enc_row_mt_mutex); +#endif + } + error_info->setjmp = 0; + return 1; +} + +static void dealloc_mb_wiener_var_mt_data(AV1_COMP *cpi, int num_workers) { + av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync); + + MultiThreadInfo *mt_info = &cpi->mt_info; + for (int j = 0; j < num_workers; ++j) { + EncWorkerData *thread_data = &mt_info->tile_thr_data[j]; + ThreadData *td = thread_data->td; + if (td != &cpi->td) av1_dealloc_mb_wiener_var_pred_buf(td); + } +} + +// This function is the multi-threading version of computing the wiener +// variance. +// Note that the wiener variance is used for allintra mode (1 pass) and its +// computation is before the frame encoding, so we don't need to consider +// the number of tiles, instead we allocate all available threads to +// the computation. +void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers, + double *sum_rec_distortion, + double *sum_est_rate) { + (void)sum_rec_distortion; + (void)sum_est_rate; + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadSync *const intra_row_mt_sync = + &cpi->ppi->intra_row_mt_sync; + + // TODO(chengchen): the memory usage could be improved. + const int mi_rows = cm->mi_params.mi_rows; + row_mt_sync_mem_alloc(intra_row_mt_sync, cm, mi_rows); + + intra_row_mt_sync->intrabc_extra_top_right_sb_delay = 0; + intra_row_mt_sync->num_threads_working = num_workers; + intra_row_mt_sync->next_mi_row = 0; + memset(intra_row_mt_sync->num_finished_cols, -1, + sizeof(*intra_row_mt_sync->num_finished_cols) * mi_rows); + mt_info->enc_row_mt.mb_wiener_mt_exit = false; + + prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers); + launch_workers(mt_info, num_workers); + sync_enc_workers(mt_info, cm, num_workers); + dealloc_mb_wiener_var_mt_data(cpi, num_workers); +} + +// Compare and order tiles based on absolute sum of tx coeffs. +static int compare_tile_order(const void *a, const void *b) { + const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a; + const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b; + + if (tile_a->abs_sum_level > tile_b->abs_sum_level) + return -1; + else if (tile_a->abs_sum_level == tile_b->abs_sum_level) + return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1); + else + return 1; +} + +// Get next tile index to be processed for pack bitstream +static AOM_INLINE int get_next_pack_bs_tile_idx( + AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) { + assert(pack_bs_sync->next_job_idx <= num_tiles); + if (pack_bs_sync->next_job_idx == num_tiles) return -1; + + return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++] + .tile_idx; +} + +// Calculates bitstream chunk size based on total buffer size and tile or tile +// group size. +static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size, + const int frame_or_tg_size, + size_t *remain_buf_size, + size_t max_buf_size, + int is_last_chunk) { + size_t this_chunk_size; + assert(*remain_buf_size > 0); + if (is_last_chunk) { + this_chunk_size = *remain_buf_size; + *remain_buf_size = 0; + } else { + const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size; + this_chunk_size = (size_t)(size_scale / frame_or_tg_size); + *remain_buf_size -= this_chunk_size; + assert(*remain_buf_size > 0); + } + assert(this_chunk_size > 0); + return this_chunk_size; +} + +// Initializes params required for pack bitstream tile. +static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, + PackBSParams *const pack_bs_params_arr, + uint8_t obu_extn_header) { + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + const int num_tiles = tiles->cols * tiles->rows; + // Fixed size tile groups for the moment + const int num_tg_hdrs = cpi->num_tg; + // Tile group size in terms of number of tiles. + const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs; + uint8_t *tile_dst = dst; + uint8_t *tile_data_curr = dst; + // Max tile group count can not be more than MAX_TILES. + int tg_size_mi[MAX_TILES] = { 0 }; // Size of tile group in mi units + int tile_idx; + int tg_idx = 0; + int tile_count_in_tg = 0; + int new_tg = 1; + + // Populate pack bitstream params of all tiles. + for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info; + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + // Calculate tile size in mi units. + const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) * + (tile_info->mi_row_end - tile_info->mi_row_start); + int is_last_tile_in_tg = 0; + tile_count_in_tg++; + if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1)) + is_last_tile_in_tg = 1; + + // Populate pack bitstream params of this tile. + pack_bs_params->curr_tg_hdr_size = 0; + pack_bs_params->obu_extn_header = obu_extn_header; + pack_bs_params->saved_wb = saved_wb; + pack_bs_params->obu_header_size = 0; + pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg; + pack_bs_params->new_tg = new_tg; + pack_bs_params->tile_col = tile_info->tile_col; + pack_bs_params->tile_row = tile_info->tile_row; + pack_bs_params->tile_size_mi = tile_size_mi; + tg_size_mi[tg_idx] += tile_size_mi; + + if (new_tg) new_tg = 0; + if (is_last_tile_in_tg) { + tile_count_in_tg = 0; + new_tg = 1; + tg_idx++; + } + } + + assert(cpi->available_bs_size > 0); + size_t tg_buf_size[MAX_TILES] = { 0 }; + size_t max_buf_size = cpi->available_bs_size; + size_t remain_buf_size = max_buf_size; + const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols; + + tile_idx = 0; + // Prepare obu, tile group and frame header of each tile group. + for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) { + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + int is_last_tg = tg_idx == cpi->num_tg - 1; + // Prorate bitstream buffer size based on tile group size and available + // buffer size. This buffer will be used to store headers and tile data. + tg_buf_size[tg_idx] = + get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size, + max_buf_size, is_last_tg); + + pack_bs_params->dst = tile_dst; + pack_bs_params->tile_data_curr = tile_dst; + + // Write obu, tile group and frame header at first tile in the tile + // group. + av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx); + tile_dst += tg_buf_size[tg_idx]; + + // Exclude headers from tile group buffer size. + tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size; + tile_idx += tg_size_in_tiles; + } + + tg_idx = 0; + // Calculate bitstream buffer size of each tile in the tile group. + for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + + if (pack_bs_params->new_tg) { + max_buf_size = tg_buf_size[tg_idx]; + remain_buf_size = max_buf_size; + } + + // Prorate bitstream buffer size of this tile based on tile size and + // available buffer size. For this proration, header size is not accounted. + const size_t tile_buf_size = get_bs_chunk_size( + pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size, + max_buf_size, pack_bs_params->is_last_tile_in_tg); + pack_bs_params->tile_buf_size = tile_buf_size; + + // Update base address of bitstream buffer for tile and tile group. + if (pack_bs_params->new_tg) { + tile_dst = pack_bs_params->dst; + tile_data_curr = pack_bs_params->tile_data_curr; + // Account header size in first tile of a tile group. + pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size; + } else { + pack_bs_params->dst = tile_dst; + pack_bs_params->tile_data_curr = tile_data_curr; + } + + if (pack_bs_params->is_last_tile_in_tg) tg_idx++; + tile_dst += pack_bs_params->tile_buf_size; + } +} + +// Worker hook function of pack bitsteam multithreading. +static int pack_bs_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + PackBSParams *const pack_bs_params = (PackBSParams *)arg2; + AV1_COMP *const cpi = thread_data->cpi; + AV1_COMMON *const cm = &cpi->common; + AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync; + const CommonTileParams *const tiles = &cm->tiles; + const int num_tiles = tiles->cols * tiles->rows; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *const pack_bs_mutex = pack_bs_sync->mutex_; +#endif + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pack_bs_mutex); + pack_bs_sync->pack_bs_mt_exit = true; + pthread_mutex_unlock(pack_bs_mutex); +#endif + return 0; + } + error_info->setjmp = 1; + + while (1) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pack_bs_mutex); +#endif + const int tile_idx = + pack_bs_sync->pack_bs_mt_exit + ? -1 + : get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles); +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pack_bs_mutex); +#endif + // When pack_bs_mt_exit is set to true, other workers need not pursue any + // further jobs. + if (tile_idx == -1) break; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; + + av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]); + } + + error_info->setjmp = 0; + return 1; +} + +// Prepares thread data and workers of pack bitsteam multithreading. +static void prepare_pack_bs_workers(AV1_COMP *const cpi, + PackBSParams *const pack_bs_params, + AVxWorkerHook hook, const int num_workers) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb; + + thread_data->cpi = cpi; + thread_data->start = i; + thread_data->thread_id = i; + av1_reset_pack_bs_thread_data(thread_data->td); + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = pack_bs_params; + } + + AV1_COMMON *const cm = &cpi->common; + AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync; + const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols; + pack_bs_sync->next_job_idx = 0; + pack_bs_sync->pack_bs_mt_exit = false; + + PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order; + // Reset tile order data of pack bitstream + av1_zero_array(pack_bs_tile_order, num_tiles); + + // Populate pack bitstream tile order structure + for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + pack_bs_tile_order[tile_idx].abs_sum_level = + cpi->tile_data[tile_idx].abs_sum_level; + pack_bs_tile_order[tile_idx].tile_idx = tile_idx; + } + + // Sort tiles in descending order based on tile area. + qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order), + compare_tile_order); +} + +// Accumulates data after pack bitsteam processing. +static void accumulate_pack_bs_data( + AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr, + uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info, + int *const largest_tile_id, unsigned int *max_tile_size, + uint32_t *const obu_header_size, uint8_t **tile_data_start, + const int num_workers) { + const AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + const int tile_count = tiles->cols * tiles->rows; + // Fixed size tile groups for the moment + size_t curr_tg_data_size = 0; + int is_first_tg = 1; + uint8_t *curr_tg_start = dst; + size_t src_offset = 0; + size_t dst_offset = 0; + + for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) { + // PackBSParams stores all parameters required to pack tile and header + // info. + const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + uint32_t tile_size = 0; + + if (pack_bs_params->new_tg) { + curr_tg_start = dst + *total_size; + curr_tg_data_size = pack_bs_params->curr_tg_hdr_size; + *tile_data_start += pack_bs_params->curr_tg_hdr_size; + *obu_header_size = pack_bs_params->obu_header_size; + } + curr_tg_data_size += + pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4); + + if (pack_bs_params->buf.size > *max_tile_size) { + *largest_tile_id = tile_idx; + *max_tile_size = (unsigned int)pack_bs_params->buf.size; + } + tile_size += + (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size; + + // Pack all the chunks of tile bitstreams together + if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size); + + if (pack_bs_params->is_last_tile_in_tg) + av1_write_last_tile_info( + cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size, + curr_tg_start, &tile_size, tile_data_start, largest_tile_id, + &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header); + src_offset += pack_bs_params->tile_buf_size; + dst_offset += tile_size; + *total_size += tile_size; + } + + // Accumulate thread data + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int idx = num_workers - 1; idx >= 0; idx--) { + ThreadData const *td = mt_info->tile_thr_data[idx].td; + av1_accumulate_pack_bs_thread_data(cpi, td); + } +} + +void av1_write_tile_obu_mt( + AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id, + unsigned int *max_tile_size, uint32_t *const obu_header_size, + uint8_t **tile_data_start, const int num_workers) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + + PackBSParams pack_bs_params[MAX_TILES]; + uint32_t tile_size[MAX_TILES] = { 0 }; + + for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++) + pack_bs_params[tile_idx].total_size = &tile_size[tile_idx]; + + init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header); + prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook, + num_workers); + launch_workers(mt_info, num_workers); + sync_enc_workers(mt_info, &cpi->common, num_workers); + accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info, + largest_tile_id, max_tile_size, obu_header_size, + tile_data_start, num_workers); +} + +// Deallocate memory for CDEF search multi-thread synchronization. +void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) { + (void)cdef_sync; + assert(cdef_sync != NULL); +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_ != NULL) { + pthread_mutex_destroy(cdef_sync->mutex_); + aom_free(cdef_sync->mutex_); + } +#endif // CONFIG_MULTITHREAD +} + +// Updates the row and column indices of the next job to be processed. +// Also updates end_of_frame flag when the processing of all blocks is complete. +static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) { + cdef_sync->fbc++; + if (cdef_sync->fbc == nhfb) { + cdef_sync->fbr++; + if (cdef_sync->fbr == nvfb) { + cdef_sync->end_of_frame = 1; + } else { + cdef_sync->fbc = 0; + } + } +} + +// Initializes cdef_sync parameters. +static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) { +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); +#endif // CONFIG_MULTITHREAD + cdef_sync->end_of_frame = 0; + cdef_sync->fbr = 0; + cdef_sync->fbc = 0; + cdef_sync->cdef_mt_exit = false; +} + +// Checks if a job is available. If job is available, +// populates next job information and returns 1, else returns 0. +static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync, + CdefSearchCtx *cdef_search_ctx, + volatile int *cur_fbr, + volatile int *cur_fbc, + volatile int *sb_count) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(cdef_sync->mutex_); +#endif // CONFIG_MULTITHREAD + int do_next_block = 0; + const int nvfb = cdef_search_ctx->nvfb; + const int nhfb = cdef_search_ctx->nhfb; + + // If a block is skip, do not process the block and + // check the skip condition for the next block. + while (!cdef_sync->cdef_mt_exit && !cdef_sync->end_of_frame && + cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr, + cdef_sync->fbc)) { + update_next_job_info(cdef_sync, nvfb, nhfb); + } + + // Populates information needed for current job and update the row, + // column indices of the next block to be processed. + if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) { + do_next_block = 1; + *cur_fbr = cdef_sync->fbr; + *cur_fbc = cdef_sync->fbc; + *sb_count = cdef_search_ctx->sb_count; + cdef_search_ctx->sb_count++; + update_next_job_info(cdef_sync, nvfb, nhfb); + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(cdef_sync->mutex_); +#endif // CONFIG_MULTITHREAD + return do_next_block; +} + +// Hook function for each thread in CDEF search multi-threading. +static int cdef_filter_block_worker_hook(void *arg1, void *arg2) { + EncWorkerData *thread_data = (EncWorkerData *)arg1; + AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg2; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *cdef_mutex_ = cdef_sync->mutex_; +#endif + struct aom_internal_error_info *const error_info = &thread_data->error_info; + CdefSearchCtx *cdef_search_ctx = thread_data->cpi->cdef_search_ctx; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(cdef_mutex_); + cdef_sync->cdef_mt_exit = true; + pthread_mutex_unlock(cdef_mutex_); +#endif + return 0; + } + error_info->setjmp = 1; + + volatile int cur_fbr, cur_fbc, sb_count; + while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc, + &sb_count)) { + av1_cdef_mse_calc_block(cdef_search_ctx, error_info, cur_fbr, cur_fbc, + sb_count); + } + error_info->setjmp = 0; + return 1; +} + +// Assigns CDEF search hook function and thread data to each worker. +static void prepare_cdef_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + MultiThreadInfo *mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + + thread_data->cpi = cpi; + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = &mt_info->cdef_sync; + } +} + +// Implements multi-threading for CDEF search. +void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) { + MultiThreadInfo *mt_info = &cpi->mt_info; + AV1CdefSync *cdef_sync = &mt_info->cdef_sync; + const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH]; + + cdef_reset_job_info(cdef_sync); + prepare_cdef_workers(cpi, cdef_filter_block_worker_hook, num_workers); + launch_workers(mt_info, num_workers); + sync_enc_workers(mt_info, &cpi->common, num_workers); +} + +// Computes num_workers for temporal filter multi-threading. +static AOM_INLINE int compute_num_tf_workers(const AV1_COMP *cpi) { + // For single-pass encode, using no. of workers as per tf block size was not + // found to improve speed. Hence the thread assignment for single-pass encode + // is kept based on compute_num_enc_workers(). + if (cpi->oxcf.pass < AOM_RC_SECOND_PASS) + return (av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads)); + + if (cpi->oxcf.max_threads <= 1) return 1; + + const int frame_height = cpi->common.height; + const BLOCK_SIZE block_size = TF_BLOCK_SIZE; + const int mb_height = block_size_high[block_size]; + const int mb_rows = get_num_blocks(frame_height, mb_height); + return AOMMIN(cpi->oxcf.max_threads, mb_rows); +} + +// Computes num_workers for tpl multi-threading. +static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) { + return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); +} + +// Computes num_workers for loop filter multi-threading. +static AOM_INLINE int compute_num_lf_workers(AV1_COMP *cpi) { + return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); +} + +// Computes num_workers for cdef multi-threading. +static AOM_INLINE int compute_num_cdef_workers(AV1_COMP *cpi) { + return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); +} + +// Computes num_workers for loop-restoration multi-threading. +static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) { + return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); +} + +// Computes num_workers for pack bitstream multi-threading. +static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) { + if (cpi->oxcf.max_threads <= 1) return 1; + return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads); +} + +// Computes num_workers for all intra multi-threading. +static AOM_INLINE int compute_num_ai_workers(AV1_COMP *cpi) { + if (cpi->oxcf.max_threads <= 1) return 1; + // The multi-threading implementation of deltaq-mode = 3 in allintra + // mode is based on row multi threading. + if (!cpi->oxcf.row_mt) return 1; + cpi->weber_bsize = BLOCK_8X8; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const int mb_step = mi_size_wide[bsize]; + const int num_mb_rows = cpi->common.mi_params.mi_rows / mb_step; + return AOMMIN(num_mb_rows, cpi->oxcf.max_threads); +} + +static int compute_num_mod_workers(AV1_COMP *cpi, + MULTI_THREADED_MODULES mod_name) { + int num_mod_workers = 0; + switch (mod_name) { + case MOD_FP: + if (cpi->oxcf.pass >= AOM_RC_SECOND_PASS) + num_mod_workers = 0; + else + num_mod_workers = + av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); + break; + case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break; + case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break; + case MOD_GME: num_mod_workers = 1; break; + case MOD_ENC: + num_mod_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); + break; + case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break; + case MOD_CDEF_SEARCH: + num_mod_workers = compute_num_cdef_workers(cpi); + break; + case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break; + case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break; + case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break; + case MOD_FRAME_ENC: + num_mod_workers = cpi->ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC]; + break; + case MOD_AI: + if (cpi->oxcf.pass == AOM_RC_ONE_PASS) { + num_mod_workers = compute_num_ai_workers(cpi); + } else { + num_mod_workers = 0; + } + break; + default: assert(0); break; + } + return (num_mod_workers); +} +// Computes the number of workers for each MT modules in the encoder +void av1_compute_num_workers_for_mt(AV1_COMP *cpi) { + for (int i = MOD_FP; i < NUM_MT_MODULES; i++) { + cpi->ppi->p_mt_info.num_mod_workers[i] = + compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i); + } +} diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h new file mode 100644 index 0000000000..468e120776 --- /dev/null +++ b/third_party/aom/av1/encoder/ethread.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ETHREAD_H_ +#define AOM_AV1_ENCODER_ETHREAD_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct ThreadData; + +typedef struct EncWorkerData { + struct AV1_COMP *cpi; + struct ThreadData *td; + struct ThreadData *original_td; + struct aom_internal_error_info error_info; + AV1LfSync *lf_sync; + LFWorkerData *lf_data; + int start; + int thread_id; +} EncWorkerData; + +void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c); +void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c, + int cols); + +void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, + int c); +void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, + int c, int cols); + +void av1_encode_tiles_mt(struct AV1_COMP *cpi); +void av1_encode_tiles_row_mt(struct AV1_COMP *cpi); + +#if !CONFIG_REALTIME_ONLY +void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi); + +int av1_fp_compute_num_enc_workers(AV1_COMP *cpi); +#endif + +void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts, + const struct FRAME_COUNTS *counts); + +void av1_row_mt_mem_dealloc(AV1_COMP *cpi); + +void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync); + +void av1_global_motion_estimation_mt(AV1_COMP *cpi); + +#if !CONFIG_REALTIME_ONLY +void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, + int r, int c); +void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, + int r, int c, int cols); + +void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, + int c); +void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, + int c, int cols); + +void av1_mc_flow_dispenser_mt(AV1_COMP *cpi); + +void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync); + +#endif // !CONFIG_REALTIME_ONLY + +void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers, + double *sum_rec_distortion, + double *sum_est_rate); + +void av1_tf_do_filtering_mt(AV1_COMP *cpi); + +void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync); + +void av1_compute_num_workers_for_mt(AV1_COMP *cpi); + +int av1_get_max_num_workers(const AV1_COMP *cpi); + +void av1_create_workers(AV1_PRIMARY *ppi, int num_workers); + +void av1_terminate_workers(AV1_PRIMARY *ppi); + +void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi); + +void av1_init_cdef_worker(AV1_COMP *cpi); + +#if !CONFIG_REALTIME_ONLY +void av1_init_lr_mt_buffers(AV1_COMP *cpi); +#endif + +#if CONFIG_MULTITHREAD +void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass); +#endif // CONFIG_MULTITHREAD + +int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info, + MULTI_THREADED_MODULES mod_name); + +void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass); + +void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi); + +void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync); + +void av1_write_tile_obu_mt( + AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id, + unsigned int *max_tile_size, uint32_t *const obu_header_size, + uint8_t **tile_data_start, const int num_workers); + +int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers); + +int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf); + +int av1_check_fpmt_config(AV1_PRIMARY *const ppi, AV1EncoderConfig *const oxcf); + +void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ETHREAD_H_ diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c new file mode 100644 index 0000000000..e1b1e69ca7 --- /dev/null +++ b/third_party/aom/av1/encoder/extend.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/common.h" +#include "av1/encoder/extend.h" + +static void copy_and_extend_plane(const uint8_t *src, int src_pitch, + uint8_t *dst, int dst_pitch, int w, int h, + int extend_top, int extend_left, + int extend_bottom, int extend_right, + int chroma_step) { + int i, linesize; + // copy the left and right most columns out + const uint8_t *src_ptr1 = src; + const uint8_t *src_ptr2 = src + (w - 1) * chroma_step; + uint8_t *dst_ptr1 = dst - extend_left; + uint8_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + memset(dst_ptr1, src_ptr1[0], extend_left); + if (chroma_step == 1) { + memcpy(dst_ptr1 + extend_left, src_ptr1, w); + } else { + for (int j = 0; j < w; j++) { + dst_ptr1[extend_left + j] = src_ptr1[chroma_step * j]; + } + } + memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h)-extend_left; + linesize = extend_left + extend_right + w; + assert(linesize <= dst_pitch); + + for (i = 0; i < extend_top; i++) { + memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += dst_pitch; + } +} + +static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch, + uint8_t *dst8, int dst_pitch, int w, + int h, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, linesize; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + // copy the left and right most columns out + const uint16_t *src_ptr1 = src; + const uint16_t *src_ptr2 = src + w - 1; + uint16_t *dst_ptr1 = dst - extend_left; + uint16_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + aom_memset16(dst_ptr1, src_ptr1[0], extend_left); + memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0])); + aom_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h)-extend_left; + linesize = extend_left + extend_right + w; + assert(linesize <= dst_pitch); + + for (i = 0; i < extend_top; i++) { + memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0])); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0])); + dst_ptr2 += dst_pitch; + } +} + +void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + // Extend src frame in buffer + const int et_y = dst->border; + const int el_y = dst->border; + const int er_y = + AOMMAX(src->y_width + dst->border, ALIGN_POWER_OF_TWO(src->y_width, 6)) - + src->y_crop_width; + const int eb_y = AOMMAX(src->y_height + dst->border, + ALIGN_POWER_OF_TWO(src->y_height, 6)) - + src->y_crop_height; + const int uv_width_subsampling = src->subsampling_x; + const int uv_height_subsampling = src->subsampling_y; + const int et_uv = et_y >> uv_height_subsampling; + const int el_uv = el_y >> uv_width_subsampling; + const int eb_uv = eb_y >> uv_height_subsampling; + const int er_uv = er_y >> uv_width_subsampling; + + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_crop_width, + src->y_crop_height, et_y, el_y, eb_y, er_y); + if (!src->monochrome) { + highbd_copy_and_extend_plane( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + highbd_copy_and_extend_plane( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + } + return; + } + + copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_crop_width, src->y_crop_height, + et_y, el_y, eb_y, er_y, 1); + if (!src->monochrome) { + // detect nv12 format + const int chroma_step = src->v_buffer ? 1 : 2; + const uint8_t *src_v_buffer = + src->v_buffer ? src->v_buffer : src->u_buffer + 1; + copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, src->uv_crop_width, + src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv, + chroma_step); + copy_and_extend_plane(src_v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, src->uv_crop_width, + src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv, + chroma_step); + } +} diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h new file mode 100644 index 0000000000..b8cc5b9d28 --- /dev/null +++ b/third_party/aom/av1/encoder/extend.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_EXTEND_H_ +#define AOM_AV1_ENCODER_EXTEND_H_ + +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_EXTEND_H_ diff --git a/third_party/aom/av1/encoder/external_partition.c b/third_party/aom/av1/encoder/external_partition.c new file mode 100644 index 0000000000..79f8b4c8a4 --- /dev/null +++ b/third_party/aom/av1/encoder/external_partition.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/common.h" +#include "av1/encoder/external_partition.h" + +aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs, + aom_ext_part_config_t config, + ExtPartController *ext_part_controller) { + if (ext_part_controller == NULL) { + return AOM_CODEC_INVALID_PARAM; + } + ext_part_controller->funcs = funcs; + ext_part_controller->config = config; + const aom_ext_part_status_t status = ext_part_controller->funcs.create_model( + ext_part_controller->funcs.priv, &ext_part_controller->config, + &ext_part_controller->model); + if (status == AOM_EXT_PART_ERROR) { + return AOM_CODEC_ERROR; + } else if (status == AOM_EXT_PART_TEST) { + ext_part_controller->test_mode = 1; + ext_part_controller->ready = 0; + return AOM_CODEC_OK; + } + assert(status == AOM_EXT_PART_OK); + ext_part_controller->ready = 1; + return AOM_CODEC_OK; +} + +aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller) { + if (ext_part_controller == NULL) { + return AOM_CODEC_INVALID_PARAM; + } + av1_zero(ext_part_controller); + return AOM_CODEC_OK; +} + +aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) { + if (ext_part_controller == NULL) { + return AOM_CODEC_INVALID_PARAM; + } + if (ext_part_controller->ready) { + const aom_ext_part_status_t status = + ext_part_controller->funcs.delete_model(ext_part_controller->model); + if (status != AOM_EXT_PART_OK) { + return AOM_CODEC_ERROR; + } + } + return av1_ext_part_init(ext_part_controller); +} + +bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller, + aom_partition_decision_t *decision) { + assert(ext_part_controller != NULL); + assert(ext_part_controller->ready); + assert(decision != NULL); + const aom_ext_part_status_t status = + ext_part_controller->funcs.get_partition_decision( + ext_part_controller->model, decision); + if (status != AOM_EXT_PART_OK) return false; + return true; +} + +bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller, + const aom_partition_stats_t *stats) { + assert(ext_part_controller != NULL); + assert(ext_part_controller->ready); + assert(stats != NULL); + const aom_ext_part_status_t status = + ext_part_controller->funcs.send_partition_stats( + ext_part_controller->model, stats); + if (status != AOM_EXT_PART_OK) return false; + return true; +} + +bool av1_ext_part_send_features(ExtPartController *ext_part_controller, + const aom_partition_features_t *features) { + assert(ext_part_controller != NULL); + assert(ext_part_controller->ready); + assert(features != NULL); + const aom_ext_part_status_t status = ext_part_controller->funcs.send_features( + ext_part_controller->model, features); + if (status != AOM_EXT_PART_OK) return false; + return true; +} + +aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode( + const ExtPartController *ext_part_controller) { + return ext_part_controller->funcs.decision_mode; +} diff --git a/third_party/aom/av1/encoder/external_partition.h b/third_party/aom/av1/encoder/external_partition.h new file mode 100644 index 0000000000..f74973e9eb --- /dev/null +++ b/third_party/aom/av1/encoder/external_partition.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ +#define AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ + +#include + +#include "aom/aom_codec.h" +#include "aom/aom_external_partition.h" + +#ifdef __cplusplus +extern "C" { +#endif +/*!\cond */ + +typedef struct ExtPartController { + int ready; + int test_mode; + aom_ext_part_config_t config; + aom_ext_part_model_t model; + aom_ext_part_funcs_t funcs; +} ExtPartController; + +aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs, + aom_ext_part_config_t config, + ExtPartController *ext_part_controller); + +aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller); + +aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller); + +bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller, + aom_partition_decision_t *decision); + +bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller, + const aom_partition_stats_t *stats); + +bool av1_ext_part_send_features(ExtPartController *ext_part_controller, + const aom_partition_features_t *features); + +aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode( + const ExtPartController *ext_part_controller); + +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c new file mode 100644 index 0000000000..e20b6c177e --- /dev/null +++ b/third_party/aom/av1/encoder/firstpass.c @@ -0,0 +1,1600 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/variance.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_scale/aom_scale.h" +#include "aom_scale/yv12config.h" + +#include "av1/common/entropymv.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" // av1_setup_dst_planes() +#include "av1/common/reconintra.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/block.h" +#include "av1/encoder/dwt.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/reconinter_enc.h" + +#define OUTPUT_FPF 0 + +#define FIRST_PASS_Q 10.0 +#define INTRA_MODE_PENALTY 1024 +#define NEW_MV_MODE_PENALTY 32 +#define DARK_THRESH 64 + +#define NCOUNT_INTRA_THRESH 8192 +#define NCOUNT_INTRA_FACTOR 3 + +#define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1 + +static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats, + struct aom_codec_pkt_list *pktlist) { + struct aom_codec_cx_pkt pkt; + pkt.kind = AOM_CODEC_STATS_PKT; + pkt.data.twopass_stats.buf = stats; + pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS); + if (pktlist != NULL) aom_codec_pkt_list_add(pktlist, &pkt); + +// TEMP debug code +#if OUTPUT_FPF + { + FILE *fpfile; + fpfile = fopen("firstpass.stt", "a"); + + fprintf(fpfile, + "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n", + stats->frame, stats->weight, stats->intra_error, stats->coded_error, + stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion, + stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct, + stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr, + stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv, + stats->MVcv, stats->mv_in_out_count, stats->new_mv_count, + stats->count, stats->duration); + fclose(fpfile); + } +#endif +} + +void av1_twopass_zero_stats(FIRSTPASS_STATS *section) { + section->frame = 0.0; + section->weight = 0.0; + section->intra_error = 0.0; + section->frame_avg_wavelet_energy = 0.0; + section->coded_error = 0.0; + section->log_intra_error = 0.0; + section->log_coded_error = 0.0; + section->sr_coded_error = 0.0; + section->pcnt_inter = 0.0; + section->pcnt_motion = 0.0; + section->pcnt_second_ref = 0.0; + section->pcnt_neutral = 0.0; + section->intra_skip_pct = 0.0; + section->inactive_zone_rows = 0.0; + section->inactive_zone_cols = 0.0; + section->MVr = 0.0; + section->mvr_abs = 0.0; + section->MVc = 0.0; + section->mvc_abs = 0.0; + section->MVrv = 0.0; + section->MVcv = 0.0; + section->mv_in_out_count = 0.0; + section->new_mv_count = 0.0; + section->count = 0.0; + section->duration = 1.0; + section->is_flash = 0; + section->noise_var = 0; + section->cor_coeff = 1.0; +} + +void av1_accumulate_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame += frame->frame; + section->weight += frame->weight; + section->intra_error += frame->intra_error; + section->log_intra_error += log1p(frame->intra_error); + section->log_coded_error += log1p(frame->coded_error); + section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy; + section->coded_error += frame->coded_error; + section->sr_coded_error += frame->sr_coded_error; + section->pcnt_inter += frame->pcnt_inter; + section->pcnt_motion += frame->pcnt_motion; + section->pcnt_second_ref += frame->pcnt_second_ref; + section->pcnt_neutral += frame->pcnt_neutral; + section->intra_skip_pct += frame->intra_skip_pct; + section->inactive_zone_rows += frame->inactive_zone_rows; + section->inactive_zone_cols += frame->inactive_zone_cols; + section->MVr += frame->MVr; + section->mvr_abs += frame->mvr_abs; + section->MVc += frame->MVc; + section->mvc_abs += frame->mvc_abs; + section->MVrv += frame->MVrv; + section->MVcv += frame->MVcv; + section->mv_in_out_count += frame->mv_in_out_count; + section->new_mv_count += frame->new_mv_count; + section->count += frame->count; + section->duration += frame->duration; +} + +static int get_unit_rows(const BLOCK_SIZE fp_block_size, const int mb_rows) { + const int height_mi_log2 = mi_size_high_log2[fp_block_size]; + const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16]; + if (height_mi_log2 > mb_height_mi_log2) { + return mb_rows >> (height_mi_log2 - mb_height_mi_log2); + } + + return mb_rows << (mb_height_mi_log2 - height_mi_log2); +} + +static int get_unit_cols(const BLOCK_SIZE fp_block_size, const int mb_cols) { + const int width_mi_log2 = mi_size_wide_log2[fp_block_size]; + const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16]; + if (width_mi_log2 > mb_width_mi_log2) { + return mb_cols >> (width_mi_log2 - mb_width_mi_log2); + } + + return mb_cols << (mb_width_mi_log2 - width_mi_log2); +} + +// TODO(chengchen): can we simplify it even if resize has to be considered? +static int get_num_mbs(const BLOCK_SIZE fp_block_size, + const int num_mbs_16X16) { + const int width_mi_log2 = mi_size_wide_log2[fp_block_size]; + const int height_mi_log2 = mi_size_high_log2[fp_block_size]; + const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16]; + const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16]; + // TODO(chengchen): Now this function assumes a square block is used. + // It does not support rectangular block sizes. + assert(width_mi_log2 == height_mi_log2); + if (width_mi_log2 > mb_width_mi_log2) { + return num_mbs_16X16 >> ((width_mi_log2 - mb_width_mi_log2) + + (height_mi_log2 - mb_height_mi_log2)); + } + + return num_mbs_16X16 << ((mb_width_mi_log2 - width_mi_log2) + + (mb_height_mi_log2 - height_mi_log2)); +} + +void av1_end_first_pass(AV1_COMP *cpi) { + if (cpi->ppi->twopass.stats_buf_ctx->total_stats && !cpi->ppi->lap_enabled) + output_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, + cpi->ppi->output_pkt_list); +} + +static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_8X8: return aom_mse8x8; + case BLOCK_16X8: return aom_mse16x8; + case BLOCK_8X16: return aom_mse8x16; + default: return aom_mse16x16; + } +} + +static unsigned int get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref) { + unsigned int sse; + const aom_variance_fn_t fn = get_block_variance_fn(bsize); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, + int bd) { + switch (bd) { + default: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_8_mse8x8; + case BLOCK_16X8: return aom_highbd_8_mse16x8; + case BLOCK_8X16: return aom_highbd_8_mse8x16; + default: return aom_highbd_8_mse16x16; + } + case 10: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_10_mse8x8; + case BLOCK_16X8: return aom_highbd_10_mse16x8; + case BLOCK_8X16: return aom_highbd_10_mse8x16; + default: return aom_highbd_10_mse16x16; + } + case 12: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_12_mse8x8; + case BLOCK_16X8: return aom_highbd_12_mse16x8; + case BLOCK_8X16: return aom_highbd_12_mse8x16; + default: return aom_highbd_12_mse16x16; + } + } +} + +static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref, + int bd) { + unsigned int sse; + const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Refine the motion search range according to the frame dimension +// for first pass test. +static int get_search_range(int width, int height) { + int sr = 0; + const int dim = AOMMIN(width, height); + + while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr; + return sr; +} + +static AOM_INLINE const search_site_config * +av1_get_first_pass_search_site_config(const AV1_COMP *cpi, MACROBLOCK *x, + SEARCH_METHODS search_method) { + const int ref_stride = x->e_mbd.plane[0].pre[0].stride; + + // For AVIF applications, even the source frames can have changing resolution, + // so we need to manually check for the strides :( + // AV1_COMP::mv_search_params.search_site_config is a compressor level cache + // that's shared by multiple threads. In most cases where all frames have the + // same resolution, the cache contains the search site config that we need. + const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; + if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_FPF]->stride) { + return mv_search_params->search_site_cfg[SS_CFG_FPF]; + } + + // If the cache does not contain the correct stride, then we will need to rely + // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the + // thread level config doesn't match, then we need to update it. + search_method = search_method_lookup[search_method]; + assert(search_method_lookup[search_method] == search_method && + "The search_method_lookup table should be idempotent."); + if (ref_stride != x->search_site_cfg_buf[search_method].stride) { + av1_refresh_search_site_config(x->search_site_cfg_buf, search_method, + ref_stride); + } + + return x->search_site_cfg_buf; +} + +static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, + const MV *ref_mv, + FULLPEL_MV *best_mv, + int *best_motion_err) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv); + int tmp_err; + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; + const int sr = get_search_range(cm->width, cm->height); + const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr; + + const search_site_config *first_pass_search_sites = + av1_get_first_pass_search_site_config(cpi, x, NSTEP); + const int fine_search_interval = + cpi->is_screen_content_type && cm->features.allow_intrabc; + FULLPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv, + start_mv, first_pass_search_sites, NSTEP, + fine_search_interval); + + FULLPEL_MV this_best_mv; + FULLPEL_MV_STATS best_mv_stats; + tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL, + &this_best_mv, &best_mv_stats, NULL); + + if (tmp_err < INT_MAX) { + aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize]; + const MSBuffers *ms_buffers = &ms_params.ms_buffers; + tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv, + &v_fn_ptr, ms_buffers->src, ms_buffers->ref) + + new_mv_mode_penalty; + } + + if (tmp_err < *best_motion_err) { + *best_motion_err = tmp_err; + *best_mv = this_best_mv; + } +} + +static BLOCK_SIZE get_bsize(const CommonModeInfoParams *const mi_params, + const BLOCK_SIZE fp_block_size, const int unit_row, + const int unit_col) { + const int unit_width = mi_size_wide[fp_block_size]; + const int unit_height = mi_size_high[fp_block_size]; + const int is_half_width = + unit_width * unit_col + unit_width / 2 >= mi_params->mi_cols; + const int is_half_height = + unit_height * unit_row + unit_height / 2 >= mi_params->mi_rows; + const int max_dimension = + AOMMAX(block_size_wide[fp_block_size], block_size_high[fp_block_size]); + int square_block_size = 0; + // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128 + switch (max_dimension) { + case 4: square_block_size = 0; break; + case 8: square_block_size = 1; break; + case 16: square_block_size = 2; break; + case 32: square_block_size = 3; break; + case 64: square_block_size = 4; break; + case 128: square_block_size = 5; break; + default: assert(0 && "First pass block size is not supported!"); break; + } + if (is_half_width && is_half_height) { + return subsize_lookup[PARTITION_SPLIT][square_block_size]; + } else if (is_half_width) { + return subsize_lookup[PARTITION_VERT][square_block_size]; + } else if (is_half_height) { + return subsize_lookup[PARTITION_HORZ][square_block_size]; + } else { + return fp_block_size; + } +} + +static int find_fp_qindex(aom_bit_depth_t bit_depth) { + return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1); +} + +static double raw_motion_error_stdev(int *raw_motion_err_list, + int raw_motion_err_counts) { + int64_t sum_raw_err = 0; + double raw_err_avg = 0; + double raw_err_stdev = 0; + if (raw_motion_err_counts == 0) return 0; + + int i; + for (i = 0; i < raw_motion_err_counts; i++) { + sum_raw_err += raw_motion_err_list[i]; + } + raw_err_avg = (double)sum_raw_err / raw_motion_err_counts; + for (i = 0; i < raw_motion_err_counts; i++) { + raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) * + (raw_motion_err_list[i] - raw_err_avg); + } + // Calculate the standard deviation for the motion error of all the inter + // blocks of the 0,0 motion using the last source + // frame as the reference. + raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts); + return raw_err_stdev; +} + +static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) { + return oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL; +} +typedef struct intra_pred_block_pass1_args { + const SequenceHeader *seq_params; + MACROBLOCK *x; +} intra_pred_block_pass1_args; + +static INLINE void copy_rect(uint8_t *dst, int dstride, const uint8_t *src, + int sstride, int width, int height, int use_hbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), sstride, + CONVERT_TO_SHORTPTR(dst), dstride, width, height); + } else { + aom_convolve_copy(src, sstride, dst, dstride, width, height); + } +#else + (void)use_hbd; + aom_convolve_copy(src, sstride, dst, dstride, width, height); +#endif +} + +static void first_pass_intra_pred_and_calc_diff(int plane, int block, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + (void)block; + struct intra_pred_block_pass1_args *const args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + MACROBLOCK_PLANE *const p = &x->plane[plane]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const SequenceHeader *seq_params = args->seq_params; + const int src_stride = p->src.stride; + uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; + + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width, + pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src, + src_stride, dst, dst_stride, blk_col, blk_row, plane); + + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); +} + +static void first_pass_predict_intra_block_for_luma_plane( + const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + const MACROBLOCKD *const xd = &x->e_mbd; + const int plane = AOM_PLANE_Y; + const MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + const int dst_stride = pd->dst.stride; + uint8_t *dst = pd->dst.buf; + const MACROBLOCK_PLANE *const p = &x->plane[plane]; + const int src_stride = p->src.stride; + const uint8_t *src = p->src.buf; + + intra_pred_block_pass1_args args = { seq_params, x }; + av1_foreach_transformed_block_in_plane( + xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args); + + // copy source data to recon buffer, as the recon buffer will be used as a + // reference frame subsequently. + copy_rect(dst, dst_stride, src, src_stride, block_size_wide[bsize], + block_size_high[bsize], seq_params->use_highbitdepth); +} + +#define UL_INTRA_THRESH 50 +#define INVALID_ROW -1 +// Computes and returns the intra pred error of a block. +// intra pred error: sum of squared error of the intra predicted residual. +// Inputs: +// cpi: the encoder setting. Only a few params in it will be used. +// this_frame: the current frame buffer. +// tile: tile information (not used in first pass, already init to zero) +// unit_row: row index in the unit of first pass block size. +// unit_col: column index in the unit of first pass block size. +// y_offset: the offset of y frame buffer, indicating the starting point of +// the current block. +// uv_offset: the offset of u and v frame buffer, indicating the starting +// point of the current block. +// fp_block_size: first pass block size. +// qindex: quantization step size to encode the frame. +// stats: frame encoding stats. +// Modifies: +// stats->intra_skip_count +// stats->image_data_start_row +// stats->intra_factor +// stats->brightness_factor +// stats->intra_error +// stats->frame_avg_wavelet_energy +// Returns: +// this_intra_error. +static int firstpass_intra_prediction( + AV1_COMP *cpi, ThreadData *td, YV12_BUFFER_CONFIG *const this_frame, + const TileInfo *const tile, const int unit_row, const int unit_col, + const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size, + const int qindex, FRAME_STATS *const stats) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = cm->seq_params; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int unit_scale = mi_size_wide[fp_block_size]; + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE bsize = + get_bsize(mi_params, fp_block_size, unit_row, unit_col); + + set_mi_offsets(mi_params, xd, unit_row * unit_scale, unit_col * unit_scale); + xd->plane[0].dst.buf = this_frame->y_buffer + y_offset; + if (num_planes > 1) { + xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset; + xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset; + } + xd->left_available = (unit_col != 0); + xd->mi[0]->bsize = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + set_mi_row_col(xd, tile, unit_row * unit_scale, mi_size_high[bsize], + unit_col * unit_scale, mi_size_wide[bsize], mi_params->mi_rows, + mi_params->mi_cols); + set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes); + xd->mi[0]->segment_id = 0; + xd->lossless[xd->mi[0]->segment_id] = (qindex == 0); + xd->mi[0]->mode = DC_PRED; + xd->mi[0]->tx_size = TX_4X4; + + if (cpi->sf.fp_sf.disable_recon) + first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize); + else + av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0); + int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff); + if (seq_params->use_highbitdepth) { + switch (seq_params->bit_depth) { + case AOM_BITS_8: break; + case AOM_BITS_10: this_intra_error >>= 4; break; + case AOM_BITS_12: this_intra_error >>= 8; break; + default: + assert(0 && + "seq_params->bit_depth should be AOM_BITS_8, " + "AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + } + + if (this_intra_error < UL_INTRA_THRESH) { + ++stats->intra_skip_count; + } else if ((unit_col > 0) && (stats->image_data_start_row == INVALID_ROW)) { + stats->image_data_start_row = unit_row; + } + + double log_intra = log1p(this_intra_error); + if (log_intra < 10.0) { + stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05); + } else { + stats->intra_factor += 1.0; + } + + int level_sample; + if (seq_params->use_highbitdepth) { + level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; + } else { + level_sample = x->plane[0].src.buf[0]; + } + + if (seq_params->use_highbitdepth) { + switch (seq_params->bit_depth) { + case AOM_BITS_8: break; + case AOM_BITS_10: level_sample >>= 2; break; + case AOM_BITS_12: level_sample >>= 4; break; + default: + assert(0 && + "seq_params->bit_depth should be AOM_BITS_8, " + "AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + } + if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) { + stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample)); + } else { + stats->brightness_factor += 1.0; + } + + // Intrapenalty below deals with situations where the intra and inter + // error scores are very low (e.g. a plain black frame). + // We do not have special cases in first pass for 0,0 and nearest etc so + // all inter modes carry an overhead cost estimate for the mv. + // When the error score is very low this causes us to pick all or lots of + // INTRA modes and throw lots of key frames. + // This penalty adds a cost matching that of a 0,0 mv to the intra case. + this_intra_error += INTRA_MODE_PENALTY; + + // Accumulate the intra error. + stats->intra_error += (int64_t)this_intra_error; + + // Stats based on wavelet energy is used in the following cases : + // 1. ML model which predicts if a flat structure (golden-frame only structure + // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in + // constant quality mode under certain conditions. + // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL. + // Thus, wavelet energy calculation is enabled for the above cases. + if (calc_wavelet_energy(&cpi->oxcf)) { + const int hbd = is_cur_buf_hbd(xd); + const int stride = x->plane[0].src.stride; + const int num_8x8_rows = block_size_high[fp_block_size] / 8; + const int num_8x8_cols = block_size_wide[fp_block_size] / 8; + const uint8_t *buf = x->plane[0].src.buf; + stats->frame_avg_wavelet_energy += av1_haar_ac_sad_mxn_uint8_input( + buf, stride, hbd, num_8x8_rows, num_8x8_cols); + } else { + stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP; + } + + return this_intra_error; +} + +// Returns the sum of square error between source and reference blocks. +static int get_prediction_error_bitdepth(const int is_high_bitdepth, + const int bitdepth, + const BLOCK_SIZE block_size, + const struct buf_2d *src, + const struct buf_2d *ref) { + (void)is_high_bitdepth; + (void)bitdepth; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_high_bitdepth) { + return highbd_get_prediction_error(block_size, src, ref, bitdepth); + } +#endif // CONFIG_AV1_HIGHBITDEPTH + return get_prediction_error(block_size, src, ref); +} + +// Accumulates motion vector stats. +// Modifies member variables of "stats". +static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv, + const int mb_row, const int mb_col, + const int mb_rows, const int mb_cols, + MV *last_non_zero_mv, FRAME_STATS *stats) { + if (is_zero_mv(&best_mv)) return; + + ++stats->mv_count; + // Non-zero vector, was it different from the last non zero vector? + if (!is_equal_mv(&best_mv, last_non_zero_mv)) ++stats->new_mv_count; + *last_non_zero_mv = best_mv; + + // Does the row vector point inwards or outwards? + if (mb_row < mb_rows / 2) { + if (mv.row > 0) { + --stats->sum_in_vectors; + } else if (mv.row < 0) { + ++stats->sum_in_vectors; + } + } else if (mb_row > mb_rows / 2) { + if (mv.row > 0) { + ++stats->sum_in_vectors; + } else if (mv.row < 0) { + --stats->sum_in_vectors; + } + } + + // Does the col vector point inwards or outwards? + if (mb_col < mb_cols / 2) { + if (mv.col > 0) { + --stats->sum_in_vectors; + } else if (mv.col < 0) { + ++stats->sum_in_vectors; + } + } else if (mb_col > mb_cols / 2) { + if (mv.col > 0) { + ++stats->sum_in_vectors; + } else if (mv.col < 0) { + --stats->sum_in_vectors; + } + } +} + +// Computes and returns the inter prediction error from the last frame. +// Computes inter prediction errors from the golden and alt ref frams and +// Updates stats accordingly. +// Inputs: +// cpi: the encoder setting. Only a few params in it will be used. +// last_frame: the frame buffer of the last frame. +// golden_frame: the frame buffer of the golden frame. +// unit_row: row index in the unit of first pass block size. +// unit_col: column index in the unit of first pass block size. +// recon_yoffset: the y offset of the reconstructed frame buffer, +// indicating the starting point of the current block. +// recont_uvoffset: the u/v offset of the reconstructed frame buffer, +// indicating the starting point of the current block. +// src_yoffset: the y offset of the source frame buffer. +// fp_block_size: first pass block size. +// this_intra_error: the intra prediction error of this block. +// raw_motion_err_counts: the count of raw motion vectors. +// raw_motion_err_list: the array that records the raw motion error. +// ref_mv: the reference used to start the motion search +// best_mv: the best mv found +// last_non_zero_mv: the last non zero mv found in this tile row. +// stats: frame encoding stats. +// Modifies: +// raw_motion_err_list +// best_ref_mv +// last_mv +// stats: many member params in it. +// Returns: +// this_inter_error +static int firstpass_inter_prediction( + AV1_COMP *cpi, ThreadData *td, const YV12_BUFFER_CONFIG *const last_frame, + const YV12_BUFFER_CONFIG *const golden_frame, const int unit_row, + const int unit_col, const int recon_yoffset, const int recon_uvoffset, + const int src_yoffset, const BLOCK_SIZE fp_block_size, + const int this_intra_error, const int raw_motion_err_counts, + int *raw_motion_err_list, const MV ref_mv, MV *best_mv, + MV *last_non_zero_mv, FRAME_STATS *stats) { + int this_inter_error = this_intra_error; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + CurrentFrame *const current_frame = &cm->current_frame; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int is_high_bitdepth = is_cur_buf_hbd(xd); + const int bitdepth = xd->bd; + const int unit_scale = mi_size_wide[fp_block_size]; + const BLOCK_SIZE bsize = + get_bsize(mi_params, fp_block_size, unit_row, unit_col); + const int fp_block_size_height = block_size_wide[fp_block_size]; + const int unit_width = mi_size_wide[fp_block_size]; + const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows); + const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols); + // Assume 0,0 motion with no mv overhead. + FULLPEL_MV mv = kZeroFullMv; + xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset; + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + av1_set_mv_col_limits(mi_params, &x->mv_limits, unit_col * unit_width, + fp_block_size_height >> MI_SIZE_LOG2, + cpi->oxcf.border_in_pixels); + + int motion_error = + get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize, + &x->plane[0].src, &xd->plane[0].pre[0]); + + // Compute the motion error of the 0,0 motion using the last source + // frame as the reference. Skip the further motion search on + // reconstructed frame if this error is small. + // TODO(chiyotsai): The unscaled last source might be different dimension + // as the current source. See BUG=aomedia:3413 + struct buf_2d unscaled_last_source_buf_2d; + unscaled_last_source_buf_2d.buf = + cpi->unscaled_last_source->y_buffer + src_yoffset; + unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; + const int raw_motion_error = get_prediction_error_bitdepth( + is_high_bitdepth, bitdepth, bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); + raw_motion_err_list[raw_motion_err_counts] = raw_motion_error; + const FIRST_PASS_SPEED_FEATURES *const fp_sf = &cpi->sf.fp_sf; + + if (raw_motion_error > fp_sf->skip_motion_search_threshold) { + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, &ref_mv, &mv, &motion_error); + + // If the current best reference mv is not centered on 0,0 then do a + // 0,0 based search as well. + if ((fp_sf->skip_zeromv_motion_search == 0) && !is_zero_mv(&ref_mv)) { + FULLPEL_MV tmp_mv = kZeroFullMv; + int tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err); + + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv = tmp_mv; + } + } + } + + // Motion search in 2nd reference frame. + int gf_motion_error = motion_error; + if ((current_frame->frame_number > 1) && golden_frame != NULL) { + FULLPEL_MV tmp_mv = kZeroFullMv; + // Assume 0,0 motion with no mv overhead. + av1_setup_pre_planes(xd, 0, golden_frame, 0, 0, NULL, 1); + xd->plane[0].pre[0].buf += recon_yoffset; + gf_motion_error = + get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize, + &x->plane[0].src, &xd->plane[0].pre[0]); + first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error); + } + if (gf_motion_error < motion_error && gf_motion_error < this_intra_error) { + ++stats->second_ref_count; + } + // In accumulating a score for the 2nd reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if ((current_frame->frame_number > 1) && golden_frame != NULL) { + stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error); + } else { + // TODO(chengchen): I believe logically this should also be changed to + // stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error). + stats->sr_coded_error += motion_error; + } + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset; + if (av1_num_planes(&cpi->common) > 1) { + xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset; + } + + // Start by assuming that intra mode is best. + *best_mv = kZeroMv; + + if (motion_error <= this_intra_error) { + // Keep a count of cases where the inter and intra were very close + // and very low. This helps with scene cut detection for example in + // cropped clips with black bars at the sides or top and bottom. + if (((this_intra_error - INTRA_MODE_PENALTY) * 9 <= motion_error * 10) && + (this_intra_error < (2 * INTRA_MODE_PENALTY))) { + stats->neutral_count += 1.0; + // Also track cases where the intra is not much worse than the inter + // and use this in limiting the GF/arf group length. + } else if ((this_intra_error > NCOUNT_INTRA_THRESH) && + (this_intra_error < (NCOUNT_INTRA_FACTOR * motion_error))) { + stats->neutral_count += + (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error); + } + + *best_mv = get_mv_from_fullmv(&mv); + this_inter_error = motion_error; + xd->mi[0]->mode = NEWMV; + xd->mi[0]->mv[0].as_mv = *best_mv; + xd->mi[0]->tx_size = TX_4X4; + xd->mi[0]->ref_frame[0] = LAST_FRAME; + xd->mi[0]->ref_frame[1] = NONE_FRAME; + + if (fp_sf->disable_recon == 0) { + av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale, + unit_col * unit_scale, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + av1_encode_sby_pass1(cpi, x, bsize); + } + stats->sum_mvr += best_mv->row; + stats->sum_mvr_abs += abs(best_mv->row); + stats->sum_mvc += best_mv->col; + stats->sum_mvc_abs += abs(best_mv->col); + stats->sum_mvrs += best_mv->row * best_mv->row; + stats->sum_mvcs += best_mv->col * best_mv->col; + ++stats->inter_count; + + accumulate_mv_stats(*best_mv, mv, unit_row, unit_col, unit_rows, unit_cols, + last_non_zero_mv, stats); + } + + return this_inter_error; +} + +// Normalize the first pass stats. +// Error / counters are normalized to each MB. +// MVs are normalized to the width/height of the frame. +static void normalize_firstpass_stats(FIRSTPASS_STATS *fps, + double num_mbs_16x16, double f_w, + double f_h) { + fps->coded_error /= num_mbs_16x16; + fps->sr_coded_error /= num_mbs_16x16; + fps->intra_error /= num_mbs_16x16; + fps->frame_avg_wavelet_energy /= num_mbs_16x16; + fps->log_coded_error = log1p(fps->coded_error); + fps->log_intra_error = log1p(fps->intra_error); + fps->MVr /= f_h; + fps->mvr_abs /= f_h; + fps->MVc /= f_w; + fps->mvc_abs /= f_w; + fps->MVrv /= (f_h * f_h); + fps->MVcv /= (f_w * f_w); + fps->new_mv_count /= num_mbs_16x16; +} + +// Updates the first pass stats of this frame. +// Input: +// cpi: the encoder setting. Only a few params in it will be used. +// stats: stats accumulated for this frame. +// raw_err_stdev: the statndard deviation for the motion error of all the +// inter blocks of the (0,0) motion using the last source +// frame as the reference. +// frame_number: current frame number. +// ts_duration: Duration of the frame / collection of frames. +// Updates: +// twopass->total_stats: the accumulated stats. +// twopass->stats_buf_ctx->stats_in_end: the pointer to the current stats, +// update its value and its position +// in the buffer. +static void update_firstpass_stats(AV1_COMP *cpi, + const FRAME_STATS *const stats, + const double raw_err_stdev, + const int frame_number, + const int64_t ts_duration, + const BLOCK_SIZE fp_block_size) { + TWO_PASS *twopass = &cpi->ppi->twopass; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end; + FIRSTPASS_STATS fps; + // The minimum error here insures some bit allocation to frames even + // in static regions. The allocation per MB declines for larger formats + // where the typical "real" energy per MB also falls. + // Initial estimate here uses sqrt(mbs) to define the min_err, where the + // number of mbs is proportional to the image area. + const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : mi_params->MBs; + // Number of actual units used in the first pass, it can be other square + // block sizes than 16X16. + const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16); + const double min_err = 200 * sqrt(num_mbs); + + fps.weight = stats->intra_factor * stats->brightness_factor; + fps.frame = frame_number; + fps.coded_error = (double)(stats->coded_error >> 8) + min_err; + fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err; + fps.intra_error = (double)(stats->intra_error >> 8) + min_err; + fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy; + fps.count = 1.0; + fps.pcnt_inter = (double)stats->inter_count / num_mbs; + fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs; + fps.pcnt_neutral = (double)stats->neutral_count / num_mbs; + fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs; + fps.inactive_zone_rows = (double)stats->image_data_start_row; + fps.inactive_zone_cols = 0.0; // Placeholder: not currently supported. + fps.raw_error_stdev = raw_err_stdev; + fps.is_flash = 0; + fps.noise_var = 0.0; + fps.cor_coeff = 1.0; + fps.log_coded_error = 0.0; + fps.log_intra_error = 0.0; + + if (stats->mv_count > 0) { + fps.MVr = (double)stats->sum_mvr / stats->mv_count; + fps.mvr_abs = (double)stats->sum_mvr_abs / stats->mv_count; + fps.MVc = (double)stats->sum_mvc / stats->mv_count; + fps.mvc_abs = (double)stats->sum_mvc_abs / stats->mv_count; + fps.MVrv = ((double)stats->sum_mvrs - + ((double)stats->sum_mvr * stats->sum_mvr / stats->mv_count)) / + stats->mv_count; + fps.MVcv = ((double)stats->sum_mvcs - + ((double)stats->sum_mvc * stats->sum_mvc / stats->mv_count)) / + stats->mv_count; + fps.mv_in_out_count = (double)stats->sum_in_vectors / (stats->mv_count * 2); + fps.new_mv_count = stats->new_mv_count; + fps.pcnt_motion = (double)stats->mv_count / num_mbs; + } else { + fps.MVr = 0.0; + fps.mvr_abs = 0.0; + fps.MVc = 0.0; + fps.mvc_abs = 0.0; + fps.MVrv = 0.0; + fps.MVcv = 0.0; + fps.mv_in_out_count = 0.0; + fps.new_mv_count = 0.0; + fps.pcnt_motion = 0.0; + } + + // TODO(paulwilkins): Handle the case when duration is set to 0, or + // something less than the full time between subsequent values of + // cpi->source_time_stamp. + fps.duration = (double)ts_duration; + + normalize_firstpass_stats(&fps, num_mbs_16X16, cm->width, cm->height); + + // We will store the stats inside the persistent twopass struct (and NOT the + // local variable 'fps'), and then cpi->output_pkt_list will point to it. + *this_frame_stats = fps; + if (!cpi->ppi->lap_enabled) { + output_stats(this_frame_stats, cpi->ppi->output_pkt_list); + } else { + av1_firstpass_info_push(&twopass->firstpass_info, this_frame_stats); + } + if (cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL) { + av1_accumulate_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, &fps); + } + twopass->stats_buf_ctx->stats_in_end++; + // When ducky encode is on, we always use linear buffer for stats_buf_ctx. + if (cpi->use_ducky_encode == 0) { + // TODO(angiebird): Figure out why first pass uses circular buffer. + /* In the case of two pass, first pass uses it as a circular buffer, + * when LAP is enabled it is used as a linear buffer*/ + if ((cpi->oxcf.pass == AOM_RC_FIRST_PASS) && + (twopass->stats_buf_ctx->stats_in_end >= + twopass->stats_buf_ctx->stats_in_buf_end)) { + twopass->stats_buf_ctx->stats_in_end = + twopass->stats_buf_ctx->stats_in_start; + } + } +} + +static void print_reconstruction_frame( + const YV12_BUFFER_CONFIG *const last_frame, int frame_number, + int do_print) { + if (!do_print) return; + + char filename[512]; + FILE *recon_file; + snprintf(filename, sizeof(filename), "enc%04d.yuv", frame_number); + + if (frame_number == 0) { + recon_file = fopen(filename, "wb"); + } else { + recon_file = fopen(filename, "ab"); + } + + fwrite(last_frame->buffer_alloc, last_frame->frame_size, 1, recon_file); + fclose(recon_file); +} + +static FRAME_STATS accumulate_frame_stats(FRAME_STATS *mb_stats, int mb_rows, + int mb_cols) { + FRAME_STATS stats = { 0 }; + int i, j; + + stats.image_data_start_row = INVALID_ROW; + for (j = 0; j < mb_rows; j++) { + for (i = 0; i < mb_cols; i++) { + FRAME_STATS mb_stat = mb_stats[j * mb_cols + i]; + stats.brightness_factor += mb_stat.brightness_factor; + stats.coded_error += mb_stat.coded_error; + stats.frame_avg_wavelet_energy += mb_stat.frame_avg_wavelet_energy; + if (stats.image_data_start_row == INVALID_ROW && + mb_stat.image_data_start_row != INVALID_ROW) { + stats.image_data_start_row = mb_stat.image_data_start_row; + } + stats.inter_count += mb_stat.inter_count; + stats.intra_error += mb_stat.intra_error; + stats.intra_factor += mb_stat.intra_factor; + stats.intra_skip_count += mb_stat.intra_skip_count; + stats.mv_count += mb_stat.mv_count; + stats.neutral_count += mb_stat.neutral_count; + stats.new_mv_count += mb_stat.new_mv_count; + stats.second_ref_count += mb_stat.second_ref_count; + stats.sr_coded_error += mb_stat.sr_coded_error; + stats.sum_in_vectors += mb_stat.sum_in_vectors; + stats.sum_mvc += mb_stat.sum_mvc; + stats.sum_mvc_abs += mb_stat.sum_mvc_abs; + stats.sum_mvcs += mb_stat.sum_mvcs; + stats.sum_mvr += mb_stat.sum_mvr; + stats.sum_mvr_abs += mb_stat.sum_mvr_abs; + stats.sum_mvrs += mb_stat.sum_mvrs; + } + } + return stats; +} + +static void setup_firstpass_data(AV1_COMMON *const cm, + FirstPassData *firstpass_data, + const int unit_rows, const int unit_cols) { + CHECK_MEM_ERROR(cm, firstpass_data->raw_motion_err_list, + aom_calloc(unit_rows * unit_cols, + sizeof(*firstpass_data->raw_motion_err_list))); + CHECK_MEM_ERROR( + cm, firstpass_data->mb_stats, + aom_calloc(unit_rows * unit_cols, sizeof(*firstpass_data->mb_stats))); + for (int j = 0; j < unit_rows; j++) { + for (int i = 0; i < unit_cols; i++) { + firstpass_data->mb_stats[j * unit_cols + i].image_data_start_row = + INVALID_ROW; + } + } +} + +void av1_free_firstpass_data(FirstPassData *firstpass_data) { + aom_free(firstpass_data->raw_motion_err_list); + firstpass_data->raw_motion_err_list = NULL; + aom_free(firstpass_data->mb_stats); + firstpass_data->mb_stats = NULL; +} + +int av1_get_unit_rows_in_tile(const TileInfo *tile, + const BLOCK_SIZE fp_block_size) { + const int unit_height_log2 = mi_size_high_log2[fp_block_size]; + const int mi_rows = tile->mi_row_end - tile->mi_row_start; + const int unit_rows = CEIL_POWER_OF_TWO(mi_rows, unit_height_log2); + + return unit_rows; +} + +int av1_get_unit_cols_in_tile(const TileInfo *tile, + const BLOCK_SIZE fp_block_size) { + const int unit_width_log2 = mi_size_wide_log2[fp_block_size]; + const int mi_cols = tile->mi_col_end - tile->mi_col_start; + const int unit_cols = CEIL_POWER_OF_TWO(mi_cols, unit_width_log2); + + return unit_cols; +} + +#define FIRST_PASS_ALT_REF_DISTANCE 16 +static void first_pass_tile(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, + const BLOCK_SIZE fp_block_size) { + TileInfo *tile = &tile_data->tile_info; + const int unit_height = mi_size_high[fp_block_size]; + const int unit_height_log2 = mi_size_high_log2[fp_block_size]; + for (int mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; + mi_row += unit_height) { + av1_first_pass_row(cpi, td, tile_data, mi_row >> unit_height_log2, + fp_block_size); + } +} + +static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + av1_alloc_src_diff_buf(cm, &cpi->td.mb); + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size); + } + } +} + +void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, + const int unit_row, const BLOCK_SIZE fp_block_size) { + MACROBLOCK *const x = &td->mb; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = cm->seq_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + TileInfo *tile = &tile_data->tile_info; + const int qindex = find_fp_qindex(seq_params->bit_depth); + const int fp_block_size_width = block_size_high[fp_block_size]; + const int fp_block_size_height = block_size_wide[fp_block_size]; + const int unit_width = mi_size_wide[fp_block_size]; + const int unit_width_log2 = mi_size_wide_log2[fp_block_size]; + const int unit_height_log2 = mi_size_high_log2[fp_block_size]; + const int unit_cols = mi_params->mb_cols * 4 / unit_width; + int raw_motion_err_counts = 0; + int unit_row_in_tile = unit_row - (tile->mi_row_start >> unit_height_log2); + int unit_col_start = tile->mi_col_start >> unit_width_log2; + int unit_cols_in_tile = av1_get_unit_cols_in_tile(tile, fp_block_size); + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; + + const YV12_BUFFER_CONFIG *last_frame = + av1_get_scaled_ref_frame(cpi, LAST_FRAME); + if (!last_frame) { + last_frame = get_ref_frame_yv12_buf(cm, LAST_FRAME); + } + const YV12_BUFFER_CONFIG *golden_frame = + av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + if (!golden_frame) { + golden_frame = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + } + YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf; + + PICK_MODE_CONTEXT *ctx = td->firstpass_ctx; + FRAME_STATS *mb_stats = + cpi->firstpass_data.mb_stats + unit_row * unit_cols + unit_col_start; + int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list + + unit_row * unit_cols + unit_col_start; + MV *first_top_mv = &tile_data->firstpass_top_mv; + + for (int i = 0; i < num_planes; ++i) { + x->plane[i].coeff = ctx->coeff[i]; + x->plane[i].qcoeff = ctx->qcoeff[i]; + x->plane[i].eobs = ctx->eobs[i]; + x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + x->plane[i].dqcoeff = ctx->dqcoeff[i]; + } + + const int src_y_stride = cpi->source->y_stride; + const int recon_y_stride = this_frame->y_stride; + const int recon_uv_stride = this_frame->uv_stride; + const int uv_mb_height = + fp_block_size_height >> (this_frame->y_height > this_frame->uv_height); + + MV best_ref_mv = kZeroMv; + MV last_mv; + + // Reset above block coeffs. + xd->up_available = (unit_row_in_tile != 0); + int recon_yoffset = (unit_row * recon_y_stride * fp_block_size_height) + + (unit_col_start * fp_block_size_width); + int src_yoffset = (unit_row * src_y_stride * fp_block_size_height) + + (unit_col_start * fp_block_size_width); + int recon_uvoffset = (unit_row * recon_uv_stride * uv_mb_height) + + (unit_col_start * uv_mb_height); + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + av1_set_mv_row_limits( + mi_params, &x->mv_limits, (unit_row << unit_height_log2), + (fp_block_size_height >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels); + + av1_setup_src_planes(x, cpi->source, unit_row << unit_height_log2, + tile->mi_col_start, num_planes, fp_block_size); + + // Fix - zero the 16x16 block first. This ensures correct this_intra_error for + // block sizes smaller than 16x16. + av1_zero_array(x->plane[0].src_diff, 256); + + for (int unit_col_in_tile = 0; unit_col_in_tile < unit_cols_in_tile; + unit_col_in_tile++) { + const int unit_col = unit_col_start + unit_col_in_tile; + + enc_row_mt->sync_read_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile); + +#if CONFIG_MULTITHREAD + if (cpi->ppi->p_mt_info.num_workers > 1) { + pthread_mutex_lock(enc_row_mt->mutex_); + bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit; + pthread_mutex_unlock(enc_row_mt->mutex_); + // Exit in case any worker has encountered an error. + if (firstpass_mt_exit) return; + } +#endif + + if (unit_col_in_tile == 0) { + last_mv = *first_top_mv; + } + int this_intra_error = firstpass_intra_prediction( + cpi, td, this_frame, tile, unit_row, unit_col, recon_yoffset, + recon_uvoffset, fp_block_size, qindex, mb_stats); + + if (!frame_is_intra_only(cm)) { + const int this_inter_error = firstpass_inter_prediction( + cpi, td, last_frame, golden_frame, unit_row, unit_col, recon_yoffset, + recon_uvoffset, src_yoffset, fp_block_size, this_intra_error, + raw_motion_err_counts, raw_motion_err_list, best_ref_mv, &best_ref_mv, + &last_mv, mb_stats); + if (unit_col_in_tile == 0) { + *first_top_mv = last_mv; + } + mb_stats->coded_error += this_inter_error; + ++raw_motion_err_counts; + } else { + mb_stats->sr_coded_error += this_intra_error; + mb_stats->coded_error += this_intra_error; + } + + // Adjust to the next column of MBs. + x->plane[0].src.buf += fp_block_size_width; + if (num_planes > 1) { + x->plane[1].src.buf += uv_mb_height; + x->plane[2].src.buf += uv_mb_height; + } + + recon_yoffset += fp_block_size_width; + src_yoffset += fp_block_size_width; + recon_uvoffset += uv_mb_height; + mb_stats++; + + enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile, + unit_cols_in_tile); + } +} + +void av1_noop_first_pass_frame(AV1_COMP *cpi, const int64_t ts_duration) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + int max_mb_rows = mi_params->mb_rows; + int max_mb_cols = mi_params->mb_cols; + if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) { + int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width); + max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2); + } + if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) { + int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height); + max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2); + } + const int unit_rows = get_unit_rows(BLOCK_16X16, max_mb_rows); + const int unit_cols = get_unit_cols(BLOCK_16X16, max_mb_cols); + setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols); + FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats; + FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols); + av1_free_firstpass_data(&cpi->firstpass_data); + update_firstpass_stats(cpi, &stats, 1.0, current_frame->frame_number, + ts_duration, BLOCK_16X16); +} + +void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) { + MACROBLOCK *const x = &cpi->td.mb; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + CurrentFrame *const current_frame = &cm->current_frame; + const SequenceHeader *const seq_params = cm->seq_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + const int qindex = find_fp_qindex(seq_params->bit_depth); + const int ref_frame_flags_backup = cpi->ref_frame_flags; + cpi->ref_frame_flags = av1_ref_frame_flag_list[LAST_FRAME] | + av1_ref_frame_flag_list[GOLDEN_FRAME]; + + // Detect if the key frame is screen content type. + if (frame_is_intra_only(cm)) { + FeatureFlags *const features = &cm->features; + assert(cpi->source != NULL); + xd->cur_buf = cpi->source; + av1_set_screen_content_options(cpi, features); + } + + // Prepare the speed features + av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); + + // Unit size for the first pass encoding. + const BLOCK_SIZE fp_block_size = + get_fp_block_size(cpi->is_screen_content_type); + + int max_mb_rows = mi_params->mb_rows; + int max_mb_cols = mi_params->mb_cols; + if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) { + int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width); + max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2); + } + if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) { + int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height); + max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2); + } + + // Number of rows in the unit size. + // Note max_mb_rows and max_mb_cols are in the unit of 16x16. + const int unit_rows = get_unit_rows(fp_block_size, max_mb_rows); + const int unit_cols = get_unit_cols(fp_block_size, max_mb_cols); + + // Set fp_block_size, for the convenience of multi-thread usage. + cpi->fp_block_size = fp_block_size; + + setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols); + int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list; + FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats; + + // multi threading info + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + if (cpi->allocated_tiles < tile_cols * tile_rows) { + av1_alloc_tile_data(cpi); + } + + av1_init_tile_data(cpi); + + const YV12_BUFFER_CONFIG *last_frame = NULL; + const YV12_BUFFER_CONFIG *golden_frame = NULL; + if (!frame_is_intra_only(cm)) { + av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0); + last_frame = av1_is_scaled(get_ref_scale_factors_const(cm, LAST_FRAME)) + ? av1_get_scaled_ref_frame(cpi, LAST_FRAME) + : get_ref_frame_yv12_buf(cm, LAST_FRAME); + golden_frame = av1_is_scaled(get_ref_scale_factors_const(cm, GOLDEN_FRAME)) + ? av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME) + : get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + } + + YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf; + // First pass code requires valid last and new frame buffers. + assert(this_frame != NULL); + assert(frame_is_intra_only(cm) || (last_frame != NULL)); + + av1_setup_frame_size(cpi); + av1_set_mv_search_params(cpi); + + set_mi_offsets(mi_params, xd, 0, 0); + xd->mi[0]->bsize = fp_block_size; + + // Do not use periodic key frames. + cpi->rc.frames_to_key = INT_MAX; + + av1_set_quantizer( + cm, cpi->oxcf.q_cfg.qm_minlevel, cpi->oxcf.q_cfg.qm_maxlevel, qindex, + cpi->oxcf.q_cfg.enable_chroma_deltaq, cpi->oxcf.q_cfg.enable_hdr_deltaq); + + av1_setup_block_planes(xd, seq_params->subsampling_x, + seq_params->subsampling_y, num_planes); + + av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, fp_block_size); + av1_setup_dst_planes(xd->plane, seq_params->sb_size, this_frame, 0, 0, 0, + num_planes); + + if (!frame_is_intra_only(cm)) { + av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes); + } + + set_mi_offsets(mi_params, xd, 0, 0); + + // Don't store luma on the fist pass since chroma is not computed + xd->cfl.store_y = 0; + av1_frame_init_quantizer(cpi); + + av1_default_coef_probs(cm); + av1_init_mode_probs(cm->fc); + av1_init_mv_probs(cm); + av1_initialize_rd_consts(cpi); + + enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy; + enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy; + + if (mt_info->num_workers > 1) { + enc_row_mt->sync_read_ptr = av1_row_mt_sync_read; + enc_row_mt->sync_write_ptr = av1_row_mt_sync_write; + av1_fp_encode_tiles_row_mt(cpi); + } else { + first_pass_tiles(cpi, fp_block_size); + } + + FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols); + int total_raw_motion_err_count = + frame_is_intra_only(cm) ? 0 : unit_rows * unit_cols; + const double raw_err_stdev = + raw_motion_error_stdev(raw_motion_err_list, total_raw_motion_err_count); + av1_free_firstpass_data(&cpi->firstpass_data); + av1_dealloc_src_diff_buf(&cpi->td.mb, av1_num_planes(cm)); + + // Clamp the image start to rows/2. This number of rows is discarded top + // and bottom as dead data so rows / 2 means the frame is blank. + if ((stats.image_data_start_row > unit_rows / 2) || + (stats.image_data_start_row == INVALID_ROW)) { + stats.image_data_start_row = unit_rows / 2; + } + // Exclude any image dead zone + if (stats.image_data_start_row > 0) { + stats.intra_skip_count = + AOMMAX(0, stats.intra_skip_count - + (stats.image_data_start_row * unit_cols * 2)); + } + + TWO_PASS *twopass = &cpi->ppi->twopass; + const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : mi_params->MBs; + // Number of actual units used in the first pass, it can be other square + // block sizes than 16X16. + const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16); + stats.intra_factor = stats.intra_factor / (double)num_mbs; + stats.brightness_factor = stats.brightness_factor / (double)num_mbs; + FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end; + update_firstpass_stats(cpi, &stats, raw_err_stdev, + current_frame->frame_number, ts_duration, + fp_block_size); + + // Copy the previous Last Frame back into gf buffer if the prediction is good + // enough... but also don't allow it to lag too far. + if ((twopass->sr_update_lag > 3) || + ((current_frame->frame_number > 0) && + (this_frame_stats->pcnt_inter > 0.20) && + ((this_frame_stats->intra_error / + DOUBLE_DIVIDE_CHECK(this_frame_stats->coded_error)) > 2.0))) { + if (golden_frame != NULL) { + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)], + cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]); + } + twopass->sr_update_lag = 1; + } else { + ++twopass->sr_update_lag; + } + + aom_extend_frame_borders(this_frame, num_planes); + + // The frame we just compressed now becomes the last frame. + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame); + + // Special case for the first frame. Copy into the GF buffer as a second + // reference. + if (current_frame->frame_number == 0 && + get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) { + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)], + cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]); + } + + print_reconstruction_frame(last_frame, current_frame->frame_number, + /*do_print=*/0); + + ++current_frame->frame_number; + cpi->ref_frame_flags = ref_frame_flags_backup; + if (!frame_is_intra_only(cm)) { + release_scaled_references(cpi); + } +} + +aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info, + FIRSTPASS_STATS *ext_stats_buf, + int ext_stats_buf_size) { + assert(IMPLIES(ext_stats_buf == NULL, ext_stats_buf_size == 0)); + if (ext_stats_buf == NULL) { + firstpass_info->stats_buf = firstpass_info->static_stats_buf; + firstpass_info->stats_buf_size = + sizeof(firstpass_info->static_stats_buf) / + sizeof(firstpass_info->static_stats_buf[0]); + firstpass_info->start_index = 0; + firstpass_info->cur_index = 0; + firstpass_info->stats_count = 0; + firstpass_info->future_stats_count = 0; + firstpass_info->past_stats_count = 0; + av1_zero(firstpass_info->total_stats); + if (ext_stats_buf_size == 0) { + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } else { + firstpass_info->stats_buf = ext_stats_buf; + firstpass_info->stats_buf_size = ext_stats_buf_size; + firstpass_info->start_index = 0; + firstpass_info->cur_index = 0; + firstpass_info->stats_count = firstpass_info->stats_buf_size; + firstpass_info->future_stats_count = firstpass_info->stats_count; + firstpass_info->past_stats_count = 0; + av1_zero(firstpass_info->total_stats); + for (int i = 0; i < firstpass_info->stats_count; ++i) { + av1_accumulate_stats(&firstpass_info->total_stats, + &firstpass_info->stats_buf[i]); + } + } + return AOM_CODEC_OK; +} + +aom_codec_err_t av1_firstpass_info_move_cur_index( + FIRSTPASS_INFO *firstpass_info) { + assert(firstpass_info->future_stats_count + + firstpass_info->past_stats_count == + firstpass_info->stats_count); + if (firstpass_info->future_stats_count > 1) { + firstpass_info->cur_index = + (firstpass_info->cur_index + 1) % firstpass_info->stats_buf_size; + --firstpass_info->future_stats_count; + ++firstpass_info->past_stats_count; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } +} + +aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info) { + if (firstpass_info->stats_count > 0 && firstpass_info->past_stats_count > 0) { + const int next_start = + (firstpass_info->start_index + 1) % firstpass_info->stats_buf_size; + firstpass_info->start_index = next_start; + --firstpass_info->stats_count; + --firstpass_info->past_stats_count; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } +} + +aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop( + FIRSTPASS_INFO *firstpass_info) { + aom_codec_err_t ret = av1_firstpass_info_move_cur_index(firstpass_info); + if (ret != AOM_CODEC_OK) return ret; + ret = av1_firstpass_info_pop(firstpass_info); + return ret; +} + +aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info, + const FIRSTPASS_STATS *input_stats) { + if (firstpass_info->stats_count < firstpass_info->stats_buf_size) { + const int next_index = + (firstpass_info->start_index + firstpass_info->stats_count) % + firstpass_info->stats_buf_size; + firstpass_info->stats_buf[next_index] = *input_stats; + ++firstpass_info->stats_count; + ++firstpass_info->future_stats_count; + av1_accumulate_stats(&firstpass_info->total_stats, input_stats); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } +} + +const FIRSTPASS_STATS *av1_firstpass_info_peek( + const FIRSTPASS_INFO *firstpass_info, int offset_from_cur) { + if (offset_from_cur >= -firstpass_info->past_stats_count && + offset_from_cur < firstpass_info->future_stats_count) { + const int index = (firstpass_info->cur_index + offset_from_cur) % + firstpass_info->stats_buf_size; + return &firstpass_info->stats_buf[index]; + } else { + return NULL; + } +} + +int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info, + int offset_from_cur) { + if (offset_from_cur < firstpass_info->future_stats_count) { + return firstpass_info->future_stats_count - offset_from_cur; + } + return 0; +} + +int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info, + int offset_from_cur) { + if (offset_from_cur >= -firstpass_info->past_stats_count) { + return offset_from_cur + firstpass_info->past_stats_count; + } + return 0; +} diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h new file mode 100644 index 0000000000..d01363a80e --- /dev/null +++ b/third_party/aom/av1/encoder/firstpass.h @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_ +#define AOM_AV1_ENCODER_FIRSTPASS_H_ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/enums.h" +#include "av1/encoder/lookahead.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) + +#define MIN_ZERO_MOTION 0.95 +#define MAX_SR_CODED_ERROR 40 +#define MAX_RAW_ERR_VAR 2000 +#define MIN_MV_IN_OUT 0.4 + +#define VLOW_MOTION_THRESHOLD 950 +struct ThreadData; + +/*! + * \brief The stucture of acummulated frame stats in the first pass. + * + * Errors (coded_error, intra_error, etc.) and counters (new_mv_count) are + * normalized to each MB. MV related stats (MVc, MVr, etc.) are normalized to + * the frame width and height. See function normalize_firstpass_stats. + */ +typedef struct FIRSTPASS_STATS { + /*! + * Frame number in display order, if stats are for a single frame. + * No real meaning for a collection of frames. + */ + double frame; + /*! + * Weight assigned to this frame (or total weight for the collection of + * frames) currently based on intra factor and brightness factor. This is used + * to distribute bits betweeen easier and harder frames. + */ + double weight; + /*! + * Intra prediction error. + */ + double intra_error; + /*! + * Average wavelet energy computed using Discrete Wavelet Transform (DWT). + */ + double frame_avg_wavelet_energy; + /*! + * Best of intra pred error and inter pred error using last frame as ref. + */ + double coded_error; + /*! + * Best of intra pred error and inter pred error using golden frame as ref. + */ + double sr_coded_error; + /*! + * Percentage of blocks with inter pred error < intra pred error. + */ + double pcnt_inter; + /*! + * Percentage of blocks using (inter prediction and) non-zero motion vectors. + */ + double pcnt_motion; + /*! + * Percentage of blocks where golden frame was better than last or intra: + * inter pred error using golden frame < inter pred error using last frame and + * inter pred error using golden frame < intra pred error + */ + double pcnt_second_ref; + /*! + * Percentage of blocks where intra and inter prediction errors were very + * close. Note that this is a 'weighted count', that is, the so blocks may be + * weighted by how close the two errors were. + */ + double pcnt_neutral; + /*! + * Percentage of blocks that have almost no intra error residual + * (i.e. are in effect completely flat and untextured in the intra + * domain). In natural videos this is uncommon, but it is much more + * common in animations, graphics and screen content, so may be used + * as a signal to detect these types of content. + */ + double intra_skip_pct; + /*! + * Image mask rows top and bottom. + */ + double inactive_zone_rows; + /*! + * Image mask columns at left and right edges. + */ + double inactive_zone_cols; + /*! + * Average of row motion vectors. + */ + double MVr; + /*! + * Mean of absolute value of row motion vectors. + */ + double mvr_abs; + /*! + * Mean of column motion vectors. + */ + double MVc; + /*! + * Mean of absolute value of column motion vectors. + */ + double mvc_abs; + /*! + * Variance of row motion vectors. + */ + double MVrv; + /*! + * Variance of column motion vectors. + */ + double MVcv; + /*! + * Value in range [-1,1] indicating fraction of row and column motion vectors + * that point inwards (negative MV value) or outwards (positive MV value). + * For example, value of 1 indicates, all row/column MVs are inwards. + */ + double mv_in_out_count; + /*! + * Count of unique non-zero motion vectors. + */ + double new_mv_count; + /*! + * Duration of the frame / collection of frames. + */ + double duration; + /*! + * 1.0 if stats are for a single frame, OR + * Number of frames in this collection for which the stats are accumulated. + */ + double count; + /*! + * standard deviation for (0, 0) motion prediction error + */ + double raw_error_stdev; + /*! + * Whether the frame contains a flash + */ + int64_t is_flash; + /*! + * Estimated noise variance + */ + double noise_var; + /*! + * Correlation coefficient with the previous frame + */ + double cor_coeff; + /*! + * log of intra_error + */ + double log_intra_error; + /*! + * log of coded_error + */ + double log_coded_error; +} FIRSTPASS_STATS; + +// We want to keep one past stats for key frame detection +// in test_candidate_kf() +#define FIRSTPASS_INFO_STATS_PAST_MIN 1 + +// The size of static buffer used in FIRSTPASS_INFO. +#define FIRSTPASS_INFO_STATIC_BUF_SIZE \ + (MAX_LAP_BUFFERS + FIRSTPASS_INFO_STATS_PAST_MIN) + +/*! + * \brief Data structure used for managing first pass stats + */ +typedef struct { + /*! + * A static buffer that will be used when no ext_stats_buf is assigned. The + * ext_stats_buf is assigned through av1_firstpass_info_init() when the user + * already has a pre-existing firstpass stats that is stored in an external + * buffer. The ext_stats_buf is usually used in two pass mode. When using one + * pass mode, we generate "firstpass" stats and encode the video in the same + * pass. In this scenario, the stats will be pushed and popped from + * static_stats_buf. + */ + FIRSTPASS_STATS static_stats_buf[FIRSTPASS_INFO_STATIC_BUF_SIZE]; + /*! + * A pointer to first pass stats. + * Note that this buffer will be used as ring buffer. + */ + FIRSTPASS_STATS *stats_buf; + /*! + * size of stats_buf + */ + int stats_buf_size; + /*! + * start index of the available frame stats + * Note that start_index doesn't always point to + * current frame's stats because we need to + * keep past stats as well. To access current + * frame's stats, please use cur_index. + */ + int start_index; + + /*! + * count available stats stored in stats_buf + * the following condition should stay true + * stats_count = future_stats_count + past_stats_count + */ + int stats_count; + + /*! + * index of the current frame's stats + */ + int cur_index; + + /*! + * count available future stats including current stats + */ + int future_stats_count; + + /*! + * count available past stats EXCLUDING current stats + */ + int past_stats_count; + + /*! + * Accumulation of the stats being pushed into firstpass_info + */ + FIRSTPASS_STATS total_stats; +} FIRSTPASS_INFO; + +/*!\brief Init firstpass_info + * + * If using ext_stats_buf, the buffer needs to stay available during encoding + * process. + * + * \ingroup rate_control + * \param[out] firstpass_info struct of firstpass_info. + * \param[in] ext_stats_buf external stats buffer. Pass in NULL if + * choose to use internal static_stats_buf. + * \param[in] ext_stats_buf_size external stats buffer size. Pass in 0 if + * choose to use internal static_stats_buf. \return status + */ +aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info, + FIRSTPASS_STATS *ext_stats_buf, + int ext_stats_buf_size); + +/*!\brief Move cur_index by 1 + * + * \ingroup rate_control + * \param[out] firstpass_info struct of firstpass_info. + * \return status + */ +aom_codec_err_t av1_firstpass_info_move_cur_index( + FIRSTPASS_INFO *firstpass_info); + +/*!\brief Pop a stats from firstpass_info + * + * \ingroup rate_control + * \param[out] firstpass_info struct of firstpass_info. + * \return status + */ +aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info); + +/*!\brief Move cur_index by 1 and pop a stats from firstpass_info + * + * \ingroup rate_control + * \param[out] firstpass_info struct of firstpass_info. + * \return status + */ +aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop( + FIRSTPASS_INFO *firstpass_info); + +/*!\brief Push a stats into firstpass_info + * + * Note that the input stats will be copied into firstpass_info. + * \ingroup rate_control + * \param[out] firstpass_info struct of firstpass_info. + * \param[in] input_stats input stats + * \return status + */ +aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info, + const FIRSTPASS_STATS *input_stats); + +/*!\brief Peek at a stats from firstpass_info + * + * The target index is as follows. + * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size + * + * \ingroup rate_control + * \param[in] firstpass_info struct of firstpass_info. + * \param[in] offset_from_cur index offset from cur_index. + * \return pointer to the stats. The pointer will be NULL if + * stats_index_offset is invalid. + */ +const FIRSTPASS_STATS *av1_firstpass_info_peek( + const FIRSTPASS_INFO *firstpass_info, int offset_from_cur); + +/*!\brief Count the future stats from the target in firstpass_info + * Note that the target stats will be counted as well. + * The target index is as follows. + * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size + * + * \ingroup rate_control + * \param[in] firstpass_info struct of firstpass_info. + * \param[in] offset_from_cur target stats's inffset + * from cur_index. + * \return Number of stats in the future after the target stats + * including itself. + */ +int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info, + int offset_from_cur); + +/*!\brief Count the past stats before the target in firstpass_info + * Note that the target stats will NOT be counted. + * The target index is as follows. + * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size + * + * \ingroup rate_control + * \param[in] firstpass_info struct of firstpass_info. + * \param[in] offset_from_cur target stats's index offset + * from cur_index. + * \return Number of stats in the past before the target stats + * excluding itself. + */ +int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info, + int offset_from_cur); + +/*!\cond */ +#define FC_ANIMATION_THRESH 0.15 +enum { + FC_NORMAL = 0, + FC_GRAPHICS_ANIMATION = 1, + FRAME_CONTENT_TYPES = 2 +} UENUM1BYTE(FRAME_CONTENT_TYPE); +/*!\endcond */ + +/*! + * \brief Data related to the current GF/ARF group and the + * individual frames within the group + */ +typedef struct GF_GROUP { + /*!\cond */ + // Frame update type, e.g. ARF/GF/LF/Overlay + FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH]; + unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH]; + // The number of frames displayed so far within the GOP at a given coding + // frame. + unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH]; + int layer_depth[MAX_STATIC_GF_GROUP_LENGTH]; + int arf_boost[MAX_STATIC_GF_GROUP_LENGTH]; + int max_layer_depth; + int max_layer_depth_allowed; + // This is currently only populated for AOM_Q mode + int q_val[MAX_STATIC_GF_GROUP_LENGTH]; + int rdmult_val[MAX_STATIC_GF_GROUP_LENGTH]; + int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH]; + // The frame coding type - inter/intra frame + FRAME_TYPE frame_type[MAX_STATIC_GF_GROUP_LENGTH]; + // The reference frame buffer control - update or reset + REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH]; + int arf_index; // the index in the gf group of ARF, if no arf, then -1 + int size; // The total length of a GOP + + // The offset into lookahead_ctx for choosing + // source of frame parallel encodes. + int src_offset[MAX_STATIC_GF_GROUP_LENGTH]; + // Stores the display order hint of each frame in the current GF_GROUP. + int display_idx[MAX_STATIC_GF_GROUP_LENGTH]; + + // The reference frame list maps the reference frame indexes to its + // buffer index in the decoded buffer. A value of -1 means the + // corresponding reference frame index doesn't point towards any + // previously decoded frame. + int8_t ref_frame_list[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES]; + // Update frame index + int update_ref_idx[MAX_STATIC_GF_GROUP_LENGTH]; + // The map_idx of primary reference + int primary_ref_idx[MAX_STATIC_GF_GROUP_LENGTH]; + + // Indicates the level of parallelism in frame parallel encodes. + // 0 : frame is independently encoded (not part of parallel encodes). + // 1 : frame is the first in encode order in a given parallel encode set. + // 2 : frame occurs later in encode order in a given parallel encode set. + int frame_parallel_level[MAX_STATIC_GF_GROUP_LENGTH]; + // Indicates whether a frame should act as non-reference frame. + bool is_frame_non_ref[MAX_STATIC_GF_GROUP_LENGTH]; + // Indicates whether a frame is dropped. + bool is_frame_dropped[MAX_STATIC_GF_GROUP_LENGTH]; + + // Stores the display order hint of the frames not to be + // refreshed by the current frame. + int skip_frame_refresh[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES]; + // Stores the display order hint of the frame to be excluded during reference + // assignment. + int skip_frame_as_ref[MAX_STATIC_GF_GROUP_LENGTH]; + /*!\endcond */ +} GF_GROUP; +/*!\cond */ + +typedef struct { + // Track if the last frame in a GOP has higher quality. + int arf_gf_boost_lst; +} GF_STATE; + +typedef struct { + FIRSTPASS_STATS *stats_in_start; + FIRSTPASS_STATS *stats_in_end; + FIRSTPASS_STATS *stats_in_buf_end; + FIRSTPASS_STATS *total_stats; + FIRSTPASS_STATS *total_left_stats; +} STATS_BUFFER_CTX; + +/*!\endcond */ + +/*! + * \brief Two pass status and control data. + */ +typedef struct { + /*!\cond */ + unsigned int section_intra_rating; + // Circular queue of first pass stats stored for most recent frames. + // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored + // here. + FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1]; + int frame_stats_next_idx; // Index to next unused element in frame_stats_arr. + STATS_BUFFER_CTX *stats_buf_ctx; + FIRSTPASS_INFO firstpass_info; // This is the first pass data structure + // intended to replace stats_in + int first_pass_done; + int64_t bits_left; + double modified_error_min; + double modified_error_max; + double modified_error_left; + + // Projected total bits available for a key frame group of frames + int64_t kf_group_bits; + + // Error score of frames still to be coded in kf group + double kf_group_error_left; + + // Over time correction for bits per macro block estimation + double bpm_factor; + + // Record of target and actual bits spent in current ARF group + int rolling_arf_group_target_bits; + int rolling_arf_group_actual_bits; + + int sr_update_lag; + + int kf_zeromotion_pct; + int last_kfgroup_zeromotion_pct; + int extend_minq; + int extend_maxq; + /*!\endcond */ +} TWO_PASS; + +/*! + * \brief Frame level Two pass status and control data. + */ +typedef struct { + /*!\cond */ + const FIRSTPASS_STATS *stats_in; + // Pointer to the stats of the current frame. + const FIRSTPASS_STATS *this_frame; + double mb_av_energy; + // An indication of the content type of the current frame + FRAME_CONTENT_TYPE fr_content_type; + double frame_avg_haar_energy; + /*!\endcond */ +} TWO_PASS_FRAME; + +/*!\cond */ + +// This structure contains several key parameters to be accumulated for this +// frame. +typedef struct { + // Intra prediction error. + int64_t intra_error; + // Average wavelet energy computed using Discrete Wavelet Transform (DWT). + int64_t frame_avg_wavelet_energy; + // Best of intra pred error and inter pred error using last frame as ref. + int64_t coded_error; + // Best of intra pred error and inter pred error using golden frame as ref. + int64_t sr_coded_error; + // Count of motion vector. + int mv_count; + // Count of blocks that pick inter prediction (inter pred error is smaller + // than intra pred error). + int inter_count; + // Count of blocks that pick second ref (golden frame). + int second_ref_count; + // Count of blocks where the inter and intra are very close and very low. + double neutral_count; + // Count of blocks where intra error is very small. + int intra_skip_count; + // Start row. + int image_data_start_row; + // Count of unique non-zero motion vectors. + int new_mv_count; + // Sum of inward motion vectors. + int sum_in_vectors; + // Sum of motion vector row. + int sum_mvr; + // Sum of motion vector column. + int sum_mvc; + // Sum of absolute value of motion vector row. + int sum_mvr_abs; + // Sum of absolute value of motion vector column. + int sum_mvc_abs; + // Sum of the square of motion vector row. + int64_t sum_mvrs; + // Sum of the square of motion vector column. + int64_t sum_mvcs; + // A factor calculated using intra pred error. + double intra_factor; + // A factor that measures brightness. + double brightness_factor; +} FRAME_STATS; + +// This structure contains first pass data. +typedef struct { + // Buffer holding frame stats for all MACROBLOCKs. + // mb_stats[i] stores the FRAME_STATS of the ith + // MB in raster scan order. + FRAME_STATS *mb_stats; + // Buffer to store the prediction error of the (0,0) motion + // vector using the last source frame as the reference. + // raw_motion_err_list[i] stores the raw_motion_err of + // the ith MB in raster scan order. + int *raw_motion_err_list; +} FirstPassData; + +struct AV1_COMP; +struct EncodeFrameParams; +struct AV1EncoderConfig; +struct TileDataEnc; + +static INLINE int is_fp_wavelet_energy_invalid( + const FIRSTPASS_STATS *fp_stats) { + assert(fp_stats != NULL); + return (fp_stats->frame_avg_wavelet_energy < 0); +} + +static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) { + return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16); +} + +int av1_get_unit_rows_in_tile(const TileInfo *tile, + const BLOCK_SIZE fp_block_size); +int av1_get_unit_cols_in_tile(const TileInfo *tile, + const BLOCK_SIZE fp_block_size); + +void av1_first_pass_row(struct AV1_COMP *cpi, struct ThreadData *td, + struct TileDataEnc *tile_data, const int mb_row, + const BLOCK_SIZE fp_block_size); +void av1_end_first_pass(struct AV1_COMP *cpi); + +void av1_free_firstpass_data(FirstPassData *firstpass_data); + +void av1_twopass_zero_stats(FIRSTPASS_STATS *section); +void av1_accumulate_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame); +/*!\endcond */ + +/*!\brief AV1 first pass encoding. + * + * \ingroup rate_control + * This function is the first encoding pass for the two pass encoding mode. + * It encodes the whole video and collect essential information. + * Two pass encoding is an encoding mode in the reference software (libaom) + * of AV1 for high performance encoding. The first pass is a fast encoding + * process to collect essential information to help the second pass make + * encoding decisions and improve coding quality. The collected stats is used + * in rate control, for example, to determine frame cut, the position of + * alternative reference frame (ARF), etc. + * + * \param[in] cpi Top-level encoder structure + * \param[in] ts_duration Duration of the frame / collection of frames + * + * \remark Nothing is returned. Instead, the "TWO_PASS" structure inside "cpi" + * is modified to store information computed in this function. + */ +void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration); + +void av1_noop_first_pass_frame(struct AV1_COMP *cpi, const int64_t ts_duration); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_FIRSTPASS_H_ diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c new file mode 100644 index 0000000000..73910de121 --- /dev/null +++ b/third_party/aom/av1/encoder/global_motion.c @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "av1/encoder/global_motion.h" + +#include "av1/common/convolve.h" +#include "av1/common/warped_motion.h" + +#include "av1/encoder/segmentation.h" + +#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR) + +// Border over which to compute the global motion +#define ERRORADV_BORDER 0 + +/* clang-format off */ +// Error metric used for global motion evaluation. +// For 8-bit input, the pixel error used to index this table will always +// be between -255 and +255. But for 10- and 12-bit input, we use interpolation +// which means that we need to support indices of -256 and +256 as well. +// Therefore, the table is offset so that logical index 0 corresponds to +// error_measure_lut[256]. +const int error_measure_lut[513] = { + // pow 0.7 + 16384, 16384, 16339, 16294, 16249, 16204, 16158, 16113, + 16068, 16022, 15977, 15932, 15886, 15840, 15795, 15749, + 15703, 15657, 15612, 15566, 15520, 15474, 15427, 15381, + 15335, 15289, 15242, 15196, 15149, 15103, 15056, 15010, + 14963, 14916, 14869, 14822, 14775, 14728, 14681, 14634, + 14587, 14539, 14492, 14445, 14397, 14350, 14302, 14254, + 14206, 14159, 14111, 14063, 14015, 13967, 13918, 13870, + 13822, 13773, 13725, 13676, 13628, 13579, 13530, 13481, + 13432, 13383, 13334, 13285, 13236, 13187, 13137, 13088, + 13038, 12988, 12939, 12889, 12839, 12789, 12739, 12689, + 12639, 12588, 12538, 12487, 12437, 12386, 12335, 12285, + 12234, 12183, 12132, 12080, 12029, 11978, 11926, 11875, + 11823, 11771, 11719, 11667, 11615, 11563, 11511, 11458, + 11406, 11353, 11301, 11248, 11195, 11142, 11089, 11036, + 10982, 10929, 10875, 10822, 10768, 10714, 10660, 10606, + 10552, 10497, 10443, 10388, 10333, 10279, 10224, 10168, + 10113, 10058, 10002, 9947, 9891, 9835, 9779, 9723, + 9666, 9610, 9553, 9497, 9440, 9383, 9326, 9268, + 9211, 9153, 9095, 9037, 8979, 8921, 8862, 8804, + 8745, 8686, 8627, 8568, 8508, 8449, 8389, 8329, + 8269, 8208, 8148, 8087, 8026, 7965, 7903, 7842, + 7780, 7718, 7656, 7593, 7531, 7468, 7405, 7341, + 7278, 7214, 7150, 7086, 7021, 6956, 6891, 6826, + 6760, 6695, 6628, 6562, 6495, 6428, 6361, 6293, + 6225, 6157, 6089, 6020, 5950, 5881, 5811, 5741, + 5670, 5599, 5527, 5456, 5383, 5311, 5237, 5164, + 5090, 5015, 4941, 4865, 4789, 4713, 4636, 4558, + 4480, 4401, 4322, 4242, 4162, 4080, 3998, 3916, + 3832, 3748, 3663, 3577, 3490, 3402, 3314, 3224, + 3133, 3041, 2948, 2854, 2758, 2661, 2562, 2461, + 2359, 2255, 2148, 2040, 1929, 1815, 1698, 1577, + 1452, 1323, 1187, 1045, 894, 731, 550, 339, + 0, 339, 550, 731, 894, 1045, 1187, 1323, + 1452, 1577, 1698, 1815, 1929, 2040, 2148, 2255, + 2359, 2461, 2562, 2661, 2758, 2854, 2948, 3041, + 3133, 3224, 3314, 3402, 3490, 3577, 3663, 3748, + 3832, 3916, 3998, 4080, 4162, 4242, 4322, 4401, + 4480, 4558, 4636, 4713, 4789, 4865, 4941, 5015, + 5090, 5164, 5237, 5311, 5383, 5456, 5527, 5599, + 5670, 5741, 5811, 5881, 5950, 6020, 6089, 6157, + 6225, 6293, 6361, 6428, 6495, 6562, 6628, 6695, + 6760, 6826, 6891, 6956, 7021, 7086, 7150, 7214, + 7278, 7341, 7405, 7468, 7531, 7593, 7656, 7718, + 7780, 7842, 7903, 7965, 8026, 8087, 8148, 8208, + 8269, 8329, 8389, 8449, 8508, 8568, 8627, 8686, + 8745, 8804, 8862, 8921, 8979, 9037, 9095, 9153, + 9211, 9268, 9326, 9383, 9440, 9497, 9553, 9610, + 9666, 9723, 9779, 9835, 9891, 9947, 10002, 10058, + 10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497, + 10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929, + 10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353, + 11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771, + 11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183, + 12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588, + 12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988, + 13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383, + 13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773, + 13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159, + 14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539, + 14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916, + 14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289, + 15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657, + 15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022, + 16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384, + 16384, +}; +/* clang-format on */ + +int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) { + return best_erroradvantage < erroradv_tr && + best_erroradvantage * params_cost < erroradv_prod_tr; +} + +static void convert_to_params(const double *params, int32_t *model) { + int i; + model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5); + model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5); + model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) * + GM_TRANS_DECODE_FACTOR; + model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) * + GM_TRANS_DECODE_FACTOR; + + for (i = 2; i < 6; ++i) { + const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0); + model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5); + model[i] = + (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX); + model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR; + } +} + +void av1_convert_model_to_params(const double *params, + WarpedMotionParams *model) { + convert_to_params(params, model->wmmat); + model->wmtype = get_wmtype(model); + model->invalid = 0; +} + +// Adds some offset to a global motion parameter and handles +// all of the necessary precision shifts, clamping, and +// zero-centering. +static int32_t add_param_offset(int param_index, int32_t param_value, + int32_t offset) { + const int scale_vals[2] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF }; + const int clamp_vals[2] = { GM_TRANS_MAX, GM_ALPHA_MAX }; + // type of param: 0 - translation, 1 - affine + const int param_type = (param_index < 2 ? 0 : 1); + const int is_one_centered = (param_index == 2 || param_index == 5); + + // Make parameter zero-centered and offset the shift that was done to make + // it compatible with the warped model + param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >> + scale_vals[param_type]; + // Add desired offset to the rescaled/zero-centered parameter + param_value += offset; + // Clamp the parameter so it does not overflow the number of bits allotted + // to it in the bitstream + param_value = (int32_t)clamp(param_value, -clamp_vals[param_type], + clamp_vals[param_type]); + // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible + // with the warped motion library + param_value *= (1 << scale_vals[param_type]); + + // Undo the zero-centering step if necessary + return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS); +} + +static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) { + switch (wmtype) { + case IDENTITY: + wm->wmmat[0] = 0; + wm->wmmat[1] = 0; + AOM_FALLTHROUGH_INTENDED; + case TRANSLATION: + wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS; + wm->wmmat[3] = 0; + AOM_FALLTHROUGH_INTENDED; + case ROTZOOM: + wm->wmmat[4] = -wm->wmmat[3]; + wm->wmmat[5] = wm->wmmat[2]; + AOM_FALLTHROUGH_INTENDED; + case AFFINE: break; + default: assert(0); + } + wm->wmtype = wmtype; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int generic_sad_highbd(const uint16_t *const ref, int ref_stride, + const uint16_t *const dst, int dst_stride, + int p_width, int p_height) { + // This function should only be called for patches smaller than + // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels + // small enough that we don't need a 64-bit accumulator + assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK); + + int sad = 0; + for (int i = 0; i < p_height; ++i) { + for (int j = 0; j < p_width; ++j) { + sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]); + } + } + return sad; +} + +#if WARP_ERROR_BLOCK != 32 +#error "Need to change SAD call size in highbd_segmented_frame_error" +#endif // WARP_ERROR_BLOCK != 32 +static int64_t highbd_segmented_frame_error( + const uint16_t *const ref, int ref_stride, const uint16_t *const dst, + int dst_stride, int p_width, int p_height, int bd, uint8_t *segment_map, + int segment_map_stride) { + (void)bd; + int patch_w, patch_h; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + int64_t sum_error = 0; + for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) { + for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + + // avoid computing error into the frame padding + patch_w = AOMMIN(error_bsize_w, p_width - j); + patch_h = AOMMIN(error_bsize_h, p_height - i); + + if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) { + sum_error += aom_highbd_sad32x32( + CONVERT_TO_BYTEPTR(ref + j + i * ref_stride), ref_stride, + CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride); + } else { + sum_error += generic_sad_highbd(ref + j + i * ref_stride, ref_stride, + dst + j + i * dst_stride, dst_stride, + patch_w, patch_h); + } + } + } + return sum_error; +} + +#if WARP_ERROR_BLOCK != 32 +#error "Need to change SAD call size in highbd_warp_error" +#endif // WARP_ERROR_BLOCK != 32 +static int64_t highbd_warp_error(WarpedMotionParams *wm, + const uint16_t *const ref, int ref_width, + int ref_height, int ref_stride, + const uint16_t *const dst, int dst_stride, + int p_col, int p_row, int p_width, + int p_height, int subsampling_x, + int subsampling_y, int bd, int64_t best_error, + uint8_t *segment_map, int segment_map_stride) { + int64_t gm_sumerr = 0; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + DECLARE_ALIGNED(32, uint16_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]); + + ConvolveParams conv_params = get_conv_params(0, 0, bd); + conv_params.use_dist_wtd_comp_avg = 0; + for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { + for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + // avoid warping extra 8x8 blocks in the padded region of the frame + // when p_width and p_height are not multiples of WARP_ERROR_BLOCK + const int warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j); + const int warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i); + highbd_warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, + warp_w, warp_h, WARP_ERROR_BLOCK, subsampling_x, + subsampling_y, bd, &conv_params); + + if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) { + gm_sumerr += aom_highbd_sad32x32( + CONVERT_TO_BYTEPTR(tmp), WARP_ERROR_BLOCK, + CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride); + } else { + gm_sumerr += + generic_sad_highbd(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride, + dst_stride, warp_w, warp_h); + } + + if (gm_sumerr > best_error) return INT64_MAX; + } + } + return gm_sumerr; +} +#endif + +static INLINE int generic_sad(const uint8_t *const ref, int ref_stride, + const uint8_t *const dst, int dst_stride, + int p_width, int p_height) { + // This function should only be called for patches smaller than + // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels + // small enough that we don't need a 64-bit accumulator + assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK); + + int sad = 0; + for (int i = 0; i < p_height; ++i) { + for (int j = 0; j < p_width; ++j) { + sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]); + } + } + return sad; +} + +#if WARP_ERROR_BLOCK != 32 +#error "Need to change SAD call size in segmented_warp_error" +#endif // WARP_ERROR_BLOCK != 32 +static int64_t segmented_frame_error(const uint8_t *const ref, int ref_stride, + const uint8_t *const dst, int dst_stride, + int p_width, int p_height, + uint8_t *segment_map, + int segment_map_stride) { + int patch_w, patch_h; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + int64_t sum_error = 0; + for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) { + for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + + // avoid computing error into the frame padding + patch_w = AOMMIN(error_bsize_w, p_width - j); + patch_h = AOMMIN(error_bsize_h, p_height - i); + + if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) { + sum_error += aom_sad32x32(ref + j + i * ref_stride, ref_stride, + dst + j + i * dst_stride, dst_stride); + } else { + sum_error += + generic_sad(ref + j + i * ref_stride, ref_stride, + dst + j + i * dst_stride, dst_stride, patch_w, patch_h); + } + } + } + return sum_error; +} + +#if WARP_ERROR_BLOCK != 32 +#error "Need to change SAD call size in warp_error" +#endif // WARP_ERROR_BLOCK != 32 +static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref, + int ref_width, int ref_height, int ref_stride, + const uint8_t *const dst, int dst_stride, int p_col, + int p_row, int p_width, int p_height, + int subsampling_x, int subsampling_y, + int64_t best_error, uint8_t *segment_map, + int segment_map_stride) { + int64_t gm_sumerr = 0; + int warp_w, warp_h; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + DECLARE_ALIGNED(16, uint8_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]); + ConvolveParams conv_params = get_conv_params(0, 0, 8); + conv_params.use_dist_wtd_comp_avg = 0; + + for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { + for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + // avoid warping extra 8x8 blocks in the padded region of the frame + // when p_width and p_height are not multiples of WARP_ERROR_BLOCK + warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j); + warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i); + warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, warp_w, + warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y, + &conv_params); + + if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) { + gm_sumerr += aom_sad32x32(tmp, WARP_ERROR_BLOCK, + dst + j + i * dst_stride, dst_stride); + } else { + gm_sumerr += + generic_sad(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride, + dst_stride, warp_w, warp_h); + } + + if (gm_sumerr > best_error) return INT64_MAX; + } + } + return gm_sumerr; +} + +int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, + int p_width, int p_height, + uint8_t *segment_map, + int segment_map_stride) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + return highbd_segmented_frame_error( + CONVERT_TO_SHORTPTR(ref), ref_stride, CONVERT_TO_SHORTPTR(dst), + dst_stride, p_width, p_height, bd, segment_map, segment_map_stride); + } +#endif + (void)use_hbd; + (void)bd; + return segmented_frame_error(ref, ref_stride, dst, dst_stride, p_width, + p_height, segment_map, segment_map_stride); +} + +int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int ref_width, int ref_height, + int ref_stride, uint8_t *dst, int dst_stride, int p_col, + int p_row, int p_width, int p_height, int subsampling_x, + int subsampling_y, int64_t best_error, + uint8_t *segment_map, int segment_map_stride) { + if (!av1_get_shear_params(wm)) return INT64_MAX; +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) + return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), ref_width, + ref_height, ref_stride, CONVERT_TO_SHORTPTR(dst), + dst_stride, p_col, p_row, p_width, p_height, + subsampling_x, subsampling_y, bd, best_error, + segment_map, segment_map_stride); +#endif + (void)use_hbd; + (void)bd; + return warp_error(wm, ref, ref_width, ref_height, ref_stride, dst, dst_stride, + p_col, p_row, p_width, p_height, subsampling_x, + subsampling_y, best_error, segment_map, segment_map_stride); +} + +int64_t av1_refine_integerized_param( + WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd, + uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst, + int d_width, int d_height, int d_stride, int n_refinements, + int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride) { + static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 }; + const int border = ERRORADV_BORDER; + int i = 0, p; + int n_params = max_trans_model_params[wmtype]; + int32_t *param_mat = wm->wmmat; + int64_t step_error, best_error; + int32_t step; + int32_t *param; + int32_t curr_param; + int32_t best_param; + + force_wmtype(wm, wmtype); + wm->wmtype = get_wmtype(wm); + + if (n_refinements == 0) { + // Compute the maximum error value that will be accepted, so that + // av1_warp_error can terminate early if it proves the model will not + // be accepted. + int64_t selection_threshold = (int64_t)lrint(ref_frame_error * erroradv_tr); + return av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, d_stride, border, + border, d_width - 2 * border, d_height - 2 * border, + 0, 0, selection_threshold, segment_map, + segment_map_stride); + } + + // When refining, use a slightly higher threshold for the initial error + // calculation - see comment above erroradv_early_tr for why. + int64_t selection_threshold = + (int64_t)lrint(ref_frame_error * erroradv_early_tr); + best_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, d_stride, border, border, + d_width - 2 * border, d_height - 2 * border, 0, 0, + selection_threshold, segment_map, segment_map_stride); + + if (best_error > selection_threshold) { + return INT64_MAX; + } + + step = 1 << (n_refinements - 1); + for (i = 0; i < n_refinements; i++, step >>= 1) { + for (p = 0; p < n_params; ++p) { + int step_dir = 0; + param = param_mat + p; + curr_param = *param; + best_param = curr_param; + // look to the left + // Note: We have to use force_wmtype() to keep the proper symmetry for + // ROTZOOM type models + *param = add_param_offset(p, curr_param, -step); + force_wmtype(wm, wmtype); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, d_stride, border, + border, d_width - 2 * border, d_height - 2 * border, 0, + 0, best_error, segment_map, segment_map_stride); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + step_dir = -1; + } + + // look to the right + *param = add_param_offset(p, curr_param, step); + force_wmtype(wm, wmtype); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, d_stride, border, + border, d_width - 2 * border, d_height - 2 * border, 0, + 0, best_error, segment_map, segment_map_stride); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + step_dir = 1; + } + + // look to the direction chosen above repeatedly until error increases + // for the biggest step size + while (step_dir) { + *param = add_param_offset(p, best_param, step * step_dir); + force_wmtype(wm, wmtype); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, d_stride, border, + border, d_width - 2 * border, d_height - 2 * border, + 0, 0, best_error, segment_map, segment_map_stride); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + } else { + step_dir = 0; + } + } + + // Restore best parameter value so far + *param = best_param; + force_wmtype(wm, wmtype); + } + } + + wm->wmtype = get_wmtype(wm); + return best_error; +} + +#define FEAT_COUNT_TR 3 +#define SEG_COUNT_TR 48 +void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width, + int height, int *inliers, + int num_inliers) { + int seg_count = 0; + memset(segment_map, 0, sizeof(*segment_map) * width * height); + + for (int i = 0; i < num_inliers; i++) { + int x = inliers[i * 2]; + int y = inliers[i * 2 + 1]; + int seg_x = x >> WARP_ERROR_BLOCK_LOG; + int seg_y = y >> WARP_ERROR_BLOCK_LOG; + segment_map[seg_y * width + seg_x] += 1; + } + + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + uint8_t feat_count = segment_map[i * width + j]; + segment_map[i * width + j] = (feat_count >= FEAT_COUNT_TR); + seg_count += (segment_map[i * width + j]); + } + } + + // If this motion does not make up a large enough portion of the frame, + // use the unsegmented version of the error metric + if (seg_count < SEG_COUNT_TR) + memset(segment_map, 1, width * height * sizeof(*segment_map)); +} diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h new file mode 100644 index 0000000000..8c9c60f0f5 --- /dev/null +++ b/third_party/aom/av1/encoder/global_motion.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_ +#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_ + +#include "aom/aom_integer.h" +#include "aom_dsp/flow_estimation/flow_estimation.h" +#include "aom_scale/yv12config.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RANSAC_NUM_MOTIONS 1 +#define GM_MAX_REFINEMENT_STEPS 5 +#define MAX_DIRECTIONS 2 + +// The structure holds a valid reference frame type and its temporal distance +// from the source frame. +typedef struct { + int distance; + MV_REFERENCE_FRAME frame; +} FrameDistPair; + +typedef struct { + // Array of structure which holds the global motion parameters for a given + // motion model. motion_models[i] holds the parameters for a given motion + // model for the ith ransac motion. + MotionModel motion_models[RANSAC_NUM_MOTIONS]; + + // Pointer to hold inliers from motion model. + uint8_t *segment_map; +} GlobalMotionData; + +typedef struct { + // Holds the mapping of each thread to past/future direction. + // thread_id_to_dir[i] indicates the direction id (past - 0/future - 1) + // assigned to the ith thread. + int8_t thread_id_to_dir[MAX_NUM_THREADS]; + + // A flag which holds the early exit status based on the speed feature + // 'prune_ref_frame_for_gm_search'. early_exit[i] will be set if the speed + // feature based early exit happens in the direction 'i'. + int8_t early_exit[MAX_DIRECTIONS]; + + // Counter for the next reference frame to be processed. + // next_frame_to_process[i] will hold the count of next reference frame to be + // processed in the direction 'i'. + int8_t next_frame_to_process[MAX_DIRECTIONS]; +} JobInfo; + +typedef struct { + // Data related to assigning jobs for global motion multi-threading. + JobInfo job_info; + +#if CONFIG_MULTITHREAD + // Mutex lock used while dispatching jobs. + pthread_mutex_t *mutex_; +#endif + + // Initialized to false, set to true by the worker thread that encounters an + // error in order to abort the processing of other worker threads. + bool gm_mt_exit; +} AV1GlobalMotionSync; + +void av1_convert_model_to_params(const double *params, + WarpedMotionParams *model); + +// Criteria for accepting a global motion model +static const double erroradv_tr = 0.65; +static const double erroradv_prod_tr = 20000; + +// Early exit threshold for global motion refinement +// This is set slightly higher than erroradv_tr, as a compromise between +// two factors: +// +// 1) By rejecting un-promising models early, we can reduce the encode time +// spent trying to refine them +// +// 2) When we refine a model, its error may decrease to below the acceptance +// threshold even if the model is initially above the threshold +static const double erroradv_early_tr = 0.70; + +int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost); + +void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width, + int height, int *inliers, + int num_inliers); + +extern const int error_measure_lut[513]; + +static INLINE int error_measure(int err) { + return error_measure_lut[256 + err]; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int highbd_error_measure(int err, int bd) { + const int b = bd - 8; + const int bmask = (1 << b) - 1; + const int v = (1 << b); + + // Split error into two parts and do an interpolated table lookup + // To compute the table index and interpolation value, we want to calculate + // the quotient and remainder of err / 2^b. But it is very important that + // the division must round down, and the remainder must be positive, + // ie. in the range [0, 2^b). + // + // In C, the >> and & operators do what we want, but the / and % operators + // give the wrong results for negative inputs. So we must use >> and & here. + // + // For example, if bd == 10 and err == -5, compare the results: + // (-5) >> 2 = -2, (-5) & 3 = 3 + // vs. (-5) / 4 = -1, (-5) % 4 = -1 + const int e1 = err >> b; + const int e2 = err & bmask; + return error_measure_lut[256 + e1] * (v - e2) + + error_measure_lut[257 + e1] * e2; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, + int p_width, int p_height, + uint8_t *segment_map, int segment_map_stride); + +// Returns the error between the result of applying motion 'wm' to the frame +// described by 'ref' and the frame described by 'dst'. +int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int ref_width, int ref_height, + int ref_stride, uint8_t *dst, int dst_stride, int p_col, + int p_row, int p_width, int p_height, int subsampling_x, + int subsampling_y, int64_t best_error, + uint8_t *segment_map, int segment_map_stride); + +// Returns the av1_warp_error between "dst" and the result of applying the +// motion params that result from fine-tuning "wm" to "ref". Note that "wm" is +// modified in place. +int64_t av1_refine_integerized_param( + WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd, + uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst, + int d_width, int d_height, int d_stride, int n_refinements, + int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_ diff --git a/third_party/aom/av1/encoder/global_motion_facade.c b/third_party/aom/av1/encoder/global_motion_facade.c new file mode 100644 index 0000000000..02a4e70ed3 --- /dev/null +++ b/third_party/aom/av1/encoder/global_motion_facade.c @@ -0,0 +1,450 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/binary_codes_writer.h" + +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_dsp/flow_estimation/flow_estimation.h" +#include "aom_dsp/pyramid.h" +#include "av1/common/warped_motion.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/global_motion_facade.h" + +// Range of model types to search +#define FIRST_GLOBAL_TRANS_TYPE ROTZOOM +#define LAST_GLOBAL_TRANS_TYPE ROTZOOM + +// Computes the cost for the warp parameters. +static int gm_get_params_cost(const WarpedMotionParams *gm, + const WarpedMotionParams *ref_gm, int allow_hp) { + int params_cost = 0; + int trans_bits, trans_prec_diff; + switch (gm->wmtype) { + case AFFINE: + case ROTZOOM: + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS), + (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF), + (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF)); + if (gm->wmtype >= AFFINE) { + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF), + (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF)); + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + } + AOM_FALLTHROUGH_INTENDED; + case TRANSLATION: + trans_bits = (gm->wmtype == TRANSLATION) + ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + trans_prec_diff = (gm->wmtype == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + params_cost += aom_count_signed_primitive_refsubexpfin( + (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_gm->wmmat[0] >> trans_prec_diff), + (gm->wmmat[0] >> trans_prec_diff)); + params_cost += aom_count_signed_primitive_refsubexpfin( + (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_gm->wmmat[1] >> trans_prec_diff), + (gm->wmmat[1] >> trans_prec_diff)); + AOM_FALLTHROUGH_INTENDED; + case IDENTITY: break; + default: assert(0); + } + return (params_cost << AV1_PROB_COST_SHIFT); +} + +// For the given reference frame, computes the global motion parameters for +// different motion models and finds the best. +static AOM_INLINE void compute_global_motion_for_ref_frame( + AV1_COMP *cpi, struct aom_internal_error_info *error_info, + YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, + MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w, + const int segment_map_h, const WarpedMotionParams *ref_params) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + int src_width = cpi->source->y_crop_width; + int src_height = cpi->source->y_crop_height; + int src_stride = cpi->source->y_stride; + assert(ref_buf[frame] != NULL); + int bit_depth = cpi->common.seq_params->bit_depth; + GlobalMotionMethod global_motion_method = default_global_motion_method; + int num_refinements = cpi->sf.gm_sf.num_refinement_steps; + bool mem_alloc_failed = false; + + // Select the best model based on fractional error reduction. + // By initializing this to erroradv_tr, the same logic which is used to + // select the best model will automatically filter out any model which + // doesn't meet the required quality threshold + double best_erroradv = erroradv_tr; + for (TransformationType model = FIRST_GLOBAL_TRANS_TYPE; + model <= LAST_GLOBAL_TRANS_TYPE; ++model) { + if (!aom_compute_global_motion( + model, cpi->source, ref_buf[frame], bit_depth, global_motion_method, + motion_models, RANSAC_NUM_MOTIONS, &mem_alloc_failed)) { + if (mem_alloc_failed) { + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate global motion buffers"); + } + continue; + } + + for (int i = 0; i < RANSAC_NUM_MOTIONS; ++i) { + if (motion_models[i].num_inliers == 0) continue; + + WarpedMotionParams tmp_wm_params; + av1_convert_model_to_params(motion_models[i].params, &tmp_wm_params); + + // Skip models that we won't use (IDENTITY or TRANSLATION) + // + // For IDENTITY type models, we don't need to evaluate anything because + // all the following logic is effectively comparing the estimated model + // to an identity model. + // + // For TRANSLATION type global motion models, gm_get_motion_vector() gives + // the wrong motion vector (see comments in that function for details). + // As translation-type models do not give much gain, we can avoid this bug + // by never choosing a TRANSLATION type model + if (tmp_wm_params.wmtype <= TRANSLATION) continue; + + av1_compute_feature_segmentation_map( + segment_map, segment_map_w, segment_map_h, motion_models[i].inliers, + motion_models[i].num_inliers); + + int64_t ref_frame_error = av1_segmented_frame_error( + is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer, + ref_buf[frame]->y_stride, cpi->source->y_buffer, src_stride, + src_width, src_height, segment_map, segment_map_w); + + if (ref_frame_error == 0) continue; + + const int64_t warp_error = av1_refine_integerized_param( + &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd, + ref_buf[frame]->y_buffer, ref_buf[frame]->y_crop_width, + ref_buf[frame]->y_crop_height, ref_buf[frame]->y_stride, + cpi->source->y_buffer, src_width, src_height, src_stride, + num_refinements, ref_frame_error, segment_map, segment_map_w); + + // av1_refine_integerized_param() can return a simpler model type than + // its input, so re-check model type here + if (tmp_wm_params.wmtype <= TRANSLATION) continue; + + double erroradvantage = (double)warp_error / ref_frame_error; + + if (erroradvantage < best_erroradv) { + best_erroradv = erroradvantage; + // Save the wm_params modified by + // av1_refine_integerized_param() rather than motion index to + // avoid rerunning refine() below. + memcpy(&(cm->global_motion[frame]), &tmp_wm_params, + sizeof(WarpedMotionParams)); + } + } + } + + if (!av1_get_shear_params(&cm->global_motion[frame])) + cm->global_motion[frame] = default_warp_params; + +#if 0 + // We never choose translational models, so this code is disabled + if (cm->global_motion[frame].wmtype == TRANSLATION) { + cm->global_motion[frame].wmmat[0] = + convert_to_trans_prec(cm->features.allow_high_precision_mv, + cm->global_motion[frame].wmmat[0]) * + GM_TRANS_ONLY_DECODE_FACTOR; + cm->global_motion[frame].wmmat[1] = + convert_to_trans_prec(cm->features.allow_high_precision_mv, + cm->global_motion[frame].wmmat[1]) * + GM_TRANS_ONLY_DECODE_FACTOR; + } +#endif + + if (cm->global_motion[frame].wmtype == IDENTITY) return; + + // If the best error advantage found doesn't meet the threshold for + // this motion type, revert to IDENTITY. + if (!av1_is_enough_erroradvantage( + best_erroradv, + gm_get_params_cost(&cm->global_motion[frame], ref_params, + cm->features.allow_high_precision_mv))) { + cm->global_motion[frame] = default_warp_params; + } +} + +// Computes global motion for the given reference frame. +void av1_compute_gm_for_valid_ref_frames( + AV1_COMP *cpi, struct aom_internal_error_info *error_info, + YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, + MotionModel *motion_models, uint8_t *segment_map, int segment_map_w, + int segment_map_h) { + AV1_COMMON *const cm = &cpi->common; + const WarpedMotionParams *ref_params = + cm->prev_frame ? &cm->prev_frame->global_motion[frame] + : &default_warp_params; + + compute_global_motion_for_ref_frame(cpi, error_info, ref_buf, frame, + motion_models, segment_map, segment_map_w, + segment_map_h, ref_params); +} + +// Loops over valid reference frames and computes global motion estimation. +static AOM_INLINE void compute_global_motion_for_references( + AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], + FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames, + MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w, + const int segment_map_h) { + AV1_COMMON *const cm = &cpi->common; + struct aom_internal_error_info *const error_info = + cpi->td.mb.e_mbd.error_info; + // Compute global motion w.r.t. reference frames starting from the nearest ref + // frame in a given direction. + for (int frame = 0; frame < num_ref_frames; frame++) { + int ref_frame = reference_frame[frame].frame; + av1_compute_gm_for_valid_ref_frames(cpi, error_info, ref_buf, ref_frame, + motion_models, segment_map, + segment_map_w, segment_map_h); + // If global motion w.r.t. current ref frame is + // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t + // the remaining ref frames in that direction. + if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search && + cm->global_motion[ref_frame].wmtype <= TRANSLATION) + break; + } +} + +// Compares the distance in 'a' and 'b'. Returns 1 if the frame corresponding to +// 'a' is farther, -1 if the frame corresponding to 'b' is farther, 0 otherwise. +static int compare_distance(const void *a, const void *b) { + const int diff = + ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance; + if (diff > 0) + return 1; + else if (diff < 0) + return -1; + return 0; +} + +static int disable_gm_search_based_on_stats(const AV1_COMP *const cpi) { + int is_gm_present = 1; + + // Check number of GM models only in GF groups with ARF frames. GM param + // estimation is always done in the case of GF groups with no ARF frames (flat + // gops) + if (cpi->ppi->gf_group.arf_index > -1) { + // valid_gm_model_found is initialized to INT32_MAX in the beginning of + // every GF group. + // Therefore, GM param estimation is always done for all frames until + // at least 1 frame each of ARF_UPDATE, INTNL_ARF_UPDATE and LF_UPDATE are + // encoded in a GF group For subsequent frames, GM param estimation is + // disabled, if no valid models have been found in all the three update + // types. + is_gm_present = (cpi->ppi->valid_gm_model_found[ARF_UPDATE] != 0) || + (cpi->ppi->valid_gm_model_found[INTNL_ARF_UPDATE] != 0) || + (cpi->ppi->valid_gm_model_found[LF_UPDATE] != 0); + } + return !is_gm_present; +} + +// Prunes reference frames for global motion estimation based on the speed +// feature 'gm_search_type'. +static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) { + (void)frame; + switch (sf->gm_sf.gm_search_type) { + case GM_FULL_SEARCH: return 1; + case GM_REDUCED_REF_SEARCH_SKIP_L2_L3: + return !(frame == LAST2_FRAME || frame == LAST3_FRAME); + case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2: + return !(frame == LAST2_FRAME || frame == LAST3_FRAME || + (frame == ALTREF2_FRAME)); + case GM_SEARCH_CLOSEST_REFS_ONLY: return 1; + case GM_DISABLE_SEARCH: return 0; + default: assert(0); + } + return 1; +} + +// Populates valid reference frames in past/future directions in +// 'reference_frames' and their count in 'num_ref_frames'. +static AOM_INLINE void update_valid_ref_frames_for_gm( + AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], + FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1], + int *num_ref_frames) { + AV1_COMMON *const cm = &cpi->common; + int *num_past_ref_frames = &num_ref_frames[0]; + int *num_future_ref_frames = &num_ref_frames[1]; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + int ref_pruning_enabled = is_frame_eligible_for_ref_pruning( + gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, cpi->gf_frame_index); + int cur_frame_gm_disabled = 0; + int pyr_lvl = cm->cur_frame->pyramid_level; + + if (cpi->sf.gm_sf.disable_gm_search_based_on_stats) { + cur_frame_gm_disabled = disable_gm_search_based_on_stats(cpi); + } + + for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) { + const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME }; + RefCntBuffer *buf = get_ref_frame_buf(cm, frame); + const int ref_disabled = + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]); + ref_buf[frame] = NULL; + cm->global_motion[frame] = default_warp_params; + // Skip global motion estimation for invalid ref frames + if (buf == NULL || + (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) { + continue; + } else { + ref_buf[frame] = &buf->buf; + } + + int prune_ref_frames = + ref_pruning_enabled && + prune_ref_by_selective_ref_frame(cpi, NULL, ref_frame, + cm->cur_frame->ref_display_order_hint); + int ref_pyr_lvl = buf->pyramid_level; + + if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width && + ref_buf[frame]->y_crop_height == cpi->source->y_crop_height && + do_gm_search_logic(&cpi->sf, frame) && !prune_ref_frames && + ref_pyr_lvl <= pyr_lvl && !cur_frame_gm_disabled) { + assert(ref_buf[frame] != NULL); + const int relative_frame_dist = av1_encoder_get_relative_dist( + buf->display_order_hint, cm->cur_frame->display_order_hint); + // Populate past and future ref frames. + // reference_frames[0][] indicates past direction and + // reference_frames[1][] indicates future direction. + if (relative_frame_dist == 0) { + // Skip global motion estimation for frames at the same nominal instant. + // This will generally be either a "real" frame coded against a + // temporal filtered version, or a higher spatial layer coded against + // a lower spatial layer. In either case, the optimal motion model will + // be IDENTITY, so we don't need to search explicitly. + } else if (relative_frame_dist < 0) { + reference_frames[0][*num_past_ref_frames].distance = + abs(relative_frame_dist); + reference_frames[0][*num_past_ref_frames].frame = frame; + (*num_past_ref_frames)++; + } else { + reference_frames[1][*num_future_ref_frames].distance = + abs(relative_frame_dist); + reference_frames[1][*num_future_ref_frames].frame = frame; + (*num_future_ref_frames)++; + } + } + } +} + +// Initializes parameters used for computing global motion. +static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) { + GlobalMotionInfo *const gm_info = &cpi->gm_info; + YV12_BUFFER_CONFIG *source = cpi->source; + + gm_info->segment_map_w = + (source->y_crop_width + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG; + gm_info->segment_map_h = + (source->y_crop_height + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG; + + memset(gm_info->reference_frames, -1, + sizeof(gm_info->reference_frames[0][0]) * MAX_DIRECTIONS * + (REF_FRAMES - 1)); + av1_zero(gm_info->num_ref_frames); + + // Populate ref_buf for valid ref frames in global motion + update_valid_ref_frames_for_gm(cpi, gm_info->ref_buf, + gm_info->reference_frames, + gm_info->num_ref_frames); + + // Sort the past and future ref frames in the ascending order of their + // distance from the current frame. reference_frames[0] => past direction + // and reference_frames[1] => future direction. + qsort(gm_info->reference_frames[0], gm_info->num_ref_frames[0], + sizeof(gm_info->reference_frames[0][0]), compare_distance); + qsort(gm_info->reference_frames[1], gm_info->num_ref_frames[1], + sizeof(gm_info->reference_frames[1][0]), compare_distance); + + if (cpi->sf.gm_sf.gm_search_type == GM_SEARCH_CLOSEST_REFS_ONLY) { + // Filter down to the nearest two ref frames. + // Prefer one past and one future ref over two past refs, even if + // the second past ref is closer + if (gm_info->num_ref_frames[1] > 0) { + gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 1); + gm_info->num_ref_frames[1] = AOMMIN(gm_info->num_ref_frames[1], 1); + } else { + gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 2); + } + } +} + +// Computes global motion w.r.t. valid reference frames. +static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) { + GlobalMotionInfo *const gm_info = &cpi->gm_info; + GlobalMotionData *gm_data = &cpi->td.gm_data; + + // Compute global motion w.r.t. past reference frames and future reference + // frames + for (int dir = 0; dir < MAX_DIRECTIONS; dir++) { + if (gm_info->num_ref_frames[dir] > 0) + compute_global_motion_for_references( + cpi, gm_info->ref_buf, gm_info->reference_frames[dir], + gm_info->num_ref_frames[dir], gm_data->motion_models, + gm_data->segment_map, gm_info->segment_map_w, gm_info->segment_map_h); + } +} + +// Global motion estimation for the current frame is computed.This computation +// happens once per frame and the winner motion model parameters are stored in +// cm->cur_frame->global_motion. +void av1_compute_global_motion_facade(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + GlobalMotionInfo *const gm_info = &cpi->gm_info; + + if (cpi->oxcf.tool_cfg.enable_global_motion) { + if (cpi->gf_frame_index == 0) { + for (int i = 0; i < FRAME_UPDATE_TYPES; i++) { + cpi->ppi->valid_gm_model_found[i] = INT32_MAX; +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) + cpi->ppi->temp_valid_gm_model_found[i] = INT32_MAX; +#endif + } + } + } + + if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source && + cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done && + cpi->sf.gm_sf.gm_search_type != GM_DISABLE_SEARCH) { + setup_global_motion_info_params(cpi); + // Terminate early if the total number of reference frames is zero. + if (cpi->gm_info.num_ref_frames[0] || cpi->gm_info.num_ref_frames[1]) { + gm_alloc_data(cpi, &cpi->td.gm_data); + if (cpi->mt_info.num_workers > 1) + av1_global_motion_estimation_mt(cpi); + else + global_motion_estimation(cpi); + gm_dealloc_data(&cpi->td.gm_data); + gm_info->search_done = 1; + } + } + memcpy(cm->cur_frame->global_motion, cm->global_motion, + sizeof(cm->cur_frame->global_motion)); +} diff --git a/third_party/aom/av1/encoder/global_motion_facade.h b/third_party/aom/av1/encoder/global_motion_facade.h new file mode 100644 index 0000000000..f13989aa25 --- /dev/null +++ b/third_party/aom/av1/encoder/global_motion_facade.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_ +#define AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_ + +#ifdef __cplusplus +extern "C" { +#endif +struct yv12_buffer_config; +struct AV1_COMP; + +// Allocates memory for members of GlobalMotionData. +static AOM_INLINE void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) { + AV1_COMMON *cm = &cpi->common; + GlobalMotionInfo *gm_info = &cpi->gm_info; + + CHECK_MEM_ERROR(cm, gm_data->segment_map, + aom_malloc(sizeof(*gm_data->segment_map) * + gm_info->segment_map_w * gm_info->segment_map_h)); + + av1_zero_array(gm_data->motion_models, RANSAC_NUM_MOTIONS); + for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) { + CHECK_MEM_ERROR(cm, gm_data->motion_models[m].inliers, + aom_malloc(sizeof(*gm_data->motion_models[m].inliers) * 2 * + MAX_CORNERS)); + } +} + +// Deallocates the memory allocated for members of GlobalMotionData. +static AOM_INLINE void gm_dealloc_data(GlobalMotionData *gm_data) { + aom_free(gm_data->segment_map); + gm_data->segment_map = NULL; + for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) { + aom_free(gm_data->motion_models[m].inliers); + gm_data->motion_models[m].inliers = NULL; + } +} + +void av1_compute_gm_for_valid_ref_frames( + AV1_COMP *cpi, struct aom_internal_error_info *error_info, + YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, + MotionModel *motion_models, uint8_t *segment_map, int segment_map_w, + int segment_map_h); +void av1_compute_global_motion_facade(struct AV1_COMP *cpi); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_ diff --git a/third_party/aom/av1/encoder/gop_structure.c b/third_party/aom/av1/encoder/gop_structure.c new file mode 100644 index 0000000000..5078098450 --- /dev/null +++ b/third_party/aom/av1/encoder/gop_structure.c @@ -0,0 +1,867 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/blockd.h" +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#include "av1/common/av1_common_int.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/pass2_strategy.h" + +// This function sets gf_group->frame_parallel_level for LF_UPDATE frames based +// on the value of parallel_frame_count. +static void set_frame_parallel_level(int *frame_parallel_level, + int *parallel_frame_count, + int max_parallel_frames) { + assert(*parallel_frame_count > 0); + // parallel_frame_count > 1 indicates subsequent frame(s) in the current + // parallel encode set. + *frame_parallel_level = 1 + (*parallel_frame_count > 1); + // Update the count of no. of parallel frames. + (*parallel_frame_count)++; + if (*parallel_frame_count > max_parallel_frames) *parallel_frame_count = 1; +} + +// This function sets gf_group->src_offset based on frame_parallel_level. +// Outputs are gf_group->src_offset and first_frame_index +static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index, + int cur_frame_idx, int frame_ind) { + if (gf_group->frame_parallel_level[frame_ind] > 0) { + if (gf_group->frame_parallel_level[frame_ind] == 1) { + *first_frame_index = cur_frame_idx; + } + + // Obtain the offset of the frame at frame_ind in the lookahead queue by + // subtracting the display order hints of the current frame from the display + // order hint of the first frame in parallel encoding set (at + // first_frame_index). + gf_group->src_offset[frame_ind] = + (cur_frame_idx + gf_group->arf_src_offset[frame_ind]) - + *first_frame_index; + } +} + +// Sets the GF_GROUP params for LF_UPDATE frames. +static AOM_INLINE void set_params_for_leaf_frames( + const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, + const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, + GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind, + int *parallel_frame_count, int max_parallel_frames, + int do_frame_parallel_encode, int *first_frame_index, int *cur_disp_index, + int layer_depth, int start, int end) { + gf_group->update_type[*frame_ind] = LF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth); + gf_group->display_idx[*frame_ind] = (*cur_disp_index); + gf_group->arf_boost[*frame_ind] = + av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start, + end - start, 0, NULL, NULL, 0); + ++(*cur_disp_index); + + // Set the level of parallelism for the LF_UPDATE frame. + if (do_frame_parallel_encode) { + set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind], + parallel_frame_count, max_parallel_frames); + // Set LF_UPDATE frames as non-reference frames. + gf_group->is_frame_non_ref[*frame_ind] = true; + } + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + + ++(*frame_ind); + ++(*cur_frame_idx); +} + +// Sets the GF_GROUP params for INTNL_OVERLAY_UPDATE frames. +static AOM_INLINE void set_params_for_intnl_overlay_frames( + GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind, + int *first_frame_index, int *cur_disp_index, int layer_depth) { + gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->layer_depth[*frame_ind] = layer_depth; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + gf_group->display_idx[*frame_ind] = (*cur_disp_index); + ++(*cur_disp_index); + + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + ++(*frame_ind); + ++(*cur_frame_idx); +} + +// Sets the GF_GROUP params for INTNL_ARF_UPDATE frames. +static AOM_INLINE void set_params_for_internal_arfs( + const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, + const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, + GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind, + int *parallel_frame_count, int max_parallel_frames, + int do_frame_parallel_encode, int *first_frame_index, int depth_thr, + int *cur_disp_idx, int layer_depth, int arf_src_offset, int offset, + int f_frames, int b_frames) { + gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = arf_src_offset; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->layer_depth[*frame_ind] = layer_depth; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + gf_group->display_idx[*frame_ind] = + (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind]; + gf_group->arf_boost[*frame_ind] = + av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, offset, + f_frames, b_frames, NULL, NULL, 0); + + if (do_frame_parallel_encode) { + if (depth_thr != INT_MAX) { + assert(depth_thr == 3 || depth_thr == 4); + assert(IMPLIES(depth_thr == 3, layer_depth == 4)); + assert(IMPLIES(depth_thr == 4, layer_depth == 5)); + // Set frame_parallel_level of the first frame in the given layer to 1. + if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) { + gf_group->frame_parallel_level[*frame_ind] = 1; + } else { + // Set frame_parallel_level of the consecutive frame in the same given + // layer to 2. + assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1); + gf_group->frame_parallel_level[*frame_ind] = 2; + // Store the display order hints of the past 2 INTNL_ARF_UPDATE + // frames which would not have been displayed at the time of the encode + // of current frame. + gf_group->skip_frame_refresh[*frame_ind][0] = + gf_group->display_idx[(*frame_ind) - 1]; + gf_group->skip_frame_refresh[*frame_ind][1] = + gf_group->display_idx[(*frame_ind) - 2]; + // Set the display_idx of frame_parallel_level 1 frame in + // gf_group->skip_frame_as_ref. + gf_group->skip_frame_as_ref[*frame_ind] = + gf_group->display_idx[(*frame_ind) - 1]; + } + } + // If max_parallel_frames is not exceeded and if the frame will not be + // temporally filtered, encode the next internal ARF frame in parallel. + if (*parallel_frame_count > 1 && + *parallel_frame_count <= max_parallel_frames) { + if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR) + gf_group->frame_parallel_level[*frame_ind] = 2; + *parallel_frame_count = 1; + } + } + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + ++(*frame_ind); +} + +// Set parameters for frames between 'start' and 'end' (excluding both). +static void set_multi_layer_params_for_fp( + const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, + GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc, + RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end, + int *cur_frame_idx, int *frame_ind, int *parallel_frame_count, + int max_parallel_frames, int do_frame_parallel_encode, + int *first_frame_index, int depth_thr, int *cur_disp_idx, int layer_depth) { + const int num_frames_to_process = end - start; + + // Either we are at the last level of the pyramid, or we don't have enough + // frames between 'l' and 'r' to create one more level. + if (layer_depth > gf_group->max_layer_depth_allowed || + num_frames_to_process < 3) { + // Leaf nodes. + while (start < end) { + set_params_for_leaf_frames(twopass, twopass_frame, p_rc, frame_info, + gf_group, cur_frame_idx, frame_ind, + parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, + cur_disp_idx, layer_depth, start, end); + ++start; + } + } else { + const int m = (start + end - 1) / 2; + + // Internal ARF. + int arf_src_offset = m - start; + set_params_for_internal_arfs( + twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx, + frame_ind, parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, INT_MAX, cur_disp_idx, + layer_depth, arf_src_offset, m, end - m, m - start); + + // If encode reordering is enabled, configure the multi-layers accordingly + // and return. For e.g., the encode order for gf-interval 16 after + // reordering would be 0-> 16-> 8-> 4-> 2-> 6-> 1-> 3-> 5-> 7-> 12-> 10-> + // 14-> 9-> 11-> 13-> 15. + if (layer_depth >= depth_thr) { + int m1 = (m + start - 1) / 2; + int m2 = (m + 1 + end) / 2; + int arf_src_offsets[2] = { m1 - start, m2 - start }; + // Parameters to compute arf_boost. + int offset[2] = { m1, m2 }; + int f_frames[2] = { m - m1, end - m2 }; + int b_frames[2] = { m1 - start, m2 - (m + 1) }; + + // Set GF_GROUP params for INTNL_ARF_UPDATE frames which are reordered. + for (int i = 0; i < 2; i++) { + set_params_for_internal_arfs( + twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx, + frame_ind, parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, depth_thr, + cur_disp_idx, layer_depth + 1, arf_src_offsets[i], offset[i], + f_frames[i], b_frames[i]); + } + + // Initialize the start and end indices to configure LF_UPDATE frames. + int start_idx[4] = { start, m1 + 1, m + 1, end - 1 }; + int end_idx[4] = { m1, m, m2, end }; + int layer_depth_for_intnl_overlay[4] = { layer_depth + 1, layer_depth, + layer_depth + 1, INVALID_IDX }; + + // Set GF_GROUP params for the rest of LF_UPDATE and INTNL_OVERLAY_UPDATE + // frames after reordering. + for (int i = 0; i < 4; i++) { + set_multi_layer_params_for_fp( + twopass, twopass_frame, gf_group, p_rc, rc, frame_info, + start_idx[i], end_idx[i], cur_frame_idx, frame_ind, + parallel_frame_count, max_parallel_frames, do_frame_parallel_encode, + first_frame_index, depth_thr, cur_disp_idx, layer_depth + 2); + if (layer_depth_for_intnl_overlay[i] != INVALID_IDX) + set_params_for_intnl_overlay_frames( + gf_group, cur_frame_idx, frame_ind, first_frame_index, + cur_disp_idx, layer_depth_for_intnl_overlay[i]); + } + return; + } + + // Frames displayed before this internal ARF. + set_multi_layer_params_for_fp( + twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m, + cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx, + layer_depth + 1); + + // Overlay for internal ARF. + set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind, + first_frame_index, cur_disp_idx, + layer_depth); + + // Frames displayed after this internal ARF. + set_multi_layer_params_for_fp( + twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end, + cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx, + layer_depth + 1); + } +} + +// Structure for bookkeeping start, end and display indices to configure +// INTNL_ARF_UPDATE frames. +typedef struct { + int start; + int end; + int display_index; +} FRAME_REORDER_INFO; + +// Updates the stats required to configure the GF_GROUP. +static AOM_INLINE void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats, + int arf_frame_index, + int display_idx, int start, + int end) { + arf_frame_stats[arf_frame_index].start = start; + arf_frame_stats[arf_frame_index].end = end; + arf_frame_stats[arf_frame_index].display_index = display_idx; +} + +// Sets GF_GROUP params for INTNL_ARF_UPDATE frames. Also populates +// doh_gf_index_map and arf_frame_stats. +static AOM_INLINE void set_params_for_internal_arfs_in_gf14( + GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats, + int *cur_frame_idx, int *cur_disp_idx, int *frame_ind, + int *count_arf_frames, int *doh_gf_index_map, int start, int end, + int layer_depth, int layer_with_parallel_encodes) { + int index = (start + end - 1) / 2; + gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = index - 1; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->layer_depth[*frame_ind] = layer_depth; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + gf_group->display_idx[*frame_ind] = + (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind]; + + // Update the display index of the current frame with its gf index. + doh_gf_index_map[index] = *frame_ind; + if (layer_with_parallel_encodes) { + assert(layer_depth == 4); + // Set frame_parallel_level of the first frame in the given layer depth + // to 1. + if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) { + gf_group->frame_parallel_level[*frame_ind] = 1; + } else { + // Set frame_parallel_level of the consecutive frame in the same given + // layer depth to 2. + assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1); + gf_group->frame_parallel_level[*frame_ind] = 2; + // Set the display_idx of frame_parallel_level 1 frame in + // gf_group->skip_frame_as_ref. + gf_group->skip_frame_as_ref[*frame_ind] = + gf_group->display_idx[(*frame_ind) - 1]; + } + } + ++(*frame_ind); + + // Update arf_frame_stats. + fill_arf_frame_stats(arf_frame_stats, *count_arf_frames, index, start, end); + ++(*count_arf_frames); +} + +// Sets GF_GROUP params for all INTNL_ARF_UPDATE frames in the given layer +// dpeth. +static AOM_INLINE void set_params_for_cur_layer_frames( + GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats, + int *cur_frame_idx, int *cur_disp_idx, int *frame_ind, + int *count_arf_frames, int *doh_gf_index_map, int num_dir, int node_start, + int node_end, int layer_depth) { + assert(num_dir < 3); + int start, end; + // Iterate through the nodes in the previous layer depth. + for (int i = node_start; i < node_end; i++) { + // For each node, check if a frame can be coded as INTNL_ARF_UPDATE frame on + // either direction. + for (int dir = 0; dir < num_dir; dir++) { + // Checks for a frame to the left of current node. + if (dir == 0) { + start = arf_frame_stats[i].start; + end = arf_frame_stats[i].display_index; + } else { + // Checks for a frame to the right of current node. + start = arf_frame_stats[i].display_index + 1; + end = arf_frame_stats[i].end; + } + const int num_frames_to_process = end - start; + // Checks if a frame can be coded as INTNL_ARF_UPDATE frame. If + // num_frames_to_process is less than 3, then there are not enough frames + // between 'start' and 'end' to create another level. + if (num_frames_to_process >= 3) { + // Flag to indicate the lower layer depths for which parallel encoding + // is enabled. Currently enabled for layer 4 frames. + int layer_with_parallel_encodes = layer_depth == 4; + set_params_for_internal_arfs_in_gf14( + gf_group, arf_frame_stats, cur_frame_idx, cur_disp_idx, frame_ind, + count_arf_frames, doh_gf_index_map, start, end, layer_depth, + layer_with_parallel_encodes); + } + } + } +} + +// Configures multi-layers of the GF_GROUP when consecutive encode of frames in +// the same layer depth is enbaled. +static AOM_INLINE void set_multi_layer_params_for_gf14( + const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, + const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, + GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats, + int *cur_frame_idx, int *frame_ind, int *count_arf_frames, + int *doh_gf_index_map, int *parallel_frame_count, int *first_frame_index, + int *cur_disp_index, int gf_interval, int layer_depth, + int max_parallel_frames) { + assert(layer_depth == 2); + assert(gf_group->max_layer_depth_allowed >= 4); + int layer, node_start, node_end = 0; + // Maximum layer depth excluding LF_UPDATE frames is 4 since applicable only + // for gf-interval 14. + const int max_layer_depth = 4; + // Iterate through each layer depth starting from 2 till 'max_layer_depth'. + for (layer = layer_depth; layer <= max_layer_depth; layer++) { + // 'node_start' and 'node_end' indicate the number of nodes from the + // previous layer depth to be considered. It also corresponds to the indices + // of arf_frame_stats. + node_start = node_end; + node_end = (*count_arf_frames); + // 'num_dir' indicates the number of directions to traverse w.r.t. a given + // node in order to choose an INTNL_ARF_UPDATE frame. Layer depth 2 would + // have only one frame and hence needs to traverse only in the left + // direction w.r.t the node in the previous layer. + int num_dir = layer == 2 ? 1 : 2; + set_params_for_cur_layer_frames(gf_group, arf_frame_stats, cur_frame_idx, + cur_disp_index, frame_ind, count_arf_frames, + doh_gf_index_map, num_dir, node_start, + node_end, layer); + } + + for (int i = 1; i < gf_interval; i++) { + // Since doh_gf_index_map is already populated for all INTNL_ARF_UPDATE + // frames in the GF_GROUP, any frame with INVALID_IDX would correspond to an + // LF_UPDATE frame. + if (doh_gf_index_map[i] == INVALID_IDX) { + // LF_UPDATE frames. + // TODO(Remya): Correct start and end parameters passed to + // set_params_for_leaf_frames() once encode reordering for gf-interval 14 + // is enbaled for parallel encode of lower layer frames. + set_params_for_leaf_frames( + twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx, + frame_ind, parallel_frame_count, max_parallel_frames, 1, + first_frame_index, cur_disp_index, layer, 0, 0); + } else { + // In order to obtain the layer depths of INTNL_OVERLAY_UPDATE frames, get + // the gf index of corresponding INTNL_ARF_UPDATE frames. + int intnl_arf_index = doh_gf_index_map[i]; + int ld = gf_group->layer_depth[intnl_arf_index]; + set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind, + first_frame_index, cur_disp_index, + ld); + } + } +} + +// Set parameters for frames between 'start' and 'end' (excluding both). +static void set_multi_layer_params( + const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, + GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc, + RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end, + int *cur_frame_idx, int *frame_ind, int *parallel_frame_count, + int max_parallel_frames, int do_frame_parallel_encode, + int *first_frame_index, int *cur_disp_idx, int layer_depth) { + const int num_frames_to_process = end - start; + + // Either we are at the last level of the pyramid, or we don't have enough + // frames between 'l' and 'r' to create one more level. + if (layer_depth > gf_group->max_layer_depth_allowed || + num_frames_to_process < 3) { + // Leaf nodes. + while (start < end) { + gf_group->update_type[*frame_ind] = LF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->display_idx[*frame_ind] = *cur_disp_idx; + gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS; + gf_group->arf_boost[*frame_ind] = + av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start, + end - start, 0, NULL, NULL, 0); + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + gf_group->max_layer_depth = + AOMMAX(gf_group->max_layer_depth, layer_depth); + // Set the level of parallelism for the LF_UPDATE frame. + if (do_frame_parallel_encode) { + set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind], + parallel_frame_count, max_parallel_frames); + // Set LF_UPDATE frames as non-reference frames. + gf_group->is_frame_non_ref[*frame_ind] = true; + } + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + ++(*frame_ind); + ++(*cur_frame_idx); + ++(*cur_disp_idx); + ++start; + } + } else { + const int m = (start + end - 1) / 2; + + // Internal ARF. + gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = m - start; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->display_idx[*frame_ind] = + *cur_disp_idx + gf_group->arf_src_offset[*frame_ind]; + gf_group->layer_depth[*frame_ind] = layer_depth; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + + if (do_frame_parallel_encode) { + // If max_parallel_frames is not exceeded and if the frame will not be + // temporally filtered, encode the next internal ARF frame in parallel. + if (*parallel_frame_count > 1 && + *parallel_frame_count <= max_parallel_frames) { + if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR) + gf_group->frame_parallel_level[*frame_ind] = 2; + *parallel_frame_count = 1; + } + } + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + + // Get the boost factor for intermediate ARF frames. + gf_group->arf_boost[*frame_ind] = + av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, m, end - m, + m - start, NULL, NULL, 0); + ++(*frame_ind); + + // Frames displayed before this internal ARF. + set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc, + frame_info, start, m, cur_frame_idx, frame_ind, + parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, + cur_disp_idx, layer_depth + 1); + + // Overlay for internal ARF. + gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->display_idx[*frame_ind] = *cur_disp_idx; + gf_group->arf_boost[*frame_ind] = 0; + gf_group->layer_depth[*frame_ind] = layer_depth; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + ++(*frame_ind); + ++(*cur_frame_idx); + ++(*cur_disp_idx); + + // Frames displayed after this internal ARF. + set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc, + frame_info, m + 1, end, cur_frame_idx, frame_ind, + parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, + cur_disp_idx, layer_depth + 1); + } +} + +static int construct_multi_layer_gf_structure( + AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group, + RATE_CONTROL *rc, FRAME_INFO *const frame_info, int baseline_gf_interval, + FRAME_UPDATE_TYPE first_frame_update_type) { + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + // TODO(angiebird): Why do we need "-1" here? + const int gf_interval = baseline_gf_interval - 1; + int frame_index = 0; + int cur_frame_index = 0; + + // Set the display order hint for the first frame in the GF_GROUP. + int cur_disp_index = (first_frame_update_type == KF_UPDATE) + ? 0 + : cpi->common.current_frame.frame_number; + + // Initialize gf_group->frame_parallel_level, gf_group->is_frame_non_ref, + // gf_group->src_offset and gf_group->is_frame_dropped with 0. + memset(gf_group->frame_parallel_level, 0, + sizeof(gf_group->frame_parallel_level)); + memset(gf_group->is_frame_non_ref, 0, sizeof(gf_group->is_frame_non_ref)); + memset(gf_group->src_offset, 0, sizeof(gf_group->src_offset)); + memset(gf_group->is_frame_dropped, 0, sizeof(gf_group->is_frame_dropped)); + // Initialize gf_group->skip_frame_refresh and gf_group->skip_frame_as_ref + // with INVALID_IDX. + memset(gf_group->skip_frame_refresh, INVALID_IDX, + sizeof(gf_group->skip_frame_refresh)); + memset(gf_group->skip_frame_as_ref, INVALID_IDX, + sizeof(gf_group->skip_frame_as_ref)); + + int kf_decomp = cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1; + // This is a patch that fixes https://crbug.com/aomedia/3163 + // enable_keyframe_filtering > 1 will introduce an extra overlay frame at + // key frame location. However when + // baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH, we can't + // afford to have an extra overlay frame. Otherwise, the gf_group->size will + // become MAX_STATIC_GF_GROUP_LENGTH + 1, which causes memory error. + // A cheap solution is to turn of kf_decomp here. + // TODO(angiebird): Find a systematic way to solve this issue. + if (baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH) { + kf_decomp = 0; + } + if (first_frame_update_type == KF_UPDATE) { + gf_group->update_type[frame_index] = kf_decomp ? ARF_UPDATE : KF_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = 0; + gf_group->frame_type[frame_index] = KEY_FRAME; + gf_group->refbuf_state[frame_index] = REFBUF_RESET; + gf_group->max_layer_depth = 0; + gf_group->display_idx[frame_index] = cur_disp_index; + if (!kf_decomp) cur_disp_index++; + ++frame_index; + + if (kf_decomp) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = 0; + gf_group->frame_type[frame_index] = INTER_FRAME; + gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; + gf_group->max_layer_depth = 0; + gf_group->display_idx[frame_index] = cur_disp_index; + cur_disp_index++; + ++frame_index; + } + cur_frame_index++; + } + + if (first_frame_update_type == GF_UPDATE) { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = 0; + gf_group->frame_type[frame_index] = INTER_FRAME; + gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; + gf_group->max_layer_depth = 0; + gf_group->display_idx[frame_index] = cur_disp_index; + cur_disp_index++; + ++frame_index; + ++cur_frame_index; + } + + // ALTREF. + const int use_altref = gf_group->max_layer_depth_allowed > 0; + int is_fwd_kf = rc->frames_to_fwd_kf == gf_interval; + + if (use_altref) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = 1; + gf_group->arf_boost[frame_index] = cpi->ppi->p_rc.gfu_boost; + gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME; + gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; + gf_group->max_layer_depth = 1; + gf_group->arf_index = frame_index; + gf_group->display_idx[frame_index] = + cur_disp_index + gf_group->arf_src_offset[frame_index]; + ++frame_index; + } else { + gf_group->arf_index = -1; + } + + // Flag to indicate if multi-layer configuration is complete. + int is_multi_layer_configured = 0; + + // Running count of no. of frames that is part of a given parallel + // encode set in a gf_group. Value of 1 indicates no parallel encode. + int parallel_frame_count = 1; + // Enable parallel encode of frames if gf_group has a multi-layer pyramid + // structure with minimum 4 layers. + int do_frame_parallel_encode = (cpi->ppi->num_fp_contexts > 1 && use_altref && + gf_group->max_layer_depth_allowed >= 4); + + int first_frame_index = cur_frame_index; + if (do_frame_parallel_encode) { + // construct_multi_layer_gf_structure() takes the input parameter + // 'gf_interval' as p_rc->baseline_gf_interval - 1 . Below code computes the + // actual GF_GROUP length by compensating for this offset. + int actual_gf_length = ((first_frame_update_type == KF_UPDATE) || + (first_frame_update_type == GF_UPDATE)) + ? gf_interval + : gf_interval + 1; + + // In order to facilitate parallel encoding of frames in lower layer depths, + // encode reordering is done. Currently encode reordering is enabled only + // for gf-intervals 16 and 32. NOTE: Since the buffer holding the + // reference frames is of size 8 (ref_frame_map[REF_FRAMES]), there is a + // limitation on the number of hidden frames possible at any given point and + // hence the reordering is enabled only for gf-intervals 16 and 32. + // Disabling encode reordering for gf-interval 14 since some cross-frame + // dependencies related to temporal filtering for FPMT is currently not + // handled. + int disable_gf14_reorder = 1; + if (actual_gf_length == 14 && !disable_gf14_reorder) { + // This array holds the gf index of INTNL_ARF_UPDATE frames in the slot + // corresponding to their display order hint. This is used while + // configuring the LF_UPDATE frames and INTNL_OVERLAY_UPDATE frames. + int doh_gf_index_map[FIXED_GF_INTERVAL]; + // Initialize doh_gf_index_map with INVALID_IDX. + memset(&doh_gf_index_map[0], INVALID_IDX, + (sizeof(doh_gf_index_map[0]) * FIXED_GF_INTERVAL)); + + FRAME_REORDER_INFO arf_frame_stats[REF_FRAMES - 1]; + // Store the stats corresponding to layer 1 frame. + fill_arf_frame_stats(arf_frame_stats, 0, actual_gf_length, 1, + actual_gf_length); + int count_arf_frames = 1; + + // Sets multi-layer params for gf-interval 14 to consecutively encode + // frames in the same layer depth, i.e., encode order would be 0-> 14-> + // 7-> 3-> 10-> 5-> 12-> 1-> 2-> 4-> 6-> 8-> 9-> 11-> 13. + // TODO(Remya): Set GF_GROUP param 'arf_boost' for all frames. + set_multi_layer_params_for_gf14( + twopass, &cpi->twopass_frame, p_rc, frame_info, gf_group, + arf_frame_stats, &cur_frame_index, &frame_index, &count_arf_frames, + doh_gf_index_map, ¶llel_frame_count, &first_frame_index, + &cur_disp_index, actual_gf_length, use_altref + 1, + cpi->ppi->num_fp_contexts); + + // Set gf_group->skip_frame_refresh. + for (int i = 0; i < actual_gf_length; i++) { + int count = 0; + if (gf_group->update_type[i] == INTNL_ARF_UPDATE) { + for (int j = 0; j < i; j++) { + // Store the display order hint of the frames which would not + // have been displayed at the encode call of frame 'i'. + if ((gf_group->display_idx[j] < gf_group->display_idx[i]) && + gf_group->update_type[j] == INTNL_ARF_UPDATE) { + gf_group->skip_frame_refresh[i][count++] = + gf_group->display_idx[j]; + } + } + } + } + } else { + // Set layer depth threshold for reordering as per the gf length. + int depth_thr = (actual_gf_length == 16) ? 3 + : (actual_gf_length == 32) ? 4 + : INT_MAX; + + set_multi_layer_params_for_fp( + twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info, + cur_frame_index, gf_interval, &cur_frame_index, &frame_index, + ¶llel_frame_count, cpi->ppi->num_fp_contexts, + do_frame_parallel_encode, &first_frame_index, depth_thr, + &cur_disp_index, use_altref + 1); + } + is_multi_layer_configured = 1; + } + + // Rest of the frames. + if (!is_multi_layer_configured) + set_multi_layer_params(twopass, &cpi->twopass_frame, gf_group, p_rc, rc, + frame_info, cur_frame_index, gf_interval, + &cur_frame_index, &frame_index, + ¶llel_frame_count, cpi->ppi->num_fp_contexts, + do_frame_parallel_encode, &first_frame_index, + &cur_disp_index, use_altref + 1); + + if (use_altref) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS; + gf_group->arf_boost[frame_index] = NORMAL_BOOST; + gf_group->frame_type[frame_index] = INTER_FRAME; + gf_group->refbuf_state[frame_index] = + is_fwd_kf ? REFBUF_RESET : REFBUF_UPDATE; + gf_group->display_idx[frame_index] = cur_disp_index; + ++frame_index; + } else { + for (; cur_frame_index <= gf_interval; ++cur_frame_index) { + gf_group->update_type[frame_index] = LF_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS; + gf_group->arf_boost[frame_index] = NORMAL_BOOST; + gf_group->frame_type[frame_index] = INTER_FRAME; + gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; + gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2); + set_src_offset(gf_group, &first_frame_index, cur_frame_index, + frame_index); + gf_group->display_idx[frame_index] = cur_disp_index; + cur_disp_index++; + ++frame_index; + } + } + if (do_frame_parallel_encode) { + // Iterate through the gf_group and reset frame_parallel_level to 0 in case + // a frame is marked as frame_parallel_level 1 with no subsequent + // frame_parallel_level 2 frame(s). + int level1_frame_idx = INT_MAX; + int level2_frame_count = 0; + for (int frame_idx = 0; frame_idx < frame_index; frame_idx++) { + if (gf_group->frame_parallel_level[frame_idx] == 1) { + // Set frame_parallel_level to 0 if only one frame is present in a + // parallel encode set. + if (level1_frame_idx != INT_MAX && !level2_frame_count) + gf_group->frame_parallel_level[level1_frame_idx] = 0; + // Book-keep frame_idx of frame_parallel_level 1 frame and reset the + // count of frame_parallel_level 2 frames in the corresponding parallel + // encode set. + level1_frame_idx = frame_idx; + level2_frame_count = 0; + } + if (gf_group->frame_parallel_level[frame_idx] == 2) level2_frame_count++; + } + // If frame_parallel_level is set to 1 for the last LF_UPDATE + // frame in the gf_group, reset it to zero since there are no subsequent + // frames in the gf_group. + if (gf_group->frame_parallel_level[frame_index - 2] == 1) { + assert(gf_group->update_type[frame_index - 2] == LF_UPDATE); + gf_group->frame_parallel_level[frame_index - 2] = 0; + } + } + + for (int gf_idx = frame_index; gf_idx < MAX_STATIC_GF_GROUP_LENGTH; + ++gf_idx) { + gf_group->update_type[gf_idx] = LF_UPDATE; + gf_group->arf_src_offset[gf_idx] = 0; + gf_group->cur_frame_idx[gf_idx] = gf_idx; + gf_group->layer_depth[gf_idx] = MAX_ARF_LAYERS; + gf_group->arf_boost[gf_idx] = NORMAL_BOOST; + gf_group->frame_type[gf_idx] = INTER_FRAME; + gf_group->refbuf_state[gf_idx] = REFBUF_UPDATE; + gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2); + } + + return frame_index; +} + +static void set_ld_layer_depth(GF_GROUP *gf_group, int gop_length) { + int log_gop_length = 0; + while ((1 << log_gop_length) < gop_length) { + ++log_gop_length; + } + + for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) { + int count = 0; + // Find the trailing zeros + for (; count < MAX_ARF_LAYERS; ++count) { + if ((gf_index >> count) & 0x01) break; + } + gf_group->layer_depth[gf_index] = AOMMAX(log_gop_length - count, 0); + } + gf_group->max_layer_depth = AOMMIN(log_gop_length, MAX_ARF_LAYERS); +} + +void av1_gop_setup_structure(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FRAME_INFO *const frame_info = &cpi->frame_info; + const int key_frame = rc->frames_since_key == 0; + FRAME_UPDATE_TYPE first_frame_update_type = ARF_UPDATE; + + if (key_frame) { + first_frame_update_type = KF_UPDATE; + if (cpi->oxcf.kf_max_pyr_height != -1) { + gf_group->max_layer_depth_allowed = AOMMIN( + cpi->oxcf.kf_max_pyr_height, gf_group->max_layer_depth_allowed); + } + } else if (!cpi->ppi->gf_state.arf_gf_boost_lst) { + first_frame_update_type = GF_UPDATE; + } + + gf_group->size = construct_multi_layer_gf_structure( + cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval, + first_frame_update_type); + + if (gf_group->max_layer_depth_allowed == 0) + set_ld_layer_depth(gf_group, p_rc->baseline_gf_interval); +} + +int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group, + int gf_frame_index) { + return gf_group->frame_type[gf_frame_index] == KEY_FRAME && + gf_group->refbuf_state[gf_frame_index] == REFBUF_UPDATE; +} + +int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index) { + const int arf_src_offset = gf_group->arf_src_offset[gf_frame_index]; + // TODO(angiebird): when gf_group->size == 32, it's possble to + // have "two" second arf. Check if this is acceptable. + if (gf_group->update_type[gf_frame_index] == INTNL_ARF_UPDATE && + arf_src_offset >= TF_LOOKAHEAD_IDX_THR) { + return 1; + } + return 0; +} diff --git a/third_party/aom/av1/encoder/gop_structure.h b/third_party/aom/av1/encoder/gop_structure.h new file mode 100644 index 0000000000..ff22f54136 --- /dev/null +++ b/third_party/aom/av1/encoder/gop_structure.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_ +#define AOM_AV1_ENCODER_GOP_STRUCTURE_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif +/*!\cond */ +struct AV1_COMP; +struct EncodeFrameParams; + +#define MIN_ARF_GF_BOOST 240 +#define NORMAL_BOOST 100 + +/*!\endcond */ + +/*!\brief Set up the Group-Of-Pictures structure for this GF_GROUP. + * + *\ingroup rate_control + * + * This function defines the Group-Of-Pictures structure for this GF_GROUP. + * This involves deciding where to place the various FRAME_UPDATE_TYPEs in + * the group. It does this primarily by updateing entries in + * cpi->twopass.gf_group.update_type[]. + * + * \param[in] cpi Top - level encoder instance structure + * + * \remark No return value but this function updates group data structures. + */ +void av1_gop_setup_structure(struct AV1_COMP *cpi); + +/*!\brief Distributes bits to frames in a group + * + *\ingroup rate_control + * + * This function decides on the allocation of bits between the different + * frames and types of frame in a GF/ARF group. + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] rc Rate control data + * \param[in] gf_group GF/ARF group data structure + * \param[in] is_key_frame Indicates if the first frame in the group is + * also a key frame. + * \param[in] use_arf Are ARF frames enabled or is this a GF only + * uni-directional group. + * \param[in] gf_group_bits Bits available to be allocated. + * + * \remark No return but updates the rate control and group data structures + * to reflect the allocation of bits. + */ +void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, + GF_GROUP *gf_group, int is_key_frame, int use_arf, + int64_t gf_group_bits); + +/*!\brief Check whether a frame in the GOP is a forward key frame + * + *\ingroup rate_control + * + * \param[in] gf_group GF/ARF group data structure + * \param[in] gf_frame_index GOP index + * + * \return Return 1 if it is a forward key frame, otherwise return 0 + */ +int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group, + int gf_frame_index); + +/*!\brief Check whether a frame in the GOP is the second arf + * + *\ingroup rate_control + * + * \param[in] gf_group GF/ARF group data structure + * \param[in] gf_frame_index GOP index + * + * \return Return 1 if it is the second arf + */ +int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_GOP_STRUCTURE_H_ diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h new file mode 100644 index 0000000000..945dc37331 --- /dev/null +++ b/third_party/aom/av1/encoder/grain_test_vectors.h @@ -0,0 +1,781 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ +#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ + +/* Test vectors for emulation of different film grain types. + * Note that bit depth would be derived from the bitstream and + * not signaled in film grain metadata. The parameters are valid + * for any bit depth. + */ +static aom_film_grain_t film_grain_test_vectors[16] = { + /* Test 1 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 16, 0 }, + { 25, 136 }, + { 33, 144 }, + { 41, 160 }, + { 48, 168 }, + { 56, 136 }, + { 67, 128 }, + { 82, 144 }, + { 97, 152 }, + { 113, 144 }, + { 128, 176 }, + { 143, 168 }, + { 158, 176 }, + { 178, 184 } }, + 14 /* num_points_y */, + { { 16, 0 }, + { 20, 64 }, + { 28, 88 }, + { 60, 104 }, + { 90, 136 }, + { 105, 160 }, + { 134, 168 }, + { 168, 208 } }, + 8 /* num_cb_points */, + { { 16, 0 }, + { 28, 96 }, + { 56, 80 }, + { 66, 96 }, + { 80, 104 }, + { 108, 96 }, + { 122, 112 }, + { 137, 112 }, + { 169, 176 } }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 }, + { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 }, + { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 }, + 8 /* ar_coeff_shift */, + 247 /* cb_mult */, + 192 /* cb_luma_mult */, + 18 /* cb_offset */, + 229 /* cr_mult */, + 192 /* cr_luma_mult */, + 54 /* cr_offset */, + 0 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /* chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 2 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 2 /* num_points_y */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_cb_points */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 3 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 192 }, { 255, 192 } }, + 2 /* num_points_y */, + { { 0, 128 }, { 255, 128 } }, + 2 /* num_cb_points */, + { { 0, 128 }, { 255, 128 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19, + -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0, + }, + { + 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19, + -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 1 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 4 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 24, 137 }, + { 53, 146 }, + { 63, 155 }, + { 78, 155 }, + { 107, 150 }, + { 122, 147 }, + { 136, 147 }, + { 166, 153 }, + }, + 9 /* num_points_y */, + { + { 16, 0 }, + { 20, 72 }, + { 27, 82 }, + { 33, 91 }, + { 69, 121 }, + { 95, 143 }, + { 108, 154 }, + { 134, 169 }, + { 147, 177 }, + }, + 9 /* num_cb_points */, + { + { 16, 0 }, + { 24, 95 }, + { 54, 93 }, + { 65, 94 }, + { 79, 98 }, + { 109, 107 }, + { 124, 119 }, + { 139, 136 }, + { 169, 170 }, + }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42, + 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113, + }, + { + -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5, + -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0, + }, + { + 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2, + -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0, + }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 5 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_points_y */, + { + { 0, 96 }, + { 32, 90 }, + { 64, 83 }, + { 96, 76 }, + { 128, 68 }, + { 159, 59 }, + { 191, 48 }, + { 223, 34 }, + { 255, 0 }, + }, + 9 /* num_cb_points */, + { + { 0, 0 }, + { 32, 34 }, + { 64, 48 }, + { 96, 59 }, + { 128, 68 }, + { 159, 76 }, + { 191, 83 }, + { 223, 90 }, + { 255, 96 }, + }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + -2, 2, -5, 7, -6, 4, -2, -1, 1, -2, 0, -2, 2, + -3, -5, 13, -13, 6, -14, 8, -1, 18, -36, 58, 0, + }, + { + -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2, + 0, 1, 0, -7, 50, -8, -2, 2, 2, 2, -4, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 1063 /* random_seed */ + }, + /* Test 6 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 96 }, + { 20, 92 }, + { 39, 88 }, + { 59, 84 }, + { 78, 80 }, + { 98, 75 }, + { 118, 70 }, + { 137, 65 }, + { 157, 60 }, + { 177, 53 }, + { 196, 46 }, + { 216, 38 }, + { 235, 27 }, + { 255, 0 }, + }, + 14 /* num_points_y */, + { { 0, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 } }, + 0 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 2754 /* random_seed */ + }, + /* Test 7 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 0 }, + { 20, 27 }, + { 39, 38 }, + { 59, 46 }, + { 78, 53 }, + { 98, 60 }, + { 118, 65 }, + { 137, 70 }, + { 157, 75 }, + { 177, 80 }, + { 196, 84 }, + { 216, 88 }, + { 235, 92 }, + { 255, 96 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 2 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 8 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 2 /* num_points_y */, + { { 0, 62 }, { 255, 62 } }, + 2 /* num_cb_points */, + { { 0, 62 }, { 255, 62 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6, + -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69, + }, + { + 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8, + -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 9 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 48 }, { 255, 48 } }, + 2 /* num_points_y */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cb_points */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 10 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 48 }, { 255, 48 } }, + 2 /* num_points_y */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cb_points */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 }, + { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 11 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_points_y */, + { + { 0, 48 }, + { 32, 45 }, + { 64, 42 }, + { 96, 38 }, + { 128, 34 }, + { 159, 29 }, + { 191, 24 }, + { 223, 17 }, + { 255, 0 }, + }, + 9 /* num_cb_points */, + { + { 0, 0 }, + { 32, 17 }, + { 64, 24 }, + { 96, 29 }, + { 128, 34 }, + { 159, 38 }, + { 191, 42 }, + { 223, 45 }, + { 255, 48 }, + }, + 9 /* num_cr_points */, + 10 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42, + 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113, + }, + { + -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5, + -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0, + }, + { + 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2, + -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0, + }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 1357 /* random_seed */ + }, + /* Test 12 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 24, 49 }, + { 39, 69 }, + { 46, 84 }, + { 53, 91 }, + { 63, 100 }, + { 78, 114 }, + { 92, 134 }, + { 164, 139 }, + }, + 9 /* num_points_y */, + { + { 16, 0 }, + { 20, 31 }, + { 26, 42 }, + { 33, 54 }, + { 40, 65 }, + { 47, 72 }, + { 56, 85 }, + { 84, 123 }, + { 152, 157 }, + }, + 9 /* num_cb_points */, + { + { 16, 0 }, + { 25, 14 }, + { 39, 33 }, + { 47, 40 }, + { 54, 47 }, + { 64, 62 }, + { 79, 76 }, + { 94, 83 }, + { 167, 101 }, + }, + 9 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 }, + { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 }, + { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 0 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 13 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 48 }, + { 20, 46 }, + { 39, 44 }, + { 59, 42 }, + { 78, 40 }, + { 98, 38 }, + { 118, 35 }, + { 137, 33 }, + { 157, 30 }, + { 177, 27 }, + { 196, 23 }, + { 216, 19 }, + { 235, 13 }, + { 255, 0 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 14 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 0 }, + { 20, 13 }, + { 39, 19 }, + { 59, 23 }, + { 78, 27 }, + { 98, 30 }, + { 118, 33 }, + { 137, 35 }, + { 157, 38 }, + { 177, 40 }, + { 196, 42 }, + { 216, 44 }, + { 235, 46 }, + { 255, 48 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 15 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 1 /* num_points_y */, + { { 0, 96 }, { 255, 96 } }, + 0 /* num_cb_points */, + { { 0, 96 }, { 255, 96 } }, + 0 /* num_cr_points */, + 11 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 }, + { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 }, + { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 1 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 16 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 58, 126 }, + { 87, 120 }, + { 97, 122 }, + { 112, 125 }, + { 126, 131 }, + { 141, 139 }, + { 199, 153 }, + }, + 8 /* num_points_y */, + { + { 16, 0 }, + { 59, 68 }, + { 66, 76 }, + { 73, 82 }, + { 79, 85 }, + { 86, 86 }, + { 151, 95 }, + { 192, 101 }, + }, + 8 /* num_cb_points */, + { + { 16, 0 }, + { 59, 64 }, + { 89, 80 }, + { 99, 86 }, + { 114, 90 }, + { 129, 93 }, + { 144, 97 }, + { 203, 85 }, + }, + 8 /* num_cr_points */, + 10 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6, + -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69, + }, + { + 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8, + -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 2 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, +}; +#endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c new file mode 100644 index 0000000000..8037b59bef --- /dev/null +++ b/third_party/aom/av1/encoder/hash.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/hash.h" +#include "config/av1_rtcd.h" + +static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator, + uint8_t *pData, uint32_t dataLength) { + for (uint32_t i = 0; i < dataLength; i++) { + const uint8_t index = (uint8_t)( + (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^ + pData[i]); + p_crc_calculator->remainder <<= 8; + p_crc_calculator->remainder ^= p_crc_calculator->table[index]; + } +} + +static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) { + p_crc_calculator->remainder = 0; +} + +static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) { + return p_crc_calculator->remainder & p_crc_calculator->final_result_mask; +} + +static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) { + const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1); + const uint32_t byte_high_bit = 1 << (8 - 1); + + for (uint32_t value = 0; value < 256; value++) { + uint32_t remainder = 0; + for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) { + if (value & mask) { + remainder ^= high_bit; + } + + if (remainder & high_bit) { + remainder <<= 1; + remainder ^= p_crc_calculator->trunc_poly; + } else { + remainder <<= 1; + } + } + p_crc_calculator->table[value] = remainder; + } +} + +void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, + uint32_t truncPoly) { + p_crc_calculator->remainder = 0; + p_crc_calculator->bits = bits; + p_crc_calculator->trunc_poly = truncPoly; + p_crc_calculator->final_result_mask = (1 << bits) - 1; + crc_calculator_init_table(p_crc_calculator); +} + +uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, + int length) { + crc_calculator_reset(p_crc_calculator); + crc_calculator_process_data(p_crc_calculator, p, length); + return crc_calculator_get_crc(p_crc_calculator); +} + +/* CRC-32C (iSCSI) polynomial in reversed bit order. */ +#define POLY 0x82f63b78 + +/* Construct table for software CRC-32C calculation. */ +void av1_crc32c_calculator_init(CRC32C *p_crc32c) { + uint32_t crc; + + for (int n = 0; n < 256; n++) { + crc = n; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + p_crc32c->table[0][n] = crc; + } + for (int n = 0; n < 256; n++) { + crc = p_crc32c->table[0][n]; + for (int k = 1; k < 8; k++) { + crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8); + p_crc32c->table[k][n] = crc; + } + } +} + +/* Table-driven software version as a fall-back. This is about 15 times slower + than using the hardware instructions. This assumes little-endian integers, + as is the case on Intel processors that the assembler code here is for. */ +uint32_t av1_get_crc32c_value_c(void *c, uint8_t *buf, size_t len) { + const uint8_t *next = (const uint8_t *)(buf); + uint64_t crc; + CRC32C *p = (CRC32C *)c; + crc = 0 ^ 0xffffffff; + while (len && ((uintptr_t)next & 7) != 0) { + crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + while (len >= 8) { + crc ^= *(uint64_t *)next; + crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^ + p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^ + p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^ + p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56]; + next += 8; + len -= 8; + } + while (len) { + crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + return (uint32_t)crc ^ 0xffffffff; +} diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h new file mode 100644 index 0000000000..d8e8cc3a0b --- /dev/null +++ b/third_party/aom/av1/encoder/hash.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_HASH_H_ +#define AOM_AV1_ENCODER_HASH_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _crc_calculator { + uint32_t remainder; + uint32_t trunc_poly; + uint32_t bits; + uint32_t table[256]; + uint32_t final_result_mask; +} CRC_CALCULATOR; + +// Initialize the crc calculator. It must be executed at least once before +// calling av1_get_crc_value(). +void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, + uint32_t truncPoly); +uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, + int length); + +// CRC32C: POLY = 0x82f63b78; +typedef struct _CRC32C { + /* Table for a quadword-at-a-time software crc. */ + uint32_t table[8][256]; +} CRC32C; + +// init table for software version crc32c +void av1_crc32c_calculator_init(CRC32C *p_crc32c); + +#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096) + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_HASH_H_ diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c new file mode 100644 index 0000000000..8b04e22d6c --- /dev/null +++ b/third_party/aom/av1/encoder/hash_motion.c @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/hash.h" +#include "av1/encoder/hash_motion.h" + +#define kSrcBits 16 +#define kBlockSizeBits 3 +#define kMaxAddr (1 << (kSrcBits + kBlockSizeBits)) + +// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported? +// If yes, fix this function +static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src, + int stride, + uint8_t *p_pixels_in1D) { + const uint8_t *p_pel = y_src; + int index = 0; + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + p_pixels_in1D[index++] = p_pel[j]; + } + p_pel += stride; + } +} + +static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src, + int stride, + uint16_t *p_pixels_in1D) { + const uint16_t *p_pel = y_src; + int index = 0; + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + p_pixels_in1D[index++] = p_pel[j]; + } + p_pel += stride; + } +} + +static int is_block_2x2_row_same_value(const uint8_t *p) { + if (p[0] != p[1] || p[2] != p[3]) { + return 0; + } + return 1; +} + +static int is_block16_2x2_row_same_value(const uint16_t *p) { + if (p[0] != p[1] || p[2] != p[3]) { + return 0; + } + return 1; +} + +static int is_block_2x2_col_same_value(const uint8_t *p) { + if ((p[0] != p[2]) || (p[1] != p[3])) { + return 0; + } + return 1; +} + +static int is_block16_2x2_col_same_value(const uint16_t *p) { + if ((p[0] != p[2]) || (p[1] != p[3])) { + return 0; + } + return 1; +} + +// the hash value (hash_value1 consists two parts, the first 3 bits relate to +// the block size and the remaining 16 bits are the crc values. This fuction +// is used to get the first 3 bits. +static int hash_block_size_to_index(int block_size) { + switch (block_size) { + case 4: return 0; + case 8: return 1; + case 16: return 2; + case 32: return 3; + case 64: return 4; + case 128: return 5; + default: return -1; + } +} + +void av1_hash_table_init(IntraBCHashInfo *intrabc_hash_info) { + if (!intrabc_hash_info->g_crc_initialized) { + av1_crc_calculator_init(&intrabc_hash_info->crc_calculator1, 24, 0x5D6DCB); + av1_crc_calculator_init(&intrabc_hash_info->crc_calculator2, 24, 0x864CFB); + intrabc_hash_info->g_crc_initialized = 1; + } + intrabc_hash_info->intrabc_hash_table.p_lookup_table = NULL; +} + +void av1_hash_table_clear_all(hash_table *p_hash_table) { + if (p_hash_table->p_lookup_table == NULL) { + return; + } + for (int i = 0; i < kMaxAddr; i++) { + if (p_hash_table->p_lookup_table[i] != NULL) { + aom_vector_destroy(p_hash_table->p_lookup_table[i]); + aom_free(p_hash_table->p_lookup_table[i]); + p_hash_table->p_lookup_table[i] = NULL; + } + } +} + +void av1_hash_table_destroy(hash_table *p_hash_table) { + av1_hash_table_clear_all(p_hash_table); + aom_free(p_hash_table->p_lookup_table); + p_hash_table->p_lookup_table = NULL; +} + +bool av1_hash_table_create(hash_table *p_hash_table) { + if (p_hash_table->p_lookup_table != NULL) { + av1_hash_table_clear_all(p_hash_table); + return true; + } + p_hash_table->p_lookup_table = + (Vector **)aom_calloc(kMaxAddr, sizeof(p_hash_table->p_lookup_table[0])); + if (!p_hash_table->p_lookup_table) return false; + return true; +} + +static bool hash_table_add_to_table(hash_table *p_hash_table, + uint32_t hash_value, + block_hash *curr_block_hash) { + if (p_hash_table->p_lookup_table[hash_value] == NULL) { + p_hash_table->p_lookup_table[hash_value] = + aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0])); + if (p_hash_table->p_lookup_table[hash_value] == NULL) { + return false; + } + if (aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10, + sizeof(curr_block_hash[0])) == VECTOR_ERROR) + return false; + if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], + curr_block_hash) == VECTOR_ERROR) + return false; + } else { + if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], + curr_block_hash) == VECTOR_ERROR) + return false; + } + return true; +} + +int32_t av1_hash_table_count(const hash_table *p_hash_table, + uint32_t hash_value) { + if (p_hash_table->p_lookup_table[hash_value] == NULL) { + return 0; + } else { + return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size); + } +} + +Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, + uint32_t hash_value) { + assert(av1_hash_table_count(p_hash_table, hash_value) > 0); + return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]); +} + +int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, + uint32_t hash_value2) { + if (p_hash_table->p_lookup_table[hash_value1] == NULL) { + return 0; + } + Iterator iterator = + aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]); + Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]); + for (; !aom_iterator_equals(&iterator, &last); + aom_iterator_increment(&iterator)) { + if ((*(block_hash *)aom_iterator_get(&iterator)).hash_value2 == + hash_value2) { + return 1; + } + } + return 0; +} + +void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intrabc_hash_info, + const YV12_BUFFER_CONFIG *picture, + uint32_t *pic_block_hash[2], + int8_t *pic_block_same_info[3]) { + const int width = 2; + const int height = 2; + const int x_end = picture->y_crop_width - width + 1; + const int y_end = picture->y_crop_height - height + 1; + CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; + CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; + + const int length = width * 2; + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t p[4]; + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + get_pixels_in_1D_short_array_by_block_2x2( + CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride + + x_pos, + picture->y_stride, p); + pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p); + pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p); + + pic_block_hash[0][pos] = + av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0])); + pic_block_hash[1][pos] = + av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0])); + pos++; + } + pos += width - 1; + } + } else { + uint8_t p[4]; + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + get_pixels_in_1D_char_array_by_block_2x2( + picture->y_buffer + y_pos * picture->y_stride + x_pos, + picture->y_stride, p); + pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p); + pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p); + + pic_block_hash[0][pos] = + av1_get_crc_value(calc_1, p, length * sizeof(p[0])); + pic_block_hash[1][pos] = + av1_get_crc_value(calc_2, p, length * sizeof(p[0])); + pos++; + } + pos += width - 1; + } + } +} + +void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info, + const YV12_BUFFER_CONFIG *picture, + int block_size, + uint32_t *src_pic_block_hash[2], + uint32_t *dst_pic_block_hash[2], + int8_t *src_pic_block_same_info[3], + int8_t *dst_pic_block_same_info[3]) { + CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; + CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; + + const int pic_width = picture->y_crop_width; + const int x_end = picture->y_crop_width - block_size + 1; + const int y_end = picture->y_crop_height - block_size + 1; + + const int src_size = block_size >> 1; + const int quad_size = block_size >> 2; + + uint32_t p[4]; + const int length = sizeof(p); + + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + p[0] = src_pic_block_hash[0][pos]; + p[1] = src_pic_block_hash[0][pos + src_size]; + p[2] = src_pic_block_hash[0][pos + src_size * pic_width]; + p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size]; + dst_pic_block_hash[0][pos] = + av1_get_crc_value(calc_1, (uint8_t *)p, length); + + p[0] = src_pic_block_hash[1][pos]; + p[1] = src_pic_block_hash[1][pos + src_size]; + p[2] = src_pic_block_hash[1][pos + src_size * pic_width]; + p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size]; + dst_pic_block_hash[1][pos] = + av1_get_crc_value(calc_2, (uint8_t *)p, length); + + dst_pic_block_same_info[0][pos] = + src_pic_block_same_info[0][pos] && + src_pic_block_same_info[0][pos + quad_size] && + src_pic_block_same_info[0][pos + src_size] && + src_pic_block_same_info[0][pos + src_size * pic_width] && + src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] && + src_pic_block_same_info[0][pos + src_size * pic_width + src_size]; + + dst_pic_block_same_info[1][pos] = + src_pic_block_same_info[1][pos] && + src_pic_block_same_info[1][pos + src_size] && + src_pic_block_same_info[1][pos + quad_size * pic_width] && + src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] && + src_pic_block_same_info[1][pos + src_size * pic_width] && + src_pic_block_same_info[1][pos + src_size * pic_width + src_size]; + pos++; + } + pos += block_size - 1; + } + + if (block_size >= 4) { + const int size_minus_1 = block_size - 1; + pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + dst_pic_block_same_info[2][pos] = + (!dst_pic_block_same_info[0][pos] && + !dst_pic_block_same_info[1][pos]) || + (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0)); + pos++; + } + pos += block_size - 1; + } + } +} + +bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, + uint32_t *pic_hash[2], + int8_t *pic_is_same, + int pic_width, int pic_height, + int block_size) { + const int x_end = pic_width - block_size + 1; + const int y_end = pic_height - block_size + 1; + + const int8_t *src_is_added = pic_is_same; + const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] }; + + int add_value = hash_block_size_to_index(block_size); + assert(add_value >= 0); + add_value <<= kSrcBits; + const int crc_mask = (1 << kSrcBits) - 1; + + for (int x_pos = 0; x_pos < x_end; x_pos++) { + for (int y_pos = 0; y_pos < y_end; y_pos++) { + const int pos = y_pos * pic_width + x_pos; + // valid data + if (src_is_added[pos]) { + block_hash curr_block_hash; + curr_block_hash.x = x_pos; + curr_block_hash.y = y_pos; + + const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value; + curr_block_hash.hash_value2 = src_hash[1][pos]; + + if (!hash_table_add_to_table(p_hash_table, hash_value1, + &curr_block_hash)) { + return false; + } + } + } + } + return true; +} + +int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start) { + const int stride = picture->y_stride; + const uint8_t *p = picture->y_buffer + y_start * stride + x_start; + + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p16[j] != p16[0]) { + return 0; + } + } + p16 += stride; + } + } else { + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p[j] != p[0]) { + return 0; + } + } + p += stride; + } + } + + return 1; +} + +int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start) { + const int stride = picture->y_stride; + const uint8_t *p = picture->y_buffer + y_start * stride + x_start; + + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p16[j * stride + i] != p16[i]) { + return 0; + } + } + } + } else { + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p[j * stride + i] != p[i]) { + return 0; + } + } + } + } + return 1; +} + +void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info, + const uint8_t *y_src, int stride, int block_size, + uint32_t *hash_value1, uint32_t *hash_value2, + int use_highbitdepth) { + int add_value = hash_block_size_to_index(block_size); + assert(add_value >= 0); + add_value <<= kSrcBits; + const int crc_mask = (1 << kSrcBits) - 1; + + CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; + CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; + uint32_t **buf_1 = intrabc_hash_info->hash_value_buffer[0]; + uint32_t **buf_2 = intrabc_hash_info->hash_value_buffer[1]; + + // 2x2 subblock hash values in current CU + int sub_block_in_width = (block_size >> 1); + if (use_highbitdepth) { + uint16_t pixel_to_hash[4]; + uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src); + for (int y_pos = 0; y_pos < block_size; y_pos += 2) { + for (int x_pos = 0; x_pos < block_size; x_pos += 2) { + int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); + get_pixels_in_1D_short_array_by_block_2x2( + y16_src + y_pos * stride + x_pos, stride, pixel_to_hash); + assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash, + sizeof(pixel_to_hash)); + buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash, + sizeof(pixel_to_hash)); + } + } + } else { + uint8_t pixel_to_hash[4]; + for (int y_pos = 0; y_pos < block_size; y_pos += 2) { + for (int x_pos = 0; x_pos < block_size; x_pos += 2) { + int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); + get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos, + stride, pixel_to_hash); + assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + buf_1[0][pos] = + av1_get_crc_value(calc_1, pixel_to_hash, sizeof(pixel_to_hash)); + buf_2[0][pos] = + av1_get_crc_value(calc_2, pixel_to_hash, sizeof(pixel_to_hash)); + } + } + } + + int src_sub_block_in_width = sub_block_in_width; + sub_block_in_width >>= 1; + + int src_idx = 1; + int dst_idx = 0; + + // 4x4 subblock hash values to current block hash values + uint32_t to_hash[4]; + for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) { + src_idx = 1 - src_idx; + dst_idx = 1 - dst_idx; + + int dst_pos = 0; + for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) { + for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) { + int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1); + + assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + assert(srcPos + src_sub_block_in_width + 1 < + AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + to_hash[0] = buf_1[src_idx][srcPos]; + to_hash[1] = buf_1[src_idx][srcPos + 1]; + to_hash[2] = buf_1[src_idx][srcPos + src_sub_block_in_width]; + to_hash[3] = buf_1[src_idx][srcPos + src_sub_block_in_width + 1]; + + buf_1[dst_idx][dst_pos] = + av1_get_crc_value(calc_1, (uint8_t *)to_hash, sizeof(to_hash)); + + to_hash[0] = buf_2[src_idx][srcPos]; + to_hash[1] = buf_2[src_idx][srcPos + 1]; + to_hash[2] = buf_2[src_idx][srcPos + src_sub_block_in_width]; + to_hash[3] = buf_2[src_idx][srcPos + src_sub_block_in_width + 1]; + buf_2[dst_idx][dst_pos] = + av1_get_crc_value(calc_2, (uint8_t *)to_hash, sizeof(to_hash)); + dst_pos++; + } + } + + src_sub_block_in_width = sub_block_in_width; + sub_block_in_width >>= 1; + } + + *hash_value1 = (buf_1[dst_idx][0] & crc_mask) + add_value; + *hash_value2 = buf_2[dst_idx][0]; +} diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h new file mode 100644 index 0000000000..8974ba27cb --- /dev/null +++ b/third_party/aom/av1/encoder/hash_motion.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_ +#define AOM_AV1_ENCODER_HASH_MOTION_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_scale/yv12config.h" +#include "av1/encoder/hash.h" +#include "third_party/vector/vector.h" +#ifdef __cplusplus +extern "C" { +#endif + +// Block size used for force_integer_mv decisions +#define FORCE_INT_MV_DECISION_BLOCK_SIZE 8 + +// store a block's hash info. +// x and y are the position from the top left of the picture +// hash_value2 is used to store the second hash value +typedef struct _block_hash { + int16_t x; + int16_t y; + uint32_t hash_value2; +} block_hash; + +typedef struct _hash_table { + Vector **p_lookup_table; +} hash_table; + +struct intrabc_hash_info; + +typedef struct intrabc_hash_info { + // buffer for hash value calculation of a block + // used only in av1_get_block_hash_value() + // [first hash/second hash] + // [two buffers used ping-pong] + uint32_t *hash_value_buffer[2][2]; + hash_table intrabc_hash_table; + + CRC_CALCULATOR crc_calculator1; + CRC_CALCULATOR crc_calculator2; + int g_crc_initialized; +} IntraBCHashInfo; + +void av1_hash_table_init(IntraBCHashInfo *intra_bc_hash_info); +void av1_hash_table_clear_all(hash_table *p_hash_table); +void av1_hash_table_destroy(hash_table *p_hash_table); +bool av1_hash_table_create(hash_table *p_hash_table); +int32_t av1_hash_table_count(const hash_table *p_hash_table, + uint32_t hash_value); +Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, + uint32_t hash_value); +int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, + uint32_t hash_value2); +void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intra_bc_hash_info, + const YV12_BUFFER_CONFIG *picture, + uint32_t *pic_block_hash[2], + int8_t *pic_block_same_info[3]); +void av1_generate_block_hash_value(IntraBCHashInfo *intra_bc_hash_info, + const YV12_BUFFER_CONFIG *picture, + int block_size, + uint32_t *src_pic_block_hash[2], + uint32_t *dst_pic_block_hash[2], + int8_t *src_pic_block_same_info[3], + int8_t *dst_pic_block_same_info[3]); +bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, + uint32_t *pic_hash[2], + int8_t *pic_is_same, + int pic_width, int pic_height, + int block_size); + +// check whether the block starts from (x_start, y_start) with the size of +// block_size x block_size has the same color in all rows +int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start); +// check whether the block starts from (x_start, y_start) with the size of +// block_size x block_size has the same color in all columns +int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start); + +void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info, + const uint8_t *y_src, int stride, int block_size, + uint32_t *hash_value1, uint32_t *hash_value2, + int use_highbitdepth); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_HASH_MOTION_H_ diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c new file mode 100644 index 0000000000..a108e8148c --- /dev/null +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/idct.h" +#include "av1/common/blockd.h" +#include "av1/encoder/hybrid_fwd_txfm.h" + +/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per + pixel. + Shared for both high and low bit depth. + */ +void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { + int i; + tran_high_t a1, b1, c1, d1, e1; + const int16_t *ip_pass0 = input; + const tran_low_t *ip = NULL; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip_pass0[0 * stride]; + b1 = ip_pass0[1 * stride]; + c1 = ip_pass0[2 * stride]; + d1 = ip_pass0[3 * stride]; + + a1 += b1; + d1 = d1 - c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[0] = (tran_low_t)a1; + op[1] = (tran_low_t)c1; + op[2] = (tran_low_t)d1; + op[3] = (tran_low_t)b1; + + ip_pass0++; + op += 4; + } + ip = output; + op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + b1 = ip[4 * 1]; + c1 = ip[4 * 2]; + d1 = ip[4 * 3]; + + a1 += b1; + d1 -= c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[4 * 0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR); + op[4 * 1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR); + op[4 * 2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR); + op[4 * 3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR); + + ip++; + op++; + } +} + +static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + if (txfm_param->lossless) { + assert(tx_type == DCT_DCT); + av1_fwht4x4(src_diff, coeff, diff_stride); + return; + } + av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +#if !CONFIG_REALTIME_ONLY +static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} +#endif + +static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + bd); +} + +static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + bd); +} + +#if !CONFIG_REALTIME_ONLY +static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} + +static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} +#endif + +static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} + +void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, + TxfmParam *txfm_param) { + if (txfm_param->bd == 8) + av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); + else + av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); +} + +void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); +} + +void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_64X64: + highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X64: + highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_64X32: + highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param); + break; + + case TX_32X32: + highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X16: + highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X8: + highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_4X8: + highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X4: + highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X16: + highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X8: + highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X32: + highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X16: + highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_4X4: + highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); + break; +#if !CONFIG_REALTIME_ONLY + case TX_4X16: + highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X4: + highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X32: + highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X8: + highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X64: + highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_64X16: + highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param); + break; +#endif + default: assert(0); break; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff, + ptrdiff_t src_stride, + tran_low_t *coeff) { + switch (tx_size) { + // As the output transform co-efficients of 4x4 Hadamard transform can be + // represented using 15 bits (for 12-bit clip) use lowbd variant of + // hadamard_4x4. + case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break; + case TX_8X8: aom_highbd_hadamard_8x8(src_diff, src_stride, coeff); break; + case TX_16X16: + aom_highbd_hadamard_16x16(src_diff, src_stride, coeff); + break; + case TX_32X32: + aom_highbd_hadamard_32x32(src_diff, src_stride, coeff); + break; + default: assert(0); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + switch (tx_size) { + case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break; + case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break; + case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break; + case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break; + default: assert(0); + } +} + +void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info, + const int16_t *src_diff, int src_stride, + tran_low_t *coeff) { + if (use_hadamard) { +#if CONFIG_AV1_HIGHBITDEPTH + if (bd_info.use_highbitdepth_buf) { + highbd_wht_fwd_txfm(tx_size, src_diff, src_stride, coeff); + } else { + wht_fwd_txfm(tx_size, src_diff, src_stride, coeff); + } +#else + wht_fwd_txfm(tx_size, src_diff, src_stride, coeff); +#endif // CONFIG_AV1_HIGHBITDEPTH + } else { + TxfmParam txfm_param; + txfm_param.tx_type = DCT_DCT; + txfm_param.tx_size = tx_size; + txfm_param.lossless = 0; + txfm_param.bd = bd_info.bit_depth; + txfm_param.is_hbd = bd_info.use_highbitdepth_buf; + txfm_param.tx_set_type = EXT_TX_SET_ALL16; + av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param); + } +} diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h new file mode 100644 index 0000000000..30f8a2258b --- /dev/null +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ +#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, + TxfmParam *txfm_param); + +void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param); + +/*!\brief Apply Hadamard or DCT transform + * + * \callergraph + * DCT and Hadamard transforms are commonly used for quick RD score estimation. + * The coeff buffer's size should be equal to the number of pixels + * corresponding to tx_size. + */ +void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info, + const int16_t *src_diff, int src_stride, tran_low_t *coeff); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ diff --git a/third_party/aom/av1/encoder/interp_search.c b/third_party/aom/av1/encoder/interp_search.c new file mode 100644 index 0000000000..27235303c0 --- /dev/null +++ b/third_party/aom/av1/encoder/interp_search.c @@ -0,0 +1,801 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/pred_common.h" +#include "av1/encoder/interp_search.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/reconinter_enc.h" + +// return mv_diff +static INLINE int is_interp_filter_good_match( + const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi, + int skip_level) { + const int is_comp = has_second_ref(mi); + int i; + + for (i = 0; i < 1 + is_comp; ++i) { + if (st->ref_frames[i] != mi->ref_frame[i]) return INT_MAX; + } + + if (skip_level == 1 && is_comp) { + if (st->comp_type != mi->interinter_comp.type) return INT_MAX; + if (st->compound_idx != mi->compound_idx) return INT_MAX; + } + + int mv_diff = 0; + for (i = 0; i < 1 + is_comp; ++i) { + mv_diff += abs(st->mv[i].as_mv.row - mi->mv[i].as_mv.row) + + abs(st->mv[i].as_mv.col - mi->mv[i].as_mv.col); + } + return mv_diff; +} + +static INLINE int save_interp_filter_search_stat( + MB_MODE_INFO *const mbmi, int64_t rd, unsigned int pred_sse, + INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx) { + if (interp_filter_stats_idx < MAX_INTERP_FILTER_STATS) { + INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters, + { mbmi->mv[0], mbmi->mv[1] }, + { mbmi->ref_frame[0], + mbmi->ref_frame[1] }, + mbmi->interinter_comp.type, + mbmi->compound_idx, + rd, + pred_sse }; + interp_filter_stats[interp_filter_stats_idx] = stat; + interp_filter_stats_idx++; + } + return interp_filter_stats_idx; +} + +static INLINE int find_interp_filter_in_stats( + MB_MODE_INFO *const mbmi, INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx, int skip_level) { + // [skip_levels][single or comp] + const int thr[2][2] = { { 0, 0 }, { 3, 7 } }; + const int is_comp = has_second_ref(mbmi); + + // Find good enough match. + // TODO(yunqing): Separate single-ref mode and comp mode stats for fast + // search. + int best = INT_MAX; + int match = -1; + for (int j = 0; j < interp_filter_stats_idx; ++j) { + const INTERPOLATION_FILTER_STATS *st = &interp_filter_stats[j]; + const int mv_diff = is_interp_filter_good_match(st, mbmi, skip_level); + // Exact match is found. + if (mv_diff == 0) { + match = j; + break; + } else if (mv_diff < best && mv_diff <= thr[skip_level - 1][is_comp]) { + best = mv_diff; + match = j; + } + } + + if (match != -1) { + mbmi->interp_filters = interp_filter_stats[match].filters; + return match; + } + return -1; // no match result found +} + +int av1_find_interp_filter_match( + MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi, + const InterpFilter assign_filter, const int need_search, + INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx) { + int match_found_idx = -1; + if (cpi->sf.interp_sf.use_interp_filter && need_search) + match_found_idx = find_interp_filter_in_stats( + mbmi, interp_filter_stats, interp_filter_stats_idx, + cpi->sf.interp_sf.use_interp_filter); + + if (!need_search || match_found_idx == -1) + set_default_interp_filters(mbmi, assign_filter); + return match_found_idx; +} + +static INLINE int get_switchable_rate(MACROBLOCK *const x, + const int_interpfilters filters, + const int ctx[2], int dual_filter) { + const InterpFilter filter0 = filters.as_filters.y_filter; + int inter_filter_cost = + x->mode_costs.switchable_interp_costs[ctx[0]][filter0]; + if (dual_filter) { + const InterpFilter filter1 = filters.as_filters.x_filter; + inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx[1]][filter1]; + } + return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; +} + +// Build inter predictor and calculate model rd +// for a given plane. +static INLINE void interp_model_rd_eval( + MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int plane_from, int plane_to, + RD_STATS *rd_stats, int is_skip_build_pred) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + RD_STATS tmp_rd_stats; + av1_init_rd_stats(&tmp_rd_stats); + + // Skip inter predictor if the predictor is already available. + if (!is_skip_build_pred) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + plane_from, plane_to); + } + + model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model + ? MODELRD_LEGACY + : MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate, + &tmp_rd_stats.dist, &tmp_rd_stats.skip_txfm, &tmp_rd_stats.sse, NULL, + NULL, NULL); + + av1_merge_rd_stats(rd_stats, &tmp_rd_stats); +} + +// calculate the rdcost of given interpolation_filter +static INLINE int64_t interpolation_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, + RD_STATS *rd_stats_luma, RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], int filter_idx, const int switchable_ctx[2], + const int skip_pred) { + const AV1_COMMON *cm = &cpi->common; + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + RD_STATS this_rd_stats_luma, this_rd_stats; + + // Initialize rd_stats structures to default values. + av1_init_rd_stats(&this_rd_stats_luma); + this_rd_stats = *rd_stats_luma; + const int_interpfilters last_best = mbmi->interp_filters; + mbmi->interp_filters = filter_sets[filter_idx]; + const int tmp_rs = + get_switchable_rate(x, mbmi->interp_filters, switchable_ctx, + cm->seq_params->enable_dual_filter); + + int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0); + if (min_rd > *rd) { + mbmi->interp_filters = last_best; + return 0; + } + + (void)tile_data; + + assert(skip_pred != 2); + assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0)); + assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0)); + assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0)); + assert((rd_stats_luma->skip_txfm == 0) || (rd_stats_luma->skip_txfm == 1)); + assert((rd_stats->skip_txfm == 0) || (rd_stats->skip_txfm == 1)); + assert((skip_pred >= 0) && + (skip_pred <= interp_search_flags->default_interp_skip_flags)); + + // When skip_txfm pred is equal to default_interp_skip_flags, + // skip both luma and chroma MC. + // For mono-chrome images: + // num_planes = 1 and cpi->default_interp_skip_flags = 1, + // skip_pred = 1: skip both luma and chroma + // skip_pred = 0: Evaluate luma and as num_planes=1, + // skip chroma evaluation + int tmp_skip_pred = + (skip_pred == interp_search_flags->default_interp_skip_flags) + ? INTERP_SKIP_LUMA_SKIP_CHROMA + : skip_pred; + + switch (tmp_skip_pred) { + case INTERP_EVAL_LUMA_EVAL_CHROMA: + // skip_pred = 0: Evaluate both luma and chroma. + // Luma MC + interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y, + &this_rd_stats_luma, 0); + this_rd_stats = this_rd_stats_luma; +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + AOM_FALLTHROUGH_INTENDED; + case INTERP_SKIP_LUMA_EVAL_CHROMA: + // skip_pred = 1: skip luma evaluation (retain previous best luma stats) + // and do chroma evaluation. + for (int plane = 1; plane < num_planes; ++plane) { + int64_t tmp_rd = + RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist); + if (tmp_rd >= *rd) { + mbmi->interp_filters = last_best; + return 0; + } + interp_model_rd_eval(x, cpi, bsize, orig_dst, plane, plane, + &this_rd_stats, 0); + } + break; + case INTERP_SKIP_LUMA_SKIP_CHROMA: + // both luma and chroma evaluation is skipped + this_rd_stats = *rd_stats; + break; + case INTERP_EVAL_INVALID: + default: assert(0); return 0; + } + int64_t tmp_rd = + RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist); + + if (tmp_rd < *rd) { + *rd = tmp_rd; + *switchable_rate = tmp_rs; + if (skip_pred != interp_search_flags->default_interp_skip_flags) { + if (skip_pred == INTERP_EVAL_LUMA_EVAL_CHROMA) { + // Overwrite the data as current filter is the best one + *rd_stats_luma = this_rd_stats_luma; + *rd_stats = this_rd_stats; + // As luma MC data is computed, no need to recompute after the search + x->recalc_luma_mc_data = 0; + } else if (skip_pred == INTERP_SKIP_LUMA_EVAL_CHROMA) { + // As luma MC data is not computed, update of luma data can be skipped + *rd_stats = this_rd_stats; + // As luma MC data is not recomputed and current filter is the best, + // indicate the possibility of recomputing MC data + // If current buffer contains valid MC data, toggle to indicate that + // luma MC data needs to be recomputed + x->recalc_luma_mc_data ^= 1; + } + swap_dst_buf(xd, dst_bufs, num_planes); + } + return 1; + } + mbmi->interp_filters = last_best; + return 0; +} + +static INLINE INTERP_PRED_TYPE is_pred_filter_search_allowed( + const AV1_COMP *const cpi, MACROBLOCKD *xd, BLOCK_SIZE bsize, + int_interpfilters *af, int_interpfilters *lf) { + const AV1_COMMON *cm = &cpi->common; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int bsl = mi_size_wide_log2[bsize]; + int is_horiz_eq = 0, is_vert_eq = 0; + + if (above_mbmi && is_inter_block(above_mbmi)) + *af = above_mbmi->interp_filters; + + if (left_mbmi && is_inter_block(left_mbmi)) *lf = left_mbmi->interp_filters; + + if (af->as_filters.x_filter != INTERP_INVALID) + is_horiz_eq = af->as_filters.x_filter == lf->as_filters.x_filter; + if (af->as_filters.y_filter != INTERP_INVALID) + is_vert_eq = af->as_filters.y_filter == lf->as_filters.y_filter; + + INTERP_PRED_TYPE pred_filter_type = (is_vert_eq << 1) + is_horiz_eq; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int pred_filter_enable = + cpi->sf.interp_sf.cb_pred_filter_search + ? (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_frame.frame_number)) & + 0x1 + : 0; + pred_filter_enable &= is_horiz_eq || is_vert_eq; + // pred_filter_search = 0: pred_filter is disabled + // pred_filter_search = 1: pred_filter is enabled and only horz pred matching + // pred_filter_search = 2: pred_filter is enabled and only vert pred matching + // pred_filter_search = 3: pred_filter is enabled and + // both vert, horz pred matching + return pred_filter_enable * pred_filter_type; +} + +static DUAL_FILTER_TYPE find_best_interp_rd_facade( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_pred, uint16_t allow_interp_mask, int is_w4_or_h4) { + int tmp_skip_pred = skip_pred; + DUAL_FILTER_TYPE best_filt_type = REG_REG; + + // If no filter are set to be evaluated, return from function + if (allow_interp_mask == 0x0) return best_filt_type; + // For block width or height is 4, skip the pred evaluation of SHARP_SHARP + tmp_skip_pred = is_w4_or_h4 + ? cpi->interp_search_flags.default_interp_skip_flags + : skip_pred; + + // Loop over the all filter types and evaluate for only allowed filter types + for (int filt_type = SHARP_SHARP; filt_type >= REG_REG; --filt_type) { + const int is_filter_allowed = + get_interp_filter_allowed_mask(allow_interp_mask, filt_type); + if (is_filter_allowed) + if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, + dst_bufs, filt_type, switchable_ctx, + tmp_skip_pred)) + best_filt_type = filt_type; + tmp_skip_pred = skip_pred; + } + return best_filt_type; +} + +static INLINE void pred_dual_interp_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_pred, INTERP_PRED_TYPE pred_filt_type, int_interpfilters *af, + int_interpfilters *lf) { + (void)lf; + assert(pred_filt_type > INTERP_HORZ_NEQ_VERT_NEQ); + assert(pred_filt_type < INTERP_PRED_TYPE_ALL); + uint16_t allowed_interp_mask = 0; + + if (pred_filt_type == INTERP_HORZ_EQ_VERT_NEQ) { + // pred_filter_search = 1: Only horizontal filter is matching + allowed_interp_mask = + av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.x_filter]; + } else if (pred_filt_type == INTERP_HORZ_NEQ_VERT_EQ) { + // pred_filter_search = 2: Only vertical filter is matching + allowed_interp_mask = + av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.y_filter]; + } else { + // pred_filter_search = 3: Both horizontal and vertical filter are matching + int filt_type = + af->as_filters.x_filter + af->as_filters.y_filter * SWITCHABLE_FILTERS; + set_interp_filter_allowed_mask(&allowed_interp_mask, filt_type); + } + // REG_REG is already been evaluated in the beginning + reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG); + find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, + rd_stats, switchable_rate, dst_bufs, + switchable_ctx, skip_pred, allowed_interp_mask, 0); +} +// Evaluate dual filter type +// a) Using above, left block interp filter +// b) Find the best horizontal filter and +// then evaluate corresponding vertical filters. +static INLINE void fast_dual_interp_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_hor, const int skip_ver) { + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ; + int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID); + int_interpfilters lf = af; + + if (!have_newmv_in_inter_mode(mbmi->mode)) { + pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf); + } + + if (pred_filter_type) { + pred_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + switchable_ctx, (skip_hor & skip_ver), + pred_filter_type, &af, &lf); + } else { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + int best_dual_mode = 0; + int skip_pred = + bw <= 4 ? interp_search_flags->default_interp_skip_flags : skip_hor; + // TODO(any): Make use of find_best_interp_rd_facade() + // if speed impact is negligible + for (int i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) { + if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, + dst_bufs, i, switchable_ctx, skip_pred)) { + best_dual_mode = i; + } + skip_pred = skip_hor; + } + // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes + skip_pred = + bh <= 4 ? interp_search_flags->default_interp_skip_flags : skip_ver; + for (int i = (best_dual_mode + (SWITCHABLE_FILTERS * 2)); + i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) { + interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + i, switchable_ctx, skip_pred); + skip_pred = skip_ver; + } + } +} + +// Find the best interp filter if dual_interp_filter = 0 +static INLINE void find_best_non_dual_interp_filter( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_ver, const int skip_hor) { + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + int8_t i; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + uint16_t interp_filter_search_mask = + interp_search_flags->interp_filter_search_mask; + + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0); + const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1); + int use_actual_frame_probs = 1; + const int *switchable_interp_p0; + const int *switchable_interp_p1; +#if CONFIG_FPMT_TEST + use_actual_frame_probs = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; + if (!use_actual_frame_probs) { + switchable_interp_p0 = (int *)cpi->ppi->temp_frame_probs + .switchable_interp_probs[update_type][ctx0]; + switchable_interp_p1 = (int *)cpi->ppi->temp_frame_probs + .switchable_interp_probs[update_type][ctx1]; + } +#endif + if (use_actual_frame_probs) { + switchable_interp_p0 = + cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx0]; + switchable_interp_p1 = + cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx1]; + } + static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 }; + const int thresh = thr[update_type]; + for (i = 0; i < SWITCHABLE_FILTERS; i++) { + // For non-dual case, the 2 dir's prob should be identical. + assert(switchable_interp_p0[i] == switchable_interp_p1[i]); + if (switchable_interp_p0[i] < thresh && + switchable_interp_p1[i] < thresh) { + DUAL_FILTER_TYPE filt_type = i + SWITCHABLE_FILTERS * i; + reset_interp_filter_allowed_mask(&interp_filter_search_mask, filt_type); + } + } + } + + // Regular filter evaluation should have been done and hence the same should + // be the winner + assert(x->e_mbd.mi[0]->interp_filters.as_int == filter_sets[0].as_int); + if ((skip_hor & skip_ver) != interp_search_flags->default_interp_skip_flags) { + INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ; + int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID); + int_interpfilters lf = af; + + pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf); + if (pred_filter_type) { + assert(af.as_filters.x_filter != INTERP_INVALID); + int filter_idx = SWITCHABLE * af.as_filters.x_filter; + // This assert tells that (filter_x == filter_y) for non-dual filter case + assert(filter_sets[filter_idx].as_filters.x_filter == + filter_sets[filter_idx].as_filters.y_filter); + if (cpi->sf.interp_sf.adaptive_interp_filter_search && + !(get_interp_filter_allowed_mask(interp_filter_search_mask, + filter_idx))) { + return; + } + if (filter_idx) { + interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + filter_idx, switchable_ctx, + (skip_hor & skip_ver)); + } + return; + } + } + // Reuse regular filter's modeled rd data for sharp filter for following + // cases + // 1) When bsize is 4x4 + // 2) When block width is 4 (i.e. 4x8/4x16 blocks) and MV in vertical + // direction is full-pel + // 3) When block height is 4 (i.e. 8x4/16x4 blocks) and MV in horizontal + // direction is full-pel + // TODO(any): Optimize cases 2 and 3 further if luma MV in relavant direction + // alone is full-pel + + if ((bsize == BLOCK_4X4) || + (block_size_wide[bsize] == 4 && + skip_ver == interp_search_flags->default_interp_skip_flags) || + (block_size_high[bsize] == 4 && + skip_hor == interp_search_flags->default_interp_skip_flags)) { + int skip_pred = skip_hor & skip_ver; + uint16_t allowed_interp_mask = 0; + + // REG_REG filter type is evaluated beforehand, hence skip it + set_interp_filter_allowed_mask(&allowed_interp_mask, SHARP_SHARP); + set_interp_filter_allowed_mask(&allowed_interp_mask, SMOOTH_SMOOTH); + if (cpi->sf.interp_sf.adaptive_interp_filter_search) + allowed_interp_mask &= interp_filter_search_mask; + + find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + switchable_ctx, skip_pred, allowed_interp_mask, + 1); + } else { + int skip_pred = (skip_hor & skip_ver); + for (i = (SWITCHABLE_FILTERS + 1); i < DUAL_FILTER_SET_SIZE; + i += (SWITCHABLE_FILTERS + 1)) { + // This assert tells that (filter_x == filter_y) for non-dual filter case + assert(filter_sets[i].as_filters.x_filter == + filter_sets[i].as_filters.y_filter); + if (cpi->sf.interp_sf.adaptive_interp_filter_search && + !(get_interp_filter_allowed_mask(interp_filter_search_mask, i))) { + continue; + } + interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + i, switchable_ctx, skip_pred); + // In first iteration, smooth filter is evaluated. If smooth filter + // (which is less sharper) is the winner among regular and smooth filters, + // sharp filter evaluation is skipped + // TODO(any): Refine this gating based on modelled rd only (i.e., by not + // accounting switchable filter rate) + if (cpi->sf.interp_sf.skip_sharp_interp_filter_search && + skip_pred != interp_search_flags->default_interp_skip_flags) { + if (mbmi->interp_filters.as_int == filter_sets[SMOOTH_SMOOTH].as_int) + break; + } + } + } +} + +static INLINE void calc_interp_skip_pred_flag(MACROBLOCK *const x, + const AV1_COMP *const cpi, + int *skip_hor, int *skip_ver) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int num_planes = av1_num_planes(cm); + const int is_compound = has_second_ref(mbmi); + assert(is_intrabc_block(mbmi) == 0); + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, mbmi->ref_frame[ref]); + // TODO(any): Refine skip flag calculation considering scaling + if (av1_is_scaled(sf)) { + *skip_hor = 0; + *skip_ver = 0; + break; + } + const MV mv = mbmi->mv[ref].as_mv; + int skip_hor_plane = 0; + int skip_ver_plane = 0; + for (int plane_idx = 0; plane_idx < AOMMAX(1, (num_planes - 1)); + ++plane_idx) { + struct macroblockd_plane *const pd = &xd->plane[plane_idx]; + const int bw = pd->width; + const int bh = pd->height; + const MV mv_q4 = clamp_mv_to_umv_border_sb( + xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); + const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; + const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; + skip_hor_plane |= ((sub_x == 0) << plane_idx); + skip_ver_plane |= ((sub_y == 0) << plane_idx); + } + *skip_hor &= skip_hor_plane; + *skip_ver &= skip_ver_plane; + // It is not valid that "luma MV is sub-pel, whereas chroma MV is not" + assert(*skip_hor != 2); + assert(*skip_ver != 2); + } + // When compond prediction type is compound segment wedge, luma MC and chroma + // MC need to go hand in hand as mask generated during luma MC is reuired for + // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during + // vertical filter decision may be incorrect as temporary MC evaluation + // overwrites the mask. Make skip_ver as 0 for this case so that mask is + // populated during luma MC + if (is_compound && mbmi->compound_idx == 1 && + mbmi->interinter_comp.type == COMPOUND_DIFFWTD) { + assert(mbmi->comp_group_idx == 1); + if (*skip_hor == 0 && *skip_ver == 1) *skip_ver = 0; + } +} + +/*!\brief AV1 interpolation filter search + * + * \ingroup inter_mode_search + * + * \param[in] cpi Top-level encoder structure. + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during + * encoding. + * \param[in] x Pointer to struc holding all the data for + * the current macroblock. + * \param[in] bsize Current block size. + * \param[in] tmp_dst A temporary prediction buffer to hold a + * computed prediction. + * \param[in,out] orig_dst A prediction buffer to hold a computed + * prediction. This will eventually hold the + * final prediction, and the tmp_dst info will + * be copied here. + * \param[in,out] rd The RD cost associated with the selected + * interpolation filter parameters. + * \param[in,out] switchable_rate The rate associated with using a SWITCHABLE + * filter mode. + * \param[in,out] skip_build_pred Indicates whether or not to build the inter + * predictor. If this is 0, the inter predictor + * has already been built and thus we can avoid + * repeating computation. + * \param[in] args HandleInterModeArgs struct holding + * miscellaneous arguments for inter mode + * search. See the documentation for this + * struct for a description of each member. + * \param[in] ref_best_rd Best RD found so far for this block. + * It is used for early termination of this + * search if the RD exceeds this value. + * + * \return Returns INT64_MAX if the filter parameters are invalid and the + * current motion mode being tested should be skipped. It returns 0 if the + * parameter search is a success. + */ +int64_t av1_interpolation_filter_search( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst, + int64_t *const rd, int *const switchable_rate, int *skip_build_pred, + HandleInterModeArgs *args, int64_t ref_best_rd) { + const AV1_COMMON *cm = &cpi->common; + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int need_search = av1_is_interp_needed(xd); + const int ref_frame = xd->mi[0]->ref_frame[0]; + RD_STATS rd_stats_luma, rd_stats; + + // Initialization of rd_stats structures with default values + av1_init_rd_stats(&rd_stats_luma); + av1_init_rd_stats(&rd_stats); + + int match_found_idx = -1; + const InterpFilter assign_filter = cm->features.interp_filter; + + match_found_idx = av1_find_interp_filter_match( + mbmi, cpi, assign_filter, need_search, args->interp_filter_stats, + args->interp_filter_stats_idx); + + if (match_found_idx != -1) { + *rd = args->interp_filter_stats[match_found_idx].rd; + x->pred_sse[ref_frame] = + args->interp_filter_stats[match_found_idx].pred_sse; + *skip_build_pred = 0; + return 0; + } + + int switchable_ctx[2]; + switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0); + switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1); + *switchable_rate = + get_switchable_rate(x, mbmi->interp_filters, switchable_ctx, + cm->seq_params->enable_dual_filter); + + // Do MC evaluation for default filter_type. + // Luma MC + interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y, + &rd_stats_luma, *skip_build_pred); + +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + // Chroma MC + if (num_planes > 1) { + interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_U, AOM_PLANE_V, + &rd_stats, *skip_build_pred); + } + *skip_build_pred = 1; + + av1_merge_rd_stats(&rd_stats, &rd_stats_luma); + + assert(rd_stats.rate >= 0); + + *rd = RDCOST(x->rdmult, *switchable_rate + rd_stats.rate, rd_stats.dist); + x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4); + + if (assign_filter != SWITCHABLE || match_found_idx != -1) { + return 0; + } + if (!need_search) { + int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + assert(mbmi->interp_filters.as_int == filters.as_int); + (void)filters; + return 0; + } + if (args->modelled_rd != NULL) { + if (has_second_ref(mbmi)) { + const int ref_mv_idx = mbmi->ref_mv_idx; + MV_REFERENCE_FRAME *refs = mbmi->ref_frame; + const int mode0 = compound_ref0_mode(mbmi->mode); + const int mode1 = compound_ref1_mode(mbmi->mode); + const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) { + return INT64_MAX; + } + } + } + + x->recalc_luma_mc_data = 0; + // skip_flag=xx (in binary form) + // Setting 0th flag corresonds to skipping luma MC and setting 1st bt + // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip + // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only" + // Skip_flag=2 is not a valid case + // skip_flag=3 corresponds to "Skip both luma and chroma MC" + int skip_hor = interp_search_flags->default_interp_skip_flags; + int skip_ver = interp_search_flags->default_interp_skip_flags; + calc_interp_skip_pred_flag(x, cpi, &skip_hor, &skip_ver); + + // do interp_filter search + restore_dst_buf(xd, *tmp_dst, num_planes); + const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst }; + // Evaluate dual interp filters + if (cm->seq_params->enable_dual_filter) { + if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) { + fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + &rd_stats_luma, &rd_stats, switchable_rate, + dst_bufs, switchable_ctx, skip_hor, skip_ver); + } else { + // Use full interpolation filter search + uint16_t allowed_interp_mask = ALLOW_ALL_INTERP_FILT_MASK; + // REG_REG filter type is evaluated beforehand, so loop is repeated over + // REG_SMOOTH to SHARP_SHARP for full interpolation filter search + reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG); + find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, + &rd_stats_luma, &rd_stats, switchable_rate, + dst_bufs, switchable_ctx, + (skip_hor & skip_ver), allowed_interp_mask, 0); + } + } else { + // Evaluate non-dual interp filters + find_best_non_dual_interp_filter( + x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats, + switchable_rate, dst_bufs, switchable_ctx, skip_ver, skip_hor); + } + swap_dst_buf(xd, dst_bufs, num_planes); + // Recompute final MC data if required + if (x->recalc_luma_mc_data == 1) { + // Recomputing final luma MC data is required only if the same was skipped + // in either of the directions Condition below is necessary, but not + // sufficient + assert((skip_hor == 1) || (skip_ver == 1)); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } + x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4); + + // save search results + if (cpi->sf.interp_sf.use_interp_filter) { + assert(match_found_idx == -1); + args->interp_filter_stats_idx = save_interp_filter_search_stat( + mbmi, *rd, x->pred_sse[ref_frame], args->interp_filter_stats, + args->interp_filter_stats_idx); + } + return 0; +} diff --git a/third_party/aom/av1/encoder/interp_search.h b/third_party/aom/av1/encoder/interp_search.h new file mode 100644 index 0000000000..9815e0bcfb --- /dev/null +++ b/third_party/aom/av1/encoder/interp_search.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ +#define AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ + +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rdopt_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ +#define MAX_INTERP_FILTER_STATS 128 +#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) + +typedef struct { + int_interpfilters filters; + int_mv mv[2]; + int8_t ref_frames[2]; + COMPOUND_TYPE comp_type; + int compound_idx; + int64_t rd; + unsigned int pred_sse; +} INTERPOLATION_FILTER_STATS; +/*!\endcond */ + +/*!\brief Miscellaneous arguments for inter mode search. + */ +typedef struct HandleInterModeArgs { + /*! + * Buffer for the above predictor in OBMC + */ + uint8_t *above_pred_buf[MAX_MB_PLANE]; + /*! + * Stride for the above predictor in OBMC + */ + int above_pred_stride[MAX_MB_PLANE]; + /*! + * Buffer for the left predictor in OBMC + */ + uint8_t *left_pred_buf[MAX_MB_PLANE]; + /*! + * Stride for the left predictor in OBMC + */ + int left_pred_stride[MAX_MB_PLANE]; + /*! + * Pointer to the first member in a 2D array which holds + * single reference mode motion vectors to be used as a starting + * point in the mv search for compound modes. Each array is length REF_FRAMES, + * meaning there is a slot for a single reference motion vector for + * each possible reference frame. The 2D array consists of N of these arrays, + * where N is the length of the reference mv stack computed for the single + * reference case for that particular reference frame. + */ + int_mv (*single_newmv)[REF_FRAMES]; + /*! + * Pointer to the first array of a 2D array with the same setup as + * single_newmv array above. This is a 2D array to hold the rate + * corresponding to each of the single reference mode motion vectors + * held in single_newmv. + */ + int (*single_newmv_rate)[REF_FRAMES]; + /*! + * Pointer to the first array of a 2D array with the same setup as + * single_newmv array above. This is a 2D array to hold a 0 or 1 + * validity value corresponding to each of the single reference mode motion + * vectors held in single_newmv. + */ + int (*single_newmv_valid)[REF_FRAMES]; + /*! + * Pointer to the first array in a 3D array of predicted rate-distortion. + * The dimensions of this structure are: + * (number of possible inter modes) X + * (number of reference MVs) X + * (number of reference frames). + */ + int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES]; + /*! + * Holds an estimated entropy cost for picking the current reference frame. + * This is used to compute an rd estimate. + */ + int ref_frame_cost; + /*! + * Holds an estimated entropy cost for picking single or compound + * reference. This is used to compute an rd estimate. + */ + int single_comp_cost; + /*! + * Pointer to the first element in a 3D array holding rd's of + * SIMPLE_TRANSLATION used to prune out the motion mode search in single ref + * modes used to determine compound ref modes. The full structure is: + * (number of inter modes) X (length of refmv list) X (number of ref frames) + */ + int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES]; + /*! + * An integer value 0 or 1 which indicates whether or not to skip the motion + * mode search and default to SIMPLE_TRANSLATION as a speed feature. + */ + int skip_motion_mode; + /*! + * Initialized to false. If true, skips interpolation filter search and uses + * the default EIGHTTAP_REGULAR. + */ + bool skip_ifs; + /*! + * A pointer to the first element in an array of INTERINTRA_MODE types. This + * contains the best inter_intra mode for each reference frame. + */ + INTERINTRA_MODE *inter_intra_mode; + /*! + * Array of saved interpolation filter stats collected to avoid repeating + * an interpolation filter search when the mv and ref_frame are the same + * as a previous search. + */ + INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS]; + + /*! + * Stack to store full pixel search start mv of NEWMV mode. + */ + FULLPEL_MV start_mv_stack[(MAX_REF_MV_SEARCH - 1) * 2]; + + /*! + * Stack to store ref_mv_idx of NEWMV mode. + */ + uint8_t ref_mv_idx_stack[(MAX_REF_MV_SEARCH - 1) * 2]; + + /*! + * Count of mvs in start mv stack. + */ + int start_mv_cnt; + + /*! + * Index of the last set of saved stats in the interp_filter_stats array. + */ + int interp_filter_stats_idx; + /*! + * Estimated wedge index. + */ + int wedge_index; + /*! + * Estimated wedge sign. + */ + int wedge_sign; + /*! + * Estimated diff wtd index. + */ + int diffwtd_index; + /*! + * Estimated cmp mode. + */ + int cmp_mode[MODE_CTX_REF_FRAMES]; + /*! + * The best sse during single new_mv search. Note that the sse here comes from + * single_motion_search, and not from interpolation_filter_search. This has + * two implications: + * 1. The mv used to calculate the sse here does not have to be the best sse + * found in handle_inter_mode. + * 2. Even if the mvs agree, the sse here can differ from the sse in \ref + * MACROBLOCK::pred_sse due to different interpolation filter used. + */ + unsigned int best_single_sse_in_refs[REF_FRAMES]; + /*! + * Holds the sse of best mode so far in the mode evaluation process. This is + * used in intermediate termination of NEWMV mode evaluation. + */ + unsigned int best_pred_sse; +} HandleInterModeArgs; + +/*!\cond */ +static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = { + { 0x00000000 }, { 0x00010000 }, { 0x00020000 }, // y = 0 + { 0x00000001 }, { 0x00010001 }, { 0x00020001 }, // y = 1 + { 0x00000002 }, { 0x00010002 }, { 0x00020002 }, // y = 2 +}; + +int av1_find_interp_filter_match( + MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi, + const InterpFilter assign_filter, const int need_search, + INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx); + +int64_t av1_interpolation_filter_search( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst, + int64_t *const rd, int *const switchable_rate, int *skip_build_pred, + HandleInterModeArgs *args, int64_t ref_best_rd); + +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ diff --git a/third_party/aom/av1/encoder/intra_mode_search.c b/third_party/aom/av1/encoder/intra_mode_search.c new file mode 100644 index 0000000000..99b0af2f8e --- /dev/null +++ b/third_party/aom/av1/encoder/intra_mode_search.c @@ -0,0 +1,1739 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/tx_search.h" + +// Even though there are 7 delta angles, this macro is set to 9 to facilitate +// the rd threshold check to prune -3 and 3 delta angles. +#define SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY (2 * MAX_ANGLE_DELTA + 3) + +// The order for evaluating delta angles while processing the luma directional +// intra modes. Currently, this order of evaluation is applicable only when +// speed feature prune_luma_odd_delta_angles_in_intra is enabled. In this case, +// even angles are evaluated first in order to facilitate the pruning of odd +// delta angles based on the rd costs of the neighboring delta angles. +static const int8_t luma_delta_angles_order[2 * MAX_ANGLE_DELTA] = { + -2, 2, -3, -1, 1, 3, +}; + +/*!\cond */ +static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = { + DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, PAETH_PRED, + SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED, D157_PRED, + D67_PRED, D113_PRED, D45_PRED, +}; + +static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { + UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, UV_V_PRED, + UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED, + UV_D135_PRED, UV_D203_PRED, UV_D157_PRED, UV_D67_PRED, + UV_D113_PRED, UV_D45_PRED, +}; + +// The bitmask corresponds to the filter intra modes as defined in enums.h +// FILTER_INTRA_MODE enumeration type. Setting a bit to 0 in the mask means to +// disable the evaluation of corresponding filter intra mode. The table +// av1_derived_filter_intra_mode_used_flag is used when speed feature +// prune_filter_intra_level is 1. The evaluated filter intra modes are union +// of the following: +// 1) FILTER_DC_PRED +// 2) mode that corresponds to best mode so far of DC_PRED, V_PRED, H_PRED, +// D157_PRED and PAETH_PRED. (Eg: FILTER_V_PRED if best mode so far is V_PRED). +static const uint8_t av1_derived_filter_intra_mode_used_flag[INTRA_MODES] = { + 0x01, // DC_PRED: 0000 0001 + 0x03, // V_PRED: 0000 0011 + 0x05, // H_PRED: 0000 0101 + 0x01, // D45_PRED: 0000 0001 + 0x01, // D135_PRED: 0000 0001 + 0x01, // D113_PRED: 0000 0001 + 0x09, // D157_PRED: 0000 1001 + 0x01, // D203_PRED: 0000 0001 + 0x01, // D67_PRED: 0000 0001 + 0x01, // SMOOTH_PRED: 0000 0001 + 0x01, // SMOOTH_V_PRED: 0000 0001 + 0x01, // SMOOTH_H_PRED: 0000 0001 + 0x11 // PAETH_PRED: 0001 0001 +}; + +// The bitmask corresponds to the chroma intra modes as defined in enums.h +// UV_PREDICTION_MODE enumeration type. Setting a bit to 0 in the mask means to +// disable the evaluation of corresponding chroma intra mode. The table +// av1_derived_chroma_intra_mode_used_flag is used when speed feature +// prune_chroma_modes_using_luma_winner is enabled. The evaluated chroma +// intra modes are union of the following: +// 1) UV_DC_PRED +// 2) UV_SMOOTH_PRED +// 3) UV_CFL_PRED +// 4) mode that corresponds to luma intra mode winner (Eg : UV_V_PRED if luma +// intra mode winner is V_PRED). +static const uint16_t av1_derived_chroma_intra_mode_used_flag[INTRA_MODES] = { + 0x2201, // DC_PRED: 0010 0010 0000 0001 + 0x2203, // V_PRED: 0010 0010 0000 0011 + 0x2205, // H_PRED: 0010 0010 0000 0101 + 0x2209, // D45_PRED: 0010 0010 0000 1001 + 0x2211, // D135_PRED: 0010 0010 0001 0001 + 0x2221, // D113_PRED: 0010 0010 0010 0001 + 0x2241, // D157_PRED: 0010 0010 0100 0001 + 0x2281, // D203_PRED: 0010 0010 1000 0001 + 0x2301, // D67_PRED: 0010 0011 0000 0001 + 0x2201, // SMOOTH_PRED: 0010 0010 0000 0001 + 0x2601, // SMOOTH_V_PRED: 0010 0110 0000 0001 + 0x2a01, // SMOOTH_H_PRED: 0010 1010 0000 0001 + 0x3201 // PAETH_PRED: 0011 0010 0000 0001 +}; + +DECLARE_ALIGNED(16, static const uint8_t, all_zeros[MAX_SB_SIZE]) = { 0 }; +DECLARE_ALIGNED(16, static const uint16_t, + highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; + +int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf, + const int stride, const int is_hbd) { + unsigned int sse; + + if (is_hbd) + return vf(buf, stride, CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse); + else + return vf(buf, stride, all_zeros, 0, &sse); +} + +// Computes average of log(1 + variance) across 4x4 sub-blocks for source and +// reconstructed blocks. +static void compute_avg_log_variance(const AV1_COMP *const cpi, MACROBLOCK *x, + const BLOCK_SIZE bs, + double *avg_log_src_variance, + double *avg_log_recon_variance) { + const MACROBLOCKD *const xd = &x->e_mbd; + const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; + const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1); + const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1); + const int right_overflow = + (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; + const int bottom_overflow = + (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; + const int bw = (MI_SIZE * mi_size_wide[bs] - right_overflow); + const int bh = (MI_SIZE * mi_size_high[bs] - bottom_overflow); + const int is_hbd = is_cur_buf_hbd(xd); + + for (int i = 0; i < bh; i += MI_SIZE) { + const int r = mi_row_in_sb + (i >> MI_SIZE_LOG2); + for (int j = 0; j < bw; j += MI_SIZE) { + const int c = mi_col_in_sb + (j >> MI_SIZE_LOG2); + const int mi_offset = r * mi_size_wide[sb_size] + c; + Block4x4VarInfo *block_4x4_var_info = + &x->src_var_info_of_4x4_sub_blocks[mi_offset]; + int src_var = block_4x4_var_info->var; + double log_src_var = block_4x4_var_info->log_var; + // Compute average of log(1 + variance) for the source block from 4x4 + // sub-block variance values. Calculate and store 4x4 sub-block variance + // and log(1 + variance), if the values present in + // src_var_of_4x4_sub_blocks are invalid. Reuse the same if it is readily + // available with valid values. + if (src_var < 0) { + src_var = av1_calc_normalized_variance( + cpi->ppi->fn_ptr[BLOCK_4X4].vf, + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, is_hbd); + block_4x4_var_info->var = src_var; + log_src_var = log1p(src_var / 16.0); + block_4x4_var_info->log_var = log_src_var; + } else { + // When source variance is already calculated and available for + // retrieval, check if log(1 + variance) is also available. If it is + // available, then retrieve from buffer. Else, calculate the same and + // store to the buffer. + if (log_src_var < 0) { + log_src_var = log1p(src_var / 16.0); + block_4x4_var_info->log_var = log_src_var; + } + } + *avg_log_src_variance += log_src_var; + + const int recon_var = av1_calc_normalized_variance( + cpi->ppi->fn_ptr[BLOCK_4X4].vf, + xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j, + xd->plane[0].dst.stride, is_hbd); + *avg_log_recon_variance += log1p(recon_var / 16.0); + } + } + + const int blocks = (bw * bh) / 16; + *avg_log_src_variance /= (double)blocks; + *avg_log_recon_variance /= (double)blocks; +} + +// Returns a factor to be applied to the RD value based on how well the +// reconstructed block variance matches the source variance. +static double intra_rd_variance_factor(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { + double threshold = INTRA_RD_VAR_THRESH(cpi->oxcf.speed); + // For non-positive threshold values, the comparison of source and + // reconstructed variances with threshold evaluates to false + // (src_var < threshold/rec_var < threshold) as these metrics are greater than + // than 0. Hence further calculations are skipped. + if (threshold <= 0) return 1.0; + + double variance_rd_factor = 1.0; + double avg_log_src_variance = 0.0; + double avg_log_recon_variance = 0.0; + double var_diff = 0.0; + + compute_avg_log_variance(cpi, x, bs, &avg_log_src_variance, + &avg_log_recon_variance); + + // Dont allow 0 to prevent / 0 below. + avg_log_src_variance += 0.000001; + avg_log_recon_variance += 0.000001; + + if (avg_log_src_variance >= avg_log_recon_variance) { + var_diff = (avg_log_src_variance - avg_log_recon_variance); + if ((var_diff > 0.5) && (avg_log_recon_variance < threshold)) { + variance_rd_factor = 1.0 + ((var_diff * 2) / avg_log_src_variance); + } + } else { + var_diff = (avg_log_recon_variance - avg_log_src_variance); + if ((var_diff > 0.5) && (avg_log_src_variance < threshold)) { + variance_rd_factor = 1.0 + (var_diff / (2 * avg_log_src_variance)); + } + } + + // Limit adjustment; + variance_rd_factor = AOMMIN(3.0, variance_rd_factor); + + return variance_rd_factor; +} +/*!\endcond */ + +/*!\brief Search for the best filter_intra mode when coding intra frame. + * + * \ingroup intra_mode_search + * \callergraph + * This function loops through all filter_intra modes to find the best one. + * + * \return Returns 1 if a new filter_intra mode is selected; 0 otherwise. + */ +static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, uint8_t *skippable, + BLOCK_SIZE bsize, int mode_cost, + PREDICTION_MODE best_mode_so_far, + int64_t *best_rd, int64_t *best_model_rd, + PICK_MODE_CONTEXT *ctx) { + // Skip the evaluation of filter intra modes. + if (cpi->sf.intra_sf.prune_filter_intra_level == 2) return 0; + + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + int filter_intra_selected_flag = 0; + FILTER_INTRA_MODE mode; + TX_SIZE best_tx_size = TX_8X8; + FILTER_INTRA_MODE_INFO filter_intra_mode_info; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + av1_zero(filter_intra_mode_info); + mbmi->filter_intra_mode_info.use_filter_intra = 1; + mbmi->mode = DC_PRED; + mbmi->palette_mode_info.palette_size[0] = 0; + + // Skip the evaluation of filter-intra if cached MB_MODE_INFO does not have + // filter-intra as winner. + if (x->use_mb_mode_cache && + !x->mb_mode_cache->filter_intra_mode_info.use_filter_intra) + return 0; + + for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) { + int64_t this_rd; + RD_STATS tokenonly_rd_stats; + mbmi->filter_intra_mode_info.filter_intra_mode = mode; + + if ((cpi->sf.intra_sf.prune_filter_intra_level == 1) && + !(av1_derived_filter_intra_mode_used_flag[best_mode_so_far] & + (1 << mode))) + continue; + + // Skip the evaluation of modes that do not match with the winner mode in + // x->mb_mode_cache. + if (x->use_mb_mode_cache && + mode != x->mb_mode_cache->filter_intra_mode_info.filter_intra_mode) + continue; + + if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) { + continue; + } + av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, + *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + const int this_rate = + tokenonly_rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + + // Visual quality adjustment based on recon vs source variance. + if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) { + this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize)); + } + + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd, + cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); + if (this_rd < *best_rd) { + *best_rd = this_rd; + best_tx_size = mbmi->tx_size; + filter_intra_mode_info = mbmi->filter_intra_mode_info; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + *rate = this_rate; + *rate_tokenonly = tokenonly_rd_stats.rate; + *distortion = tokenonly_rd_stats.dist; + *skippable = tokenonly_rd_stats.skip_txfm; + filter_intra_selected_flag = 1; + } + } + + if (filter_intra_selected_flag) { + mbmi->mode = DC_PRED; + mbmi->tx_size = best_tx_size; + mbmi->filter_intra_mode_info = filter_intra_mode_info; + av1_copy_array(ctx->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + return 1; + } else { + return 0; + } +} + +void av1_count_colors(const uint8_t *src, int stride, int rows, int cols, + int *val_count, int *num_colors) { + const int max_pix_val = 1 << 8; + memset(val_count, 0, max_pix_val * sizeof(val_count[0])); + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + const int this_val = src[r * stride + c]; + assert(this_val < max_pix_val); + ++val_count[this_val]; + } + } + int n = 0; + for (int i = 0; i < max_pix_val; ++i) { + if (val_count[i]) ++n; + } + *num_colors = n; +} + +void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, + int cols, int bit_depth, int *val_count, + int *bin_val_count, int *num_color_bins, + int *num_colors) { + assert(bit_depth <= 12); + const int max_bin_val = 1 << 8; + const int max_pix_val = 1 << bit_depth; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + memset(bin_val_count, 0, max_bin_val * sizeof(val_count[0])); + if (val_count != NULL) + memset(val_count, 0, max_pix_val * sizeof(val_count[0])); + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + /* + * Down-convert the pixels to 8-bit domain before counting. + * This provides consistency of behavior for palette search + * between lbd and hbd encodes. This down-converted pixels + * are only used for calculating the threshold (n). + */ + const int this_val = ((src[r * stride + c]) >> (bit_depth - 8)); + assert(this_val < max_bin_val); + if (this_val >= max_bin_val) continue; + ++bin_val_count[this_val]; + if (val_count != NULL) ++val_count[(src[r * stride + c])]; + } + } + int n = 0; + // Count the colors based on 8-bit domain used to gate the palette path + for (int i = 0; i < max_bin_val; ++i) { + if (bin_val_count[i]) ++n; + } + *num_color_bins = n; + + // Count the actual hbd colors used to create top_colors + n = 0; + if (val_count != NULL) { + for (int i = 0; i < max_pix_val; ++i) { + if (val_count[i]) ++n; + } + *num_colors = n; + } +} + +void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi, + int reorder_delta_angle_eval) { + if (mode_idx < INTRA_MODE_END) { + mbmi->mode = intra_rd_search_mode_order[mode_idx]; + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + } else { + mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED; + int delta_angle_eval_idx = + (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2); + if (reorder_delta_angle_eval) { + mbmi->angle_delta[PLANE_TYPE_Y] = + luma_delta_angles_order[delta_angle_eval_idx]; + } else { + mbmi->angle_delta[PLANE_TYPE_Y] = + (delta_angle_eval_idx < 3 ? (delta_angle_eval_idx - 3) + : (delta_angle_eval_idx - 2)); + } + } +} + +static AOM_INLINE int get_model_rd_index_for_pruning( + const MACROBLOCK *const x, + const INTRA_MODE_SPEED_FEATURES *const intra_sf) { + const int top_intra_model_count_allowed = + intra_sf->top_intra_model_count_allowed; + if (!intra_sf->adapt_top_model_rd_count_using_neighbors) + return top_intra_model_count_allowed - 1; + + const MACROBLOCKD *const xd = &x->e_mbd; + const PREDICTION_MODE mode = xd->mi[0]->mode; + int model_rd_index_for_pruning = top_intra_model_count_allowed - 1; + int is_left_mode_neq_cur_mode = 0, is_above_mode_neq_cur_mode = 0; + if (xd->left_available) + is_left_mode_neq_cur_mode = xd->left_mbmi->mode != mode; + if (xd->up_available) + is_above_mode_neq_cur_mode = xd->above_mbmi->mode != mode; + // The pruning of luma intra modes is made more aggressive at lower quantizers + // and vice versa. The value for model_rd_index_for_pruning is derived as + // follows. + // qidx 0 to 127: Reduce the index of a candidate used for comparison only if + // the current mode does not match either of the available neighboring modes. + // qidx 128 to 255: Reduce the index of a candidate used for comparison only + // if the current mode does not match both the available neighboring modes. + if (x->qindex <= 127) { + if (is_left_mode_neq_cur_mode || is_above_mode_neq_cur_mode) + model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0); + } else { + if (is_left_mode_neq_cur_mode && is_above_mode_neq_cur_mode) + model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0); + } + return model_rd_index_for_pruning; +} + +int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd, + int64_t top_intra_model_rd[], int max_model_cnt_allowed, + int model_rd_index_for_pruning) { + const double thresh_best = 1.50; + const double thresh_top = 1.00; + for (int i = 0; i < max_model_cnt_allowed; i++) { + if (this_model_rd < top_intra_model_rd[i]) { + for (int j = max_model_cnt_allowed - 1; j > i; j--) { + top_intra_model_rd[j] = top_intra_model_rd[j - 1]; + } + top_intra_model_rd[i] = this_model_rd; + break; + } + } + if (top_intra_model_rd[model_rd_index_for_pruning] != INT64_MAX && + this_model_rd > + thresh_top * top_intra_model_rd[model_rd_index_for_pruning]) + return 1; + + if (this_model_rd != INT64_MAX && + this_model_rd > thresh_best * (*best_model_rd)) + return 1; + if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd; + return 0; +} + +// Run RD calculation with given chroma intra prediction angle., and return +// the RD cost. Update the best mode info. if the RD cost is the best so far. +static int64_t pick_intra_angle_routine_sbuv( + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats, + int *best_angle_delta, int64_t *best_rd) { + MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; + assert(!is_inter_block(mbmi)); + int this_rate; + int64_t this_rd; + RD_STATS tokenonly_rd_stats; + + if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in)) + return INT64_MAX; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead); + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + if (this_rd < *best_rd) { + *best_rd = this_rd; + *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; + *rate = this_rate; + rd_stats->rate = tokenonly_rd_stats.rate; + rd_stats->dist = tokenonly_rd_stats.dist; + rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm; + } + return this_rd; +} + +/*!\brief Search for the best angle delta for chroma prediction + * + * \ingroup intra_mode_search + * \callergraph + * Given a chroma directional intra prediction mode, this function will try to + * estimate the best delta_angle. + * + * \returns Return if there is a new mode with smaller rdcost than best_rd. + */ +static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int rate_overhead, + int64_t best_rd, int *rate, + RD_STATS *rd_stats) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + int i, angle_delta, best_angle_delta = 0; + int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; + + rd_stats->rate = INT_MAX; + rd_stats->skip_txfm = 0; + rd_stats->dist = INT64_MAX; + for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; + + for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (i = 0; i < 2; ++i) { + best_rd_in = (best_rd == INT64_MAX) + ? INT64_MAX + : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5))); + mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta; + this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, + best_rd_in, rate, rd_stats, + &best_angle_delta, &best_rd); + rd_cost[2 * angle_delta + i] = this_rd; + if (angle_delta == 0) { + if (this_rd == INT64_MAX) return 0; + rd_cost[1] = this_rd; + break; + } + } + } + + assert(best_rd != INT64_MAX); + for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + int64_t rd_thresh; + for (i = 0; i < 2; ++i) { + int skip_search = 0; + rd_thresh = best_rd + (best_rd >> 5); + if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && + rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) + skip_search = 1; + if (!skip_search) { + mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta; + pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd, + rate, rd_stats, &best_angle_delta, + &best_rd); + } + } + } + + mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta; + return rd_stats->rate != INT_MAX; +} + +#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \ + (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1) + +static void cfl_idx_to_sign_and_alpha(int cfl_idx, CFL_SIGN_TYPE *cfl_sign, + int *cfl_alpha) { + int cfl_linear_idx = cfl_idx - CFL_INDEX_ZERO; + if (cfl_linear_idx == 0) { + *cfl_sign = CFL_SIGN_ZERO; + *cfl_alpha = 0; + } else { + *cfl_sign = cfl_linear_idx > 0 ? CFL_SIGN_POS : CFL_SIGN_NEG; + *cfl_alpha = abs(cfl_linear_idx) - 1; + } +} + +static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + int plane, TX_SIZE tx_size, + BLOCK_SIZE plane_bsize, int cfl_idx, + int fast_mode, RD_STATS *rd_stats) { + assert(IMPLIES(fast_mode, rd_stats == NULL)); + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int cfl_plane = get_cfl_pred_type(plane); + CFL_SIGN_TYPE cfl_sign; + int cfl_alpha; + cfl_idx_to_sign_and_alpha(cfl_idx, &cfl_sign, &cfl_alpha); + // We conly build CFL for a given plane, the other plane's sign is dummy + int dummy_sign = CFL_SIGN_NEG; + const int8_t orig_cfl_alpha_signs = mbmi->cfl_alpha_signs; + const uint8_t orig_cfl_alpha_idx = mbmi->cfl_alpha_idx; + mbmi->cfl_alpha_signs = + PLANE_SIGN_TO_JOINT_SIGN(cfl_plane, cfl_sign, dummy_sign); + mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha; + int64_t cfl_cost; + if (fast_mode) { + cfl_cost = + intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0); + } else { + av1_init_rd_stats(rd_stats); + av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize, + tx_size, FTXS_NONE, 0); + av1_rd_cost_update(x->rdmult, rd_stats); + cfl_cost = rd_stats->rdcost; + } + mbmi->cfl_alpha_signs = orig_cfl_alpha_signs; + mbmi->cfl_alpha_idx = orig_cfl_alpha_idx; + return cfl_cost; +} + +static const int cfl_dir_ls[2] = { 1, -1 }; + +// If cfl_search_range is CFL_MAGS_SIZE, return zero. Otherwise return the index +// of the best alpha found using intra_model_rd(). +static int cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x, + int plane, TX_SIZE tx_size, + int cfl_search_range) { + assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); + + if (cfl_search_range == CFL_MAGS_SIZE) return CFL_INDEX_ZERO; + + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->uv_mode == UV_CFL_PRED); + const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + + int est_best_cfl_idx = CFL_INDEX_ZERO; + int fast_mode = 1; + int start_cfl_idx = CFL_INDEX_ZERO; + int64_t best_cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, + start_cfl_idx, fast_mode, NULL); + for (int si = 0; si < 2; ++si) { + const int dir = cfl_dir_ls[si]; + for (int i = 1; i < CFL_MAGS_SIZE; ++i) { + int cfl_idx = start_cfl_idx + dir * i; + if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break; + int64_t cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, + cfl_idx, fast_mode, NULL); + if (cfl_cost < best_cfl_cost) { + best_cfl_cost = cfl_cost; + est_best_cfl_idx = cfl_idx; + } else { + break; + } + } + } + return est_best_cfl_idx; +} + +static AOM_INLINE void set_invalid_cfl_parameters( + uint8_t *best_cfl_alpha_idx, int8_t *best_cfl_alpha_signs) { + *best_cfl_alpha_idx = 0; + *best_cfl_alpha_signs = 0; +} + +static void cfl_pick_plane_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + int plane, TX_SIZE tx_size, int cfl_search_range, + RD_STATS cfl_rd_arr[CFL_MAGS_SIZE], + int est_best_cfl_idx) { + assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->uv_mode == UV_CFL_PRED); + const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + + for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) { + av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]); + } + + int fast_mode = 0; + int start_cfl_idx = est_best_cfl_idx; + cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode, + &cfl_rd_arr[start_cfl_idx]); + + if (cfl_search_range == 1) return; + + for (int si = 0; si < 2; ++si) { + const int dir = cfl_dir_ls[si]; + for (int i = 1; i < cfl_search_range; ++i) { + int cfl_idx = start_cfl_idx + dir * i; + if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break; + cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode, + &cfl_rd_arr[cfl_idx]); + } + } +} + +/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component + * + * \ingroup intra_mode_search + * \callergraph + * + * This function will use DCT_DCT followed by computing SATD (sum of absolute + * transformed differences) to estimate the RD score and find the best possible + * CFL parameter. + * + * Then the function will apply a full RD search near the best possible CFL + * parameter to find the best actual CFL parameter. + * + * Side effect: + * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD + * search. + * + * \param[in] x Encoder prediction block structure. + * \param[in] cpi Top-level encoder instance structure. + * \param[in] tx_size Transform size. + * \param[in] ref_best_rd Reference best RD. + * \param[in] cfl_search_range The search range of full RD search near the + * estimated best CFL parameter. + * + * \param[out] best_rd_stats RD stats of the best CFL parameter + * \param[out] best_cfl_alpha_idx Best CFL alpha index + * \param[out] best_cfl_alpha_signs Best CFL joint signs + * + */ +static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi, + TX_SIZE tx_size, int64_t ref_best_rd, + int cfl_search_range, RD_STATS *best_rd_stats, + uint8_t *best_cfl_alpha_idx, + int8_t *best_cfl_alpha_signs) { + assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); + const ModeCosts *mode_costs = &x->mode_costs; + RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE]; + RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE]; + MACROBLOCKD *const xd = &x->e_mbd; + int est_best_cfl_idx_u, est_best_cfl_idx_v; + + av1_invalid_rd_stats(best_rd_stats); + + // As the dc pred data is same for different values of alpha, enable the + // caching of dc pred data. Call clear_cfl_dc_pred_cache_flags() before + // returning to avoid the unintentional usage of cached dc pred data. + xd->cfl.use_dc_pred_cache = true; + // Evaluate alpha parameter of each chroma plane. + est_best_cfl_idx_u = + cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range); + est_best_cfl_idx_v = + cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range); + + if (cfl_search_range == 1) { + // For cfl_search_range=1, further refinement of alpha is not enabled. Hence + // CfL index=0 for both the chroma planes implies invalid CfL mode. + if (est_best_cfl_idx_u == CFL_INDEX_ZERO && + est_best_cfl_idx_v == CFL_INDEX_ZERO) { + set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs); + clear_cfl_dc_pred_cache_flags(&xd->cfl); + return 0; + } + + int cfl_alpha_u, cfl_alpha_v; + CFL_SIGN_TYPE cfl_sign_u, cfl_sign_v; + const MB_MODE_INFO *mbmi = xd->mi[0]; + cfl_idx_to_sign_and_alpha(est_best_cfl_idx_u, &cfl_sign_u, &cfl_alpha_u); + cfl_idx_to_sign_and_alpha(est_best_cfl_idx_v, &cfl_sign_v, &cfl_alpha_v); + const int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1; + // Compute alpha and mode signaling rate. + const int rate_overhead = + mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u] + + mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v] + + mode_costs + ->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_CFL_PRED]; + // Skip the CfL mode evaluation if the RD cost derived using the rate needed + // to signal the CfL mode and alpha parameter exceeds the ref_best_rd. + if (RDCOST(x->rdmult, rate_overhead, 0) > ref_best_rd) { + set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs); + clear_cfl_dc_pred_cache_flags(&xd->cfl); + return 0; + } + } + + // Compute the rd cost of each chroma plane using the alpha parameters which + // were already evaluated. + cfl_pick_plane_rd(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u, + est_best_cfl_idx_u); + cfl_pick_plane_rd(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v, + est_best_cfl_idx_v); + + clear_cfl_dc_pred_cache_flags(&xd->cfl); + + for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) { + if (cfl_rd_arr_u[ui].rate == INT_MAX) continue; + int cfl_alpha_u; + CFL_SIGN_TYPE cfl_sign_u; + cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u); + for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) { + if (cfl_rd_arr_v[vi].rate == INT_MAX) continue; + int cfl_alpha_v; + CFL_SIGN_TYPE cfl_sign_v; + cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v); + // cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO is not a + // valid parameter for CFL + if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue; + int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1; + RD_STATS rd_stats = cfl_rd_arr_u[ui]; + av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]); + if (rd_stats.rate != INT_MAX) { + rd_stats.rate += + mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u]; + rd_stats.rate += + mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v]; + } + av1_rd_cost_update(x->rdmult, &rd_stats); + if (rd_stats.rdcost < best_rd_stats->rdcost) { + *best_rd_stats = rd_stats; + *best_cfl_alpha_idx = + (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v; + *best_cfl_alpha_signs = joint_sign; + } + } + } + if (best_rd_stats->rdcost >= ref_best_rd) { + av1_invalid_rd_stats(best_rd_stats); + // Set invalid CFL parameters here since the rdcost is not better than + // ref_best_rd. + set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs); + return 0; + } + return 1; +} + +static bool should_prune_chroma_smooth_pred_based_on_source_variance( + const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize) { + if (!cpi->sf.intra_sf.prune_smooth_intra_mode_for_chroma) return false; + + // If the source variance of both chroma planes is less than 20 (empirically + // derived), prune UV_SMOOTH_PRED. + for (int i = AOM_PLANE_U; i < av1_num_planes(&cpi->common); i++) { + const unsigned int variance = av1_get_perpixel_variance_facade( + cpi, &x->e_mbd, &x->plane[i].src, bsize, i); + if (variance >= 20) return false; + } + return true; +} + +int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, uint8_t *skippable, + BLOCK_SIZE bsize, TX_SIZE max_tx_size) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + MB_MODE_INFO best_mbmi = *mbmi; + int64_t best_rd = INT64_MAX, this_rd; + const ModeCosts *mode_costs = &x->mode_costs; + const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg; + + init_sbuv_mode(mbmi); + + // Return if the current block does not correspond to a chroma block. + if (!xd->is_chroma_ref) { + *rate = 0; + *rate_tokenonly = 0; + *distortion = 0; + *skippable = 1; + return INT64_MAX; + } + + // Only store reconstructed luma when there's chroma RDO. When there's no + // chroma RDO, the reconstructed luma will be stored in encode_superblock(). + xd->cfl.store_y = store_cfl_required_rdo(cm, x); + if (xd->cfl.store_y) { + // Restore reconstructed luma values. + // TODO(chiyotsai@google.com): right now we are re-computing the txfm in + // this function everytime we search through uv modes. There is some + // potential speed up here if we cache the result to avoid redundant + // computation. + av1_encode_intra_block_plane(cpi, x, mbmi->bsize, AOM_PLANE_Y, + DRY_RUN_NORMAL, + cpi->optimize_seg_arr[mbmi->segment_id]); + xd->cfl.store_y = 0; + } + IntraModeSearchState intra_search_state; + init_intra_mode_search_state(&intra_search_state); + const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd); + + // Search through all non-palette modes. + for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) { + int this_rate; + RD_STATS tokenonly_rd_stats; + UV_PREDICTION_MODE uv_mode = uv_rd_search_mode_order[mode_idx]; + + // Skip the current mode evaluation if the RD cost derived using the mode + // signaling rate exceeds the best_rd so far. + const int mode_rate = + mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode]; + if (RDCOST(x->rdmult, mode_rate, 0) > best_rd) continue; + + PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); + const int is_diagonal_mode = av1_is_diagonal_mode(intra_mode); + const int is_directional_mode = av1_is_directional_mode(intra_mode); + + if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra) + continue; + if (is_directional_mode && + !cpi->oxcf.intra_mode_cfg.enable_directional_intra) + continue; + + if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] & + (1 << uv_mode))) + continue; + if (!intra_mode_cfg->enable_smooth_intra && uv_mode >= UV_SMOOTH_PRED && + uv_mode <= UV_SMOOTH_H_PRED) + continue; + + if (!intra_mode_cfg->enable_paeth_intra && uv_mode == UV_PAETH_PRED) + continue; + + assert(mbmi->mode < INTRA_MODES); + if (cpi->sf.intra_sf.prune_chroma_modes_using_luma_winner && + !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << uv_mode))) + continue; + + mbmi->uv_mode = uv_mode; + + // Init variables for cfl and angle delta + const SPEED_FEATURES *sf = &cpi->sf; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; + if (uv_mode == UV_CFL_PRED) { + if (!cfl_allowed || !intra_mode_cfg->enable_cfl_intra) continue; + assert(!is_directional_mode); + const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + if (!cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd, + sf->intra_sf.cfl_search_range, &tokenonly_rd_stats, + &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs)) { + continue; + } + } else if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) && + intra_mode_cfg->enable_angle_delta) { + if (sf->intra_sf.chroma_intra_pruning_with_hog && + !intra_search_state.dir_mode_skip_mask_ready) { + static const float thresh[2][4] = { + { -1.2f, 0.0f, 0.0f, 1.2f }, // Interframe + { -1.2f, -1.2f, -0.6f, 0.4f }, // Intraframe + }; + const int is_chroma = 1; + const int is_intra_frame = frame_is_intra_only(cm); + prune_intra_mode_with_hog( + x, bsize, cm->seq_params->sb_size, + thresh[is_intra_frame] + [sf->intra_sf.chroma_intra_pruning_with_hog - 1], + intra_search_state.directional_mode_skip_mask, is_chroma); + intra_search_state.dir_mode_skip_mask_ready = 1; + } + if (intra_search_state.directional_mode_skip_mask[uv_mode]) { + continue; + } + + // Search through angle delta + const int rate_overhead = + mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode]; + if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd, + &this_rate, &tokenonly_rd_stats)) + continue; + } else { + if (uv_mode == UV_SMOOTH_PRED && + should_prune_chroma_smooth_pred_based_on_source_variance(cpi, x, + bsize)) + continue; + + // Predict directly if we don't need to search for angle delta. + if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) { + continue; + } + } + const int mode_cost = + mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode]; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost); + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + + if (this_rd < best_rd) { + best_mbmi = *mbmi; + best_rd = this_rd; + *rate = this_rate; + *rate_tokenonly = tokenonly_rd_stats.rate; + *distortion = tokenonly_rd_stats.dist; + *skippable = tokenonly_rd_stats.skip_txfm; + } + } + + // Search palette mode + const int try_palette = + cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mbmi->bsize); + if (try_palette) { + uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map; + av1_rd_pick_palette_intra_sbuv( + cpi, x, + mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][UV_DC_PRED], + best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly, + distortion, skippable); + } + + *mbmi = best_mbmi; + // Make sure we actually chose a mode + assert(best_rd < INT64_MAX); + return best_rd; +} + +// Searches palette mode for luma channel in inter frame. +int av1_search_palette_mode(IntraModeSearchState *intra_search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost, + int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + MB_MODE_INFO *const mbmi = x->e_mbd.mi[0]; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + int rate2 = 0; + int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd; + int skippable = 0; + uint8_t *const best_palette_color_map = + x->palette_buffer->best_palette_color_map; + uint8_t *const color_map = xd->plane[0].color_index_map; + MB_MODE_INFO best_mbmi_palette = *mbmi; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const ModeCosts *mode_costs = &x->mode_costs; + const int *const intra_mode_cost = + mode_costs->mbmode_cost[size_group_lookup[bsize]]; + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + av1_zero(pmi->palette_size); + + RD_STATS rd_stats_y; + av1_invalid_rd_stats(&rd_stats_y); + av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED], + &best_mbmi_palette, best_palette_color_map, + &best_rd_palette, &rd_stats_y.rate, NULL, + &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL, + ctx, best_blk_skip, best_tx_type_map); + if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) { + this_rd_cost->rdcost = INT64_MAX; + return skippable; + } + + memcpy(x->txfm_search_info.blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + memcpy(color_map, best_palette_color_map, + rows * cols * sizeof(best_palette_color_map[0])); + + skippable = rd_stats_y.skip_txfm; + distortion2 = rd_stats_y.dist; + rate2 = rd_stats_y.rate + ref_frame_cost; + if (num_planes > 1) { + if (intra_search_state->rate_uv_intra == INT_MAX) { + // We have not found any good uv mode yet, so we need to search for it. + TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); + av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra, + &intra_search_state->rate_uv_tokenonly, + &intra_search_state->dist_uvs, + &intra_search_state->skip_uvs, bsize, uv_tx); + intra_search_state->mode_uv = mbmi->uv_mode; + intra_search_state->pmi_uv = *pmi; + intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; + } + + // We have found at least one good uv mode before, so copy and paste it + // over. + mbmi->uv_mode = intra_search_state->mode_uv; + pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1]; + if (pmi->palette_size[1] > 0) { + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } + mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta; + skippable = skippable && intra_search_state->skip_uvs; + distortion2 += intra_search_state->dist_uvs; + rate2 += intra_search_state->rate_uv_intra; + } + + if (skippable) { + rate2 -= rd_stats_y.rate; + if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly; + rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1]; + } else { + rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0]; + } + this_rd = RDCOST(x->rdmult, rate2, distortion2); + this_rd_cost->rate = rate2; + this_rd_cost->dist = distortion2; + this_rd_cost->rdcost = this_rd; + return skippable; +} + +void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + PICK_MODE_CONTEXT *ctx, + RD_STATS *this_rd_cost, int64_t best_rd) { + MB_MODE_INFO *const mbmi = x->e_mbd.mi[0]; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + MACROBLOCKD *const xd = &x->e_mbd; + int64_t best_rd_palette = best_rd, this_rd; + uint8_t *const best_palette_color_map = + x->palette_buffer->best_palette_color_map; + uint8_t *const color_map = xd->plane[0].color_index_map; + MB_MODE_INFO best_mbmi_palette = *mbmi; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const ModeCosts *mode_costs = &x->mode_costs; + const int *const intra_mode_cost = + mode_costs->mbmode_cost[size_group_lookup[bsize]]; + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + av1_zero(pmi->palette_size); + + RD_STATS rd_stats_y; + av1_invalid_rd_stats(&rd_stats_y); + av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED], + &best_mbmi_palette, best_palette_color_map, + &best_rd_palette, &rd_stats_y.rate, NULL, + &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL, + ctx, best_blk_skip, best_tx_type_map); + if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) { + this_rd_cost->rdcost = INT64_MAX; + return; + } + + memcpy(x->txfm_search_info.blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + memcpy(color_map, best_palette_color_map, + rows * cols * sizeof(best_palette_color_map[0])); + + rd_stats_y.rate += ref_frame_cost; + + if (rd_stats_y.skip_txfm) { + rd_stats_y.rate = + ref_frame_cost + + mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1]; + } else { + rd_stats_y.rate += + mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0]; + } + this_rd = RDCOST(x->rdmult, rd_stats_y.rate, rd_stats_y.dist); + this_rd_cost->rate = rd_stats_y.rate; + this_rd_cost->dist = rd_stats_y.dist; + this_rd_cost->rdcost = this_rd; + this_rd_cost->skip_txfm = rd_stats_y.skip_txfm; +} + +/*!\brief Get the intra prediction by searching through tx_type and tx_size. + * + * \ingroup intra_mode_search + * \callergraph + * Currently this function is only used in the intra frame code path for + * winner-mode processing. + * + * \return Returns whether the current mode is an improvement over best_rd. + */ +static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, const int *bmode_costs, + int64_t *best_rd, int *rate, + int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable, + MB_MODE_INFO *best_mbmi, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + RD_STATS rd_stats; + // In order to improve txfm search, avoid rd based breakouts during winner + // mode evaluation. Hence passing ref_best_rd as INT64_MAX by default when the + // speed feature use_rd_based_breakout_for_intra_tx_search is disabled. + int64_t ref_best_rd = cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search + ? *best_rd + : INT64_MAX; + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, ref_best_rd); + if (rd_stats.rate == INT_MAX) return 0; + int this_rate_tokenonly = rd_stats.rate; + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) { + // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size + // in the tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size); + } + const int this_rate = + rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0); + const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist); + if (this_rd < *best_rd) { + *best_mbmi = *mbmi; + *best_rd = this_rd; + *rate = this_rate; + *rate_tokenonly = this_rate_tokenonly; + *distortion = rd_stats.dist; + *skippable = rd_stats.skip_txfm; + av1_copy_array(ctx->blk_skip, x->txfm_search_info.blk_skip, + ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + return 1; + } + return 0; +} + +/*!\brief Search for the best filter_intra mode when coding inter frame. + * + * \ingroup intra_mode_search + * \callergraph + * This function loops through all filter_intra modes to find the best one. + * + * \remark Returns nothing, but updates the mbmi and rd_stats. + */ +static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, + const PICK_MODE_CONTEXT *ctx, + RD_STATS *rd_stats_y, int mode_cost, + int64_t best_rd, + int64_t best_rd_so_far) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->mode == DC_PRED && + av1_filter_intra_allowed_bsize(&cpi->common, bsize)); + + RD_STATS rd_stats_y_fi; + int filter_intra_selected_flag = 0; + TX_SIZE best_tx_size = mbmi->tx_size; + FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + memcpy(best_blk_skip, x->txfm_search_info.blk_skip, + sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + mbmi->filter_intra_mode_info.use_filter_intra = 1; + for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES; + ++fi_mode) { + mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode; + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize, best_rd); + if (rd_stats_y_fi.rate == INT_MAX) continue; + const int this_rate_tmp = + rd_stats_y_fi.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); + const int64_t this_rd_tmp = + RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist); + + if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) { + break; + } + if (this_rd_tmp < best_rd_so_far) { + best_tx_size = mbmi->tx_size; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + memcpy(best_blk_skip, x->txfm_search_info.blk_skip, + sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); + best_fi_mode = fi_mode; + *rd_stats_y = rd_stats_y_fi; + filter_intra_selected_flag = 1; + best_rd_so_far = this_rd_tmp; + } + } + + mbmi->tx_size = best_tx_size; + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + memcpy(x->txfm_search_info.blk_skip, best_blk_skip, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + + if (filter_intra_selected_flag) { + mbmi->filter_intra_mode_info.use_filter_intra = 1; + mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode; + } else { + mbmi->filter_intra_mode_info.use_filter_intra = 0; + } +} + +// Evaluate a given luma intra-mode in inter frames. +int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y, + int64_t best_rd, int *mode_cost_y, int64_t *rd_y, + int64_t *best_model_rd, + int64_t top_intra_model_rd[]) { + const AV1_COMMON *cm = &cpi->common; + const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->ref_frame[0] == INTRA_FRAME); + const PREDICTION_MODE mode = mbmi->mode; + const ModeCosts *mode_costs = &x->mode_costs; + const int mode_cost = + mode_costs->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost; + const int skip_ctx = av1_get_skip_txfm_context(xd); + + int known_rate = mode_cost; + const int intra_cost_penalty = av1_get_intra_cost_penalty( + cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q, + cm->seq_params->bit_depth); + + if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty; + known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0], + mode_costs->skip_txfm_cost[skip_ctx][1]); + const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0); + if (known_rd > best_rd) { + intra_search_state->skip_intra_modes = 1; + return 0; + } + + const int is_directional_mode = av1_is_directional_mode(mode); + if (is_directional_mode && av1_use_angle_delta(bsize) && + cpi->oxcf.intra_mode_cfg.enable_angle_delta) { + if (intra_sf->intra_pruning_with_hog && + !intra_search_state->dir_mode_skip_mask_ready) { + const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f }; + const int is_chroma = 0; + prune_intra_mode_with_hog(x, bsize, cm->seq_params->sb_size, + thresh[intra_sf->intra_pruning_with_hog - 1], + intra_search_state->directional_mode_skip_mask, + is_chroma); + intra_search_state->dir_mode_skip_mask_ready = 1; + } + if (intra_search_state->directional_mode_skip_mask[mode]) return 0; + } + const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); + const int64_t this_model_rd = + intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1); + + const int model_rd_index_for_pruning = + get_model_rd_index_for_pruning(x, intra_sf); + + if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd, + intra_sf->top_intra_model_count_allowed, + model_rd_index_for_pruning)) + return 0; + av1_init_rd_stats(rd_stats_y); + av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd); + + // Pick filter intra modes. + if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { + int try_filter_intra = 1; + int64_t best_rd_so_far = INT64_MAX; + if (rd_stats_y->rate != INT_MAX) { + // best_rd_so_far is the rdcost of DC_PRED without using filter_intra. + // Later, in filter intra search, best_rd_so_far is used for comparison. + mbmi->filter_intra_mode_info.use_filter_intra = 0; + const int tmp_rate = + rd_stats_y->rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); + best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist); + try_filter_intra = (best_rd_so_far / 2) <= best_rd; + } else if (intra_sf->skip_filter_intra_in_inter_frames >= 1) { + // As rd cost of luma intra dc mode is more than best_rd (i.e., + // rd_stats_y->rate = INT_MAX), skip the evaluation of filter intra modes. + try_filter_intra = 0; + } + + if (try_filter_intra) { + handle_filter_intra_mode(cpi, x, bsize, ctx, rd_stats_y, mode_cost, + best_rd, best_rd_so_far); + } + } + + if (rd_stats_y->rate == INT_MAX) return 0; + + *mode_cost_y = intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); + const int rate_y = rd_stats_y->skip_txfm + ? mode_costs->skip_txfm_cost[skip_ctx][1] + : rd_stats_y->rate; + *rd_y = RDCOST(x->rdmult, rate_y + *mode_cost_y, rd_stats_y->dist); + if (best_rd < (INT64_MAX / 2) && *rd_y > (best_rd + (best_rd >> 2))) { + intra_search_state->skip_intra_modes = 1; + return 0; + } + + return 1; +} + +int av1_search_intra_uv_modes_in_interframe( + IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, + const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->ref_frame[0] == INTRA_FRAME); + + // TODO(chiyotsai@google.com): Consolidate the chroma search code here with + // the one in av1_search_palette_mode. + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int try_palette = + cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(cm->features.allow_screen_content_tools, mbmi->bsize); + + assert(intra_search_state->rate_uv_intra == INT_MAX); + if (intra_search_state->rate_uv_intra == INT_MAX) { + // If no good uv-predictor had been found, search for it. + const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); + av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra, + &intra_search_state->rate_uv_tokenonly, + &intra_search_state->dist_uvs, + &intra_search_state->skip_uvs, bsize, uv_tx); + intra_search_state->mode_uv = mbmi->uv_mode; + if (try_palette) intra_search_state->pmi_uv = *pmi; + intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; + + const int uv_rate = intra_search_state->rate_uv_tokenonly; + const int64_t uv_dist = intra_search_state->dist_uvs; + const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist); + if (uv_rd > best_rd) { + // If there is no good intra uv-mode available, we can skip all intra + // modes. + intra_search_state->skip_intra_modes = 1; + return 0; + } + } + + // If we are here, then the encoder has found at least one good intra uv + // predictor, so we can directly copy its statistics over. + // TODO(any): the stats here is not right if the best uv mode is CFL but the + // best y mode is palette. + rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly; + rd_stats_uv->dist = intra_search_state->dist_uvs; + rd_stats_uv->skip_txfm = intra_search_state->skip_uvs; + rd_stats->skip_txfm = rd_stats_y->skip_txfm && rd_stats_uv->skip_txfm; + mbmi->uv_mode = intra_search_state->mode_uv; + if (try_palette) { + pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1]; + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } + mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta; + + return 1; +} + +// Checks if odd delta angles can be pruned based on rdcosts of even delta +// angles of the corresponding directional mode. +static AOM_INLINE int prune_luma_odd_delta_angles_using_rd_cost( + const MB_MODE_INFO *const mbmi, const int64_t *const intra_modes_rd_cost, + int64_t best_rd, int prune_luma_odd_delta_angles_in_intra) { + const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y]; + if (!prune_luma_odd_delta_angles_in_intra || + !av1_is_directional_mode(mbmi->mode) || !(abs(luma_delta_angle) & 1) || + best_rd == INT64_MAX) + return 0; + + const int64_t rd_thresh = best_rd + (best_rd >> 3); + + // Neighbour rdcosts are considered for pruning of odd delta angles as + // mentioned below: + // Delta angle Delta angle rdcost + // to be pruned to be considered + // -3 -2 + // -1 -2, 0 + // 1 0, 2 + // 3 2 + return intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA] > rd_thresh && + intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA + 2] > + rd_thresh; +} + +// Finds the best non-intrabc mode on an intra frame. +int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, uint8_t *skippable, + BLOCK_SIZE bsize, int64_t best_rd, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + int64_t best_model_rd = INT64_MAX; + int is_directional_mode; + uint8_t directional_mode_skip_mask[INTRA_MODES] = { 0 }; + // Flag to check rd of any intra mode is better than best_rd passed to this + // function + int beat_best_rd = 0; + const int *bmode_costs; + const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int try_palette = + cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mbmi->bsize); + uint8_t *best_palette_color_map = + try_palette ? x->palette_buffer->best_palette_color_map : NULL; + const MB_MODE_INFO *above_mi = xd->above_mbmi; + const MB_MODE_INFO *left_mi = xd->left_mbmi; + const PREDICTION_MODE A = av1_above_block_mode(above_mi); + const PREDICTION_MODE L = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[A]; + const int left_ctx = intra_mode_context[L]; + bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx]; + + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf; + if (intra_sf->intra_pruning_with_hog) { + // Less aggressive thresholds are used here than those used in inter frame + // encoding in av1_handle_intra_y_mode() because we want key frames/intra + // frames to have higher quality. + const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f }; + const int is_chroma = 0; + prune_intra_mode_with_hog(x, bsize, cpi->common.seq_params->sb_size, + thresh[intra_sf->intra_pruning_with_hog - 1], + directional_mode_skip_mask, is_chroma); + } + mbmi->filter_intra_mode_info.use_filter_intra = 0; + pmi->palette_size[0] = 0; + + // Set params for mode evaluation + set_mode_eval_params(cpi, x, MODE_EVAL); + + MB_MODE_INFO best_mbmi = *mbmi; + const int max_winner_mode_count = + winner_mode_count_allowed[cpi->sf.winner_mode_sf.multi_winner_mode_type]; + zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats); + x->winner_mode_count = 0; + + // Searches the intra-modes except for intrabc, palette, and filter_intra. + int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT]; + for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) { + top_intra_model_rd[i] = INT64_MAX; + } + + // Initialize the rdcost corresponding to all the directional and + // non-directional intra modes. + // 1. For directional modes, it stores the rdcost values for delta angles -4, + // -3, ..., 3, 4. + // 2. The rdcost value for luma_delta_angle is stored at index + // luma_delta_angle + MAX_ANGLE_DELTA + 1. + // 3. The rdcost values for fictitious/nonexistent luma_delta_angle -4 and 4 + // (array indices 0 and 8) are always set to INT64_MAX (the initial value). + int64_t intra_modes_rd_cost[INTRA_MODE_END] + [SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY]; + for (int i = 0; i < INTRA_MODE_END; i++) { + for (int j = 0; j < SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY; j++) { + intra_modes_rd_cost[i][j] = INT64_MAX; + } + } + + for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT; + ++mode_idx) { + set_y_mode_and_delta_angle(mode_idx, mbmi, + intra_sf->prune_luma_odd_delta_angles_in_intra); + RD_STATS this_rd_stats; + int this_rate, this_rate_tokenonly, s; + int is_diagonal_mode; + int64_t this_distortion, this_rd; + const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y]; + + is_diagonal_mode = av1_is_diagonal_mode(mbmi->mode); + if (is_diagonal_mode && !intra_mode_cfg->enable_diagonal_intra) continue; + if (av1_is_directional_mode(mbmi->mode) && + !intra_mode_cfg->enable_directional_intra) + continue; + + // The smooth prediction mode appears to be more frequently picked + // than horizontal / vertical smooth prediction modes. Hence treat + // them differently in speed features. + if ((!intra_mode_cfg->enable_smooth_intra || + intra_sf->disable_smooth_intra) && + (mbmi->mode == SMOOTH_H_PRED || mbmi->mode == SMOOTH_V_PRED)) + continue; + if (!intra_mode_cfg->enable_smooth_intra && mbmi->mode == SMOOTH_PRED) + continue; + + // The functionality of filter intra modes and smooth prediction + // overlap. Hence smooth prediction is pruned only if all the + // filter intra modes are enabled. + if (intra_sf->disable_smooth_intra && + intra_sf->prune_filter_intra_level == 0 && mbmi->mode == SMOOTH_PRED) + continue; + if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED) + continue; + + // Skip the evaluation of modes that do not match with the winner mode in + // x->mb_mode_cache. + if (x->use_mb_mode_cache && mbmi->mode != x->mb_mode_cache->mode) continue; + + is_directional_mode = av1_is_directional_mode(mbmi->mode); + if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; + if (is_directional_mode && + !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) && + luma_delta_angle != 0) + continue; + + // Use intra_y_mode_mask speed feature to skip intra mode evaluation. + if (!(intra_sf->intra_y_mode_mask[max_txsize_lookup[bsize]] & + (1 << mbmi->mode))) + continue; + + if (prune_luma_odd_delta_angles_using_rd_cost( + mbmi, intra_modes_rd_cost[mbmi->mode], best_rd, + intra_sf->prune_luma_odd_delta_angles_in_intra)) + continue; + + const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); + const int64_t this_model_rd = + intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1); + + const int model_rd_index_for_pruning = + get_model_rd_index_for_pruning(x, intra_sf); + + if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd, + intra_sf->top_intra_model_count_allowed, + model_rd_index_for_pruning)) + continue; + + // Builds the actual prediction. The prediction from + // model_intra_yrd_and_prune was just an estimation that did not take into + // account the effect of txfm pipeline, so we need to redo it for real + // here. + av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd); + this_rate_tokenonly = this_rd_stats.rate; + this_distortion = this_rd_stats.dist; + s = this_rd_stats.skip_txfm; + + if (this_rate_tokenonly == INT_MAX) continue; + + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) { + // av1_pick_uniform_tx_size_type_yrd above includes the cost of the + // tx_size in the tokenonly rate, but for intra blocks, tx_size is always + // coded (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size); + } + this_rate = + this_rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0); + this_rd = RDCOST(x->rdmult, this_rate, this_distortion); + + // Visual quality adjustment based on recon vs source variance. + if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) { + this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize)); + } + + intra_modes_rd_cost[mbmi->mode][luma_delta_angle + MAX_ANGLE_DELTA + 1] = + this_rd; + + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd, + cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); + if (this_rd < best_rd) { + best_mbmi = *mbmi; + best_rd = this_rd; + // Setting beat_best_rd flag because current mode rd is better than + // best_rd passed to this function + beat_best_rd = 1; + *rate = this_rate; + *rate_tokenonly = this_rate_tokenonly; + *distortion = this_distortion; + *skippable = s; + memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + } + } + + // Searches palette + if (try_palette) { + av1_rd_pick_palette_intra_sby( + cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map, + &best_rd, rate, rate_tokenonly, distortion, skippable, &beat_best_rd, + ctx, ctx->blk_skip, ctx->tx_type_map); + } + + // Searches filter_intra + if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) { + if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion, + skippable, bsize, bmode_costs[DC_PRED], + best_mbmi.mode, &best_rd, &best_model_rd, + ctx)) { + best_mbmi = *mbmi; + } + } + + // No mode is identified with less rd value than best_rd passed to this + // function. In such cases winner mode processing is not necessary and return + // best_rd as INT64_MAX to indicate best mode is not identified + if (!beat_best_rd) return INT64_MAX; + + // In multi-winner mode processing, perform tx search for few best modes + // identified during mode evaluation. Winner mode processing uses best tx + // configuration for tx search. + if (cpi->sf.winner_mode_sf.multi_winner_mode_type) { + int best_mode_idx = 0; + int block_width, block_height; + uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map; + av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width, + &block_height, NULL, NULL); + + for (int mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) { + *mbmi = x->winner_mode_stats[mode_idx].mbmi; + if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) { + // Restore color_map of palette mode before winner mode processing + if (mbmi->palette_mode_info.palette_size[0] > 0) { + uint8_t *color_map_src = + x->winner_mode_stats[mode_idx].color_index_map; + memcpy(color_map_dst, color_map_src, + block_width * block_height * sizeof(*color_map_src)); + } + // Set params for winner mode evaluation + set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); + + // Winner mode processing + // If previous searches use only the default tx type/no R-D optimization + // of quantized coeffs, do an extra search for the best tx type/better + // R-D optimization of quantized coeffs + if (intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, + rate_tokenonly, distortion, skippable, &best_mbmi, + ctx)) + best_mode_idx = mode_idx; + } + } + // Copy color_map of palette mode for final winner mode + if (best_mbmi.palette_mode_info.palette_size[0] > 0) { + uint8_t *color_map_src = + x->winner_mode_stats[best_mode_idx].color_index_map; + memcpy(color_map_dst, color_map_src, + block_width * block_height * sizeof(*color_map_src)); + } + } else { + // If previous searches use only the default tx type/no R-D optimization of + // quantized coeffs, do an extra search for the best tx type/better R-D + // optimization of quantized coeffs + if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) { + // Set params for winner mode evaluation + set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); + *mbmi = best_mbmi; + intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, + rate_tokenonly, distortion, skippable, &best_mbmi, ctx); + } + } + *mbmi = best_mbmi; + av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk); + return best_rd; +} diff --git a/third_party/aom/av1/encoder/intra_mode_search.h b/third_party/aom/av1/encoder/intra_mode_search.h new file mode 100644 index 0000000000..75289c4e3c --- /dev/null +++ b/third_party/aom/av1/encoder/intra_mode_search.h @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Declares high level functions to search through intra modes. + */ +#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ +#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Variables related to intra-mode search during inter frame coding. + * + * \ingroup intra_mode_search + * This is a set of variables used during intra-mode search for inter frames. + * This includes an histogram of gradient speed features and a cache of uv + * prediction to avoid repeated search of chroma prediction. + */ +typedef struct IntraModeSearchState { + /*! + * \brief The best luma intra-mode found so far + */ + PREDICTION_MODE best_intra_mode; + + /** \name Speed feature variables + * Variables to help with pruning some luma intra-modes during inter frame + * coding process. + */ + /**@{*/ + /*! + * \brief Whether to terminate all intra mode search. + */ + int skip_intra_modes; + /*! + * \brief Whether a directional mode is pruned. + */ + uint8_t directional_mode_skip_mask[INTRA_MODES]; + /*! + * \brief Whether \ref directional_mode_skip_mask is valid for pruning. + */ + int dir_mode_skip_mask_ready; + /**@}*/ + + /** \name Chroma mode search cache + * A cache of the best chroma prediction mode to avoid having to search for + * chroma predictions repeatedly in \ref + * av1_search_intra_uv_modes_in_interframe() + */ + /**@{*/ + int rate_uv_intra; /*!< \brief Total rate to transmit uv_mode */ + int rate_uv_tokenonly; /*!< \brief Rate transmit txfm tokens */ + int64_t dist_uvs; /*!< \brief Distortion of the uv_mode's recon */ + uint8_t skip_uvs; /*!< \brief Whether the uv txfm is skippable */ + UV_PREDICTION_MODE mode_uv; /*!< \brief The best uv mode */ + PALETTE_MODE_INFO pmi_uv; /*!< \brief Color map if mode_uv is palette */ + int8_t uv_angle_delta; /*!< \brief Angle delta if mode_uv directional */ + /**@}*/ +} IntraModeSearchState; + +/*!\brief Evaluate a given luma intra-mode for inter frames. + * + * \ingroup intra_mode_search + * \callgraph + * \callergraph + * This function handles an intra-mode luma prediction when the current frame + * is an inter frame. This is the intra-mode counterpart of handle_inter_mode. + * This function performs an intra luma prediction using the mode specified by + * x->e_mbd.mi[0]->mode. This function does *not* support palette mode + * prediction in the luma channel. + * + * \param[in,out] intra_search_state Structure to intra search state. + * \param[in] cpi Top-level encoder structure. + * \param[in,out] x Pointer to structure holding all the + * data for the current macroblock. + * \param[in] bsize Current partition block size. + * \param[in] ref_frame_cost The entropy cost for signaling that the + * current ref frame is an intra frame. + * \param[in] ctx Structure to hold the number of 4x4 blks + * to copy tx_type and txfm_skip arrays. + * \param[out] rd_stats_y Struct to keep track of the current + * intra-mode's rd_stats (luma only). + * \param[in] best_rd Best RD seen for this block so far. + * \param[out] mode_cost_y The cost needed to signal the current + * intra mode. + * \param[out] rd_y The rdcost of the chosen mode. + * \param[in] best_model_rd Best model RD seen for this block so far + * \param[in] top_intra_model_rd Top intra model RD seen for this + * block so far. + * + * \return Returns 1 if a valid intra mode is found, 0 otherwise. + * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and + * rd_y are also updated. Moreover, in the first evaluation with directional + * mode, a prune_mask computed with histogram of gradient is also stored in + * intra_search_state. + */ +int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y, + int64_t best_rd, int *mode_cost_y, int64_t *rd_y, + int64_t *best_model_rd, + int64_t top_intra_model_rd[]); + +/*!\brief Search through all chroma intra-modes for inter frames. + * + * \ingroup intra_mode_search + * \callgraph + * \callergraph + * This function handles intra-mode chroma prediction when the current frame + * is an inter frame. This is done by calling \ref av1_rd_pick_intra_sbuv_mode + * with some additional book-keeping. + * + * \param[in,out] intra_search_state Structure to intra search state. + * \param[in] cpi Top-level encoder structure. + * \param[in,out] x Pointer to structure holding all the + * data for the current macroblock. + * \param[in] bsize Current partition block size. + * \param[out] rd_stats Struct to keep track of the current + * intra-mode's rd_stats (all planes). + * \param[out] rd_stats_y Struct to keep track of the current + * intra-mode's rd_stats (luma only). + * \param[out] rd_stats_uv Struct to keep track of the current + * intra-mode's rd_stats (chroma only). + * \param[in] best_rd Best RD seen for this block so far. + * + * \return Returns 1 if a valid intra mode is found, 0 otherwise. + * The corresponding values in x->e_mbd.mi[0], rd_stats(_y|_uv) are also + * updated. Moreover, in the first evocation of the function, the chroma intra + * mode result is cached in intra_search_state to be used in subsequent calls. + */ +int av1_search_intra_uv_modes_in_interframe( + IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, + const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd); + +/*!\brief Evaluate luma palette mode for inter frames. + * + * \ingroup intra_mode_search + * \callergraph + * \callgraph + * This function handles luma palette mode when the current frame is an + * inter frame. + * + * \param[in] intra_search_state Structure to hold the best luma intra mode + * and cache chroma prediction for speed up. + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to structure holding all the data + * for the current macroblock. + * \param[in] bsize Current partition block size. + * \param[in] ref_frame_cost The entropy cost for signaling that the + * current ref frame is an intra frame. + * \param[in] ctx Structure to hold the number of 4x4 blks to + * copy the tx_type and txfm_skip arrays. + * \param[in] this_rd_cost Struct to keep track of palette mode's + * rd_stats. + * \param[in] best_rd Best RD seen for this block so far. + * + * \return Returns whether luma palette mode can skip the txfm. The + * corresponding mbmi, this_rd_costs, intra_search_state, and tx_type arrays in + * ctx are also updated. + */ +int av1_search_palette_mode(IntraModeSearchState *intra_search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost, + int64_t best_rd); + +/*!\brief Evaluate luma palette mode for inter frames. + * + * \ingroup intra_mode_search + * \callergraph + * \callgraph + * This function handles luma palette mode when the current frame is an + * inter frame. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to structure holding all the data + * for the current macroblock. + * \param[in] bsize Current partition block size. + * \param[in] ref_frame_cost The entropy cost for signaling that the + * current ref frame is an intra frame. + * \param[in] ctx Structure to hold the number of 4x4 blks to + * copy the tx_type and txfm_skip arrays. + * \param[in] this_rd_cost Struct to keep track of palette mode's + * rd_stats. + * \param[in] best_rd Best RD seen for this block so far. + */ +void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + PICK_MODE_CONTEXT *ctx, + RD_STATS *this_rd_cost, int64_t best_rd); + +/*!\brief Perform intra-mode search on luma channels for intra frames. + * + * \ingroup intra_mode_search + * \callgraph + * \callergraph + * This function performs intra-mode search on the luma channel when the + * current frame is intra-only. This function does not search intrabc mode, + * but it does search palette and filter_intra. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to structure holding all the data + * for the current macroblock. + * \param[in] rate The total rate needed to predict the current + * chroma block. + * \param[in] rate_tokenonly The rate without the cost of sending the + * prediction modes. + * chroma block. + * after the reconstruction. + * \param[in] distortion The chroma distortion of the best prediction + * after the reconstruction. + * \param[in] skippable Whether we can skip txfm process. + * \param[in] bsize Current partition block size. + * \param[in] best_rd Best RD seen for this block so far. + * \param[in] ctx Structure to hold the number of 4x4 blks to + * copy the tx_type and txfm_skip arrays. + * + * \return Returns the rd_cost if this function finds a mode better than + * best_rd, otherwise returns INT64_MAX. This also updates the mbmi, the rate + * and distortion, and the tx_type arrays in ctx. + */ +int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, uint8_t *skippable, + BLOCK_SIZE bsize, int64_t best_rd, + PICK_MODE_CONTEXT *ctx); + +/*!\brief Perform intra-mode search on chroma channels. + * + * \ingroup intra_mode_search + * \callergraph + * \callgraph + * This function performs intra-mode search on the chroma channels. Just like + * \ref av1_rd_pick_intra_sby_mode(), this function searches over palette mode + * (filter_intra is not available on chroma planes). Unlike \ref + * av1_rd_pick_intra_sby_mode() this function is used by both inter and intra + * frames. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to structure holding all the data + * for the current macroblock. + * \param[in] rate The total rate needed to predict the current + * chroma block. + * \param[in] rate_tokenonly The rate without the cost of sending the + * prediction modes. + * chroma block. + * after the reconstruction. + * \param[in] distortion The chroma distortion of the best prediction + * after the reconstruction. + * \param[in] skippable Whether we can skip txfm process. + * \param[in] bsize Current partition block size. + * \param[in] max_tx_size The maximum tx_size available + * + * \return Returns the rd_cost of the best uv mode found. This also updates the + * mbmi, the rate and distortion, distortion. + */ +int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, uint8_t *skippable, + BLOCK_SIZE bsize, TX_SIZE max_tx_size); + +/*! \brief Return the number of colors in src. Used by palette mode. + */ +void av1_count_colors(const uint8_t *src, int stride, int rows, int cols, + int *val_count, int *num_colors); + +/*! \brief See \ref av1_count_colors(), but for highbd. + */ +void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, + int cols, int bit_depth, int *val_count, + int *val_count_8bit, int *num_color_bins, + int *num_colors); + +/*! \brief Initializes the \ref IntraModeSearchState struct. + */ +static AOM_INLINE void init_intra_mode_search_state( + IntraModeSearchState *intra_search_state) { + memset(intra_search_state, 0, sizeof(*intra_search_state)); + intra_search_state->rate_uv_intra = INT_MAX; +} + +/*! \brief set the luma intra mode and delta angles for a given mode index. + * The total number of luma intra mode is LUMA_MODE_COUNT = 61. + * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional + * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2 + * delta angles. + * \param[in] mode_idx mode index in intra mode decision + * process. + * \param[in] mbmi Pointer to structure holding the mode + * info for the current macroblock. + * \param[in] reorder_delta_angle_eval Indicates whether to reorder the + * evaluation of delta angle modes. + */ +void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi, + int reorder_delta_angle_eval); + +/*! \brief prune luma intra mode based on the model rd. + * \param[in] this_model_rd model rd for current mode. + * \param[in] best_model_rd Best model RD seen for this block so + * far. + * \param[in] top_intra_model_rd Top intra model RD seen for this + * block so far. + * \param[in] max_model_cnt_allowed The maximum number of top intra + * model RD allowed. + * \param[in] model_rd_index_for_pruning Index of the candidate used for + * pruning based on model rd. + */ +int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd, + int64_t top_intra_model_rd[], int max_model_cnt_allowed, + int model_rd_index_for_pruning); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ diff --git a/third_party/aom/av1/encoder/intra_mode_search_utils.h b/third_party/aom/av1/encoder/intra_mode_search_utils.h new file mode 100644 index 0000000000..107c2236f8 --- /dev/null +++ b/third_party/aom/av1/encoder/intra_mode_search_utils.h @@ -0,0 +1,690 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Defines utility functions used in intra mode search. + * + * This includes rdcost estimations, histogram based pruning, etc. + */ +#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_ +#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_ + +#include "av1/common/enums.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/hybrid_fwd_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ +// Macro for computing the speed-preset dependent threshold which is used for +// deciding whether to enable/disable variance calculations in +// intra_rd_variance_factor(). +#define INTRA_RD_VAR_THRESH(X) (1.0 - (0.25 * (X))) + +#define BINS 32 +static const float av1_intra_hog_model_bias[DIRECTIONAL_MODES] = { + 0.450578f, 0.695518f, -0.717944f, -0.639894f, + -0.602019f, -0.453454f, 0.055857f, -0.465480f, +}; + +static const float av1_intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = { + -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f, + -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f, + -0.434156f, 0.322868f, 2.260546f, 3.368715f, 3.989290f, 3.308487f, + 2.277893f, 0.923793f, 0.026412f, -0.385174f, -0.718622f, -1.408867f, + -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f, + -2.985709f, -3.447155f, 3.758139f, 3.204353f, 2.170998f, 0.826587f, + -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f, + -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f, + -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f, + -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f, + -0.088058f, 0.753494f, 2.092413f, 3.215266f, -3.300277f, -2.748658f, + -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f, + -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f, + -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f, + 0.813112f, 1.702213f, 2.653045f, 3.351749f, 3.243554f, 3.199409f, + 2.437856f, 1.468854f, 0.533039f, -0.099065f, -0.622643f, -2.200732f, + -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f, 1.975043f, + 3.179528f, 3.939064f, 3.454379f, 3.689386f, 3.116411f, 1.970991f, + 0.798406f, -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f, + -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f, + -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f, + -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f, + -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f, 1.416882f, + 2.572884f, 3.607755f, 3.974820f, 3.997783f, 2.970459f, 0.791687f, + -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f, + -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f, + -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f, + 2.794130f, 3.685984f, 3.745195f, 3.252444f, 2.316108f, 1.399146f, + -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f, + -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f, + -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f, + -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f, + -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f, + -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f, + -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f, + -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f, + 0.716997f, 1.481393f, 2.216702f, 2.737986f, 3.109809f, 3.226084f, + 2.490098f, -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f, + -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f, + -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f, + -1.430687f, 0.872896f, 2.766550f, 3.610080f, 3.578041f, 3.334928f, + 2.586680f, 1.895721f, 1.122195f, 0.488519f, -0.140689f, -0.799076f, + -1.222860f, -1.502437f, -1.900969f, -3.206816f, +}; + +static const NN_CONFIG av1_intra_hog_model_nnconfig = { + BINS, // num_inputs + DIRECTIONAL_MODES, // num_outputs + 0, // num_hidden_layers + { 0 }, + { + av1_intra_hog_model_weights, + }, + { + av1_intra_hog_model_bias, + }, +}; + +#define FIX_PREC_BITS (16) +static AOM_INLINE int get_hist_bin_idx(int dx, int dy) { + const int32_t ratio = (dy * (1 << FIX_PREC_BITS)) / dx; + + // Find index by bisection + static const int thresholds[BINS] = { + -1334015, -441798, -261605, -183158, -138560, -109331, -88359, -72303, + -59392, -48579, -39272, -30982, -23445, -16400, -9715, -3194, + 3227, 9748, 16433, 23478, 31015, 39305, 48611, 59425, + 72336, 88392, 109364, 138593, 183191, 261638, 441831, INT32_MAX + }; + + int lo_idx = 0, hi_idx = BINS - 1; + // Divide into segments of size 8 gives better performance than binary search + // here. + if (ratio <= thresholds[7]) { + lo_idx = 0; + hi_idx = 7; + } else if (ratio <= thresholds[15]) { + lo_idx = 8; + hi_idx = 15; + } else if (ratio <= thresholds[23]) { + lo_idx = 16; + hi_idx = 23; + } else { + lo_idx = 24; + hi_idx = 31; + } + + for (int idx = lo_idx; idx <= hi_idx; idx++) { + if (ratio <= thresholds[idx]) { + return idx; + } + } + assert(0 && "No valid histogram bin found!"); + return BINS - 1; +} +#undef FIX_PREC_BITS + +// Normalizes the hog data. +static AOM_INLINE void normalize_hog(float total, float *hist) { + for (int i = 0; i < BINS; ++i) hist[i] /= total; +} + +static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride, + int rows, int cols, float *hist) { + float total = 0.1f; + src += stride; + for (int r = 1; r < rows - 1; ++r) { + for (int c = 1; c < cols - 1; ++c) { + const uint8_t *above = &src[c - stride]; + const uint8_t *below = &src[c + stride]; + const uint8_t *left = &src[c - 1]; + const uint8_t *right = &src[c + 1]; + // Calculate gradient using Sobel filters. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + if (dx == 0 && dy == 0) continue; + const int temp = abs(dx) + abs(dy); + if (!temp) continue; + total += temp; + if (dx == 0) { + hist[0] += temp / 2; + hist[BINS - 1] += temp / 2; + } else { + const int idx = get_hist_bin_idx(dx, dy); + assert(idx >= 0 && idx < BINS); + hist[idx] += temp; + } + } + src += stride; + } + + normalize_hog(total, hist); +} + +// Computes and stores pixel level gradient information of a given superblock +// for LBD encode. +static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x, + BLOCK_SIZE sb_size, + PLANE_TYPE plane) { + PixelLevelGradientInfo *const grad_info_sb = + x->pixel_gradient_info + plane * MAX_SB_SQUARE; + const uint8_t *src = x->plane[plane].src.buf; + const int stride = x->plane[plane].src.stride; + const int ss_x = x->e_mbd.plane[plane].subsampling_x; + const int ss_y = x->e_mbd.plane[plane].subsampling_y; + const int sb_height = block_size_high[sb_size] >> ss_y; + const int sb_width = block_size_wide[sb_size] >> ss_x; + src += stride; + for (int r = 1; r < sb_height - 1; ++r) { + for (int c = 1; c < sb_width - 1; ++c) { + const uint8_t *above = &src[c - stride]; + const uint8_t *below = &src[c + stride]; + const uint8_t *left = &src[c - 1]; + const uint8_t *right = &src[c + 1]; + // Calculate gradient using Sobel filters. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0); + grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum = + (uint16_t)(abs(dx) + abs(dy)); + grad_info_sb[r * sb_width + c].hist_bin_idx = + (dx != 0) ? get_hist_bin_idx(dx, dy) : -1; + } + src += stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride, + int rows, int cols, float *hist) { + float total = 0.1f; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + src += stride; + for (int r = 1; r < rows - 1; ++r) { + for (int c = 1; c < cols - 1; ++c) { + const uint16_t *above = &src[c - stride]; + const uint16_t *below = &src[c + stride]; + const uint16_t *left = &src[c - 1]; + const uint16_t *right = &src[c + 1]; + // Calculate gradient using Sobel filters. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + if (dx == 0 && dy == 0) continue; + const int temp = abs(dx) + abs(dy); + if (!temp) continue; + total += temp; + if (dx == 0) { + hist[0] += temp / 2; + hist[BINS - 1] += temp / 2; + } else { + const int idx = get_hist_bin_idx(dx, dy); + assert(idx >= 0 && idx < BINS); + hist[idx] += temp; + } + } + src += stride; + } + + normalize_hog(total, hist); +} + +// Computes and stores pixel level gradient information of a given superblock +// for HBD encode. +static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x, + BLOCK_SIZE sb_size, + PLANE_TYPE plane) { + PixelLevelGradientInfo *const grad_info_sb = + x->pixel_gradient_info + plane * MAX_SB_SQUARE; + const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf); + const int stride = x->plane[plane].src.stride; + const int ss_x = x->e_mbd.plane[plane].subsampling_x; + const int ss_y = x->e_mbd.plane[plane].subsampling_y; + const int sb_height = block_size_high[sb_size] >> ss_y; + const int sb_width = block_size_wide[sb_size] >> ss_x; + src += stride; + for (int r = 1; r < sb_height - 1; ++r) { + for (int c = 1; c < sb_width - 1; ++c) { + const uint16_t *above = &src[c - stride]; + const uint16_t *below = &src[c + stride]; + const uint16_t *left = &src[c - 1]; + const uint16_t *right = &src[c + 1]; + // Calculate gradient using Sobel filters. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0); + grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum = + (uint16_t)(abs(dx) + abs(dy)); + grad_info_sb[r * sb_width + c].hist_bin_idx = + (dx != 0) ? get_hist_bin_idx(dx, dy) : -1; + } + src += stride; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows, + int cols, float *hist, int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + highbd_generate_hog(src8, stride, rows, cols, hist); + return; + } +#else + (void)highbd; +#endif // CONFIG_AV1_HIGHBITDEPTH + lowbd_generate_hog(src8, stride, rows, cols, hist); +} + +static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x, + BLOCK_SIZE sb_size, + PLANE_TYPE plane) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(&x->e_mbd)) { + highbd_compute_gradient_info_sb(x, sb_size, plane); + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + lowbd_compute_gradient_info_sb(x, sb_size, plane); +} + +// Gradient caching at superblock level is allowed only if all of the following +// conditions are satisfied: +// (1) The current frame is an intra only frame +// (2) Non-RD mode decisions are not enabled +// (3) The sf partition_search_type is set to SEARCH_PARTITION +// (4) Either intra_pruning_with_hog or chroma_intra_pruning_with_hog is enabled +// +// SB level caching of gradient data may not help in speedup for the following +// cases: +// (1) Inter frames (due to early intra gating) +// (2) When partition_search_type is not SEARCH_PARTITION +// Hence, gradient data is computed at block level in such cases. +static AOM_INLINE bool is_gradient_caching_for_hog_enabled( + const AV1_COMP *const cpi) { + const SPEED_FEATURES *const sf = &cpi->sf; + return frame_is_intra_only(&cpi->common) && !sf->rt_sf.use_nonrd_pick_mode && + (sf->part_sf.partition_search_type == SEARCH_PARTITION) && + (sf->intra_sf.intra_pruning_with_hog || + sf->intra_sf.chroma_intra_pruning_with_hog); +} + +// Function to generate pixel level gradient information for a given superblock. +// Sets the flags 'is_sb_gradient_cached' for the specific plane-type if +// gradient info is generated for the same. +static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE sb_size, int mi_row, + int mi_col) { + // Initialise flags related to hog data caching. + x->is_sb_gradient_cached[PLANE_TYPE_Y] = false; + x->is_sb_gradient_cached[PLANE_TYPE_UV] = false; + if (!is_gradient_caching_for_hog_enabled(cpi)) return; + + const SPEED_FEATURES *sf = &cpi->sf; + const int num_planes = av1_num_planes(&cpi->common); + + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size); + + if (sf->intra_sf.intra_pruning_with_hog) { + compute_gradient_info_sb(x, sb_size, PLANE_TYPE_Y); + x->is_sb_gradient_cached[PLANE_TYPE_Y] = true; + } + if (sf->intra_sf.chroma_intra_pruning_with_hog && num_planes > 1) { + compute_gradient_info_sb(x, sb_size, PLANE_TYPE_UV); + x->is_sb_gradient_cached[PLANE_TYPE_UV] = true; + } +} + +// Reuses the pixel level gradient data generated at superblock level for block +// level histogram computation. +static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x, + int rows, int cols, + BLOCK_SIZE sb_size, + PLANE_TYPE plane, + float *hist) { + float total = 0.1f; + const int ss_x = x->e_mbd.plane[plane].subsampling_x; + const int ss_y = x->e_mbd.plane[plane].subsampling_y; + const int sb_width = block_size_wide[sb_size] >> ss_x; + + // Derive the offset from the starting of the superblock in order to locate + // the block level gradient data in the cache. + const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1); + const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1); + const int block_offset_in_grad_cache = + sb_width * (mi_row_in_sb << (MI_SIZE_LOG2 - ss_y)) + + (mi_col_in_sb << (MI_SIZE_LOG2 - ss_x)); + const PixelLevelGradientInfo *grad_info_blk = x->pixel_gradient_info + + plane * MAX_SB_SQUARE + + block_offset_in_grad_cache; + + // Retrieve the cached gradient information and generate the histogram. + for (int r = 1; r < rows - 1; ++r) { + for (int c = 1; c < cols - 1; ++c) { + const uint16_t abs_dx_abs_dy_sum = + grad_info_blk[r * sb_width + c].abs_dx_abs_dy_sum; + if (!abs_dx_abs_dy_sum) continue; + total += abs_dx_abs_dy_sum; + const bool is_dx_zero = grad_info_blk[r * sb_width + c].is_dx_zero; + if (is_dx_zero) { + hist[0] += abs_dx_abs_dy_sum >> 1; + hist[BINS - 1] += abs_dx_abs_dy_sum >> 1; + } else { + const int8_t idx = grad_info_blk[r * sb_width + c].hist_bin_idx; + assert(idx >= 0 && idx < BINS); + hist[idx] += abs_dx_abs_dy_sum; + } + } + } + normalize_hog(total, hist); +} + +static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize, + BLOCK_SIZE sb_size, int plane, float *hog) { + const MACROBLOCKD *xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int bh = block_size_high[bsize]; + const int bw = block_size_wide[bsize]; + const int rows = + ((xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh) >> + ss_y; + const int cols = + ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >> + ss_x; + + // If gradient data is already generated at SB level, reuse the cached data. + // Otherwise, compute the data. + if (x->is_sb_gradient_cached[plane]) { + generate_hog_using_gradient_cache(x, rows, cols, sb_size, plane, hog); + } else { + const uint8_t *src = x->plane[plane].src.buf; + const int src_stride = x->plane[plane].src.stride; + generate_hog(src, src_stride, rows, cols, hog, is_cur_buf_hbd(xd)); + } + + // Scale the hog so the luma and chroma are on the same scale + for (int b = 0; b < BINS; ++b) { + hog[b] *= (1 + ss_x) * (1 + ss_y); + } +} + +static AOM_INLINE void prune_intra_mode_with_hog( + const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th, + uint8_t *directional_mode_skip_mask, int is_chroma) { + const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y; + float hist[BINS] = { 0.0f }; + collect_hog_data(x, bsize, sb_size, plane, hist); + + // Make prediction for each of the mode + float scores[DIRECTIONAL_MODES] = { 0.0f }; + av1_nn_predict(hist, &av1_intra_hog_model_nnconfig, 1, scores); + for (UV_PREDICTION_MODE uv_mode = UV_V_PRED; uv_mode <= UV_D67_PRED; + uv_mode++) { + if (scores[uv_mode - UV_V_PRED] <= th) { + directional_mode_skip_mask[uv_mode] = 1; + } + } +} +#undef BINS + +int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf, + const int stride, const int is_hbd); + +// Returns whether caching of source variance for 4x4 sub-blocks is allowed. +static AOM_INLINE bool is_src_var_for_4x4_sub_blocks_caching_enabled( + const AV1_COMP *const cpi) { + const SPEED_FEATURES *const sf = &cpi->sf; + if (cpi->oxcf.mode != ALLINTRA) return false; + + if (sf->part_sf.partition_search_type == SEARCH_PARTITION) return true; + + if (INTRA_RD_VAR_THRESH(cpi->oxcf.speed) <= 0 || + (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode)) + return false; + + return true; +} + +// Initialize the members of Block4x4VarInfo structure to -1 at the start +// of every superblock. +static AOM_INLINE void init_src_var_info_of_4x4_sub_blocks( + const AV1_COMP *const cpi, Block4x4VarInfo *src_var_info_of_4x4_sub_blocks, + const BLOCK_SIZE sb_size) { + if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return; + + const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size]; + for (int i = 0; i < mi_count_in_sb; i++) { + src_var_info_of_4x4_sub_blocks[i].var = -1; + src_var_info_of_4x4_sub_blocks[i].log_var = -1.0; + } +} + +// Returns the cost needed to send a uniformly distributed r.v. +static AOM_INLINE int write_uniform_cost(int n, int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return 0; + if (v < m) + return av1_cost_literal(l - 1); + else + return av1_cost_literal(l); +} +/*!\endcond */ + +/*!\brief Returns the rate cost for luma prediction mode info of intra blocks. + * + * \callergraph + */ +static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi, + const MACROBLOCK *x, + const MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int mode_cost, + int discount_color_cost) { + int total_rate = mode_cost; + const ModeCosts *mode_costs = &x->mode_costs; + const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0; + const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra; + const int use_intrabc = mbmi->use_intrabc; + // Can only activate one mode. + assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc + + use_filter_intra) <= 1); + const int try_palette = av1_allow_palette( + cpi->common.features.allow_screen_content_tools, mbmi->bsize); + if (try_palette && mbmi->mode == DC_PRED) { + const MACROBLOCKD *xd = &x->e_mbd; + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + const int mode_ctx = av1_get_palette_mode_ctx(xd); + total_rate += + mode_costs->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette]; + if (use_palette) { + const uint8_t *const color_map = xd->plane[0].color_index_map; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + const int plt_size = mbmi->palette_mode_info.palette_size[0]; + int palette_mode_cost = + mode_costs + ->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] + + write_uniform_cost(plt_size, color_map[0]); + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + palette_mode_cost += + av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache, + n_cache, cpi->common.seq_params->bit_depth); + if (!discount_color_cost) + palette_mode_cost += + av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP); + + total_rate += palette_mode_cost; + } + } + if (av1_filter_intra_allowed(&cpi->common, mbmi)) { + total_rate += mode_costs->filter_intra_cost[mbmi->bsize][use_filter_intra]; + if (use_filter_intra) { + total_rate += + mode_costs->filter_intra_mode_cost[mbmi->filter_intra_mode_info + .filter_intra_mode]; + } + } + if (av1_is_directional_mode(mbmi->mode)) { + if (av1_use_angle_delta(bsize)) { + total_rate += + mode_costs->angle_delta_cost[mbmi->mode - V_PRED] + [MAX_ANGLE_DELTA + + mbmi->angle_delta[PLANE_TYPE_Y]]; + } + } + if (av1_allow_intrabc(&cpi->common)) + total_rate += mode_costs->intrabc_cost[use_intrabc]; + return total_rate; +} + +/*!\brief Return the rate cost for chroma prediction mode info of intra blocks. + * + * \callergraph + */ +static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi, + const MACROBLOCK *x, + const MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int mode_cost) { + int total_rate = mode_cost; + const ModeCosts *mode_costs = &x->mode_costs; + const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0; + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + // Can only activate one mode. + assert(((uv_mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1); + + const int try_palette = av1_allow_palette( + cpi->common.features.allow_screen_content_tools, mbmi->bsize); + if (try_palette && uv_mode == UV_DC_PRED) { + const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info; + total_rate += + mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette]; + if (use_palette) { + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + const int plt_size = pmi->palette_size[1]; + const MACROBLOCKD *xd = &x->e_mbd; + const uint8_t *const color_map = xd->plane[1].color_index_map; + int palette_mode_cost = + mode_costs + ->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] + + write_uniform_cost(plt_size, color_map[0]); + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + palette_mode_cost += av1_palette_color_cost_uv( + pmi, color_cache, n_cache, cpi->common.seq_params->bit_depth); + palette_mode_cost += + av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP); + total_rate += palette_mode_cost; + } + } + const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); + if (av1_is_directional_mode(intra_mode)) { + if (av1_use_angle_delta(bsize)) { + total_rate += + mode_costs->angle_delta_cost[intra_mode - V_PRED] + [mbmi->angle_delta[PLANE_TYPE_UV] + + MAX_ANGLE_DELTA]; + } + } + return total_rate; +} + +/*!\cond */ +// Makes a quick intra prediction and estimate the rdcost with a model without +// going through the whole txfm/quantize/itxfm process. +static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x, + int plane, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int use_hadamard) { + MACROBLOCKD *const xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + int row, col; + assert(!is_inter_block(xd->mi[0])); + const int stepr = tx_size_high_unit[tx_size]; + const int stepc = tx_size_wide_unit[tx_size]; + const int txbw = tx_size_wide[tx_size]; + const int txbh = tx_size_high[tx_size]; + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + int64_t satd_cost = 0; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + // Prediction. + for (row = 0; row < max_blocks_high; row += stepr) { + for (col = 0; col < max_blocks_wide; col += stepc) { + av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); + // Here we use p->src_diff and p->coeff as temporary buffers for + // prediction residue and transform coefficients. The buffers are only + // used in this for loop, therefore we don't need to properly add offset + // to the buffers. + av1_subtract_block( + bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize], + p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride, + pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride); + av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff, + block_size_wide[plane_bsize], p->coeff); + satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]); + } + } + return satd_cost; +} +/*!\endcond */ + +/*!\brief Estimate the luma rdcost of a given intra mode and try to prune it. + * + * \ingroup intra_mode_search + * \callergraph + * This function first makes a quick luma prediction and estimates the rdcost + * with a model without going through the txfm, then try to prune the current + * mode if the new estimate y_rd > 1.25 * best_model_rd. + * + * \return Returns 1 if the given mode is prune; 0 otherwise. + */ +static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, + int64_t *best_model_rd) { + const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); + const int plane = 0; + const AV1_COMMON *cm = &cpi->common; + const int64_t this_model_rd = + intra_model_rd(cm, x, plane, bsize, tx_size, /*use_hadamard=*/1); + if (*best_model_rd != INT64_MAX && + this_model_rd > *best_model_rd + (*best_model_rd >> 2)) { + return 1; + } else if (this_model_rd < *best_model_rd) { + *best_model_rd = this_model_rd; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_ diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h new file mode 100644 index 0000000000..4be2038a6f --- /dev/null +++ b/third_party/aom/av1/encoder/k_means_template.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "av1/common/blockd.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/random.h" + +#ifndef AV1_K_MEANS_DIM +#error "This template requires AV1_K_MEANS_DIM to be defined" +#endif + +#define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y) +#define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM) + +// Though we want to compute the smallest L2 norm, in 1 dimension, +// it is equivalent to find the smallest L1 norm and then square it. +// This is preferrable for speed, especially on the SIMD side. +static int RENAME(calc_dist)(const int16_t *p1, const int16_t *p2) { +#if AV1_K_MEANS_DIM == 1 + return abs(p1[0] - p2[0]); +#else + int dist = 0; + for (int i = 0; i < AV1_K_MEANS_DIM; ++i) { + const int diff = p1[i] - p2[i]; + dist += diff * diff; + } + return dist; +#endif +} + +void RENAME(av1_calc_indices)(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *dist, int n, int k) { + if (dist) { + *dist = 0; + } + for (int i = 0; i < n; ++i) { + int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids); + indices[i] = 0; + for (int j = 1; j < k; ++j) { + const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, + centroids + j * AV1_K_MEANS_DIM); + if (this_dist < min_dist) { + min_dist = this_dist; + indices[i] = j; + } + } + if (dist) { +#if AV1_K_MEANS_DIM == 1 + *dist += min_dist * min_dist; +#else + *dist += min_dist; +#endif + } + } +} + +static void RENAME(calc_centroids)(const int16_t *data, int16_t *centroids, + const uint8_t *indices, int n, int k) { + int i, j; + int count[PALETTE_MAX_SIZE] = { 0 }; + int centroids_sum[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE]; + unsigned int rand_state = (unsigned int)data[0]; + assert(n <= 32768); + memset(centroids_sum, 0, sizeof(centroids_sum[0]) * k * AV1_K_MEANS_DIM); + + for (i = 0; i < n; ++i) { + const int index = indices[i]; + assert(index < k); + ++count[index]; + for (j = 0; j < AV1_K_MEANS_DIM; ++j) { + centroids_sum[index * AV1_K_MEANS_DIM + j] += + data[i * AV1_K_MEANS_DIM + j]; + } + } + + for (i = 0; i < k; ++i) { + if (count[i] == 0) { + memcpy(centroids + i * AV1_K_MEANS_DIM, + data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM, + sizeof(centroids[0]) * AV1_K_MEANS_DIM); + } else { + for (j = 0; j < AV1_K_MEANS_DIM; ++j) { + centroids[i * AV1_K_MEANS_DIM + j] = + DIVIDE_AND_ROUND(centroids_sum[i * AV1_K_MEANS_DIM + j], count[i]); + } + } + } +} + +void RENAME(av1_k_means)(const int16_t *data, int16_t *centroids, + uint8_t *indices, int n, int k, int max_itr) { + int16_t centroids_tmp[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE]; + uint8_t indices_tmp[MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT]; + int16_t *meta_centroids[2] = { centroids, centroids_tmp }; + uint8_t *meta_indices[2] = { indices, indices_tmp }; + int i, l = 0, prev_l, best_l = 0; + int64_t this_dist; + + assert(n <= MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT); + +#if AV1_K_MEANS_DIM == 1 + av1_calc_indices_dim1(data, centroids, indices, &this_dist, n, k); +#else + av1_calc_indices_dim2(data, centroids, indices, &this_dist, n, k); +#endif + + for (i = 0; i < max_itr; ++i) { + const int64_t prev_dist = this_dist; + prev_l = l; + l = (l == 1) ? 0 : 1; + + RENAME(calc_centroids)(data, meta_centroids[l], meta_indices[prev_l], n, k); + if (!memcmp(meta_centroids[l], meta_centroids[prev_l], + sizeof(centroids[0]) * k * AV1_K_MEANS_DIM)) { + break; + } +#if AV1_K_MEANS_DIM == 1 + av1_calc_indices_dim1(data, meta_centroids[l], meta_indices[l], &this_dist, + n, k); +#else + av1_calc_indices_dim2(data, meta_centroids[l], meta_indices[l], &this_dist, + n, k); +#endif + + if (this_dist > prev_dist) { + best_l = prev_l; + break; + } + } + if (i == max_itr) best_l = l; + if (best_l != 0) { + memcpy(centroids, meta_centroids[1], + sizeof(centroids[0]) * k * AV1_K_MEANS_DIM); + memcpy(indices, meta_indices[1], sizeof(indices[0]) * n); + } +} +#undef RENAME_ +#undef RENAME diff --git a/third_party/aom/av1/encoder/level.c b/third_party/aom/av1/encoder/level.c new file mode 100644 index 0000000000..5d5fe9ce96 --- /dev/null +++ b/third_party/aom/av1/encoder/level.c @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/encoder.h" +#include "av1/encoder/level.h" + +#define UNDEFINED_LEVEL \ + { \ + .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0, \ + .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0, \ + .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \ + .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0 \ + } + +static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = { + { .level = SEQ_LEVEL_2_0, + .max_picture_size = 147456, + .max_h_size = 2048, + .max_v_size = 1152, + .max_display_rate = 4423680L, + .max_decode_rate = 5529600L, + .max_header_rate = 150, + .main_mbps = 1.5, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 8, + .max_tile_cols = 4 }, + { .level = SEQ_LEVEL_2_1, + .max_picture_size = 278784, + .max_h_size = 2816, + .max_v_size = 1584, + .max_display_rate = 8363520L, + .max_decode_rate = 10454400L, + .max_header_rate = 150, + .main_mbps = 3.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 8, + .max_tile_cols = 4 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_3_0, + .max_picture_size = 665856, + .max_h_size = 4352, + .max_v_size = 2448, + .max_display_rate = 19975680L, + .max_decode_rate = 24969600L, + .max_header_rate = 150, + .main_mbps = 6.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 16, + .max_tile_cols = 6 }, + { .level = SEQ_LEVEL_3_1, + .max_picture_size = 1065024, + .max_h_size = 5504, + .max_v_size = 3096, + .max_display_rate = 31950720L, + .max_decode_rate = 39938400L, + .max_header_rate = 150, + .main_mbps = 10.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 16, + .max_tile_cols = 6 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_4_0, + .max_picture_size = 2359296, + .max_h_size = 6144, + .max_v_size = 3456, + .max_display_rate = 70778880L, + .max_decode_rate = 77856768L, + .max_header_rate = 300, + .main_mbps = 12.0, + .high_mbps = 30.0, + .main_cr = 4.0, + .high_cr = 4.0, + .max_tiles = 32, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_4_1, + .max_picture_size = 2359296, + .max_h_size = 6144, + .max_v_size = 3456, + .max_display_rate = 141557760L, + .max_decode_rate = 155713536L, + .max_header_rate = 300, + .main_mbps = 20.0, + .high_mbps = 50.0, + .main_cr = 4.0, + .high_cr = 4.0, + .max_tiles = 32, + .max_tile_cols = 8 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_5_0, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 267386880L, + .max_decode_rate = 273715200L, + .max_header_rate = 300, + .main_mbps = 30.0, + .high_mbps = 100.0, + .main_cr = 6.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_1, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 534773760L, + .max_decode_rate = 547430400L, + .max_header_rate = 300, + .main_mbps = 40.0, + .high_mbps = 160.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_2, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 1069547520L, + .max_decode_rate = 1094860800L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_3, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 1069547520L, + .max_decode_rate = 1176502272L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_6_0, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 1069547520L, + .max_decode_rate = 1176502272L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_1, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 2139095040L, + .max_decode_rate = 2189721600L, + .max_header_rate = 300, + .main_mbps = 100.0, + .high_mbps = 480.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_2, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 4278190080L, + .max_decode_rate = 4379443200L, + .max_header_rate = 300, + .main_mbps = 160.0, + .high_mbps = 800.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_3, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 4278190080L, + .max_decode_rate = 4706009088L, + .max_header_rate = 300, + .main_mbps = 160.0, + .high_mbps = 800.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, +#if CONFIG_CWG_C013 + { .level = SEQ_LEVEL_7_0, + .max_picture_size = 142606336, + .max_h_size = 32768, + .max_v_size = 17408, + .max_display_rate = 4278190080L, + .max_decode_rate = 4706009088L, + .max_header_rate = 300, + .main_mbps = 160.0, + .high_mbps = 800.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 256, + .max_tile_cols = 32 }, + { .level = SEQ_LEVEL_7_1, + .max_picture_size = 142606336, + .max_h_size = 32768, + .max_v_size = 17408, + .max_display_rate = 8556380160L, + .max_decode_rate = 8758886400L, + .max_header_rate = 300, + .main_mbps = 200.0, + .high_mbps = 960.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 256, + .max_tile_cols = 32 }, + { .level = SEQ_LEVEL_7_2, + .max_picture_size = 142606336, + .max_h_size = 32768, + .max_v_size = 17408, + .max_display_rate = 17112760320L, + .max_decode_rate = 17517772800L, + .max_header_rate = 300, + .main_mbps = 320.0, + .high_mbps = 1600.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 256, + .max_tile_cols = 32 }, + { .level = SEQ_LEVEL_7_3, + .max_picture_size = 142606336, + .max_h_size = 32768, + .max_v_size = 17408, + .max_display_rate = 17112760320L, + .max_decode_rate = 18824036352L, + .max_header_rate = 300, + .main_mbps = 320.0, + .high_mbps = 1600.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 256, + .max_tile_cols = 32 }, + { .level = SEQ_LEVEL_8_0, + .max_picture_size = 530841600, + .max_h_size = 65536, + .max_v_size = 34816, + .max_display_rate = 17112760320L, + .max_decode_rate = 18824036352L, + .max_header_rate = 300, + .main_mbps = 320.0, + .high_mbps = 1600.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 512, + .max_tile_cols = 64 }, + { .level = SEQ_LEVEL_8_1, + .max_picture_size = 530841600, + .max_h_size = 65536, + .max_v_size = 34816, + .max_display_rate = 34225520640L, + .max_decode_rate = 34910031052L, + .max_header_rate = 300, + .main_mbps = 400.0, + .high_mbps = 1920.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 512, + .max_tile_cols = 64 }, + { .level = SEQ_LEVEL_8_2, + .max_picture_size = 530841600, + .max_h_size = 65536, + .max_v_size = 34816, + .max_display_rate = 68451041280L, + .max_decode_rate = 69820062105L, + .max_header_rate = 300, + .main_mbps = 640.0, + .high_mbps = 3200.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 512, + .max_tile_cols = 64 }, + { .level = SEQ_LEVEL_8_3, + .max_picture_size = 530841600, + .max_h_size = 65536, + .max_v_size = 34816, + .max_display_rate = 68451041280L, + .max_decode_rate = 75296145408L, + .max_header_rate = 300, + .main_mbps = 640.0, + .high_mbps = 3200.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 512, + .max_tile_cols = 64 }, +#else // !CONFIG_CWG_C013 + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, +#endif // CONFIG_CWG_C013 +}; + +typedef enum { + LUMA_PIC_SIZE_TOO_LARGE, + LUMA_PIC_H_SIZE_TOO_LARGE, + LUMA_PIC_V_SIZE_TOO_LARGE, + LUMA_PIC_H_SIZE_TOO_SMALL, + LUMA_PIC_V_SIZE_TOO_SMALL, + TOO_MANY_TILE_COLUMNS, + TOO_MANY_TILES, + TILE_RATE_TOO_HIGH, + TILE_TOO_LARGE, + SUPERRES_TILE_WIDTH_TOO_LARGE, + CROPPED_TILE_WIDTH_TOO_SMALL, + CROPPED_TILE_HEIGHT_TOO_SMALL, + TILE_WIDTH_INVALID, + FRAME_HEADER_RATE_TOO_HIGH, + DISPLAY_RATE_TOO_HIGH, + DECODE_RATE_TOO_HIGH, + CR_TOO_SMALL, + TILE_SIZE_HEADER_RATE_TOO_HIGH, + BITRATE_TOO_HIGH, + DECODER_MODEL_FAIL, + + TARGET_LEVEL_FAIL_IDS, + TARGET_LEVEL_OK, +} TARGET_LEVEL_FAIL_ID; + +static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { + "The picture size is too large.", + "The picture width is too large.", + "The picture height is too large.", + "The picture width is too small.", + "The picture height is too small.", + "Too many tile columns are used.", + "Too many tiles are used.", + "The tile rate is too high.", + "The tile size is too large.", + "The superres tile width is too large.", + "The cropped tile width is less than 8.", + "The cropped tile height is less than 8.", + "The tile width is invalid.", + "The frame header rate is too high.", + "The display luma sample rate is too high.", + "The decoded luma sample rate is too high.", + "The compression ratio is too small.", + "The product of max tile size and header rate is too high.", + "The bitrate is too high.", + "The decoder model fails.", +}; + +static double get_max_bitrate(const AV1LevelSpec *const level_spec, int tier, + BITSTREAM_PROFILE profile) { + if (level_spec->level < SEQ_LEVEL_4_0) tier = 0; + const double bitrate_basis = + (tier ? level_spec->high_mbps : level_spec->main_mbps) * 1e6; + const double bitrate_profile_factor = + profile == PROFILE_0 ? 1.0 : (profile == PROFILE_1 ? 2.0 : 3.0); + return bitrate_basis * bitrate_profile_factor; +} + +double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier, + BITSTREAM_PROFILE profile) { + assert(is_valid_seq_level_idx(level_index)); + return get_max_bitrate(&av1_level_defs[level_index], tier, profile); +} + +void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles, + int *const max_tile_cols) { + assert(is_valid_seq_level_idx(level_index)); + const AV1LevelSpec *const level_spec = &av1_level_defs[level_index]; + *max_tiles = level_spec->max_tiles; + *max_tile_cols = level_spec->max_tile_cols; +} + +// We assume time t to be valid if and only if t >= 0.0. +// So INVALID_TIME can be defined as anything less than 0. +#define INVALID_TIME (-1.0) + +// This corresponds to "free_buffer" in the spec. +static void release_buffer(DECODER_MODEL *const decoder_model, int idx) { + assert(idx >= 0 && idx < BUFFER_POOL_MAX_SIZE); + FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx]; + this_buffer->decoder_ref_count = 0; + this_buffer->player_ref_count = 0; + this_buffer->display_index = -1; + this_buffer->presentation_time = INVALID_TIME; +} + +static void initialize_buffer_pool(DECODER_MODEL *const decoder_model) { + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + release_buffer(decoder_model, i); + } + for (int i = 0; i < REF_FRAMES; ++i) { + decoder_model->vbi[i] = -1; + } +} + +static int get_free_buffer(DECODER_MODEL *const decoder_model) { + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + const FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + if (this_buffer->decoder_ref_count == 0 && + this_buffer->player_ref_count == 0) + return i; + } + return -1; +} + +static void update_ref_buffers(DECODER_MODEL *const decoder_model, int idx, + int refresh_frame_flags) { + FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx]; + for (int i = 0; i < REF_FRAMES; ++i) { + if (refresh_frame_flags & (1 << i)) { + const int pre_idx = decoder_model->vbi[i]; + if (pre_idx != -1) { + --decoder_model->frame_buffer_pool[pre_idx].decoder_ref_count; + } + decoder_model->vbi[i] = idx; + ++this_buffer->decoder_ref_count; + } + } +} + +// The time (in seconds) required to decode a frame. +static double time_to_decode_frame(const AV1_COMMON *const cm, + int64_t max_decode_rate) { + if (cm->show_existing_frame) return 0.0; + + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + int luma_samples = 0; + if (frame_type == KEY_FRAME || frame_type == INTRA_ONLY_FRAME) { + luma_samples = cm->superres_upscaled_width * cm->height; + } else { + const int spatial_layer_dimensions_present_flag = 0; + if (spatial_layer_dimensions_present_flag) { + assert(0 && "Spatial layer dimensions not supported yet."); + } else { + const SequenceHeader *const seq_params = cm->seq_params; + const int max_frame_width = seq_params->max_frame_width; + const int max_frame_height = seq_params->max_frame_height; + luma_samples = max_frame_width * max_frame_height; + } + } + + return luma_samples / (double)max_decode_rate; +} + +// Release frame buffers that are no longer needed for decode or display. +// It corresponds to "start_decode_at_removal_time" in the spec. +static void release_processed_frames(DECODER_MODEL *const decoder_model, + double removal_time) { + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i]; + if (this_buffer->player_ref_count > 0) { + if (this_buffer->presentation_time >= 0.0 && + this_buffer->presentation_time <= removal_time) { + this_buffer->player_ref_count = 0; + if (this_buffer->decoder_ref_count == 0) { + release_buffer(decoder_model, i); + } + } + } + } +} + +static int frames_in_buffer_pool(const DECODER_MODEL *const decoder_model) { + int frames_in_pool = 0; + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + const FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + if (this_buffer->decoder_ref_count > 0 || + this_buffer->player_ref_count > 0) { + ++frames_in_pool; + } + } + return frames_in_pool; +} + +static double get_presentation_time(const DECODER_MODEL *const decoder_model, + int display_index) { + if (decoder_model->mode == SCHEDULE_MODE) { + assert(0 && "SCHEDULE_MODE NOT SUPPORTED"); + return INVALID_TIME; + } else { + const double initial_presentation_delay = + decoder_model->initial_presentation_delay; + // Can't decide presentation time until the initial presentation delay is + // known. + if (initial_presentation_delay < 0.0) return INVALID_TIME; + + return initial_presentation_delay + + display_index * decoder_model->num_ticks_per_picture * + decoder_model->display_clock_tick; + } +} + +#define MAX_TIME 1e16 +static double time_next_buffer_is_free(int num_decoded_frame, + int decoder_buffer_delay, + const FRAME_BUFFER *frame_buffer_pool, + double current_time) { + if (num_decoded_frame == 0) { + return (double)decoder_buffer_delay / 90000.0; + } + + double buf_free_time = MAX_TIME; + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + const FRAME_BUFFER *const this_buffer = &frame_buffer_pool[i]; + if (this_buffer->decoder_ref_count == 0) { + if (this_buffer->player_ref_count == 0) { + return current_time; + } + const double presentation_time = this_buffer->presentation_time; + if (presentation_time >= 0.0 && presentation_time < buf_free_time) { + buf_free_time = presentation_time; + } + } + } + return buf_free_time < MAX_TIME ? buf_free_time : INVALID_TIME; +} +#undef MAX_TIME + +static double get_removal_time(int mode, int num_decoded_frame, + int decoder_buffer_delay, + const FRAME_BUFFER *frame_buffer_pool, + double current_time) { + if (mode == SCHEDULE_MODE) { + assert(0 && "SCHEDULE_MODE IS NOT SUPPORTED YET"); + return INVALID_TIME; + } else { + return time_next_buffer_is_free(num_decoded_frame, decoder_buffer_delay, + frame_buffer_pool, current_time); + } +} + +void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) { + printf( + "\n status %d, num_frame %3d, num_decoded_frame %3d, " + "num_shown_frame %3d, current time %6.2f, frames in buffer %2d, " + "presentation delay %6.2f, total interval %6.2f\n", + decoder_model->status, decoder_model->num_frame, + decoder_model->num_decoded_frame, decoder_model->num_shown_frame, + decoder_model->current_time, frames_in_buffer_pool(decoder_model), + decoder_model->initial_presentation_delay, + decoder_model->dfg_interval_queue.total_interval); + for (int i = 0; i < 10; ++i) { + const FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + printf("buffer %d, decode count %d, display count %d, present time %6.4f\n", + i, this_buffer->decoder_ref_count, this_buffer->player_ref_count, + this_buffer->presentation_time); + } +} + +// op_index is the operating point index. +void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level, + int op_index, DECODER_MODEL *const decoder_model) { + decoder_model->status = DECODER_MODEL_OK; + decoder_model->level = level; + + const AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + decoder_model->bit_rate = get_max_bitrate( + av1_level_defs + level, seq_params->tier[op_index], seq_params->profile); + + // TODO(huisu or anyone): implement SCHEDULE_MODE. + decoder_model->mode = RESOURCE_MODE; + decoder_model->encoder_buffer_delay = 20000; + decoder_model->decoder_buffer_delay = 70000; + decoder_model->is_low_delay_mode = false; + + decoder_model->first_bit_arrival_time = 0.0; + decoder_model->last_bit_arrival_time = 0.0; + decoder_model->coded_bits = 0; + + decoder_model->removal_time = INVALID_TIME; + decoder_model->presentation_time = INVALID_TIME; + decoder_model->decode_samples = 0; + decoder_model->display_samples = 0; + decoder_model->max_decode_rate = 0.0; + decoder_model->max_display_rate = 0.0; + + decoder_model->num_frame = -1; + decoder_model->num_decoded_frame = -1; + decoder_model->num_shown_frame = -1; + decoder_model->current_time = 0.0; + + initialize_buffer_pool(decoder_model); + + DFG_INTERVAL_QUEUE *const dfg_interval_queue = + &decoder_model->dfg_interval_queue; + dfg_interval_queue->total_interval = 0.0; + dfg_interval_queue->head = 0; + dfg_interval_queue->size = 0; + + if (seq_params->timing_info_present) { + decoder_model->num_ticks_per_picture = + seq_params->timing_info.num_ticks_per_picture; + decoder_model->display_clock_tick = + seq_params->timing_info.num_units_in_display_tick / + seq_params->timing_info.time_scale; + } else { + decoder_model->num_ticks_per_picture = 1; + decoder_model->display_clock_tick = 1.0 / cpi->framerate; + } + + decoder_model->initial_display_delay = + seq_params->op_params[op_index].initial_display_delay; + decoder_model->initial_presentation_delay = INVALID_TIME; + decoder_model->decode_rate = av1_level_defs[level].max_decode_rate; +} + +DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf( + const AV1_COMP *const cpi, size_t coded_bits, + const DECODER_MODEL *const decoder_model) { + DECODER_MODEL_STATUS status = DECODER_MODEL_OK; + + if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) { + return status; + } + + const AV1_COMMON *const cm = &cpi->common; + const int show_existing_frame = cm->show_existing_frame; + + size_t cur_coded_bits = decoder_model->coded_bits + coded_bits; + int num_decoded_frame = decoder_model->num_decoded_frame; + if (!show_existing_frame) ++num_decoded_frame; + + if (show_existing_frame) { + return status; + } else { + const double removal_time = get_removal_time( + decoder_model->mode, num_decoded_frame, + decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool, + decoder_model->current_time); + if (removal_time < 0.0) { + status = DECODE_FRAME_BUF_UNAVAILABLE; + return status; + } + + // A frame with show_existing_frame being false indicates the end of a DFG. + // Update the bits arrival time of this DFG. + const double buffer_delay = (decoder_model->encoder_buffer_delay + + decoder_model->decoder_buffer_delay) / + 90000.0; + const double latest_arrival_time = removal_time - buffer_delay; + const double first_bit_arrival_time = + AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time); + const double last_bit_arrival_time = + first_bit_arrival_time + + (double)cur_coded_bits / decoder_model->bit_rate; + // Smoothing buffer underflows if the last bit arrives after the removal + // time. + if (last_bit_arrival_time > removal_time && + !decoder_model->is_low_delay_mode) { + status = SMOOTHING_BUFFER_UNDERFLOW; + return status; + } + + // Check if the smoothing buffer overflows. + const DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue; + if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) { + assert(0); + } + + double total_interval = queue->total_interval; + int qhead = queue->head; + int qsize = queue->size; + // Remove the DFGs with removal time earlier than last_bit_arrival_time. + while (queue->buf[qhead].removal_time <= last_bit_arrival_time && + qsize > 0) { + if (queue->buf[qhead].removal_time - first_bit_arrival_time + + total_interval > + 1.0) { + status = SMOOTHING_BUFFER_OVERFLOW; + return status; + } + total_interval -= queue->buf[qhead].last_bit_arrival_time - + queue->buf[qhead].first_bit_arrival_time; + qhead = (qhead + 1) % DFG_INTERVAL_QUEUE_SIZE; + --qsize; + } + total_interval += last_bit_arrival_time - first_bit_arrival_time; + // The smoothing buffer can hold at most "bit_rate" bits, which is + // equivalent to 1 second of total interval. + if (total_interval > 1.0) { + status = SMOOTHING_BUFFER_OVERFLOW; + return status; + } + + return status; + } +} + +void av1_decoder_model_process_frame(const AV1_COMP *const cpi, + size_t coded_bits, + DECODER_MODEL *const decoder_model) { + if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return; + + const AV1_COMMON *const cm = &cpi->common; + const int luma_pic_size = cm->superres_upscaled_width * cm->height; + const int show_existing_frame = cm->show_existing_frame; + const int show_frame = cm->show_frame || show_existing_frame; + ++decoder_model->num_frame; + if (!show_existing_frame) ++decoder_model->num_decoded_frame; + if (show_frame) ++decoder_model->num_shown_frame; + decoder_model->coded_bits += coded_bits; + + int display_idx = -1; + if (show_existing_frame) { + display_idx = decoder_model->vbi[cpi->existing_fb_idx_to_show]; + if (display_idx < 0) { + decoder_model->status = DECODE_EXISTING_FRAME_BUF_EMPTY; + return; + } + if (decoder_model->frame_buffer_pool[display_idx].frame_type == KEY_FRAME) { + update_ref_buffers(decoder_model, display_idx, 0xFF); + } + } else { + const double removal_time = get_removal_time( + decoder_model->mode, decoder_model->num_decoded_frame, + decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool, + decoder_model->current_time); + if (removal_time < 0.0) { + decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE; + return; + } + + const int previous_decode_samples = decoder_model->decode_samples; + const double previous_removal_time = decoder_model->removal_time; + assert(previous_removal_time < removal_time); + decoder_model->removal_time = removal_time; + decoder_model->decode_samples = luma_pic_size; + const double this_decode_rate = + previous_decode_samples / (removal_time - previous_removal_time); + decoder_model->max_decode_rate = + AOMMAX(decoder_model->max_decode_rate, this_decode_rate); + + // A frame with show_existing_frame being false indicates the end of a DFG. + // Update the bits arrival time of this DFG. + const double buffer_delay = (decoder_model->encoder_buffer_delay + + decoder_model->decoder_buffer_delay) / + 90000.0; + const double latest_arrival_time = removal_time - buffer_delay; + decoder_model->first_bit_arrival_time = + AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time); + decoder_model->last_bit_arrival_time = + decoder_model->first_bit_arrival_time + + (double)decoder_model->coded_bits / decoder_model->bit_rate; + // Smoothing buffer underflows if the last bit arrives after the removal + // time. + if (decoder_model->last_bit_arrival_time > removal_time && + !decoder_model->is_low_delay_mode) { + decoder_model->status = SMOOTHING_BUFFER_UNDERFLOW; + return; + } + // Reset the coded bits for the next DFG. + decoder_model->coded_bits = 0; + + // Check if the smoothing buffer overflows. + DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue; + if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) { + assert(0); + } + const double first_bit_arrival_time = decoder_model->first_bit_arrival_time; + const double last_bit_arrival_time = decoder_model->last_bit_arrival_time; + // Remove the DFGs with removal time earlier than last_bit_arrival_time. + while (queue->buf[queue->head].removal_time <= last_bit_arrival_time && + queue->size > 0) { + if (queue->buf[queue->head].removal_time - first_bit_arrival_time + + queue->total_interval > + 1.0) { + decoder_model->status = SMOOTHING_BUFFER_OVERFLOW; + return; + } + queue->total_interval -= queue->buf[queue->head].last_bit_arrival_time - + queue->buf[queue->head].first_bit_arrival_time; + queue->head = (queue->head + 1) % DFG_INTERVAL_QUEUE_SIZE; + --queue->size; + } + // Push current DFG into the queue. + const int queue_index = + (queue->head + queue->size++) % DFG_INTERVAL_QUEUE_SIZE; + queue->buf[queue_index].first_bit_arrival_time = first_bit_arrival_time; + queue->buf[queue_index].last_bit_arrival_time = last_bit_arrival_time; + queue->buf[queue_index].removal_time = removal_time; + queue->total_interval += last_bit_arrival_time - first_bit_arrival_time; + // The smoothing buffer can hold at most "bit_rate" bits, which is + // equivalent to 1 second of total interval. + if (queue->total_interval > 1.0) { + decoder_model->status = SMOOTHING_BUFFER_OVERFLOW; + return; + } + + release_processed_frames(decoder_model, removal_time); + decoder_model->current_time = + removal_time + time_to_decode_frame(cm, decoder_model->decode_rate); + + const int cfbi = get_free_buffer(decoder_model); + if (cfbi < 0) { + decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE; + return; + } + const CurrentFrame *const current_frame = &cm->current_frame; + decoder_model->frame_buffer_pool[cfbi].frame_type = + cm->current_frame.frame_type; + display_idx = cfbi; + update_ref_buffers(decoder_model, cfbi, current_frame->refresh_frame_flags); + + if (decoder_model->initial_presentation_delay < 0.0) { + // Display can begin after required number of frames have been buffered. + if (frames_in_buffer_pool(decoder_model) >= + decoder_model->initial_display_delay - 1) { + decoder_model->initial_presentation_delay = decoder_model->current_time; + // Update presentation time for each shown frame in the frame buffer. + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + if (this_buffer->player_ref_count == 0) continue; + assert(this_buffer->display_index >= 0); + this_buffer->presentation_time = + get_presentation_time(decoder_model, this_buffer->display_index); + } + } + } + } + + // Display. + if (show_frame) { + assert(display_idx >= 0 && display_idx < BUFFER_POOL_MAX_SIZE); + FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[display_idx]; + ++this_buffer->player_ref_count; + this_buffer->display_index = decoder_model->num_shown_frame; + const double presentation_time = + get_presentation_time(decoder_model, this_buffer->display_index); + this_buffer->presentation_time = presentation_time; + if (presentation_time >= 0.0 && + decoder_model->current_time > presentation_time) { + decoder_model->status = DISPLAY_FRAME_LATE; + return; + } + + const int previous_display_samples = decoder_model->display_samples; + const double previous_presentation_time = decoder_model->presentation_time; + decoder_model->display_samples = luma_pic_size; + decoder_model->presentation_time = presentation_time; + if (presentation_time >= 0.0 && previous_presentation_time >= 0.0) { + assert(previous_presentation_time < presentation_time); + const double this_display_rate = + previous_display_samples / + (presentation_time - previous_presentation_time); + decoder_model->max_display_rate = + AOMMAX(decoder_model->max_display_rate, this_display_rate); + } + } +} + +void av1_init_level_info(AV1_COMP *cpi) { + for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) { + AV1LevelInfo *const this_level_info = + cpi->ppi->level_params.level_info[op_index]; + if (!this_level_info) continue; + memset(this_level_info, 0, sizeof(*this_level_info)); + AV1LevelSpec *const level_spec = &this_level_info->level_spec; + level_spec->level = SEQ_LEVEL_MAX; + AV1LevelStats *const level_stats = &this_level_info->level_stats; + level_stats->min_cropped_tile_width = INT_MAX; + level_stats->min_cropped_tile_height = INT_MAX; + level_stats->min_frame_width = INT_MAX; + level_stats->min_frame_height = INT_MAX; + level_stats->tile_width_is_valid = 1; + level_stats->min_cr = 1e8; + + FrameWindowBuffer *const frame_window_buffer = + &this_level_info->frame_window_buffer; + frame_window_buffer->num = 0; + frame_window_buffer->start = 0; + + const AV1_COMMON *const cm = &cpi->common; + const int upscaled_width = cm->superres_upscaled_width; + const int height = cm->height; + const int pic_size = upscaled_width * height; + for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) { + DECODER_MODEL *const this_model = &this_level_info->decoder_models[level]; + const AV1LevelSpec *const spec = &av1_level_defs[level]; + if (upscaled_width > spec->max_h_size || height > spec->max_v_size || + pic_size > spec->max_picture_size) { + // Turn off decoder model for this level as the frame size already + // exceeds level constraints. + this_model->status = DECODER_MODEL_DISABLED; + } else { + av1_decoder_model_init(cpi, level, op_index, this_model); + } + } + } +} + +static double get_min_cr(const AV1LevelSpec *const level_spec, int tier, + int is_still_picture, int64_t decoded_sample_rate) { + if (is_still_picture) return 0.8; + if (level_spec->level < SEQ_LEVEL_4_0) tier = 0; + const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr; + const double speed_adj = + (double)decoded_sample_rate / level_spec->max_display_rate; + return AOMMAX(min_cr_basis * speed_adj, 0.8); +} + +double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier, + int is_still_picture) { + assert(is_valid_seq_level_idx(level_index)); + const AV1LevelSpec *const level_spec = &av1_level_defs[level_index]; + return get_min_cr(level_spec, tier, is_still_picture, + level_spec->max_decode_rate); +} + +static void get_temporal_parallel_params(int scalability_mode_idc, + int *temporal_parallel_num, + int *temporal_parallel_denom) { + if (scalability_mode_idc < 0) { + *temporal_parallel_num = 1; + *temporal_parallel_denom = 1; + return; + } + + // TODO(huisu@): handle scalability cases. + if (scalability_mode_idc == SCALABILITY_SS) { + (void)scalability_mode_idc; + } else { + (void)scalability_mode_idc; + } +} + +#define MIN_CROPPED_TILE_WIDTH 8 +#define MIN_CROPPED_TILE_HEIGHT 8 +#define MIN_FRAME_WIDTH 16 +#define MIN_FRAME_HEIGHT 16 +#define MAX_TILE_SIZE_HEADER_RATE_PRODUCT 588251136 + +static TARGET_LEVEL_FAIL_ID check_level_constraints( + const AV1LevelInfo *const level_info, AV1_LEVEL level, int tier, + int is_still_picture, BITSTREAM_PROFILE profile, int check_bitrate) { + const DECODER_MODEL *const decoder_model = &level_info->decoder_models[level]; + const DECODER_MODEL_STATUS decoder_model_status = decoder_model->status; + if (decoder_model_status != DECODER_MODEL_OK && + decoder_model_status != DECODER_MODEL_DISABLED) { + return DECODER_MODEL_FAIL; + } + + const AV1LevelSpec *const level_spec = &level_info->level_spec; + const AV1LevelSpec *const target_level_spec = &av1_level_defs[level]; + const AV1LevelStats *const level_stats = &level_info->level_stats; + TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK; + do { + if (level_spec->max_picture_size > target_level_spec->max_picture_size) { + fail_id = LUMA_PIC_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_h_size > target_level_spec->max_h_size) { + fail_id = LUMA_PIC_H_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_v_size > target_level_spec->max_v_size) { + fail_id = LUMA_PIC_V_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) { + fail_id = TOO_MANY_TILE_COLUMNS; + break; + } + + if (level_spec->max_tiles > target_level_spec->max_tiles) { + fail_id = TOO_MANY_TILES; + break; + } + + if (level_spec->max_header_rate > target_level_spec->max_header_rate) { + fail_id = FRAME_HEADER_RATE_TOO_HIGH; + break; + } + + if (decoder_model->max_display_rate > + (double)target_level_spec->max_display_rate) { + fail_id = DISPLAY_RATE_TOO_HIGH; + break; + } + + // TODO(huisu): we are not using max decode rate calculated by the decoder + // model because the model in resource availability mode always returns + // MaxDecodeRate(as in the level definitions) as the max decode rate. + if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) { + fail_id = DECODE_RATE_TOO_HIGH; + break; + } + + if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) { + fail_id = TILE_RATE_TOO_HIGH; + break; + } + +#if CONFIG_CWG_C013 + const int max_tile_size = (level >= SEQ_LEVEL_7_0 && level <= SEQ_LEVEL_8_3) + ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE + : MAX_TILE_AREA; +#else + const int max_tile_size = MAX_TILE_AREA; +#endif + if (level_stats->max_tile_size > max_tile_size) { + fail_id = TILE_TOO_LARGE; + break; + } + + if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) { + fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE; + break; + } + + if (level_stats->min_cropped_tile_width < MIN_CROPPED_TILE_WIDTH) { + fail_id = CROPPED_TILE_WIDTH_TOO_SMALL; + break; + } + + if (level_stats->min_cropped_tile_height < MIN_CROPPED_TILE_HEIGHT) { + fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL; + break; + } + + if (level_stats->min_frame_width < MIN_FRAME_WIDTH) { + fail_id = LUMA_PIC_H_SIZE_TOO_SMALL; + break; + } + + if (level_stats->min_frame_height < MIN_FRAME_HEIGHT) { + fail_id = LUMA_PIC_V_SIZE_TOO_SMALL; + break; + } + + if (!level_stats->tile_width_is_valid) { + fail_id = TILE_WIDTH_INVALID; + break; + } + + const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture, + level_spec->max_decode_rate); + if (level_stats->min_cr < min_cr) { + fail_id = CR_TOO_SMALL; + break; + } + + if (check_bitrate) { + // Check average bitrate instead of max_bitrate. + const double bitrate_limit = + get_max_bitrate(target_level_spec, tier, profile); + const double avg_bitrate = level_stats->total_compressed_size * 8.0 / + level_stats->total_time_encoded; + if (avg_bitrate > bitrate_limit) { + fail_id = BITRATE_TOO_HIGH; + break; + } + } + + if (target_level_spec->level > SEQ_LEVEL_5_1) { + int temporal_parallel_num; + int temporal_parallel_denom; + const int scalability_mode_idc = -1; + get_temporal_parallel_params(scalability_mode_idc, &temporal_parallel_num, + &temporal_parallel_denom); + const int val = level_stats->max_tile_size * level_spec->max_header_rate * + temporal_parallel_denom / temporal_parallel_num; + if (val > MAX_TILE_SIZE_HEADER_RATE_PRODUCT) { + fail_id = TILE_SIZE_HEADER_RATE_TOO_HIGH; + break; + } + } + } while (0); + + return fail_id; +} + +static void get_tile_stats(const AV1_COMMON *const cm, + const TileDataEnc *const tile_data, + int *max_tile_size, int *max_superres_tile_width, + int *min_cropped_tile_width, + int *min_cropped_tile_height, + int *tile_width_valid) { + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const int superres_scale_denominator = cm->superres_scale_denominator; + + *max_tile_size = 0; + *max_superres_tile_width = 0; + *min_cropped_tile_width = INT_MAX; + *min_cropped_tile_height = INT_MAX; + *tile_width_valid = 1; + + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + const TileInfo *const tile_info = + &tile_data[tile_row * cm->tiles.cols + tile_col].tile_info; + const int tile_width = + (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE; + const int tile_height = + (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE; + const int tile_size = tile_width * tile_height; + *max_tile_size = AOMMAX(*max_tile_size, tile_size); + + const int supperres_tile_width = + tile_width * superres_scale_denominator / SCALE_NUMERATOR; + *max_superres_tile_width = + AOMMAX(*max_superres_tile_width, supperres_tile_width); + + const int cropped_tile_width = + cm->width - tile_info->mi_col_start * MI_SIZE; + const int cropped_tile_height = + cm->height - tile_info->mi_row_start * MI_SIZE; + *min_cropped_tile_width = + AOMMIN(*min_cropped_tile_width, cropped_tile_width); + *min_cropped_tile_height = + AOMMIN(*min_cropped_tile_height, cropped_tile_height); + + const int is_right_most_tile = + tile_info->mi_col_end == cm->mi_params.mi_cols; + if (!is_right_most_tile) { + if (av1_superres_scaled(cm)) + *tile_width_valid &= tile_width >= 128; + else + *tile_width_valid &= tile_width >= 64; + } + } + } +} + +static int store_frame_record(int64_t ts_start, int64_t ts_end, + size_t encoded_size, int pic_size, + int frame_header_count, int tiles, int show_frame, + int show_existing_frame, + FrameWindowBuffer *const buffer) { + if (buffer->num < FRAME_WINDOW_SIZE) { + ++buffer->num; + } else { + buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE; + } + const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE; + FrameRecord *const record = &buffer->buf[new_idx]; + record->ts_start = ts_start; + record->ts_end = ts_end; + record->encoded_size_in_bytes = encoded_size; + record->pic_size = pic_size; + record->frame_header_count = frame_header_count; + record->tiles = tiles; + record->show_frame = show_frame; + record->show_existing_frame = show_existing_frame; + + return new_idx; +} + +// Count the number of frames encoded in the last "duration" ticks, in display +// time. +static int count_frames(const FrameWindowBuffer *const buffer, + int64_t duration) { + const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE; + // Assume current frame is shown frame. + assert(buffer->buf[current_idx].show_frame); + + const int64_t current_time = buffer->buf[current_idx].ts_end; + const int64_t time_limit = AOMMAX(current_time - duration, 0); + int num_frames = 1; + int index = current_idx - 1; + for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) { + if (index < 0) index = FRAME_WINDOW_SIZE - 1; + const FrameRecord *const record = &buffer->buf[index]; + if (!record->show_frame) continue; + const int64_t ts_start = record->ts_start; + if (ts_start < time_limit) break; + } + + return num_frames; +} + +// Scan previously encoded frames and update level metrics accordingly. +static void scan_past_frames(const FrameWindowBuffer *const buffer, + int num_frames_to_scan, + AV1LevelSpec *const level_spec, + AV1LevelStats *const level_stats) { + const int num_frames_in_buffer = buffer->num; + int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE; + int frame_headers = 0; + int tiles = 0; + int64_t display_samples = 0; + int64_t decoded_samples = 0; + size_t encoded_size_in_bytes = 0; + for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) { + const FrameRecord *const record = &buffer->buf[index]; + if (!record->show_existing_frame) { + frame_headers += record->frame_header_count; + decoded_samples += record->pic_size; + } + if (record->show_frame) { + display_samples += record->pic_size; + } + tiles += record->tiles; + encoded_size_in_bytes += record->encoded_size_in_bytes; + --index; + if (index < 0) index = FRAME_WINDOW_SIZE - 1; + } + level_spec->max_header_rate = + AOMMAX(level_spec->max_header_rate, frame_headers); + // TODO(huisu): we can now compute max display rate with the decoder model, so + // these couple of lines can be removed. Keep them here for a while for + // debugging purpose. + level_spec->max_display_rate = + AOMMAX(level_spec->max_display_rate, display_samples); + level_spec->max_decode_rate = + AOMMAX(level_spec->max_decode_rate, decoded_samples); + level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles); + level_stats->max_bitrate = + AOMMAX(level_stats->max_bitrate, + (int)AOMMIN(encoded_size_in_bytes * 8, (size_t)INT_MAX)); +} + +void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start, + int64_t ts_end) { + AV1_COMMON *const cm = &cpi->common; + const AV1LevelParams *const level_params = &cpi->ppi->level_params; + + const int upscaled_width = cm->superres_upscaled_width; + const int width = cm->width; + const int height = cm->height; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const int tiles = tile_cols * tile_rows; + const int luma_pic_size = upscaled_width * height; + const int frame_header_count = cpi->frame_header_count; + const int show_frame = cm->show_frame; + const int show_existing_frame = cm->show_existing_frame; + + int max_tile_size; + int min_cropped_tile_width; + int min_cropped_tile_height; + int max_superres_tile_width; + int tile_width_is_valid; + get_tile_stats(cm, cpi->tile_data, &max_tile_size, &max_superres_tile_width, + &min_cropped_tile_width, &min_cropped_tile_height, + &tile_width_is_valid); + + const double compression_ratio = av1_get_compression_ratio(cm, size); + + const int temporal_layer_id = cm->temporal_layer_id; + const int spatial_layer_id = cm->spatial_layer_id; + const SequenceHeader *const seq_params = cm->seq_params; + const BITSTREAM_PROFILE profile = seq_params->profile; + const int is_still_picture = seq_params->still_picture; + // update level_stats + // TODO(kyslov@) fix the implementation according to buffer model + for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) { + if (!is_in_operating_point(seq_params->operating_point_idc[i], + temporal_layer_id, spatial_layer_id) || + !((level_params->keep_level_stats >> i) & 1)) { + continue; + } + + AV1LevelInfo *const level_info = level_params->level_info[i]; + assert(level_info != NULL); + AV1LevelStats *const level_stats = &level_info->level_stats; + + level_stats->max_tile_size = + AOMMAX(level_stats->max_tile_size, max_tile_size); + level_stats->max_superres_tile_width = + AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width); + level_stats->min_cropped_tile_width = + AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width); + level_stats->min_cropped_tile_height = + AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height); + level_stats->tile_width_is_valid &= tile_width_is_valid; + level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width); + level_stats->min_frame_height = + AOMMIN(level_stats->min_frame_height, height); + level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio); + level_stats->total_compressed_size += (double)size; + + // update level_spec + // TODO(kyslov@) update all spec fields + AV1LevelSpec *const level_spec = &level_info->level_spec; + level_spec->max_picture_size = + AOMMAX(level_spec->max_picture_size, luma_pic_size); + level_spec->max_h_size = + AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width); + level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height); + level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols); + level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles); + + // Store info. of current frame into FrameWindowBuffer. + FrameWindowBuffer *const buffer = &level_info->frame_window_buffer; + store_frame_record(ts_start, ts_end, size, luma_pic_size, + frame_header_count, tiles, show_frame, + show_existing_frame, buffer); + if (show_frame) { + // Count the number of frames encoded in the past 1 second. + const int encoded_frames_in_last_second = + show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0; + scan_past_frames(buffer, encoded_frames_in_last_second, level_spec, + level_stats); + level_stats->total_time_encoded += + (cpi->time_stamps.prev_ts_end - cpi->time_stamps.prev_ts_start) / + (double)TICKS_PER_SEC; + } + + DECODER_MODEL *const decoder_models = level_info->decoder_models; + for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) { + av1_decoder_model_process_frame(cpi, size << 3, &decoder_models[level]); + } + + // Check whether target level is met. + const AV1_LEVEL target_level = level_params->target_seq_level_idx[i]; + if (target_level < SEQ_LEVELS && cpi->oxcf.strict_level_conformance) { + assert(is_valid_seq_level_idx(target_level)); + const int tier = seq_params->tier[i]; + const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints( + level_info, target_level, tier, is_still_picture, profile, 0); + if (fail_id != TARGET_LEVEL_OK) { + const int target_level_major = 2 + (target_level >> 2); + const int target_level_minor = target_level & 3; + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Failed to encode to the target level %d_%d. %s", + target_level_major, target_level_minor, + level_fail_messages[fail_id]); + } + } + } +} + +aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params, + const AV1LevelParams *level_params, + int *seq_level_idx) { + const int is_still_picture = seq_params->still_picture; + const BITSTREAM_PROFILE profile = seq_params->profile; + for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) { + seq_level_idx[op] = (int)SEQ_LEVEL_MAX; + if (!((level_params->keep_level_stats >> op) & 1)) continue; + const int tier = seq_params->tier[op]; + const AV1LevelInfo *const level_info = level_params->level_info[op]; + assert(level_info != NULL); + for (int level = 0; level < SEQ_LEVELS; ++level) { + if (!is_valid_seq_level_idx(level)) continue; + const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints( + level_info, level, tier, is_still_picture, profile, 1); + if (fail_id == TARGET_LEVEL_OK) { + seq_level_idx[op] = level; + break; + } + } + } + + return AOM_CODEC_OK; +} + +aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params, + const AV1LevelParams *level_params, + int *target_seq_level_idx) { + for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) { + target_seq_level_idx[op] = (int)SEQ_LEVEL_MAX; + if (!((level_params->keep_level_stats >> op) & 1)) continue; + target_seq_level_idx[op] = level_params->target_seq_level_idx[op]; + } + + return AOM_CODEC_OK; +} diff --git a/third_party/aom/av1/encoder/level.h b/third_party/aom/av1/encoder/level.h new file mode 100644 index 0000000000..ebf2a1c19d --- /dev/null +++ b/third_party/aom/av1/encoder/level.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_LEVEL_H_ +#define AOM_AV1_ENCODER_LEVEL_H_ + +#include "av1/common/enums.h" + +struct AV1_COMP; + +// AV1 Level Specifications +typedef struct { + AV1_LEVEL level; + int max_picture_size; + int max_h_size; + int max_v_size; + int max_header_rate; + int max_tile_rate; + int max_tiles; + int max_tile_cols; + int64_t max_display_rate; + int64_t max_decode_rate; + double main_mbps; + double high_mbps; + double main_cr; + double high_cr; +} AV1LevelSpec; + +typedef struct { + int64_t ts_start; + int64_t ts_end; + size_t encoded_size_in_bytes; + int pic_size; + int frame_header_count; + int tiles; + int show_frame; + int show_existing_frame; +} FrameRecord; + +// Record frame info. in a rolling window. +#define FRAME_WINDOW_SIZE 256 +typedef struct { + FrameRecord buf[FRAME_WINDOW_SIZE]; + int num; // Number of FrameRecord stored in the buffer. + int start; // Buffer index of the first FrameRecord. +} FrameWindowBuffer; + +typedef struct { + int max_bitrate; // Max bitrate in any 1-second window, in bps. + int max_tile_size; + int max_superres_tile_width; + int min_cropped_tile_width; + int min_cropped_tile_height; + int tile_width_is_valid; + int min_frame_width; + int min_frame_height; + double total_compressed_size; // In bytes. + double total_time_encoded; // In seconds. + double min_cr; +} AV1LevelStats; + +// The following data structures are for the decoder model. +typedef struct { + int decoder_ref_count; + int player_ref_count; + int display_index; + FRAME_TYPE frame_type; + double presentation_time; +} FRAME_BUFFER; + +// Interval of bits transmission for a DFG(Decodable Frame Group). +typedef struct { + double first_bit_arrival_time; // Time when the first bit arrives. + double last_bit_arrival_time; // Time when the last bit arrives. + // Removal time means the time when the bits to be decoded are removed from + // the smoothing buffer. Removal time is essentially the time when the + // decoding of the frame starts. + double removal_time; +} DFG_INTERVAL; + +#define DFG_INTERVAL_QUEUE_SIZE 64 +typedef struct { + int head; + int size; + double total_interval; + DFG_INTERVAL buf[DFG_INTERVAL_QUEUE_SIZE]; +} DFG_INTERVAL_QUEUE; + +enum { + RESOURCE_MODE = 0, // Resource availability mode. + SCHEDULE_MODE // Decoding schedule mode. +} UENUM1BYTE(DECODER_MODEL_MODE); + +enum { + DECODER_MODEL_OK = 0, + DECODE_BUFFER_AVAILABLE_LATE, + DECODE_FRAME_BUF_UNAVAILABLE, + DECODE_EXISTING_FRAME_BUF_EMPTY, + DISPLAY_FRAME_LATE, + SMOOTHING_BUFFER_UNDERFLOW, + SMOOTHING_BUFFER_OVERFLOW, + DECODER_MODEL_DISABLED +} UENUM1BYTE(DECODER_MODEL_STATUS); + +#define BUFFER_POOL_MAX_SIZE 10 +typedef struct { + DECODER_MODEL_STATUS status; + DECODER_MODEL_MODE mode; + bool is_low_delay_mode; + AV1_LEVEL level; + int encoder_buffer_delay; // In units of 1/90000 seconds. + int decoder_buffer_delay; // In units of 1/90000 seconds. + int num_ticks_per_picture; + int initial_display_delay; // In units of frames. + int64_t decode_rate; + double display_clock_tick; // In units of seconds. + double current_time; // In units of seconds. + double initial_presentation_delay; // In units of seconds. + double bit_rate; // Bits per second. + + int num_frame; + int num_decoded_frame; + int num_shown_frame; + int vbi[REF_FRAMES]; // Virtual buffer index. + FRAME_BUFFER frame_buffer_pool[BUFFER_POOL_MAX_SIZE]; + DFG_INTERVAL_QUEUE dfg_interval_queue; + + // Information for the DFG(Decodable Frame Group) being processed. + double first_bit_arrival_time; + double last_bit_arrival_time; + size_t coded_bits; + + // Information for the frame being processed. + double removal_time; + double presentation_time; + int decode_samples; + int display_samples; + + double max_display_rate; + double max_decode_rate; +} DECODER_MODEL; + +typedef struct { + AV1LevelStats level_stats; + AV1LevelSpec level_spec; + FrameWindowBuffer frame_window_buffer; + DECODER_MODEL decoder_models[SEQ_LEVELS]; +} AV1LevelInfo; + +typedef struct AV1LevelParams { + // Specifies the level that the coded video sequence conforms to for each + // operating point. + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + // Bit mask to indicate whether to keep level stats for corresponding + // operating points. + uint32_t keep_level_stats; + // Level information for each operating point. + AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS]; +} AV1LevelParams; + +static INLINE int is_in_operating_point(int operating_point, + int temporal_layer_id, + int spatial_layer_id) { + if (!operating_point) return 1; + + return ((operating_point >> temporal_layer_id) & 1) && + ((operating_point >> (spatial_layer_id + 8)) & 1); +} + +void av1_init_level_info(struct AV1_COMP *cpi); + +void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start, + int64_t ts_end); + +// Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS]. +aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params, + const AV1LevelParams *level_params, + int *seq_level_idx); + +aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params, + const AV1LevelParams *level_params, + int *target_seq_level_idx); + +// Print the status of the decoder model(for debugging). +void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model); + +void av1_decoder_model_init(const struct AV1_COMP *const cpi, AV1_LEVEL level, + int op_index, DECODER_MODEL *const decoder_model); + +void av1_decoder_model_process_frame(const struct AV1_COMP *const cpi, + size_t coded_bits, + DECODER_MODEL *const decoder_model); + +// This function uses the decoder model to check whether there could be +// SMOOTHING_BUFFER_UNDERFLOW or SMOOTHING_BUFFER_OVERFLOW. It does not +// update the content of decoder_model, and can be used to target certain +// encoding level in the recode loop. +DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf( + const struct AV1_COMP *const cpi, size_t coded_bits, + const DECODER_MODEL *const decoder_model); + +// Return max bitrate(bps) for given level. +double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier, + BITSTREAM_PROFILE profile); + +// Get max number of tiles and tile columns for given level. +void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles, + int *const max_tile_cols); + +// Return minimum compression ratio for given level. +double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier, + int is_still_picture); +#endif // AOM_AV1_ENCODER_LEVEL_H_ diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c new file mode 100644 index 0000000000..9ef9b88675 --- /dev/null +++ b/third_party/aom/av1/encoder/lookahead.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_config.h" + +#include "aom_scale/yv12config.h" +#include "av1/common/common.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/lookahead.h" + +/* Return the buffer at the given absolute index and increment the index */ +static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) { + int index = *idx; + struct lookahead_entry *buf = ctx->buf + index; + + assert(index < ctx->max_sz); + if (++index >= ctx->max_sz) index -= ctx->max_sz; + *idx = index; + return buf; +} + +void av1_lookahead_destroy(struct lookahead_ctx *ctx) { + if (ctx) { + if (ctx->buf) { + int i; + + for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img); + free(ctx->buf); + } + free(ctx); + } +} + +struct lookahead_ctx *av1_lookahead_init( + unsigned int width, unsigned int height, unsigned int subsampling_x, + unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, + const int border_in_pixels, int byte_alignment, int num_lap_buffers, + bool is_all_intra, int num_pyramid_levels) { + int lag_in_frames = AOMMAX(1, depth); + + // For all-intra frame encoding, previous source frames are not required. + // Hence max_pre_frames is set to 0 in this case. As previous source frames + // are accessed using a negative index to av1_lookahead_peek(), setting + // max_pre_frames to 0 will cause av1_lookahead_peek() to return NULL for a + // negative index. + const uint8_t max_pre_frames = is_all_intra ? 0 : MAX_PRE_FRAMES; + + // Add the lags to depth and clamp + depth += num_lap_buffers; + depth = clamp(depth, 1, MAX_TOTAL_BUFFERS); + + // Allocate memory to keep previous source frames available. + depth += max_pre_frames; + + // Allocate the lookahead structures + struct lookahead_ctx *ctx = calloc(1, sizeof(*ctx)); + if (ctx) { + unsigned int i; + ctx->max_sz = depth; + ctx->push_frame_count = 0; + ctx->max_pre_frames = max_pre_frames; + ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - ctx->max_pre_frames; + ctx->read_ctxs[ENCODE_STAGE].valid = 1; + if (num_lap_buffers) { + ctx->read_ctxs[LAP_STAGE].pop_sz = lag_in_frames; + ctx->read_ctxs[LAP_STAGE].valid = 1; + } + ctx->buf = calloc(depth, sizeof(*ctx->buf)); + if (!ctx->buf) goto fail; + for (i = 0; i < depth; i++) { + if (aom_realloc_frame_buffer( + &ctx->buf[i].img, width, height, subsampling_x, subsampling_y, + use_highbitdepth, border_in_pixels, byte_alignment, NULL, NULL, + NULL, num_pyramid_levels, 0)) { + goto fail; + } + } + } + return ctx; +fail: + av1_lookahead_destroy(ctx); + return NULL; +} + +int av1_lookahead_full(const struct lookahead_ctx *ctx) { + // TODO(angiebird): Test this function. + return ctx->read_ctxs[ENCODE_STAGE].sz >= ctx->read_ctxs[ENCODE_STAGE].pop_sz; +} + +int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, int use_highbitdepth, + int num_pyramid_levels, aom_enc_frame_flags_t flags) { + int width = src->y_crop_width; + int height = src->y_crop_height; + int uv_width = src->uv_crop_width; + int uv_height = src->uv_crop_height; + int subsampling_x = src->subsampling_x; + int subsampling_y = src->subsampling_y; + int larger_dimensions, new_dimensions; + + assert(ctx->read_ctxs[ENCODE_STAGE].valid == 1); + if (ctx->read_ctxs[ENCODE_STAGE].sz + ctx->max_pre_frames > ctx->max_sz) + return 1; + + ctx->read_ctxs[ENCODE_STAGE].sz++; + if (ctx->read_ctxs[LAP_STAGE].valid) { + ctx->read_ctxs[LAP_STAGE].sz++; + } + + struct lookahead_entry *buf = pop(ctx, &ctx->write_idx); + + new_dimensions = width != buf->img.y_crop_width || + height != buf->img.y_crop_height || + uv_width != buf->img.uv_crop_width || + uv_height != buf->img.uv_crop_height; + larger_dimensions = width > buf->img.y_width || height > buf->img.y_height || + uv_width > buf->img.uv_width || + uv_height > buf->img.uv_height; + assert(!larger_dimensions || new_dimensions); + + if (larger_dimensions) { + YV12_BUFFER_CONFIG new_img; + memset(&new_img, 0, sizeof(new_img)); + if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x, + subsampling_y, use_highbitdepth, + AOM_BORDER_IN_PIXELS, 0, num_pyramid_levels, 0)) + return 1; + aom_free_frame_buffer(&buf->img); + buf->img = new_img; + } else if (new_dimensions) { + buf->img.y_crop_width = src->y_crop_width; + buf->img.y_crop_height = src->y_crop_height; + buf->img.uv_crop_width = src->uv_crop_width; + buf->img.uv_crop_height = src->uv_crop_height; + buf->img.subsampling_x = src->subsampling_x; + buf->img.subsampling_y = src->subsampling_y; + } + // Partial copy not implemented yet + av1_copy_and_extend_frame(src, &buf->img); + + buf->ts_start = ts_start; + buf->ts_end = ts_end; + buf->display_idx = ctx->push_frame_count; + buf->flags = flags; + ++ctx->push_frame_count; + aom_remove_metadata_from_frame_buffer(&buf->img); + if (src->metadata && + aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata)) { + return 1; + } + return 0; +} + +struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain, + COMPRESSOR_STAGE stage) { + struct lookahead_entry *buf = NULL; + if (ctx) { + struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + if (read_ctx->sz && (drain || read_ctx->sz == read_ctx->pop_sz)) { + buf = pop(ctx, &read_ctx->read_idx); + read_ctx->sz--; + } + } + return buf; +} + +struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index, + COMPRESSOR_STAGE stage) { + struct lookahead_entry *buf = NULL; + if (ctx == NULL) { + return buf; + } + + struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + if (index >= 0) { + // Forward peek + if (index < read_ctx->sz) { + index += read_ctx->read_idx; + if (index >= ctx->max_sz) index -= ctx->max_sz; + buf = ctx->buf + index; + } + } else if (index < 0) { + // Backward peek + if (-index <= ctx->max_pre_frames) { + index += (int)(read_ctx->read_idx); + if (index < 0) index += (int)(ctx->max_sz); + buf = ctx->buf + index; + } + } + + return buf; +} + +unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx, + COMPRESSOR_STAGE stage) { + assert(ctx != NULL); + + struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + return read_ctx->sz; +} + +int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) { + assert(ctx != NULL); + + struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + return read_ctx->pop_sz; +} diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h new file mode 100644 index 0000000000..c0e6d222f5 --- /dev/null +++ b/third_party/aom/av1/encoder/lookahead.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes look ahead buffer operations. + */ +#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_ +#define AOM_AV1_ENCODER_LOOKAHEAD_H_ + +#include + +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ +#define MAX_LAG_BUFFERS 48 +#define MAX_LAP_BUFFERS 48 +#define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS) +#define LAP_LAG_IN_FRAMES 17 + +struct lookahead_entry { + YV12_BUFFER_CONFIG img; + int64_t ts_start; + int64_t ts_end; + int display_idx; + aom_enc_frame_flags_t flags; +}; + +// The max of past frames we want to keep in the queue. +#define MAX_PRE_FRAMES 1 + +enum { ENCODE_STAGE, LAP_STAGE, MAX_STAGES } UENUM1BYTE(COMPRESSOR_STAGE); + +struct read_ctx { + int sz; /* Number of buffers currently in the queue */ + int read_idx; /* Read index */ + int pop_sz; /* Size to check for pop condition */ + int valid; /* Is this ctx valid? */ +}; + +struct lookahead_ctx { + int max_sz; /* Absolute size of the queue */ + int write_idx; /* Write index */ + struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */ + struct lookahead_entry *buf; /* Buffer list */ + int push_frame_count; /* Number of frames that have been pushed in the queue*/ + uint8_t + max_pre_frames; /* Maximum number of past frames allowed in the queue */ +}; +/*!\endcond */ + +/**\brief Initializes the lookahead stage + * + * The lookahead stage is a queue of frame buffers on which some analysis + * may be done when buffers are enqueued. + */ +struct lookahead_ctx *av1_lookahead_init( + unsigned int width, unsigned int height, unsigned int subsampling_x, + unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, + const int border_in_pixels, int byte_alignment, int num_lap_buffers, + bool is_all_intra, int num_pyramid_levels); + +/**\brief Destroys the lookahead stage + */ +void av1_lookahead_destroy(struct lookahead_ctx *ctx); + +/**\brief Check if lookahead buffer is full + */ +int av1_lookahead_full(const struct lookahead_ctx *ctx); + +/**\brief Enqueue a source buffer + * + * This function will copy the source image into a new framebuffer with + * the expected stride/border. + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] src Pointer to the image to enqueue + * \param[in] ts_start Timestamp for the start of this frame + * \param[in] ts_end Timestamp for the end of this frame + * \param[in] use_highbitdepth Tell if HBD is used + * \param[in] num_pyramid_levels Number of pyramid levels to allocate + for each frame buffer + * \param[in] flags Flags set on this frame + */ +int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, int use_highbitdepth, + int num_pyramid_levels, aom_enc_frame_flags_t flags); + +/**\brief Get the next source buffer to encode + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] drain Flag indicating the buffer should be drained + * (return a buffer regardless of the current queue depth) + * \param[in] stage Encoder stage + * + * \retval Return NULL, if drain set and queue is empty, or if drain not set and + * queue not of the configured depth. + */ +struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain, + COMPRESSOR_STAGE stage); + +/**\brief Get a future source buffer to encode + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] index Index of the frame to be returned, 0 == next frame + * \param[in] stage Encoder stage + * + * \retval Return NULL, if no buffer exists at the specified index + */ +struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index, + COMPRESSOR_STAGE stage); + +/**\brief Get the number of frames currently in the lookahead queue + */ +unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx, + COMPRESSOR_STAGE stage); + +/**\brief Get pop_sz value + */ +int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_LOOKAHEAD_H_ diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c new file mode 100644 index 0000000000..4e53447379 --- /dev/null +++ b/third_party/aom/av1/encoder/mcomp.c @@ -0,0 +1,3998 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/common.h" +#include "av1/common/filter.h" +#include "av1/common/mvref_common.h" +#include "av1/common/reconinter.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" + +static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params, + const MvCosts *mv_costs, + const MV *ref_mv, int errorperbit, + int sadperbit) { + mv_cost_params->ref_mv = ref_mv; + mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv); + mv_cost_params->mv_cost_type = MV_COST_ENTROPY; + mv_cost_params->error_per_bit = errorperbit; + mv_cost_params->sad_per_bit = sadperbit; + // For allintra encoding mode, 'mv_costs' is not allocated. Hence, the + // population of mvjcost and mvcost are avoided. In case of IntraBC, these + // values are populated from 'dv_costs' in av1_set_ms_to_intra_mode(). + if (mv_costs != NULL) { + mv_cost_params->mvjcost = mv_costs->nmv_joint_cost; + mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0]; + mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1]; + } +} + +static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) { + ms_buffers->ref = &x->e_mbd.plane[0].pre[0]; + ms_buffers->src = &x->plane[0].src; + + av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0); + + ms_buffers->wsrc = x->obmc_buffer.wsrc; + ms_buffers->obmc_mask = x->obmc_buffer.mask; +} + +void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) { + obmc_buffer->wsrc = NULL; + obmc_buffer->mask = NULL; + obmc_buffer->above_pred = NULL; + obmc_buffer->left_pred = NULL; +} + +void av1_make_default_fullpel_ms_params( + FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv, + const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS], + SEARCH_METHODS search_method, int fine_search_interval) { + const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; + const int is_key_frame = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE; + + // High level params + ms_params->bsize = bsize; + ms_params->vfp = &cpi->ppi->fn_ptr[bsize]; + + init_ms_buffers(&ms_params->ms_buffers, x); + + av1_set_mv_search_method(ms_params, search_sites, search_method); + + ms_params->mesh_patterns[0] = mv_sf->mesh_patterns; + ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns; + ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh; + ms_params->prune_mesh_search = + (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_2) ? 1 : 0; + ms_params->mesh_search_mv_diff_threshold = 4; + ms_params->run_mesh_search = 0; + ms_params->fine_search_interval = fine_search_interval; + + ms_params->is_intra_mode = 0; + + ms_params->fast_obmc_search = mv_sf->obmc_full_pixel_search_level; + + ms_params->mv_limits = x->mv_limits; + av1_set_mv_search_range(&ms_params->mv_limits, ref_mv); + + // Mvcost params + init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv, + x->errorperbit, x->sadperbit); + + ms_params->sdf = ms_params->vfp->sdf; + ms_params->sdx4df = ms_params->vfp->sdx4df; + ms_params->sdx3df = ms_params->vfp->sdx3df; + + if (mv_sf->use_downsampled_sad == 2 && block_size_high[bsize] >= 16) { + ms_params->sdf = ms_params->vfp->sdsf; + ms_params->sdx4df = ms_params->vfp->sdsx4df; + // Skip version of sadx3 is not available yet + ms_params->sdx3df = ms_params->vfp->sdsx4df; + } else if (mv_sf->use_downsampled_sad == 1 && block_size_high[bsize] >= 16 && + !is_key_frame) { + FULLPEL_MV start_mv_clamped = start_mv; + // adjust start_mv to make sure it is within MV range + clamp_fullmv(&start_mv_clamped, &ms_params->mv_limits); + + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const int ref_stride = ref->stride; + const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv_clamped); + const struct buf_2d *const src = ms_params->ms_buffers.src; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + + unsigned int start_mv_sad_even_rows, start_mv_sad_odd_rows; + start_mv_sad_even_rows = + ms_params->vfp->sdsf(src_buf, src_stride, best_address, ref_stride); + start_mv_sad_odd_rows = + ms_params->vfp->sdsf(src_buf + src_stride, src_stride, + best_address + ref_stride, ref_stride); + + // If the absolute SAD difference computed between the pred-to-src of even + // and odd rows is small, skip every other row in sad computation. + const int odd_to_even_diff_sad = + abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows); + const int mult_thresh = 4; + if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) { + ms_params->sdf = ms_params->vfp->sdsf; + ms_params->sdx4df = ms_params->vfp->sdsx4df; + ms_params->sdx3df = ms_params->vfp->sdsx4df; + } + } +} + +void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const IntraBCMVCosts *dv_costs) { + ms_params->is_intra_mode = 1; + + MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + + mv_cost_params->mvjcost = dv_costs->joint_mv; + mv_cost_params->mvcost[0] = dv_costs->dv_costs[0]; + mv_cost_params->mvcost[1] = dv_costs->dv_costs[1]; +} + +void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, + const MV *ref_mv, const int *cost_list) { + const AV1_COMMON *cm = &cpi->common; + // High level params + ms_params->allow_hp = cm->features.allow_high_precision_mv; + ms_params->forced_stop = cpi->sf.mv_sf.subpel_force_stop; + ms_params->iters_per_step = cpi->sf.mv_sf.subpel_iters_per_step; + ms_params->cost_list = cond_cost_list_const(cpi, cost_list); + + av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv); + + // Mvcost params + init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv, + x->errorperbit, x->sadperbit); + + // Subpel variance params + ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize]; + ms_params->var_params.subpel_search_type = + cpi->sf.mv_sf.use_accurate_subpel_search; + ms_params->var_params.w = block_size_wide[bsize]; + ms_params->var_params.h = block_size_high[bsize]; + + // Ref and src buffers + MSBuffers *ms_buffers = &ms_params->var_params.ms_buffers; + init_ms_buffers(ms_buffers, x); +} + +void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) { + // Calculate the outermost full-pixel MVs which are inside the limits set by + // av1_set_subpel_mv_search_range(). + // + // The subpel limits are simply mv->col +/- 8*MAX_FULL_PEL_VAL, and similar + // for mv->row. We can then divide by 8 to find the fullpel MV limits. But + // we have to be careful about the rounding. We want these bounds to be + // at least as tight as the subpel limits, which means that we must round + // the minimum values up and the maximum values down when dividing. + int col_min = ((mv->col + 7) >> 3) - MAX_FULL_PEL_VAL; + int row_min = ((mv->row + 7) >> 3) - MAX_FULL_PEL_VAL; + int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL; + int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL; + + col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1); + row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1); + col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1); + row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1); + + // Get intersection of UMV window and valid MV window to reduce # of checks + // in diamond search. + if (mv_limits->col_min < col_min) mv_limits->col_min = col_min; + if (mv_limits->col_max > col_max) mv_limits->col_max = col_max; + if (mv_limits->row_min < row_min) mv_limits->row_min = row_min; + if (mv_limits->row_max > row_max) mv_limits->row_max = row_max; + + mv_limits->col_max = AOMMAX(mv_limits->col_min, mv_limits->col_max); + mv_limits->row_max = AOMMAX(mv_limits->row_min, mv_limits->row_max); +} + +int av1_init_search_range(int size) { + int sr = 0; + // Minimum search size no matter what the passed in value. + size = AOMMAX(16, size); + + while ((size << sr) < MAX_FULL_PEL_VAL) sr++; + + sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2); + return sr; +} + +// ============================================================================ +// Cost of motion vectors +// ============================================================================ +// TODO(any): Adaptively adjust the regularization strength based on image size +// and motion activity instead of using hard-coded values. It seems like we +// roughly half the lambda for each increase in resolution +// These are multiplier used to perform regularization in motion compensation +// when x->mv_cost_type is set to MV_COST_L1. +// LOWRES +#define SSE_LAMBDA_LOWRES 2 // Used by mv_cost_err_fn +#define SAD_LAMBDA_LOWRES 32 // Used by mvsad_err_cost during full pixel search +// MIDRES +#define SSE_LAMBDA_MIDRES 0 // Used by mv_cost_err_fn +#define SAD_LAMBDA_MIDRES 15 // Used by mvsad_err_cost during full pixel search +// HDRES +#define SSE_LAMBDA_HDRES 1 // Used by mv_cost_err_fn +#define SAD_LAMBDA_HDRES 8 // Used by mvsad_err_cost during full pixel search + +// Returns the rate of encoding the current motion vector based on the +// joint_cost and comp_cost. joint_costs covers the cost of transmitting +// JOINT_MV, and comp_cost covers the cost of transmitting the actual motion +// vector. +static INLINE int mv_cost(const MV *mv, const int *joint_cost, + const int *const comp_cost[2]) { + return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] + + comp_cost[1][mv->col]; +} + +#define CONVERT_TO_CONST_MVCOST(ptr) ((const int *const *)(ptr)) +// Returns the cost of encoding the motion vector diff := *mv - *ref. The cost +// is defined as the rate required to encode diff * weight, rounded to the +// nearest 2 ** 7. +// This is NOT used during motion compensation. +int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, + int *const mvcost[2], int weight) { + const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col }; + return ROUND_POWER_OF_TWO( + mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7); +} + +// Returns the cost of using the current mv during the motion search. This is +// used when var is used as the error metric. +#define PIXEL_TRANSFORM_ERROR_SCALE 4 +static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv, + const int *mvjcost, const int *const mvcost[2], + int error_per_bit, MV_COST_TYPE mv_cost_type) { + const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col }; + const MV abs_diff = { abs(diff.row), abs(diff.col) }; + + switch (mv_cost_type) { + case MV_COST_ENTROPY: + if (mvcost) { + return (int)ROUND_POWER_OF_TWO_64( + (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit, + RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT + + PIXEL_TRANSFORM_ERROR_SCALE); + } + return 0; + case MV_COST_L1_LOWRES: + return (SSE_LAMBDA_LOWRES * (abs_diff.row + abs_diff.col)) >> 3; + case MV_COST_L1_MIDRES: + return (SSE_LAMBDA_MIDRES * (abs_diff.row + abs_diff.col)) >> 3; + case MV_COST_L1_HDRES: + return (SSE_LAMBDA_HDRES * (abs_diff.row + abs_diff.col)) >> 3; + case MV_COST_NONE: return 0; + default: assert(0 && "Invalid rd_cost_type"); return 0; + } +} + +static INLINE int mv_err_cost_(const MV *mv, + const MV_COST_PARAMS *mv_cost_params) { + if (mv_cost_params->mv_cost_type == MV_COST_NONE) { + return 0; + } + return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost, + mv_cost_params->mvcost, mv_cost_params->error_per_bit, + mv_cost_params->mv_cost_type); +} + +// Returns the cost of using the current mv during the motion search. This is +// only used during full pixel motion search when sad is used as the error +// metric +static INLINE int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv, + const int *mvjcost, const int *const mvcost[2], + int sad_per_bit, MV_COST_TYPE mv_cost_type) { + const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row), + GET_MV_SUBPEL(mv->col - ref_mv->col) }; + + switch (mv_cost_type) { + case MV_COST_ENTROPY: + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * + sad_per_bit, + AV1_PROB_COST_SHIFT); + case MV_COST_L1_LOWRES: + return (SAD_LAMBDA_LOWRES * (abs(diff.row) + abs(diff.col))) >> 3; + case MV_COST_L1_MIDRES: + return (SAD_LAMBDA_MIDRES * (abs(diff.row) + abs(diff.col))) >> 3; + case MV_COST_L1_HDRES: + return (SAD_LAMBDA_HDRES * (abs(diff.row) + abs(diff.col))) >> 3; + case MV_COST_NONE: return 0; + default: assert(0 && "Invalid rd_cost_type"); return 0; + } +} + +static INLINE int mvsad_err_cost_(const FULLPEL_MV *mv, + const MV_COST_PARAMS *mv_cost_params) { + return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv, + mv_cost_params->mvjcost, mv_cost_params->mvcost, + mv_cost_params->sad_per_bit, + mv_cost_params->mv_cost_type); +} + +// ============================================================================= +// Fullpixel Motion Search: Translational +// ============================================================================= +#define MAX_PATTERN_SCALES 11 +#define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale +#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates + +// Search site initialization for DIAMOND / CLAMPED_DIAMOND search methods. +// level = 0: DIAMOND, level = 1: CLAMPED_DIAMOND. +void av1_init_dsmotion_compensation(search_site_config *cfg, int stride, + int level) { + int num_search_steps = 0; + int stage_index = MAX_MVSEARCH_STEPS - 1; + + cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0; + cfg->site[stage_index][0].offset = 0; + cfg->stride = stride; + + // Choose the initial step size depending on level. + const int first_step = (level > 0) ? (MAX_FIRST_STEP / 4) : MAX_FIRST_STEP; + + for (int radius = first_step; radius > 0;) { + int num_search_pts = 8; + + const FULLPEL_MV search_site_mvs[13] = { + { 0, 0 }, { -radius, 0 }, { radius, 0 }, + { 0, -radius }, { 0, radius }, { -radius, -radius }, + { radius, radius }, { -radius, radius }, { radius, -radius }, + }; + + int i; + for (i = 0; i <= num_search_pts; ++i) { + search_site *const site = &cfg->site[stage_index][i]; + site->mv = search_site_mvs[i]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + cfg->searches_per_step[stage_index] = num_search_pts; + cfg->radius[stage_index] = radius; + // Update the search radius based on level. + if (!level || ((stage_index < 9) && level)) radius /= 2; + --stage_index; + ++num_search_steps; + } + cfg->num_search_steps = num_search_steps; +} + +void av1_init_motion_fpf(search_site_config *cfg, int stride) { + int num_search_steps = 0; + int stage_index = MAX_MVSEARCH_STEPS - 1; + + cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0; + cfg->site[stage_index][0].offset = 0; + cfg->stride = stride; + + for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) { + // Generate offsets for 8 search sites per step. + int tan_radius = AOMMAX((int)(0.41 * radius), 1); + int num_search_pts = 12; + if (radius == 1) num_search_pts = 8; + + const FULLPEL_MV search_site_mvs[13] = { + { 0, 0 }, + { -radius, 0 }, + { radius, 0 }, + { 0, -radius }, + { 0, radius }, + { -radius, -tan_radius }, + { radius, tan_radius }, + { -tan_radius, radius }, + { tan_radius, -radius }, + { -radius, tan_radius }, + { radius, -tan_radius }, + { tan_radius, radius }, + { -tan_radius, -radius }, + }; + + int i; + for (i = 0; i <= num_search_pts; ++i) { + search_site *const site = &cfg->site[stage_index][i]; + site->mv = search_site_mvs[i]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + cfg->searches_per_step[stage_index] = num_search_pts; + cfg->radius[stage_index] = radius; + --stage_index; + ++num_search_steps; + } + cfg->num_search_steps = num_search_steps; +} + +// Search site initialization for NSTEP / NSTEP_8PT search methods. +// level = 0: NSTEP, level = 1: NSTEP_8PT. +void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride, + int level) { + int num_search_steps = 0; + int stage_index = 0; + cfg->stride = stride; + int radius = 1; + const int num_stages = (level > 0) ? 16 : 15; + for (stage_index = 0; stage_index < num_stages; ++stage_index) { + int tan_radius = AOMMAX((int)(0.41 * radius), 1); + int num_search_pts = 12; + if ((radius <= 5) || (level > 0)) { + tan_radius = radius; + num_search_pts = 8; + } + const FULLPEL_MV search_site_mvs[13] = { + { 0, 0 }, + { -radius, 0 }, + { radius, 0 }, + { 0, -radius }, + { 0, radius }, + { -radius, -tan_radius }, + { radius, tan_radius }, + { -tan_radius, radius }, + { tan_radius, -radius }, + { -radius, tan_radius }, + { radius, -tan_radius }, + { tan_radius, radius }, + { -tan_radius, -radius }, + }; + + for (int i = 0; i <= num_search_pts; ++i) { + search_site *const site = &cfg->site[stage_index][i]; + site->mv = search_site_mvs[i]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + cfg->searches_per_step[stage_index] = num_search_pts; + cfg->radius[stage_index] = radius; + ++num_search_steps; + if (stage_index < 12) + radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1); + } + cfg->num_search_steps = num_search_steps; +} + +// Search site initialization for BIGDIA / FAST_BIGDIA / FAST_DIAMOND +// search methods. +void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride, + int level) { + (void)level; + cfg->stride = stride; + // First scale has 4-closest points, the rest have 8 points in diamond + // shape at increasing scales + static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = { + 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + + // BIGDIA search method candidates. + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const FULLPEL_MV + site_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 } }, + { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 }, + { -1, 1 }, { -2, 0 } }, + { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 }, + { -2, 2 }, { -4, 0 } }, + { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 }, + { -4, 4 }, { -8, 0 } }, + { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 }, + { -8, 8 }, { -16, 0 } }, + { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 }, + { 0, 32 }, { -16, 16 }, { -32, 0 } }, + { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 }, + { 0, 64 }, { -32, 32 }, { -64, 0 } }, + { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 }, + { 0, 128 }, { -64, 64 }, { -128, 0 } }, + { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, + { 128, 128 }, { 0, 256 }, { -128, 128 }, { -256, 0 } }, + { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, + { 256, 256 }, { 0, 512 }, { -256, 256 }, { -512, 0 } }, + { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 }, + { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } }, + }; + + /* clang-format on */ + int radius = 1; + for (int i = 0; i < MAX_PATTERN_SCALES; ++i) { + cfg->searches_per_step[i] = bigdia_num_candidates[i]; + cfg->radius[i] = radius; + for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) { + search_site *const site = &cfg->site[i][j]; + site->mv = site_candidates[i][j]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + radius *= 2; + } + cfg->num_search_steps = MAX_PATTERN_SCALES; +} + +// Search site initialization for SQUARE search method. +void av1_init_motion_compensation_square(search_site_config *cfg, int stride, + int level) { + (void)level; + cfg->stride = stride; + // All scales have 8 closest points in square shape. + static const int square_num_candidates[MAX_PATTERN_SCALES] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + + // Square search method candidates. + // Note that the largest candidate step at each scale is 2^scale. + /* clang-format off */ + static const FULLPEL_MV + square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, + { -1, 1 }, { -1, 0 } }, + { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 }, + { -2, 2 }, { -2, 0 } }, + { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 }, + { -4, 4 }, { -4, 0 } }, + { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 }, + { -8, 8 }, { -8, 0 } }, + { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 }, + { 0, 16 }, { -16, 16 }, { -16, 0 } }, + { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 }, + { 0, 32 }, { -32, 32 }, { -32, 0 } }, + { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 }, + { 0, 64 }, { -64, 64 }, { -64, 0 } }, + { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, + { 128, 128 }, { 0, 128 }, { -128, 128 }, { -128, 0 } }, + { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, + { 256, 256 }, { 0, 256 }, { -256, 256 }, { -256, 0 } }, + { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, + { 512, 512 }, { 0, 512 }, { -512, 512 }, { -512, 0 } }, + { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 }, + { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } }, + }; + + /* clang-format on */ + int radius = 1; + for (int i = 0; i < MAX_PATTERN_SCALES; ++i) { + cfg->searches_per_step[i] = square_num_candidates[i]; + cfg->radius[i] = radius; + for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) { + search_site *const site = &cfg->site[i][j]; + site->mv = square_candidates[i][j]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + radius *= 2; + } + cfg->num_search_steps = MAX_PATTERN_SCALES; +} + +// Search site initialization for HEX / FAST_HEX search methods. +void av1_init_motion_compensation_hex(search_site_config *cfg, int stride, + int level) { + (void)level; + cfg->stride = stride; + // First scale has 8-closest points, the rest have 6 points in hex shape + // at increasing scales. + static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6 }; + // Note that the largest candidate step at each scale is 2^scale. + /* clang-format off */ + static const FULLPEL_MV + hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, + { -1, 1 }, { -1, 0 } }, + { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } }, + { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } }, + { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } }, + { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, + { -8, 16 }, { -16, 0 } }, + { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 }, + { -32, 0 } }, + { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 }, + { -64, 0 } }, + { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, + { -64, 128 }, { -128, 0 } }, + { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, + { -128, 256 }, { -256, 0 } }, + { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, + { -256, 512 }, { -512, 0 } }, + { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 }, + { -512, 1024 }, { -1024, 0 } }, + }; + + /* clang-format on */ + int radius = 1; + for (int i = 0; i < MAX_PATTERN_SCALES; ++i) { + cfg->searches_per_step[i] = hex_num_candidates[i]; + cfg->radius[i] = radius; + for (int j = 0; j < hex_num_candidates[i]; ++j) { + search_site *const site = &cfg->site[i][j]; + site->mv = hex_candidates[i][j]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + radius *= 2; + } + cfg->num_search_steps = MAX_PATTERN_SCALES; +} + +const av1_init_search_site_config + av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS] = { + av1_init_dsmotion_compensation, av1_init_motion_compensation_nstep, + av1_init_motion_compensation_nstep, av1_init_dsmotion_compensation, + av1_init_motion_compensation_hex, av1_init_motion_compensation_bigdia, + av1_init_motion_compensation_square + }; + +// Checks whether the mv is within range of the mv_limits +static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col, + int range) { + return ((row - range) >= mv_limits->row_min) & + ((row + range) <= mv_limits->row_max) & + ((col - range) >= mv_limits->col_min) & + ((col + range) <= mv_limits->col_max); +} + +static INLINE int get_mvpred_var_cost( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv, + FULLPEL_MV_STATS *mv_stats) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const MV sub_this_mv = get_mv_from_fullmv(this_mv); + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + const int ref_stride = ref->stride; + + int bestsme; + + bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv), + ref_stride, &mv_stats->sse); + mv_stats->distortion = bestsme; + + mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params); + bestsme += mv_stats->err_cost; + + return bestsme; +} + +static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct buf_2d *const src, + const uint8_t *const ref_address, + const int ref_stride) { + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + + return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride); +} + +static INLINE int get_mvpred_compound_var_cost( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv, + FULLPEL_MV_STATS *mv_stats) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + const int ref_stride = ref->stride; + + const uint8_t *mask = ms_params->ms_buffers.mask; + const uint8_t *second_pred = ms_params->ms_buffers.second_pred; + const int mask_stride = ms_params->ms_buffers.mask_stride; + const int invert_mask = ms_params->ms_buffers.inv_mask; + int bestsme; + + if (mask) { + bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0, + src_buf, src_stride, second_pred, mask, mask_stride, + invert_mask, &mv_stats->sse); + } else if (second_pred) { + bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0, + src_buf, src_stride, &mv_stats->sse, second_pred); + } else { + bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv), + ref_stride, &mv_stats->sse); + } + mv_stats->distortion = bestsme; + + const MV sub_this_mv = get_mv_from_fullmv(this_mv); + mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params); + bestsme += mv_stats->err_cost; + + return bestsme; +} + +static INLINE int get_mvpred_compound_sad( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct buf_2d *const src, const uint8_t *const ref_address, + const int ref_stride) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + + const uint8_t *mask = ms_params->ms_buffers.mask; + const uint8_t *second_pred = ms_params->ms_buffers.second_pred; + const int mask_stride = ms_params->ms_buffers.mask_stride; + const int invert_mask = ms_params->ms_buffers.inv_mask; + + if (mask) { + return vfp->msdf(src_buf, src_stride, ref_address, ref_stride, second_pred, + mask, mask_stride, invert_mask); + } else if (second_pred) { + return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred); + } else { + return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride); + } +} + +// Calculates and returns a sad+mvcost list around an integer best pel during +// fullpixel motion search. The resulting list can be used to speed up subpel +// motion search later. +#define USE_SAD_COSTLIST 1 + +// calc_int_cost_list uses var to populate the costlist, which is more accurate +// than sad but slightly slower. +static AOM_FORCE_INLINE void calc_int_cost_list( + const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + int *cost_list) { + static const FULLPEL_MV neighbors[4] = { + { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } + }; + const int br = best_mv.row; + const int bc = best_mv.col; + + FULLPEL_MV_STATS mv_stats; + cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv, &mv_stats); + + if (check_bounds(&ms_params->mv_limits, br, bc, 1)) { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV neighbor_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + cost_list[i + 1] = + get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats); + } + } else { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV neighbor_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) { + cost_list[i + 1] = INT_MAX; + } else { + cost_list[i + 1] = + get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats); + } + } + } +} + +// calc_int_sad_list uses sad to populate the costlist, which is less accurate +// than var but faster. +static AOM_FORCE_INLINE void calc_int_sad_list( + const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + int *cost_list, int costlist_has_sad) { + static const FULLPEL_MV neighbors[4] = { + { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } + }; + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const int ref_stride = ref->stride; + const int br = best_mv.row; + const int bc = best_mv.col; + + assert(av1_is_fullmv_in_range(&ms_params->mv_limits, best_mv)); + + // Refresh the costlist it does not contain valid sad + if (!costlist_has_sad) { + cost_list[0] = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &best_mv), ref_stride); + + if (check_bounds(&ms_params->mv_limits, br, bc, 1)) { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV this_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + cost_list[i + 1] = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + } + } else { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV this_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + cost_list[i + 1] = INT_MAX; + } else { + cost_list[i + 1] = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + } + } + } + } + + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + cost_list[0] += mvsad_err_cost_(&best_mv, mv_cost_params); + + for (int idx = 0; idx < 4; idx++) { + if (cost_list[idx + 1] != INT_MAX) { + const FULLPEL_MV this_mv = { br + neighbors[idx].row, + bc + neighbors[idx].col }; + cost_list[idx + 1] += mvsad_err_cost_(&this_mv, mv_cost_params); + } + } +} + +// Computes motion vector cost and adds to the sad cost. +// Then updates the best sad and motion vectors. +// Inputs: +// this_sad: the sad to be evaluated. +// mv: the current motion vector. +// mv_cost_params: a structure containing information to compute mv cost. +// best_sad: the current best sad. +// raw_best_sad (optional): the current best sad without calculating mv cost. +// best_mv: the current best motion vector. +// second_best_mv (optional): the second best motion vector up to now. +// Modifies: +// best_sad, raw_best_sad, best_mv, second_best_mv +// If the current sad is lower than the current best sad. +// Returns: +// Whether the input sad (mv) is better than the current best. +static AOM_INLINE int update_mvs_and_sad(const unsigned int this_sad, + const FULLPEL_MV *mv, + const MV_COST_PARAMS *mv_cost_params, + unsigned int *best_sad, + unsigned int *raw_best_sad, + FULLPEL_MV *best_mv, + FULLPEL_MV *second_best_mv) { + if (this_sad >= *best_sad) return 0; + + // Add the motion vector cost. + const unsigned int sad = this_sad + mvsad_err_cost_(mv, mv_cost_params); + if (sad < *best_sad) { + if (raw_best_sad) *raw_best_sad = this_sad; + *best_sad = sad; + if (second_best_mv) *second_best_mv = *best_mv; + *best_mv = *mv; + return 1; + } + return 0; +} + +// Calculate sad4 and update the bestmv information +// in FAST_DIAMOND search method. +static AOM_INLINE void calc_sad4_update_bestmv( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, + const FULLPEL_MV center_mv, const uint8_t *center_address, + unsigned int *bestsad, unsigned int *raw_bestsad, int search_step, + int *best_site, int cand_start, int *cost_list) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const search_site *site = ms_params->search_sites->site[search_step]; + + unsigned char const *block_offset[4]; + unsigned int sads_buf[4]; + unsigned int *sads; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + if (cost_list) { + sads = (unsigned int *)(cost_list + 1); + } else { + sads = sads_buf; + } + // Loop over number of candidates. + for (int j = 0; j < 4; j++) + block_offset[j] = site[cand_start + j].offset + center_address; + + // 4-point sad calculation. + ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads); + + for (int j = 0; j < 4; j++) { + const FULLPEL_MV this_mv = { center_mv.row + site[cand_start + j].mv.row, + center_mv.col + site[cand_start + j].mv.col }; + const int found_better_mv = update_mvs_and_sad( + sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, + /*second_best_mv=*/NULL); + if (found_better_mv) *best_site = cand_start + j; + } +} + +static AOM_INLINE void calc_sad3_update_bestmv( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, + FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad, + unsigned int *raw_bestsad, int search_step, int *best_site, + const int *chkpts_indices, int *cost_list) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const search_site *site = ms_params->search_sites->site[search_step]; + unsigned char const *block_offset[4] = { + center_address + site[chkpts_indices[0]].offset, + center_address + site[chkpts_indices[1]].offset, + center_address + site[chkpts_indices[2]].offset, + center_address, + }; + unsigned int sads[4]; + ms_params->sdx3df(src->buf, src->stride, block_offset, ref->stride, sads); + for (int j = 0; j < 3; j++) { + const int index = chkpts_indices[j]; + const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row, + center_mv.col + site[index].mv.col }; + const int found_better_mv = update_mvs_and_sad( + sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, + /*second_best_mv=*/NULL); + if (found_better_mv) *best_site = j; + } + if (cost_list) { + for (int j = 0; j < 3; j++) { + int index = chkpts_indices[j]; + cost_list[index + 1] = sads[j]; + } + } +} + +// Calculate sad and update the bestmv information +// in FAST_DIAMOND search method. +static AOM_INLINE void calc_sad_update_bestmv( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, + const FULLPEL_MV center_mv, const uint8_t *center_address, + unsigned int *bestsad, unsigned int *raw_bestsad, int search_step, + int *best_site, const int num_candidates, int cand_start, int *cost_list) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const search_site *site = ms_params->search_sites->site[search_step]; + // Loop over number of candidates. + for (int i = cand_start; i < num_candidates; i++) { + const FULLPEL_MV this_mv = { center_mv.row + site[i].mv.row, + center_mv.col + site[i].mv.col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue; + int thissad = get_mvpred_sad(ms_params, src, + center_address + site[i].offset, ref->stride); + if (cost_list) { + cost_list[i + 1] = thissad; + } + const int found_better_mv = update_mvs_and_sad( + thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, + /*second_best_mv=*/NULL); + if (found_better_mv) *best_site = i; + } +} + +static AOM_INLINE void calc_sad_update_bestmv_with_indices( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, + const FULLPEL_MV center_mv, const uint8_t *center_address, + unsigned int *bestsad, unsigned int *raw_bestsad, int search_step, + int *best_site, const int num_candidates, const int *chkpts_indices, + int *cost_list) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const search_site *site = ms_params->search_sites->site[search_step]; + // Loop over number of candidates. + for (int i = 0; i < num_candidates; i++) { + int index = chkpts_indices[i]; + const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row, + center_mv.col + site[index].mv.col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + if (cost_list) { + cost_list[index + 1] = INT_MAX; + } + continue; + } + const int thissad = get_mvpred_sad( + ms_params, src, center_address + site[index].offset, ref->stride); + if (cost_list) { + cost_list[index + 1] = thissad; + } + const int found_better_mv = update_mvs_and_sad( + thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, + /*second_best_mv=*/NULL); + if (found_better_mv) *best_site = i; + } +} + +// Generic pattern search function that searches over multiple scales. +// Each scale can have a different number of candidates and shape of +// candidates as indicated in the num_candidates and candidates arrays +// passed into this function +static int pattern_search(FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + static const int search_steps[MAX_MVSEARCH_STEPS] = { + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + }; + int i, s, t; + + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const search_site_config *search_sites = ms_params->search_sites; + const int *num_candidates = search_sites->searches_per_step; + const int ref_stride = ref->stride; + const int last_is_4 = num_candidates[0] == 4; + int br, bc; + unsigned int bestsad = UINT_MAX, raw_bestsad = UINT_MAX; + int k = -1; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + search_step = AOMMIN(search_step, MAX_MVSEARCH_STEPS - 1); + assert(search_step >= 0); + int best_init_s = search_steps[search_step]; + // adjust ref_mv to make sure it is within MV range + clamp_fullmv(&start_mv, &ms_params->mv_limits); + br = start_mv.row; + bc = start_mv.col; + if (cost_list != NULL) { + cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = + INT_MAX; + } + int costlist_has_sad = 0; + + // Work out the start point for the search + raw_bestsad = get_mvpred_sad(ms_params, src, + get_buf_from_fullmv(ref, &start_mv), ref_stride); + bestsad = raw_bestsad + mvsad_err_cost_(&start_mv, mv_cost_params); + + // Search all possible scales up to the search param around the center point + // pick the scale of the point that is best as the starting scale of + // further steps around it. + const uint8_t *center_address = get_buf_from_fullmv(ref, &start_mv); + if (do_init_search) { + s = best_init_s; + best_init_s = -1; + for (t = 0; t <= s; ++t) { + int best_site = -1; + FULLPEL_MV center_mv = { br, bc }; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) { + // Call 4-point sad for multiples of 4 candidates. + const int no_of_4_cand_loops = num_candidates[t] >> 2; + for (i = 0; i < no_of_4_cand_loops; i++) { + calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, t, + &best_site, i * 4, /*cost_list=*/NULL); + } + // Rest of the candidates + const int remaining_cand = num_candidates[t] % 4; + calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, t, + &best_site, remaining_cand, + no_of_4_cand_loops * 4, NULL); + } else { + calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, t, + &best_site, num_candidates[t], 0, NULL); + } + if (best_site == -1) { + continue; + } else { + best_init_s = t; + k = best_site; + } + } + if (best_init_s != -1) { + br += search_sites->site[best_init_s][k].mv.row; + bc += search_sites->site[best_init_s][k].mv.col; + center_address += search_sites->site[best_init_s][k].offset; + } + } + + // If the center point is still the best, just skip this and move to + // the refinement step. + if (best_init_s != -1) { + const int last_s = (last_is_4 && cost_list != NULL); + int best_site = -1; + s = best_init_s; + + for (; s >= last_s; s--) { + // No need to search all points the 1st time if initial search was used + if (!do_init_search || s != best_init_s) { + FULLPEL_MV center_mv = { br, bc }; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + // Call 4-point sad for multiples of 4 candidates. + const int no_of_4_cand_loops = num_candidates[s] >> 2; + for (i = 0; i < no_of_4_cand_loops; i++) { + calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, + center_mv, center_address, &bestsad, + &raw_bestsad, s, &best_site, i * 4, + /*cost_list=*/NULL); + } + // Rest of the candidates + const int remaining_cand = num_candidates[s] % 4; + calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, remaining_cand, + no_of_4_cand_loops * 4, NULL); + } else { + calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, num_candidates[s], 0, NULL); + } + + if (best_site == -1) { + continue; + } else { + br += search_sites->site[s][best_site].mv.row; + bc += search_sites->site[s][best_site].mv.col; + center_address += search_sites->site[s][best_site].offset; + k = best_site; + } + } + + do { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + + FULLPEL_MV center_mv = { br, bc }; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, next_chkpts_indices, NULL); + } else { + calc_sad_update_bestmv_with_indices( + ms_params, mv_cost_params, best_mv, center_mv, center_address, + &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF, + next_chkpts_indices, NULL); + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += search_sites->site[s][k].mv.row; + bc += search_sites->site[s][k].mv.col; + center_address += search_sites->site[s][k].offset; + } + } while (best_site != -1); + } + // Note: If we enter the if below, then cost_list must be non-NULL. + if (s == 0) { + cost_list[0] = raw_bestsad; + costlist_has_sad = 1; + assert(num_candidates[s] == 4); + if (!do_init_search || s != best_init_s) { + FULLPEL_MV center_mv = { br, bc }; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, 0, cost_list); + } else { + calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, /*num_candidates=*/4, + /*cand_start=*/0, cost_list); + } + + if (best_site != -1) { + br += search_sites->site[s][best_site].mv.row; + bc += search_sites->site[s][best_site].mv.col; + center_address += search_sites->site[s][best_site].offset; + k = best_site; + } + } + while (best_site != -1) { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX; + cost_list[((k + 2) % 4) + 1] = cost_list[0]; + cost_list[0] = raw_bestsad; + + FULLPEL_MV center_mv = { br, bc }; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + assert(PATTERN_CANDIDATES_REF == 3); + calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, next_chkpts_indices, cost_list); + } else { + calc_sad_update_bestmv_with_indices( + ms_params, mv_cost_params, best_mv, center_mv, center_address, + &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF, + next_chkpts_indices, cost_list); + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += search_sites->site[s][k].mv.row; + bc += search_sites->site[s][k].mv.col; + center_address += search_sites->site[s][k].offset; + } + } + } + } + best_mv->row = br; + best_mv->col = bc; + + assert(center_address == get_buf_from_fullmv(ref, best_mv) && + "center address is out of sync with best_mv!\n"); + + // Returns the one-away integer pel cost/sad around the best as follows: + // cost_list[0]: cost/sad at the best integer pel + // cost_list[1]: cost/sad at delta {0, -1} (left) from the best integer pel + // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel + // cost_list[3]: cost/sad at delta { 0, 1} (right) from the best integer pel + // cost_list[4]: cost/sad at delta {-1, 0} (top) from the best integer pel + if (cost_list) { + if (USE_SAD_COSTLIST) { + calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); + } else { + calc_int_cost_list(*best_mv, ms_params, cost_list); + } + } + + const int var_cost = get_mvpred_var_cost(ms_params, best_mv, best_mv_stats); + return var_cost; +} + +// For the following foo_search, the input arguments are: +// start_mv: where we are starting our motion search +// ms_params: a collection of motion search parameters +// search_step: how many steps to skip in our motion search. For example, +// a value 3 suggests that 3 search steps have already taken place prior to +// this function call, so we jump directly to step 4 of the search process +// do_init_search: if on, do an initial search of all possible scales around the +// start_mv, and then pick the best scale. +// cond_list: used to hold the cost around the best full mv so we can use it to +// speed up subpel search later. +// best_mv: the best mv found in the motion search +static int hex_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return pattern_search(start_mv, ms_params, search_step, do_init_search, + cost_list, best_mv, best_mv_stats); +} + +static int bigdia_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return pattern_search(start_mv, ms_params, search_step, do_init_search, + cost_list, best_mv, best_mv_stats); +} + +static int square_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return pattern_search(start_mv, ms_params, search_step, do_init_search, + cost_list, best_mv, best_mv_stats); +} + +static int fast_hex_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return hex_search(start_mv, ms_params, + AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search, + cost_list, best_mv, best_mv_stats); +} + +static int vfast_dia_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return bigdia_search(start_mv, ms_params, + AOMMAX(MAX_MVSEARCH_STEPS - 1, search_step), + do_init_search, cost_list, best_mv, best_mv_stats); +} + +static int fast_dia_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return bigdia_search(start_mv, ms_params, + AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), + do_init_search, cost_list, best_mv, best_mv_stats); +} + +static int fast_bigdia_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return bigdia_search(start_mv, ms_params, + AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step), + do_init_search, cost_list, best_mv, best_mv_stats); +} + +static int diamond_search_sad(FULLPEL_MV start_mv, unsigned int start_mv_sad, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, int *num00, + FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) { +#define UPDATE_SEARCH_STEP \ + do { \ + if (best_site != 0) { \ + tmp_second_best_mv = *best_mv; \ + best_mv->row += site[best_site].mv.row; \ + best_mv->col += site[best_site].mv.col; \ + best_address += site[best_site].offset; \ + is_off_center = 1; \ + } \ + \ + if (is_off_center == 0) num_center_steps++; \ + \ + if (best_site == 0 && step > 2) { \ + int next_step_size = cfg->radius[step - 1]; \ + while (next_step_size == cfg->radius[step] && step > 2) { \ + num_center_steps++; \ + --step; \ + next_step_size = cfg->radius[step - 1]; \ + } \ + } \ + } while (0) + + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + const int ref_stride = ref->stride; + + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + + const search_site_config *cfg = ms_params->search_sites; + + int is_off_center = 0; + // Number of times that we have stayed in the middle. This is used to skip + // search steps in the future if diamond_search_sad is called again. + int num_center_steps = 0; + + // search_step determines the length of the initial step and hence the number + // of iterations. + const int tot_steps = cfg->num_search_steps - search_step; + FULLPEL_MV tmp_second_best_mv; + if (second_best_mv) { + tmp_second_best_mv = *second_best_mv; + } + + *best_mv = start_mv; + + // Check the starting position + const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv); + unsigned int bestsad = start_mv_sad; + + // TODO(chiyotsai@google.com): Implement 4 points search for msdf&sdaf + if (ms_params->ms_buffers.second_pred) { + for (int step = tot_steps - 1; step >= 0; --step) { + const search_site *site = cfg->site[step]; + const int num_searches = cfg->searches_per_step[step]; + int best_site = 0; + + for (int idx = 1; idx <= num_searches; idx++) { + const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row, + best_mv->col + site[idx].mv.col }; + + if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + const uint8_t *const check_here = site[idx].offset + best_address; + unsigned int thissad = + get_mvpred_compound_sad(ms_params, src, check_here, ref_stride); + + if (thissad < bestsad) { + thissad += mvsad_err_cost_(&this_mv, mv_cost_params); + if (thissad < bestsad) { + bestsad = thissad; + best_site = idx; + } + } + } + } + UPDATE_SEARCH_STEP; + } + } else { + for (int step = tot_steps - 1; step >= 0; --step) { + const search_site *site = cfg->site[step]; + const int num_searches = cfg->searches_per_step[step]; + int best_site = 0; + + int all_in = 1; + // Trap illegal vectors + all_in &= best_mv->row + site[1].mv.row >= ms_params->mv_limits.row_min; + all_in &= best_mv->row + site[2].mv.row <= ms_params->mv_limits.row_max; + all_in &= best_mv->col + site[3].mv.col >= ms_params->mv_limits.col_min; + all_in &= best_mv->col + site[4].mv.col <= ms_params->mv_limits.col_max; + + if (all_in) { + for (int idx = 1; idx <= num_searches; idx += 4) { + unsigned char const *block_offset[4]; + unsigned int sads[4]; + + for (int j = 0; j < 4; j++) + block_offset[j] = site[idx + j].offset + best_address; + + ms_params->sdx4df(src_buf, src_stride, block_offset, ref_stride, + sads); + for (int j = 0; j < 4; j++) { + if (sads[j] < bestsad) { + const FULLPEL_MV this_mv = { best_mv->row + site[idx + j].mv.row, + best_mv->col + + site[idx + j].mv.col }; + unsigned int thissad = + sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params); + if (thissad < bestsad) { + bestsad = thissad; + best_site = idx + j; + } + } + } + } + } else { + for (int idx = 1; idx <= num_searches; idx++) { + const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row, + best_mv->col + site[idx].mv.col }; + + if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + const uint8_t *const check_here = site[idx].offset + best_address; + unsigned int thissad = + get_mvpred_sad(ms_params, src, check_here, ref_stride); + + if (thissad < bestsad) { + thissad += mvsad_err_cost_(&this_mv, mv_cost_params); + if (thissad < bestsad) { + bestsad = thissad; + best_site = idx; + } + } + } + } + } + UPDATE_SEARCH_STEP; + } + } + + *num00 = num_center_steps; + if (second_best_mv) { + *second_best_mv = tmp_second_best_mv; + } + + return bestsad; + +#undef UPDATE_SEARCH_STEP +} + +static INLINE unsigned int get_start_mvpred_sad_cost( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv); + + unsigned int start_mv_sad = + mvsad_err_cost_(&start_mv, &ms_params->mv_cost_params); + + if (ms_params->ms_buffers.second_pred) + start_mv_sad += + get_mvpred_compound_sad(ms_params, src, best_address, ref->stride); + else + start_mv_sad += get_mvpred_sad(ms_params, src, best_address, ref->stride); + + return start_mv_sad; +} + +static int full_pixel_diamond(FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, int *cost_list, + FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats, + FULLPEL_MV *second_best_mv) { + const search_site_config *cfg = ms_params->search_sites; + int thissme, n, num00 = 0; + + // Clamp start mv and calculate the cost + clamp_fullmv(&start_mv, &ms_params->mv_limits); + unsigned int start_mv_sad = get_start_mvpred_sad_cost(ms_params, start_mv); + + diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param, &n, best_mv, + second_best_mv); + + int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv, best_mv_stats); + + // If there won't be more n-step search, check to see if refining search is + // needed. + const int further_steps = cfg->num_search_steps - 1 - step_param; + while (n < further_steps) { + ++n; + + // TODO(chiyotsai@google.com): There is another bug here where the second + // best mv gets incorrectly overwritten. Fix it later. + FULLPEL_MV tmp_best_mv; + FULLPEL_MV_STATS tmp_best_mv_stats; + diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param + n, + &num00, &tmp_best_mv, second_best_mv); + + thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv, + &tmp_best_mv_stats); + + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = tmp_best_mv; + *best_mv_stats = tmp_best_mv_stats; + } + + if (num00) { + // Advance the loop by num00 steps + n += num00; + num00 = 0; + } + } + + // Return cost list. + if (cost_list) { + if (USE_SAD_COSTLIST) { + const int costlist_has_sad = 0; + calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); + } else { + calc_int_cost_list(*best_mv, ms_params, cost_list); + } + } + return bestsme; +} + +// Exhaustive motion search around a given centre position with a given +// step size. +static int exhaustive_mesh_search(FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int range, const int step, + FULLPEL_MV *best_mv, + FULLPEL_MV *second_best_mv) { + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const int ref_stride = ref->stride; + unsigned int best_sad = INT_MAX; + int r, c, i; + int start_col, end_col, start_row, end_row; + const int col_step = (step > 1) ? step : 4; + + assert(step >= 1); + + clamp_fullmv(&start_mv, &ms_params->mv_limits); + *best_mv = start_mv; + best_sad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv), + ref_stride); + best_sad += mvsad_err_cost_(&start_mv, mv_cost_params); + start_row = AOMMAX(-range, ms_params->mv_limits.row_min - start_mv.row); + start_col = AOMMAX(-range, ms_params->mv_limits.col_min - start_mv.col); + end_row = AOMMIN(range, ms_params->mv_limits.row_max - start_mv.row); + end_col = AOMMIN(range, ms_params->mv_limits.col_max - start_mv.col); + + for (r = start_row; r <= end_row; r += step) { + for (c = start_col; c <= end_col; c += col_step) { + // Step > 1 means we are not checking every location in this pass. + if (step > 1) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c }; + unsigned int sad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); + update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad, + /*raw_best_sad=*/NULL, best_mv, second_best_mv); + } else { + // 4 sads in a single call if we are checking every location + if (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; + addrs[i] = get_buf_from_fullmv(ref, &mv); + } + + ms_params->sdx4df(src->buf, src->stride, addrs, ref_stride, sads); + + for (i = 0; i < 4; ++i) { + if (sads[i] < best_sad) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; + update_mvs_and_sad(sads[i], &mv, mv_cost_params, &best_sad, + /*raw_best_sad=*/NULL, best_mv, + second_best_mv); + } + } + } else { + for (i = 0; i < end_col - c; ++i) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; + unsigned int sad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); + update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad, + /*raw_best_sad=*/NULL, best_mv, second_best_mv); + } + } + } + } + } + + return best_sad; +} + +// Runs an limited range exhaustive mesh search using a pattern set +// according to the encode speed profile. +static int full_pixel_exhaustive(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct MESH_PATTERN *const mesh_patterns, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *mv_stats, + FULLPEL_MV *second_best_mv) { + const int kMinRange = 7; + const int kMaxRange = 256; + const int kMinInterval = 1; + + int bestsme; + int i; + int interval = mesh_patterns[0].interval; + int range = mesh_patterns[0].range; + int baseline_interval_divisor; + + // TODO(chiyotsai@google.com): Currently exhaustive search calls single ref + // version of sad and variance function. We still need to check the + // performance when compound ref exhaustive search is enabled. + assert(!ms_params->ms_buffers.second_pred && + "Mesh search does not support compound mode!"); + + *best_mv = start_mv; + + // Trap illegal values for interval and range for this function. + if ((range < kMinRange) || (range > kMaxRange) || (interval < kMinInterval) || + (interval > range)) + return INT_MAX; + + baseline_interval_divisor = range / interval; + + // Check size of proposed first range against magnitude of the centre + // value used as a starting point. + range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4); + range = AOMMIN(range, kMaxRange); + interval = AOMMAX(interval, range / baseline_interval_divisor); + // Use a small search step/interval for certain kind of clips. + // For example, screen content clips with a lot of texts. + // Large interval could lead to a false matching position, and it can't find + // the best global candidate in following iterations due to reduced search + // range. The solution here is to use a small search iterval in the beginning + // and thus reduces the chance of missing the best candidate. + if (ms_params->fine_search_interval) { + interval = AOMMIN(interval, 4); + } + + // initial search + bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval, + best_mv, second_best_mv); + + if ((interval > kMinInterval) && (range > kMinRange)) { + // Progressive searches with range and step size decreasing each time + // till we reach a step size of 1. Then break out. + for (i = 1; i < MAX_MESH_STEP; ++i) { + // First pass with coarser step and longer range + bestsme = exhaustive_mesh_search( + *best_mv, ms_params, mesh_patterns[i].range, + mesh_patterns[i].interval, best_mv, second_best_mv); + + if (mesh_patterns[i].interval == 1) break; + } + } + + if (bestsme < INT_MAX) { + bestsme = get_mvpred_var_cost(ms_params, best_mv, mv_stats); + } + + // Return cost list. + if (cost_list) { + if (USE_SAD_COSTLIST) { + const int costlist_has_sad = 0; + calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); + } else { + calc_int_cost_list(*best_mv, ms_params, cost_list); + } + } + return bestsme; +} + +// This function is called when we do joint motion search in comp_inter_inter +// mode, or when searching for one component of an ext-inter compound mode. +int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const FULLPEL_MV start_mv, FULLPEL_MV *best_mv) { + static const search_neighbors neighbors[8] = { + { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 }, + { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 }, + { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 }, + { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 }, + { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 }, + { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 }, + { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 }, + { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 } + }; + + uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * + SEARCH_GRID_STRIDE_8P] = { 0 }; + int grid_center = SEARCH_GRID_CENTER_8P; + int grid_coord = grid_center; + + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const FullMvLimits *mv_limits = &ms_params->mv_limits; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const struct buf_2d *src = ms_buffers->src; + const struct buf_2d *ref = ms_buffers->ref; + const int ref_stride = ref->stride; + + *best_mv = start_mv; + clamp_fullmv(best_mv, mv_limits); + + unsigned int best_sad = get_mvpred_compound_sad( + ms_params, src, get_buf_from_fullmv(ref, best_mv), ref_stride); + best_sad += mvsad_err_cost_(best_mv, mv_cost_params); + + do_refine_search_grid[grid_coord] = 1; + + for (int i = 0; i < SEARCH_RANGE_8P; ++i) { + int best_site = -1; + + for (int j = 0; j < 8; ++j) { + grid_coord = grid_center + neighbors[j].coord_offset; + if (do_refine_search_grid[grid_coord] == 1) { + continue; + } + const FULLPEL_MV mv = { best_mv->row + neighbors[j].coord.row, + best_mv->col + neighbors[j].coord.col }; + + do_refine_search_grid[grid_coord] = 1; + if (av1_is_fullmv_in_range(mv_limits, mv)) { + unsigned int sad; + sad = get_mvpred_compound_sad( + ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + best_mv->row += neighbors[best_site].coord.row; + best_mv->col += neighbors[best_site].coord.col; + grid_center += neighbors[best_site].coord_offset; + } + } + return best_sad; +} + +int av1_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, int *cost_list, + FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats, + FULLPEL_MV *second_best_mv) { + const BLOCK_SIZE bsize = ms_params->bsize; + const SEARCH_METHODS search_method = ms_params->search_method; + + const int is_intra_mode = ms_params->is_intra_mode; + int run_mesh_search = ms_params->run_mesh_search; + + int var = 0; + MARK_MV_INVALID(best_mv); + if (second_best_mv) { + MARK_MV_INVALID(second_best_mv); + } + + if (cost_list) { + cost_list[0] = INT_MAX; + cost_list[1] = INT_MAX; + cost_list[2] = INT_MAX; + cost_list[3] = INT_MAX; + cost_list[4] = INT_MAX; + } + + assert(ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride); + assert(ms_params->ms_buffers.ref->width == ms_params->ms_buffers.src->width); + + switch (search_method) { + case FAST_BIGDIA: + var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list, + best_mv, best_mv_stats); + break; + case VFAST_DIAMOND: + var = vfast_dia_search(start_mv, ms_params, step_param, 0, cost_list, + best_mv, best_mv_stats); + break; + case FAST_DIAMOND: + var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list, + best_mv, best_mv_stats); + break; + case FAST_HEX: + var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list, + best_mv, best_mv_stats); + break; + case HEX: + var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv, + best_mv_stats); + break; + case SQUARE: + var = square_search(start_mv, ms_params, step_param, 1, cost_list, + best_mv, best_mv_stats); + break; + case BIGDIA: + var = bigdia_search(start_mv, ms_params, step_param, 1, cost_list, + best_mv, best_mv_stats); + break; + case NSTEP: + case NSTEP_8PT: + case DIAMOND: + case CLAMPED_DIAMOND: + var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list, + best_mv, best_mv_stats, second_best_mv); + break; + default: assert(0 && "Invalid search method."); + } + + // Should we allow a follow on exhaustive search? + if (!run_mesh_search && + ((search_method == NSTEP) || (search_method == NSTEP_8PT)) && + !ms_params->ms_buffers.second_pred) { + int exhaustive_thr = ms_params->force_mesh_thresh; + exhaustive_thr >>= + 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); + // Threshold variance for an exhaustive full search. + if (var > exhaustive_thr) run_mesh_search = 1; + } + + // TODO(yunqing): the following is used to reduce mesh search in temporal + // filtering. Can extend it to intrabc. + if (!is_intra_mode && ms_params->prune_mesh_search) { + const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row), + abs(start_mv.col - best_mv->col)); + if (full_pel_mv_diff <= ms_params->mesh_search_mv_diff_threshold) { + run_mesh_search = 0; + } + } + + if (ms_params->sdf != ms_params->vfp->sdf) { + // If we are skipping rows when we perform the motion search, we need to + // check the quality of skipping. If it's bad, then we run mesh search with + // skip row features off. + // TODO(chiyotsai@google.com): Handle the case where we have a vertical + // offset of 1 before we hit this statement to avoid having to redo + // motion search. + const struct buf_2d *src = ms_params->ms_buffers.src; + const struct buf_2d *ref = ms_params->ms_buffers.ref; + const int src_stride = src->stride; + const int ref_stride = ref->stride; + + const uint8_t *src_address = src->buf; + const uint8_t *best_address = get_buf_from_fullmv(ref, best_mv); + const int sad = + ms_params->vfp->sdf(src_address, src_stride, best_address, ref_stride); + const int skip_sad = + ms_params->vfp->sdsf(src_address, src_stride, best_address, ref_stride); + // We will keep the result of skipping rows if it's good enough. Here, good + // enough means the error is less than 1 per pixel. + const int kSADThresh = + 1 << (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); + if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= AOMMAX(sad, 1) * 9) { + // There is a large discrepancy between skipping and not skipping, so we + // need to redo the motion search. + FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params; + new_ms_params.sdf = new_ms_params.vfp->sdf; + new_ms_params.sdx4df = new_ms_params.vfp->sdx4df; + new_ms_params.sdx3df = new_ms_params.vfp->sdx3df; + + return av1_full_pixel_search(start_mv, &new_ms_params, step_param, + cost_list, best_mv, best_mv_stats, + second_best_mv); + } + } + + if (run_mesh_search) { + int var_ex; + FULLPEL_MV tmp_mv_ex; + FULLPEL_MV_STATS tmp_mv_stats; + // Pick the mesh pattern for exhaustive search based on the toolset (intraBC + // or non-intraBC) + // TODO(chiyotsai@google.com): There is a bug here where the second best mv + // gets overwritten without actually comparing the rdcost. + const MESH_PATTERN *const mesh_patterns = + ms_params->mesh_patterns[is_intra_mode]; + // TODO(chiyotsai@google.com): the second best mv is not set correctly by + // full_pixel_exhaustive, which can incorrectly override it. + var_ex = + full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns, cost_list, + &tmp_mv_ex, &tmp_mv_stats, second_best_mv); + if (var_ex < var) { + var = var_ex; + *best_mv_stats = tmp_mv_stats; + *best_mv = tmp_mv_ex; + } + } + + return var; +} + +int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + IntraBCHashInfo *intrabc_hash_info, + FULLPEL_MV *best_mv) { + if (!av1_use_hash_me(cpi)) return INT_MAX; + + const BLOCK_SIZE bsize = ms_params->bsize; + const int block_width = block_size_wide[bsize]; + const int block_height = block_size_high[bsize]; + + if (block_width != block_height) return INT_MAX; + + const FullMvLimits *mv_limits = &ms_params->mv_limits; + const MSBuffers *ms_buffer = &ms_params->ms_buffers; + + const uint8_t *src = ms_buffer->src->buf; + const int src_stride = ms_buffer->src->stride; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int x_pos = mi_col * MI_SIZE; + const int y_pos = mi_row * MI_SIZE; + + uint32_t hash_value1, hash_value2; + int best_hash_cost = INT_MAX; + + // for the hashMap + hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table; + + av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width, + &hash_value1, &hash_value2, is_cur_buf_hbd(xd)); + + const int count = av1_hash_table_count(ref_frame_hash, hash_value1); + if (count <= 1) { + return INT_MAX; + } + + Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1); + for (int i = 0; i < count; i++, aom_iterator_increment(&iterator)) { + block_hash ref_block_hash = *(block_hash *)(aom_iterator_get(&iterator)); + if (hash_value2 == ref_block_hash.hash_value2) { + // Make sure the prediction is from valid area. + const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos), + GET_MV_SUBPEL(ref_block_hash.x - x_pos) }; + if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize, + cpi->common.seq_params->mib_size_log2)) + continue; + + FULLPEL_MV hash_mv; + hash_mv.col = ref_block_hash.x - x_pos; + hash_mv.row = ref_block_hash.y - y_pos; + if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue; + FULLPEL_MV_STATS mv_stats; + const int refCost = get_mvpred_var_cost(ms_params, &hash_mv, &mv_stats); + if (refCost < best_hash_cost) { + best_hash_cost = refCost; + *best_mv = hash_mv; + } + } + } + + return best_hash_cost; +} + +static int vector_match(int16_t *ref, int16_t *src, int bwl, int search_size, + int full_search, int *sad) { + int best_sad = INT_MAX; + int this_sad; + int d; + int center, offset = 0; + int bw = search_size << 1; + + if (full_search) { + for (d = 0; d <= bw; d++) { + this_sad = aom_vector_var(&ref[d], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + offset = d; + } + } + center = offset; + *sad = best_sad; + return (center - (bw >> 1)); + } + + for (d = 0; d <= bw; d += 16) { + this_sad = aom_vector_var(&ref[d], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + offset = d; + } + } + center = offset; + + for (d = -8; d <= 8; d += 16) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -4; d <= 4; d += 8) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -2; d <= 2; d += 4) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -1; d <= 1; d += 2) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + *sad = best_sad; + return (center - (bw >> 1)); +} + +// A special fast version of motion search used in rt mode. +// The search window along columns and row is given by: +// +/- me_search_size_col/row. +unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col, const MV *ref_mv, + unsigned int *y_sad_zero, + int me_search_size_col, + int me_search_size_row) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + int idx; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; + const int full_search = is_screen; + const bool screen_scroll_superblock = + is_screen && bsize == cm->seq_params->sb_size; + // Keep border a multiple of 16. + const int border = (cpi->oxcf.border_in_pixels >> 4) << 4; + int search_size_width = me_search_size_col; + int search_size_height = me_search_size_row; + // Adjust based on boundary. + if (((mi_col << 2) - search_size_width < -border) || + ((mi_col << 2) + search_size_width > cm->width + border)) + search_size_width = border; + if (((mi_row << 2) - search_size_height < -border) || + ((mi_row << 2) + search_size_height > cm->height + border)) + search_size_height = border; + const int src_stride = x->plane[0].src.stride; + const int ref_stride = xd->plane[0].pre[0].stride; + uint8_t const *ref_buf, *src_buf; + int_mv *best_int_mv = &xd->mi[0]->mv[0]; + unsigned int best_sad, tmp_sad, this_sad[4]; + int best_sad_col, best_sad_row; + const int row_norm_factor = mi_size_high_log2[bsize] + 1; + const int col_norm_factor = 3 + (bw >> 5); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]); + static const MV search_pos[4] = { + { -1, 0 }, + { 0, -1 }, + { 0, 1 }, + { 1, 0 }, + }; + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; + av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL, + MAX_MB_PLANE); + } + + if (xd->bd != 8) { + best_int_mv->as_fullmv = kZeroFullMv; + best_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + return best_sad; + } + const int width_ref_buf = (search_size_width << 1) + bw; + const int height_ref_buf = (search_size_height << 1) + bh; + int16_t *hbuf = (int16_t *)aom_malloc(width_ref_buf * sizeof(*hbuf)); + int16_t *vbuf = (int16_t *)aom_malloc(height_ref_buf * sizeof(*vbuf)); + int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf)); + int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf)); + if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) { + aom_free(hbuf); + aom_free(vbuf); + aom_free(src_hbuf); + aom_free(src_vbuf); + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf"); + } + + // Set up prediction 1-D reference set for rows. + ref_buf = xd->plane[0].pre[0].buf - search_size_width; + aom_int_pro_row(hbuf, ref_buf, ref_stride, width_ref_buf, bh, + row_norm_factor); + + // Set up prediction 1-D reference set for cols + ref_buf = xd->plane[0].pre[0].buf - search_size_height * ref_stride; + aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, height_ref_buf, + col_norm_factor); + + // Set up src 1-D reference set + src_buf = x->plane[0].src.buf; + aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor); + aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor); + + // Find the best match per 1-D search + best_int_mv->as_fullmv.col = + vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width, + full_search, &best_sad_col); + best_int_mv->as_fullmv.row = + vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height, + full_search, &best_sad_row); + + // For screen: select between horiz or vert motion. + if (is_screen) { + if (best_sad_col < best_sad_row) + best_int_mv->as_fullmv.row = 0; + else + best_int_mv->as_fullmv.col = 0; + } + + FULLPEL_MV this_mv = best_int_mv->as_fullmv; + src_buf = x->plane[0].src.buf; + ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv); + best_sad = + cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + + // Evaluate zero MV if found MV is non-zero. + if (best_int_mv->as_int != 0) { + tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); + *y_sad_zero = tmp_sad; + if (tmp_sad < best_sad) { + best_int_mv->as_fullmv = kZeroFullMv; + this_mv = best_int_mv->as_fullmv; + ref_buf = xd->plane[0].pre[0].buf; + best_sad = tmp_sad; + } + } else { + *y_sad_zero = best_sad; + } + + if (!screen_scroll_superblock) { + const uint8_t *const pos[4] = { + ref_buf - ref_stride, + ref_buf - 1, + ref_buf + 1, + ref_buf + ref_stride, + }; + + cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, + this_sad); + + for (idx = 0; idx < 4; ++idx) { + if (this_sad[idx] < best_sad) { + best_sad = this_sad[idx]; + best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row; + best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col; + } + } + + if (this_sad[0] < this_sad[3]) + this_mv.row -= 1; + else + this_mv.row += 1; + + if (this_sad[1] < this_sad[2]) + this_mv.col -= 1; + else + this_mv.col += 1; + + ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv); + + tmp_sad = + cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + if (best_sad > tmp_sad) { + best_int_mv->as_fullmv = this_mv; + best_sad = tmp_sad; + } + } + + FullMvLimits mv_limits = x->mv_limits; + av1_set_mv_search_range(&mv_limits, ref_mv); + clamp_fullmv(&best_int_mv->as_fullmv, &mv_limits); + + convert_fullmv_to_mv(best_int_mv); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + + aom_free(hbuf); + aom_free(vbuf); + aom_free(src_hbuf); + aom_free(src_vbuf); + return best_sad; +} + +// ============================================================================= +// Fullpixel Motion Search: OBMC +// ============================================================================= +static INLINE int get_obmc_mvpred_var( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const struct buf_2d *ref_buf = ms_buffers->ref; + + const MV mv = get_mv_from_fullmv(this_mv); + unsigned int unused; + + return vfp->ovf(get_buf_from_fullmv(ref_buf, this_mv), ref_buf->stride, wsrc, + mask, &unused) + + mv_err_cost_(&mv, mv_cost_params); +} + +static int obmc_refining_search_sad( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV *best_mv) { + const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const struct buf_2d *ref_buf = ms_buffers->ref; + const FULLPEL_MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + const int kSearchRange = 8; + + unsigned int best_sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, best_mv), + ref_buf->stride, wsrc, mask) + + mvsad_err_cost_(best_mv, mv_cost_params); + + for (int i = 0; i < kSearchRange; i++) { + int best_site = -1; + + for (int j = 0; j < 4; j++) { + const FULLPEL_MV mv = { best_mv->row + neighbors[j].row, + best_mv->col + neighbors[j].col }; + if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) { + unsigned int sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, &mv), + ref_buf->stride, wsrc, mask); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + best_mv->row += neighbors[best_site].row; + best_mv->col += neighbors[best_site].col; + } + } + return best_sad; +} + +static int obmc_diamond_search_sad( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv, + FULLPEL_MV *best_mv, int search_step, int *num00) { + const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp; + const search_site_config *cfg = ms_params->search_sites; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const struct buf_2d *const ref_buf = ms_buffers->ref; + + // search_step determines the length of the initial step and hence the number + // of iterations. + const int tot_steps = cfg->num_search_steps - search_step; + const uint8_t *best_address, *init_ref; + int best_sad = INT_MAX; + int best_site = 0; + + clamp_fullmv(&start_mv, &ms_params->mv_limits); + best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv); + *num00 = 0; + *best_mv = start_mv; + + // Check the starting position + best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) + + mvsad_err_cost_(best_mv, mv_cost_params); + + for (int step = tot_steps - 1; step >= 0; --step) { + const search_site *const site = cfg->site[step]; + best_site = 0; + for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) { + const FULLPEL_MV mv = { best_mv->row + site[idx].mv.row, + best_mv->col + site[idx].mv.col }; + if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) { + int sad = fn_ptr->osdf(best_address + site[idx].offset, ref_buf->stride, + wsrc, mask); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + + if (sad < best_sad) { + best_sad = sad; + best_site = idx; + } + } + } + } + + if (best_site != 0) { + best_mv->row += site[best_site].mv.row; + best_mv->col += site[best_site].mv.col; + best_address += site[best_site].offset; + } else if (best_address == init_ref) { + (*num00)++; + } + } + return best_sad; +} + +static int obmc_full_pixel_diamond( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv, + int step_param, FULLPEL_MV *best_mv) { + const search_site_config *cfg = ms_params->search_sites; + FULLPEL_MV tmp_mv; + int thissme, n, num00 = 0; + int bestsme = + obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param, &n); + if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(ms_params, &tmp_mv); + *best_mv = tmp_mv; + + // If there won't be more n-step search, check to see if refining search is + // needed. + const int further_steps = cfg->num_search_steps - 1 - step_param; + + while (n < further_steps) { + ++n; + + if (num00) { + num00--; + } else { + thissme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, + step_param + n, &num00); + if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv); + + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = tmp_mv; + } + } + } + + return bestsme; +} + +int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, FULLPEL_MV *best_mv) { + if (!ms_params->fast_obmc_search) { + const int bestsme = + obmc_full_pixel_diamond(ms_params, start_mv, step_param, best_mv); + return bestsme; + } else { + *best_mv = start_mv; + clamp_fullmv(best_mv, &ms_params->mv_limits); + int thissme = obmc_refining_search_sad(ms_params, best_mv); + if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, best_mv); + return thissme; + } +} + +// ============================================================================= +// Subpixel Motion Search: Translational +// ============================================================================= +#define INIT_SUBPEL_STEP_SIZE (4) +/* + * To avoid the penalty for crossing cache-line read, preload the reference + * area in a small buffer, which is aligned to make sure there won't be crossing + * cache-line read while reading from this buffer. This reduced the cpu + * cycles spent on reading ref data in sub-pixel filter functions. + * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x + * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we + * could reduce the area. + */ + +// Returns the subpel offset used by various subpel variance functions [m]sv[a]f +static INLINE int get_subpel_part(int x) { return x & 7; } + +// Gets the address of the ref buffer at subpel location (r, c), rounded to the +// nearest fullpel precision toward - \infty +static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, + const MV mv) { + const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3); + return &buf->buf[offset]; +} + +// Estimates the variance of prediction residue using bilinear filter for fast +// search. +static INLINE int estimated_pref_error( + const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const uint8_t *src = ms_buffers->src->buf; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int src_stride = ms_buffers->src->stride; + const int ref_stride = ms_buffers->ref->stride; + const uint8_t *second_pred = ms_buffers->second_pred; + const uint8_t *mask = ms_buffers->mask; + const int mask_stride = ms_buffers->mask_stride; + const int invert_mask = ms_buffers->inv_mask; + + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + if (second_pred == NULL) { + return vfp->svf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + sse); + } else if (mask) { + return vfp->msvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + second_pred, mask, mask_stride, invert_mask, sse); + } else { + return vfp->svaf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + sse, second_pred); + } +} + +// Calculates the variance of prediction residue. +static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm, + const MV *this_mv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const uint8_t *src = ms_buffers->src->buf; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int src_stride = ms_buffers->src->stride; + const int ref_stride = ms_buffers->ref->stride; + const uint8_t *second_pred = ms_buffers->second_pred; + const uint8_t *mask = ms_buffers->mask; + const int mask_stride = ms_buffers->mask_stride; + const int invert_mask = ms_buffers->inv_mask; + const int w = var_params->w; + const int h = var_params->h; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + unsigned int besterr; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); + uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16); + if (second_pred != NULL) { + if (mask) { + aom_highbd_comp_mask_upsampled_pred( + xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride, + invert_mask, xd->bd, subpel_search_type); + } else { + aom_highbd_comp_avg_upsampled_pred( + xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd, + subpel_search_type); + } + } else { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + xd->bd, subpel_search_type); + } + besterr = vfp->vf(pred8, w, src, src_stride, sse); + } else { + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); + if (second_pred != NULL) { + if (mask) { + aom_comp_mask_upsampled_pred( + xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride, + invert_mask, subpel_search_type); + } else { + aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, + second_pred, w, h, subpel_x_q3, subpel_y_q3, + ref, ref_stride, subpel_search_type); + } + } else { + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search_type); + } + + besterr = vfp->vf(pred, w, src, src_stride, sse); + } +#else + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); + if (second_pred != NULL) { + if (mask) { + aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, + second_pred, w, h, subpel_x_q3, subpel_y_q3, + ref, ref_stride, mask, mask_stride, + invert_mask, subpel_search_type); + } else { + aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, + second_pred, w, h, subpel_x_q3, subpel_y_q3, + ref, ref_stride, subpel_search_type); + } + } else { + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, + subpel_y_q3, ref, ref_stride, subpel_search_type); + } + + besterr = vfp->vf(pred, w, src, src_stride, sse); +#endif + return besterr; +} + +// Estimates whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. It is suffixed "fast" because +// it uses bilinear filter to estimate the prediction. +static INLINE unsigned int check_better_fast( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, + const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *has_better_mv, int is_scaled) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + int thismse; + if (is_scaled) { + thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse); + } else { + thismse = estimated_pref_error(this_mv, var_params, &sse); + } + cost = mv_err_cost_(this_mv, mv_cost_params); + cost += thismse; + + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *has_better_mv |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +// Checks whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. +static AOM_FORCE_INLINE unsigned int check_better( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, + const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *is_better) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + int thismse; + thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse); + cost = mv_err_cost_(this_mv, mv_cost_params); + cost += thismse; + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *is_better |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +static INLINE MV get_best_diag_step(int step_size, unsigned int left_cost, + unsigned int right_cost, + unsigned int up_cost, + unsigned int down_cost) { + const MV diag_step = { up_cost <= down_cost ? -step_size : step_size, + left_cost <= right_cost ? -step_size : step_size }; + + return diag_step; +} + +// Searches the four cardinal direction for a better mv, then follows up with a +// search in the best quadrant. This uses bilinear filter to speed up the +// calculation. +static AOM_FORCE_INLINE MV first_level_check_fast( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv, + int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int is_scaled) { + // Check the four cardinal directions + const MV left_mv = { this_mv.row, this_mv.col - hstep }; + int dummy = 0; + const unsigned int left = check_better_fast( + xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, &dummy, is_scaled); + + const MV right_mv = { this_mv.row, this_mv.col + hstep }; + const unsigned int right = check_better_fast( + xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy, is_scaled); + + const MV top_mv = { this_mv.row - hstep, this_mv.col }; + const unsigned int up = check_better_fast( + xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, &dummy, is_scaled); + + const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; + const unsigned int down = check_better_fast( + xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy, is_scaled); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + check_better_fast(xd, cm, &diag_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + + return diag_step; +} + +// Performs a following up search after first_level_check_fast is called. This +// performs two extra chess pattern searches in the best quadrant. +static AOM_FORCE_INLINE void second_level_check_fast( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, const MV diag_step, + MV *best_mv, int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int is_scaled) { + assert(diag_step.row == hstep || diag_step.row == -hstep); + assert(diag_step.col == hstep || diag_step.col == -hstep); + const int tr = this_mv.row; + const int tc = this_mv.col; + const int br = best_mv->row; + const int bc = best_mv->col; + int dummy = 0; + if (tr != br && tc != bc) { + assert(diag_step.col == bc - tc); + assert(diag_step.row == br - tr); + const MV chess_mv_1 = { br, bc + diag_step.col }; + const MV chess_mv_2 = { br + diag_step.row, bc }; + check_better_fast(xd, cm, &chess_mv_1, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + + check_better_fast(xd, cm, &chess_mv_2, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + } else if (tr == br && tc != bc) { + assert(diag_step.col == bc - tc); + // Continue searching in the best direction + const MV bottom_long_mv = { br + hstep, bc + diag_step.col }; + const MV top_long_mv = { br - hstep, bc + diag_step.col }; + check_better_fast(xd, cm, &bottom_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &top_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + + // Search in the direction opposite of the best quadrant + const MV rev_mv = { br - diag_step.row, bc }; + check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + } else if (tr != br && tc == bc) { + assert(diag_step.row == br - tr); + // Continue searching in the best direction + const MV right_long_mv = { br + diag_step.row, bc + hstep }; + const MV left_long_mv = { br + diag_step.row, bc - hstep }; + check_better_fast(xd, cm, &right_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &left_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + + // Search in the direction opposite of the best quadrant + const MV rev_mv = { br, bc - diag_step.col }; + check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + } +} + +// Combines first level check and second level check when applicable. This first +// searches the four cardinal directions, and perform several +// diagonal/chess-pattern searches in the best quadrant. +static AOM_FORCE_INLINE void two_level_checks_fast( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv, + int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int iters, int is_scaled) { + const MV diag_step = first_level_check_fast( + xd, cm, this_mv, best_mv, hstep, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, is_scaled); + if (iters > 1) { + second_level_check_fast(xd, cm, this_mv, diag_step, best_mv, hstep, + mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, is_scaled); + } +} + +static AOM_FORCE_INLINE MV +first_level_check(MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, + MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + int dummy = 0; + const MV left_mv = { this_mv.row, this_mv.col - hstep }; + const MV right_mv = { this_mv.row, this_mv.col + hstep }; + const MV top_mv = { this_mv.row - hstep, this_mv.col }; + const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; + + const unsigned int left = + check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int right = + check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int up = + check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int down = + check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy); + + return diag_step; +} + +// A newer version of second level check that gives better quality. +// TODO(chiyotsai@google.com): evaluate this on subpel_search_types different +// from av1_find_best_sub_pixel_tree +static AOM_FORCE_INLINE void second_level_check_v2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step, + MV *best_mv, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int is_scaled) { + assert(best_mv->row == this_mv.row + diag_step.row || + best_mv->col == this_mv.col + diag_step.col); + if (CHECK_MV_EQUAL(this_mv, *best_mv)) { + return; + } else if (this_mv.row == best_mv->row) { + // Search away from diagonal step since diagonal search did not provide any + // improvement + diag_step.row *= -1; + } else if (this_mv.col == best_mv->col) { + diag_step.col *= -1; + } + + const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col }; + const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col }; + const MV diag_bias_mv = { best_mv->row + diag_step.row, + best_mv->col + diag_step.col }; + int has_better_mv = 0; + + if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { + check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv); + check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv); + } + } else { + check_better_fast(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv, + is_scaled); + check_better_fast(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv, + is_scaled); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + check_better_fast(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv, is_scaled); + } + } +} + +// Gets the error at the beginning when the mv has fullpel precision +static unsigned int setup_center_error( + const MACROBLOCKD *xd, const MV *bestmv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + const int w = var_params->w; + const int h = var_params->h; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const uint8_t *src = ms_buffers->src->buf; + const uint8_t *y = get_buf_from_mv(ms_buffers->ref, *bestmv); + const int src_stride = ms_buffers->src->stride; + const int y_stride = ms_buffers->ref->stride; + const uint8_t *second_pred = ms_buffers->second_pred; + const uint8_t *mask = ms_buffers->mask; + const int mask_stride = ms_buffers->mask_stride; + const int invert_mask = ms_buffers->inv_mask; + + unsigned int besterr; + + if (second_pred != NULL) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]); + uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16); + if (mask) { + aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, + mask, mask_stride, invert_mask); + } else { + aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + } + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); + } else { + DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); + if (mask) { + aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask, + mask_stride, invert_mask); + } else { + aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + } + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); + } +#else + (void)xd; + DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); + if (mask) { + aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask, + mask_stride, invert_mask); + } else { + aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + } + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); +#endif + } else { + besterr = vfp->vf(y, y_stride, src, src_stride, sse1); + } + *distortion = besterr; + besterr += mv_err_cost_(bestmv, mv_cost_params); + return besterr; +} + +// Gets the error at the beginning when the mv has fullpel precision +static unsigned int upsampled_setup_center_error( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, var_params, sse1); + *distortion = besterr; + besterr += mv_err_cost_(bestmv, mv_cost_params); + return besterr; +} + +static INLINE int divide_and_round(int n, int d) { + return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d); +} + +static INLINE int is_cost_list_wellbehaved(const int *cost_list) { + return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] && + cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4]; +} + +// Returns surface minima estimate at given precision in 1/2^n bits. +// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C +// For a given set of costs S0, S1, S2, S3, S4 at points +// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively, +// the solution for the location of the minima (x0, y0) is given by: +// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0), +// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0). +// The code below is an integerized version of that. +static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic, + int bits) { + *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)), + (cost_list[1] - 2 * cost_list[0] + cost_list[3])); + *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)), + (cost_list[4] - 2 * cost_list[0] + cost_list[2])); +} + +// Checks the list of mvs searched in the last iteration and see if we are +// repeating it. If so, return 1. Otherwise we update the last_mv_search_list +// with current_mv and return 0. +static INLINE int check_repeated_mv_and_update(int_mv *last_mv_search_list, + const MV current_mv, int iter) { + if (last_mv_search_list) { + if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) { + return 1; + } + + last_mv_search_list[iter].as_mv = current_mv; + } + return 0; +} + +static AOM_INLINE int setup_center_error_facade( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *bestmv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion, + int is_scaled) { + if (is_scaled) { + return upsampled_setup_center_error(xd, cm, bestmv, var_params, + mv_cost_params, sse1, distortion); + } else { + return setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1, + distortion); + } +} + +int av1_find_best_sub_pixel_tree_pruned_more( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, + unsigned int *sse1, int_mv *last_mv_search_list) { + (void)cm; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const int *cost_list = ms_params->cost_list; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + + // The iteration we are current searching for. Iter 0 corresponds to fullpel + // mv, iter 1 to half pel, and so on + int iter = 0; + int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel + unsigned int besterr = INT_MAX; + *bestmv = start_mv; + + const struct scale_factors *const sf = is_intrabc_block(xd->mi[0]) + ? &cm->sf_identity + : xd->block_ref_scale_factors[0]; + const int is_scaled = av1_is_scaled(sf); + + if (start_mv_stats != NULL && !is_scaled) { + besterr = start_mv_stats->distortion + start_mv_stats->err_cost; + *distortion = start_mv_stats->distortion; + *sse1 = start_mv_stats->sse; + } else { + besterr = + setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params, + sse1, distortion, is_scaled); + } + + // If forced_stop is FULL_PEL, return. + if (forced_stop == FULL_PEL) return besterr; + + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { + int ir, ic; + get_cost_surf_min(cost_list, &ir, &ic, 1); + if (ir != 0 || ic != 0) { + const MV this_mv = { start_mv.row + ir * hstep, + start_mv.col + ic * hstep }; + int dummy = 0; + check_better_fast(xd, cm, &this_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + } + } else { + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + if (forced_stop < HALF_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + if (allow_hp && forced_stop == EIGHTH_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + return besterr; +} + +int av1_find_best_sub_pixel_tree_pruned( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, + unsigned int *sse1, int_mv *last_mv_search_list) { + (void)cm; + (void)start_mv_stats; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const int *cost_list = ms_params->cost_list; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + + // The iteration we are current searching for. Iter 0 corresponds to fullpel + // mv, iter 1 to half pel, and so on + int iter = 0; + int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel + unsigned int besterr = INT_MAX; + *bestmv = start_mv; + + const struct scale_factors *const sf = is_intrabc_block(xd->mi[0]) + ? &cm->sf_identity + : xd->block_ref_scale_factors[0]; + const int is_scaled = av1_is_scaled(sf); + + if (start_mv_stats != NULL && !is_scaled) { + besterr = start_mv_stats->distortion + start_mv_stats->err_cost; + *distortion = start_mv_stats->distortion; + *sse1 = start_mv_stats->sse; + } else { + besterr = + setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params, + sse1, distortion, is_scaled); + } + + // If forced_stop is FULL_PEL, return. + if (forced_stop == FULL_PEL) return besterr; + + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX) { + const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) + + (cost_list[2] < cost_list[4] ? 0 : 2); + + const MV left_mv = { start_mv.row, start_mv.col - hstep }; + const MV right_mv = { start_mv.row, start_mv.col + hstep }; + const MV bottom_mv = { start_mv.row + hstep, start_mv.col }; + const MV top_mv = { start_mv.row - hstep, start_mv.col }; + + const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep }; + const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep }; + const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep }; + const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep }; + + int dummy = 0; + + switch (whichdir) { + case 0: // bottom left quadrant + check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &bottom_left_mv, bestmv, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, &dummy, is_scaled); + break; + case 1: // bottom right quadrant + check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &bottom_right_mv, bestmv, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, &dummy, is_scaled); + break; + case 2: // top left quadrant + check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &top_left_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + break; + case 3: // top right quadrant + check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &top_right_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + break; + } + } else { + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + if (forced_stop < HALF_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + if (allow_hp && forced_stop == EIGHTH_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + return besterr; +} + +int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, + MV *bestmv, int *distortion, + unsigned int *sse1, + int_mv *last_mv_search_list) { + (void)start_mv_stats; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + const SUBPEL_SEARCH_TYPE subpel_search_type = + ms_params->var_params.subpel_search_type; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + // How many steps to take. A round of 0 means fullpel search only, 1 means + // half-pel, and so on. + const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp); + int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel + + unsigned int besterr = INT_MAX; + + *bestmv = start_mv; + + const struct scale_factors *const sf = is_intrabc_block(xd->mi[0]) + ? &cm->sf_identity + : xd->block_ref_scale_factors[0]; + const int is_scaled = av1_is_scaled(sf); + + if (start_mv_stats != NULL && !is_scaled) { + besterr = start_mv_stats->distortion + start_mv_stats->err_cost; + *distortion = start_mv_stats->distortion; + *sse1 = start_mv_stats->sse; + } else { + if (subpel_search_type != USE_2_TAPS_ORIG) { + besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params, + mv_cost_params, sse1, distortion); + } else { + besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1, + distortion); + } + } + + // If forced_stop is FULL_PEL, return. + if (!round) return besterr; + + for (int iter = 0; iter < round; ++iter) { + MV iter_center_mv = *bestmv; + if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv, + iter)) { + return INT_MAX; + } + + MV diag_step; + if (subpel_search_type != USE_2_TAPS_ORIG) { + diag_step = first_level_check(xd, cm, iter_center_mv, bestmv, hstep, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion); + } else { + diag_step = first_level_check_fast(xd, cm, iter_center_mv, bestmv, hstep, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion, is_scaled); + } + + // Check diagonal sub-pixel position + if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) { + second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv, + mv_limits, var_params, mv_cost_params, &besterr, + sse1, distortion, is_scaled); + } + + hstep >>= 1; + } + + return besterr; +} + +// Note(yunqingwang): The following 2 functions are only used in the motion +// vector unit test, which return extreme motion vectors allowed by the MV +// limits. +// Returns the maximum MV. +int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, + MV *bestmv, int *distortion, unsigned int *sse1, + int_mv *last_mv_search_list) { + (void)xd; + (void)cm; + (void)start_mv; + (void)start_mv_stats; + (void)sse1; + (void)distortion; + (void)last_mv_search_list; + + const int allow_hp = ms_params->allow_hp; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + bestmv->row = mv_limits->row_max; + bestmv->col = mv_limits->col_max; + + unsigned int besterr = 0; + + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp, 0); + return besterr; +} + +// Returns the minimum MV. +int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, + MV *bestmv, int *distortion, unsigned int *sse1, + int_mv *last_mv_search_list) { + (void)xd; + (void)cm; + (void)start_mv; + (void)start_mv_stats; + (void)sse1; + (void)distortion; + (void)last_mv_search_list; + + const int allow_hp = ms_params->allow_hp; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + bestmv->row = mv_limits->row_min; + bestmv->col = mv_limits->col_min; + + unsigned int besterr = 0; + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp, 0); + return besterr; +} + +#if !CONFIG_REALTIME_ONLY +// Computes the cost of the current predictor by going through the whole +// av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv +// during motion_mode_rd. We are going through the whole +// av1_enc_build_inter_predictor because we might have changed the interpolation +// filter, etc before motion_mode_rd is called. +static INLINE unsigned int compute_motion_cost( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize, + const MV *this_mv) { + unsigned int mse; + unsigned int sse; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + const MSBuffers *ms_buffers = &var_params->ms_buffers; + + const uint8_t *const src = ms_buffers->src->buf; + const int src_stride = ms_buffers->src->stride; + const uint8_t *const dst = xd->plane[0].dst.buf; + const int dst_stride = xd->plane[0].dst.stride; + const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp; + + mse = vfp->vf(dst, dst_stride, src, src_stride, &sse); + mse += mv_err_cost_(this_mv, &ms_params->mv_cost_params); + return mse; +} + +// Refines MV in a small range + +// Macros to build bitmasks which help us avoid redundant computations +// +// To explain the idea here, imagine that on the first iteration of the +// loop below, we step rightwards. Then, on the second iteration, the neighbors +// to consider are: +// . . . +// 0 1 . +// . . . +// Where 0 is the initial search point, 1 is the best candidate found in the +// first iteration, and the dots are the other neighbors of point 1. +// +// Naively, we would now need to scan all 8 neighbors of point 1 (point 0 and +// the seven points marked with dots), and compare them to see where to move +// next. However, we already evaluated 5 of those 8 neighbors in the last +// iteration, and decided that they are worse than point 1. So we don't need +// to re-consider these points. We only really need to consider the three +// points which are adjacent to point 1 but *not* to point 0. +// +// As the algorithm goes on, there are other ways that redundant evaluations +// can happen, if the search path curls back around on itself. +// +// To avoid all possible redundancies, we'd have to build a set containing +// every point we have already checked, and this would be quite expensive. +// +// So instead, we apply a 95%-effective solution with a much lower overhead: +// we prune out the points which were considered during the previous +// iteration, but we don't worry about any prior iteration. This can be done +// as follows: +// +// We build a static table, called neighbor_mask, which answers the question +// "if we moved in direction X last time, which neighbors are new, and which +// were scanned last iteration?" +// Then we can query this table to quickly determine which points we need to +// evaluate, and which we can skip. +// +// To query the table, the logic is simply: +// neighbor_mask[i] & (1 << j) == "if we moved in direction i last iteration, +// do we need to scan neighbor j this iteration?" +#define NEIGHBOR_MASK_DIA(left, down, right, up) \ + (left | (down << 1) | (right << 2) | (up << 3)) + +#define NEIGHBOR_MASK_SQR(left, down, right, up, down_left, down_right, \ + up_left, up_right) \ + (left | (down << 1) | (right << 2) | (up << 3) | (down_left << 4) | \ + (down_right << 5) | (up_left << 6) | (up_right << 7)) + +static const warp_search_config warp_search_info[WARP_SEARCH_METHODS] = { + // WARP_SEARCH_DIAMOND + { + .num_neighbors = 4, + .neighbors = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }, + .neighbor_mask = { + // If we stepped left last time, consider all points except right + NEIGHBOR_MASK_DIA(1, 1, 0, 1), + // If we stepped down last time, consider all points except up + NEIGHBOR_MASK_DIA(1, 1, 1, 0), + // Stepped right last time + NEIGHBOR_MASK_DIA(0, 1, 1, 1), + // Stepped up last time + NEIGHBOR_MASK_DIA(1, 0, 1, 1), + }, + }, + // WARP_SEARCH_SQUARE + { + .num_neighbors = 8, + .neighbors = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, + { 1, -1 }, { 1, 1 }, { -1, -1 }, { -1, 1 } }, + .neighbor_mask = { + // If we stepped left last time, then we only need to consider 3 points: + // left, down+left, up+left + NEIGHBOR_MASK_SQR(1, 0, 0, 0, 1, 0, 1, 0), + // If we stepped down last time, then we only need to consider 3 points: + // down, down+left, down+right + NEIGHBOR_MASK_SQR(0, 1, 0, 0, 1, 1, 0, 0), + // Stepped right last time + NEIGHBOR_MASK_SQR(0, 0, 1, 0, 0, 1, 0, 1), + // Stepped up last time + NEIGHBOR_MASK_SQR(0, 0, 0, 1, 0, 0, 1, 1), + + // If we stepped down+left last time, then we need to consider 5 points: + // left, down, down+left, down+right, up+left + NEIGHBOR_MASK_SQR(1, 1, 0, 0, 1, 1, 1, 0), + // Stepped down+right last time + NEIGHBOR_MASK_SQR(0, 1, 1, 0, 1, 1, 0, 1), + // Stepped up+left last time + NEIGHBOR_MASK_SQR(1, 0, 0, 1, 1, 0, 1, 1), + // Stepped up+right last time + NEIGHBOR_MASK_SQR(0, 0, 1, 1, 0, 1, 1, 1), + }, + }, +}; + +unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + BLOCK_SIZE bsize, const int *pts0, + const int *pts_inref0, int total_samples, + WARP_SEARCH_METHOD search_method, + int num_iterations) { + MB_MODE_INFO *mbmi = xd->mi[0]; + + const MV *neighbors = warp_search_info[search_method].neighbors; + const int num_neighbors = warp_search_info[search_method].num_neighbors; + const uint8_t *neighbor_mask = warp_search_info[search_method].neighbor_mask; + + MV *best_mv = &mbmi->mv[0].as_mv; + + WarpedMotionParams best_wm_params = mbmi->wm_params; + int best_num_proj_ref = mbmi->num_proj_ref; + unsigned int bestmse; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + const int mv_shift = ms_params->allow_hp ? 0 : 1; + + // Calculate the center position's error + assert(av1_is_subpelmv_in_range(mv_limits, *best_mv)); + bestmse = compute_motion_cost(xd, cm, ms_params, bsize, best_mv); + + // MV search + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // First step always scans all neighbors + uint8_t valid_neighbors = UINT8_MAX; + + for (int ite = 0; ite < num_iterations; ++ite) { + int best_idx = -1; + + for (int idx = 0; idx < num_neighbors; ++idx) { + if ((valid_neighbors & (1 << idx)) == 0) { + continue; + } + + unsigned int thismse; + + MV this_mv = { best_mv->row + neighbors[idx].row * (1 << mv_shift), + best_mv->col + neighbors[idx].col * (1 << mv_shift) }; + if (av1_is_subpelmv_in_range(mv_limits, this_mv)) { + memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); + memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); + if (total_samples > 1) { + mbmi->num_proj_ref = + av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize); + } + + if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, + this_mv.row, this_mv.col, &mbmi->wm_params, + mi_row, mi_col)) { + thismse = compute_motion_cost(xd, cm, ms_params, bsize, &this_mv); + + if (thismse < bestmse) { + best_idx = idx; + best_wm_params = mbmi->wm_params; + best_num_proj_ref = mbmi->num_proj_ref; + bestmse = thismse; + } + } + } + } + + if (best_idx == -1) break; + + if (best_idx >= 0) { + best_mv->row += neighbors[best_idx].row * (1 << mv_shift); + best_mv->col += neighbors[best_idx].col * (1 << mv_shift); + valid_neighbors = neighbor_mask[best_idx]; + } + } + + mbmi->wm_params = best_wm_params; + mbmi->num_proj_ref = best_num_proj_ref; + return bestmse; +} + +#endif // !CONFIG_REALTIME_ONLY +// ============================================================================= +// Subpixel Motion Search: OBMC +// ============================================================================= +// Estimates the variance of prediction residue +static INLINE int estimate_obmc_pref_error( + const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const int32_t *src = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int ref_stride = ms_buffers->ref->stride; + + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + return vfp->osvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, mask, sse); +} + +// Calculates the variance of prediction residue +static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm, + const MV *this_mv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type; + const int w = var_params->w; + const int h = var_params->h; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int ref_stride = ms_buffers->ref->stride; + + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + unsigned int besterr; + DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd, + subpel_search_type); + besterr = vfp->ovf(pred8, w, wsrc, mask, sse); + } else { + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, + subpel_y_q3, ref, ref_stride, subpel_search_type); + + besterr = vfp->ovf(pred, w, wsrc, mask, sse); + } +#else + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, + subpel_y_q3, ref, ref_stride, subpel_search_type); + + besterr = vfp->ovf(pred, w, wsrc, mask, sse); +#endif + return besterr; +} + +static unsigned int setup_obmc_center_error( + const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + // TODO(chiyotsai@google.com): There might be a bug here where we didn't use + // get_buf_from_mv(ref, *this_mv). + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const uint8_t *ref = ms_buffers->ref->buf; + const int ref_stride = ms_buffers->ref->stride; + unsigned int besterr = + var_params->vfp->ovf(ref, ref_stride, wsrc, mask, sse1); + *distortion = besterr; + besterr += mv_err_cost_(this_mv, mv_cost_params); + return besterr; +} + +static unsigned int upsampled_setup_obmc_center_error( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *this_mv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + unsigned int besterr = + upsampled_obmc_pref_error(xd, cm, this_mv, var_params, sse1); + *distortion = besterr; + besterr += mv_err_cost_(this_mv, mv_cost_params); + return besterr; +} + +// Estimates the variance of prediction residue +// TODO(chiyotsai@google.com): the cost does does not match the cost in +// mv_cost_. Investigate this later. +static INLINE int estimate_obmc_mvcost(const MV *this_mv, + const MV_COST_PARAMS *mv_cost_params) { + const MV *ref_mv = mv_cost_params->ref_mv; + const int *mvjcost = mv_cost_params->mvjcost; + const int *const *mvcost = mv_cost_params->mvcost; + const int error_per_bit = mv_cost_params->error_per_bit; + const MV_COST_TYPE mv_cost_type = mv_cost_params->mv_cost_type; + const MV diff_mv = { GET_MV_SUBPEL(this_mv->row - ref_mv->row), + GET_MV_SUBPEL(this_mv->col - ref_mv->col) }; + + switch (mv_cost_type) { + case MV_COST_ENTROPY: + return (unsigned)((mv_cost(&diff_mv, mvjcost, + CONVERT_TO_CONST_MVCOST(mvcost)) * + error_per_bit + + 4096) >> + 13); + case MV_COST_NONE: return 0; + default: + assert(0 && "L1 norm is not tuned for estimated obmc mvcost"); + return 0; + } +} + +// Estimates whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. +static INLINE unsigned int obmc_check_better_fast( + const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *has_better_mv) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + const int thismse = estimate_obmc_pref_error(this_mv, var_params, &sse); + + cost = estimate_obmc_mvcost(this_mv, mv_cost_params); + cost += thismse; + + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *has_better_mv |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +// Estimates whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. +static INLINE unsigned int obmc_check_better( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, + const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *has_better_mv) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + const int thismse = + upsampled_obmc_pref_error(xd, cm, this_mv, var_params, &sse); + cost = mv_err_cost_(this_mv, mv_cost_params); + + cost += thismse; + + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *has_better_mv |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +static AOM_FORCE_INLINE MV obmc_first_level_check( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv, + const int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + int dummy = 0; + const MV left_mv = { this_mv.row, this_mv.col - hstep }; + const MV right_mv = { this_mv.row, this_mv.col + hstep }; + const MV top_mv = { this_mv.row - hstep, this_mv.col }; + const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; + + if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { + const unsigned int left = + obmc_check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int right = + obmc_check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int up = + obmc_check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int down = + obmc_check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + obmc_check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + return diag_step; + } else { + const unsigned int left = obmc_check_better_fast( + &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, + distortion, &dummy); + const unsigned int right = obmc_check_better_fast( + &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, &dummy); + + const unsigned int up = obmc_check_better_fast( + &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, + distortion, &dummy); + + const unsigned int down = obmc_check_better_fast( + &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, &dummy); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + obmc_check_better_fast(&diag_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + return diag_step; + } +} + +// A newer version of second level check for obmc that gives better quality. +static AOM_FORCE_INLINE void obmc_second_level_check_v2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step, + MV *best_mv, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + assert(best_mv->row == this_mv.row + diag_step.row || + best_mv->col == this_mv.col + diag_step.col); + if (CHECK_MV_EQUAL(this_mv, *best_mv)) { + return; + } else if (this_mv.row == best_mv->row) { + // Search away from diagonal step since diagonal search did not provide any + // improvement + diag_step.row *= -1; + } else if (this_mv.col == best_mv->col) { + diag_step.col *= -1; + } + + const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col }; + const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col }; + const MV diag_bias_mv = { best_mv->row + diag_step.row, + best_mv->col + diag_step.col }; + int has_better_mv = 0; + + if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { + obmc_check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + obmc_check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + obmc_check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + } + } else { + obmc_check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + obmc_check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + obmc_check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + } + } +} + +int av1_find_best_obmc_sub_pixel_tree_up( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, + unsigned int *sse1, int_mv *last_mv_search_list) { + (void)last_mv_search_list; + (void)start_mv_stats; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + const SUBPEL_SEARCH_TYPE subpel_search_type = + ms_params->var_params.subpel_search_type; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + int hstep = INIT_SUBPEL_STEP_SIZE; + const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp); + + unsigned int besterr = INT_MAX; + *bestmv = start_mv; + + if (subpel_search_type != USE_2_TAPS_ORIG) + besterr = upsampled_setup_obmc_center_error( + xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion); + else + besterr = setup_obmc_center_error(bestmv, var_params, mv_cost_params, sse1, + distortion); + + for (int iter = 0; iter < round; ++iter) { + MV iter_center_mv = *bestmv; + MV diag_step = obmc_first_level_check(xd, cm, iter_center_mv, bestmv, hstep, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion); + + if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) { + obmc_second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion); + } + hstep >>= 1; + } + + return besterr; +} + +// ============================================================================= +// Public cost function: mv_cost + pred error +// ============================================================================= +int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params, + const FULLPEL_MV best_mv, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, const struct buf_2d *pre) { + const MV mv = get_mv_from_fullmv(&best_mv); + unsigned int sse, var; + + var = vfp->vf(src->buf, src->stride, get_buf_from_fullmv(pre, &best_mv), + pre->stride, &sse); + (void)var; + + return sse + mv_err_cost_(&mv, mv_cost_params); +} + +static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params, + const FULLPEL_MV best_mv, + const uint8_t *second_pred, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, + const struct buf_2d *pre) { + const MV mv = get_mv_from_fullmv(&best_mv); + unsigned int unused; + + return vfp->svaf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0, + src->buf, src->stride, &unused, second_pred) + + mv_err_cost_(&mv, mv_cost_params); +} + +static INLINE int get_mvpred_mask_var( + const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv, + const uint8_t *second_pred, const uint8_t *mask, int mask_stride, + int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src, + const struct buf_2d *pre) { + const MV mv = get_mv_from_fullmv(&best_mv); + unsigned int unused; + + return vfp->msvf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0, + src->buf, src->stride, second_pred, mask, mask_stride, + invert_mask, &unused) + + mv_err_cost_(&mv, mv_cost_params); +} + +int av1_get_mvpred_compound_var(const MV_COST_PARAMS *mv_cost_params, + const FULLPEL_MV best_mv, + const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, int invert_mask, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, + const struct buf_2d *pre) { + if (mask) { + return get_mvpred_mask_var(mv_cost_params, best_mv, second_pred, mask, + mask_stride, invert_mask, vfp, src, pre); + } else { + return get_mvpred_av_var(mv_cost_params, best_mv, second_pred, vfp, src, + pre); + } +} diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h new file mode 100644 index 0000000000..87b9309b61 --- /dev/null +++ b/third_party/aom/av1/encoder/mcomp.h @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MCOMP_H_ +#define AOM_AV1_ENCODER_MCOMP_H_ + +#include "av1/common/mv.h" +#include "av1/encoder/block.h" +#include "av1/encoder/rd.h" + +#include "aom_dsp/variance.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct SPEED_FEATURES; + +// ============================================================================= +// Cost functions +// ============================================================================= + +enum { + MV_COST_ENTROPY, // Use the entropy rate of the mv as the cost + MV_COST_L1_LOWRES, // Use the l1 norm of the mv as the cost (<480p) + MV_COST_L1_MIDRES, // Use the l1 norm of the mv as the cost (>=480p) + MV_COST_L1_HDRES, // Use the l1 norm of the mv as the cost (>=720p) + MV_COST_NONE // Use 0 as as cost irrespective of the current mv +} UENUM1BYTE(MV_COST_TYPE); + +typedef struct { + // The reference mv used to compute the mv cost + const MV *ref_mv; + FULLPEL_MV full_ref_mv; + MV_COST_TYPE mv_cost_type; + const int *mvjcost; + const int *mvcost[2]; + int error_per_bit; + // A multiplier used to convert rate to sad cost + int sad_per_bit; +} MV_COST_PARAMS; + +int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, + int *const mvcost[2], int weight); + +int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params, + const FULLPEL_MV best_mv, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, const struct buf_2d *pre); +int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params, + const FULLPEL_MV best_mv, + const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, int invert_mask, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, + const struct buf_2d *pre); + +// ============================================================================= +// Motion Search +// ============================================================================= +typedef struct { + // The reference buffer + const struct buf_2d *ref; + + // The source and predictors/mask used by translational search + const struct buf_2d *src; + const uint8_t *second_pred; + const uint8_t *mask; + int mask_stride; + int inv_mask; + + // The weighted source and mask used by OBMC + const int32_t *wsrc; + const int32_t *obmc_mask; +} MSBuffers; + +static INLINE void av1_set_ms_compound_refs(MSBuffers *ms_buffers, + const uint8_t *second_pred, + const uint8_t *mask, + int mask_stride, int invert_mask) { + ms_buffers->second_pred = second_pred; + ms_buffers->mask = mask; + ms_buffers->mask_stride = mask_stride; + ms_buffers->inv_mask = invert_mask; +} + +// ============================================================================= +// Fullpixel Motion Search +// ============================================================================= +// This struct holds fullpixel motion search parameters that should be constant +// during the search +typedef struct { + BLOCK_SIZE bsize; + // A function pointer to the simd function for fast computation + const aom_variance_fn_ptr_t *vfp; + + MSBuffers ms_buffers; + + // WARNING: search_method should be regarded as a private variable and should + // not be modified directly so it is in sync with search_sites. To modify it, + // use av1_set_mv_search_method. + SEARCH_METHODS search_method; + const search_site_config *search_sites; + FullMvLimits mv_limits; + + int run_mesh_search; // Sets mesh search unless it got pruned by + // prune_mesh_search. + int prune_mesh_search; // Disables mesh search if the best_mv after a normal + // search if close to the start_mv. + int mesh_search_mv_diff_threshold; // mv diff threshold to enable + // prune_mesh_search + int force_mesh_thresh; // Forces mesh search if the residue variance is + // higher than the threshold. + const struct MESH_PATTERN *mesh_patterns[2]; + + // Use maximum search interval of 4 if true. This helps motion search to find + // the best motion vector for screen content types. + int fine_search_interval; + + int is_intra_mode; + + int fast_obmc_search; + + // For calculating mv cost + MV_COST_PARAMS mv_cost_params; + + // Stores the function used to compute the sad. This can be different from the + // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up. + aom_sad_fn_t sdf; + aom_sad_multi_d_fn_t sdx4df; + aom_sad_multi_d_fn_t sdx3df; +} FULLPEL_MOTION_SEARCH_PARAMS; + +typedef struct { + int err_cost; + unsigned int distortion; + unsigned int sse; +} FULLPEL_MV_STATS; + +void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer); + +void av1_make_default_fullpel_ms_params( + FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv, + const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS], + SEARCH_METHODS search_method, int fine_search_interval); + +/*! Sets the \ref FULLPEL_MOTION_SEARCH_PARAMS to intra mode. */ +void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const IntraBCMVCosts *dv_costs); + +// Sets up configs for fullpixel DIAMOND / CLAMPED_DIAMOND search method. +void av1_init_dsmotion_compensation(search_site_config *cfg, int stride, + int level); +// Sets up configs for firstpass motion search. +void av1_init_motion_fpf(search_site_config *cfg, int stride); +// Sets up configs for NSTEP / NSTEP_8PT motion search method. +void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride, + int level); +// Sets up configs for BIGDIA / FAST_DIAMOND / FAST_BIGDIA +// motion search method. +void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride, + int level); +// Sets up configs for HEX or FAST_HEX motion search method. +void av1_init_motion_compensation_hex(search_site_config *cfg, int stride, + int level); +// Sets up configs for SQUARE motion search method. +void av1_init_motion_compensation_square(search_site_config *cfg, int stride, + int level); + +/*! Function pointer to search site config initialization of different search + * method functions. */ +typedef void (*av1_init_search_site_config)(search_site_config *cfg, int stride, + int level); + +/*! Array of function pointers used to set the motion search config. */ +extern const av1_init_search_site_config + av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS]; + +// Array to inform which all search methods are having +// same candidates and different in number of search steps. +static const SEARCH_METHODS search_method_lookup[NUM_SEARCH_METHODS] = { + DIAMOND, // DIAMOND + NSTEP, // NSTEP + NSTEP_8PT, // NSTEP_8PT + CLAMPED_DIAMOND, // CLAMPED_DIAMOND + HEX, // HEX + BIGDIA, // BIGDIA + SQUARE, // SQUARE + HEX, // FAST_HEX + BIGDIA, // FAST_DIAMOND + BIGDIA, // FAST_BIGDIA + BIGDIA // VFAST_DIAMOND +}; + +// Reinitialize the search site config. +static AOM_INLINE void av1_refresh_search_site_config( + search_site_config *ss_cfg_buf, SEARCH_METHODS search_method, + const int ref_stride) { + const int level = + search_method == NSTEP_8PT || search_method == CLAMPED_DIAMOND; + search_method = search_method_lookup[search_method]; + av1_init_motion_compensation[search_method](&ss_cfg_buf[search_method], + ref_stride, level); +} + +// Mv beyond the range do not produce new/different prediction block. +static INLINE void av1_set_mv_search_method( + FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS], + SEARCH_METHODS search_method) { + ms_params->search_method = search_method; + ms_params->search_sites = + &search_sites[search_method_lookup[ms_params->search_method]]; +} + +// Set up limit values for MV components. +// Mv beyond the range do not produce new/different prediction block. +static INLINE void av1_set_mv_row_limits( + const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, + int mi_row, int mi_height, int border) { + const int min1 = -(mi_row * MI_SIZE + border - 2 * AOM_INTERP_EXTEND); + const int min2 = -(((mi_row + mi_height) * MI_SIZE) + 2 * AOM_INTERP_EXTEND); + mv_limits->row_min = AOMMAX(min1, min2); + const int max1 = (mi_params->mi_rows - mi_row - mi_height) * MI_SIZE + + border - 2 * AOM_INTERP_EXTEND; + const int max2 = + (mi_params->mi_rows - mi_row) * MI_SIZE + 2 * AOM_INTERP_EXTEND; + mv_limits->row_max = AOMMIN(max1, max2); +} + +static INLINE void av1_set_mv_col_limits( + const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, + int mi_col, int mi_width, int border) { + const int min1 = -(mi_col * MI_SIZE + border - 2 * AOM_INTERP_EXTEND); + const int min2 = -(((mi_col + mi_width) * MI_SIZE) + 2 * AOM_INTERP_EXTEND); + mv_limits->col_min = AOMMAX(min1, min2); + const int max1 = (mi_params->mi_cols - mi_col - mi_width) * MI_SIZE + border - + 2 * AOM_INTERP_EXTEND; + const int max2 = + (mi_params->mi_cols - mi_col) * MI_SIZE + 2 * AOM_INTERP_EXTEND; + mv_limits->col_max = AOMMIN(max1, max2); +} + +static INLINE void av1_set_mv_limits( + const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, + int mi_row, int mi_col, int mi_height, int mi_width, int border) { + av1_set_mv_row_limits(mi_params, mv_limits, mi_row, mi_height, border); + av1_set_mv_col_limits(mi_params, mv_limits, mi_col, mi_width, border); +} + +void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv); + +int av1_init_search_range(int size); + +unsigned int av1_int_pro_motion_estimation( + const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, const MV *ref_mv, unsigned int *y_sad_zero, + int me_search_size_col, int me_search_size_row); + +int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const FULLPEL_MV start_mv, FULLPEL_MV *best_mv); + +int av1_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, int *cost_list, + FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats, + FULLPEL_MV *second_best_mv); + +int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + IntraBCHashInfo *intrabc_hash_info, + FULLPEL_MV *best_mv); + +int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, FULLPEL_MV *best_mv); + +static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits, + FULLPEL_MV mv) { + return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) && + (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max); +} +// ============================================================================= +// Subpixel Motion Search +// ============================================================================= +enum { + EIGHTH_PEL, + QUARTER_PEL, + HALF_PEL, + FULL_PEL +} UENUM1BYTE(SUBPEL_FORCE_STOP); + +typedef struct { + const aom_variance_fn_ptr_t *vfp; + SUBPEL_SEARCH_TYPE subpel_search_type; + // Source and reference buffers + MSBuffers ms_buffers; + int w, h; +} SUBPEL_SEARCH_VAR_PARAMS; + +// This struct holds subpixel motion search parameters that should be constant +// during the search +typedef struct { + // High level motion search settings + int allow_hp; + const int *cost_list; + SUBPEL_FORCE_STOP forced_stop; + int iters_per_step; + SubpelMvLimits mv_limits; + + // For calculating mv cost + MV_COST_PARAMS mv_cost_params; + + // Distortion calculation params + SUBPEL_SEARCH_VAR_PARAMS var_params; +} SUBPEL_MOTION_SEARCH_PARAMS; + +void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, + const MV *ref_mv, const int *cost_list); + +typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, + MV *bestmv, int *distortion, + unsigned int *sse1, + int_mv *last_mv_search_list); + +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree; +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned; +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more; +extern fractional_mv_step_fp av1_return_max_sub_pixel_mv; +extern fractional_mv_step_fp av1_return_min_sub_pixel_mv; +extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up; + +unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + BLOCK_SIZE bsize, const int *pts0, + const int *pts_inref0, int total_samples, + WARP_SEARCH_METHOD search_method, + int num_iterations); + +static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) { + for (int z = 0; z < 3; z++) { + fractional_best_mv[z].as_int = INVALID_MV; + } +} + +static INLINE void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits, + const FullMvLimits *mv_limits, + const MV *ref_mv) { + const int max_mv = GET_MV_SUBPEL(MAX_FULL_PEL_VAL); + int minc = AOMMAX(GET_MV_SUBPEL(mv_limits->col_min), ref_mv->col - max_mv); + int maxc = AOMMIN(GET_MV_SUBPEL(mv_limits->col_max), ref_mv->col + max_mv); + int minr = AOMMAX(GET_MV_SUBPEL(mv_limits->row_min), ref_mv->row - max_mv); + int maxr = AOMMIN(GET_MV_SUBPEL(mv_limits->row_max), ref_mv->row + max_mv); + + maxc = AOMMAX(minc, maxc); + maxr = AOMMAX(minr, maxr); + + subpel_limits->col_min = AOMMAX(MV_LOW + 1, minc); + subpel_limits->col_max = AOMMIN(MV_UPP - 1, maxc); + subpel_limits->row_min = AOMMAX(MV_LOW + 1, minr); + subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr); +} + +static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits, + MV mv) { + return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) && + (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max); +} + +static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) { + return mv->row * stride + mv->col; +} + +static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf, + const FULLPEL_MV *mv) { + return &buf->buf[get_offset_from_fullmv(mv, buf->stride)]; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MCOMP_H_ diff --git a/third_party/aom/av1/encoder/mcomp_structs.h b/third_party/aom/av1/encoder/mcomp_structs.h new file mode 100644 index 0000000000..06660cf4a6 --- /dev/null +++ b/third_party/aom/av1/encoder/mcomp_structs.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MCOMP_STRUCTS_H_ +#define AOM_AV1_ENCODER_MCOMP_STRUCTS_H_ + +#include "av1/common/mv.h" + +// The maximum number of steps in a step search given the largest +// allowed initial step +#define MAX_MVSEARCH_STEPS 11 +// Max full pel mv specified in the unit of full pixel +// Enable the use of motion vector in range [-1023, 1023]. +#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1) +// Maximum size of the first step in full pel units +#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1)) +// Maximum number of neighbors to scan per iteration during +// WARPED_CAUSAL refinement +// Note: The elements of warp_search_config.neighbor_mask must be at least +// MAX_WARP_SEARCH_NEIGHBORS many bits wide. So the type may need to be +// widened if this value is increased. +#define MAX_WARP_SEARCH_NEIGHBORS 8 + +#define SEARCH_RANGE_8P 3 +#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1) +#define SEARCH_GRID_CENTER_8P \ + (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P) + +typedef struct { + FULLPEL_MV coord; + int coord_offset; +} search_neighbors; +// motion search site +typedef struct search_site { + FULLPEL_MV mv; + int offset; +} search_site; + +typedef struct search_site_config { + search_site site[MAX_MVSEARCH_STEPS * 2][16 + 1]; + // Number of search steps. + int num_search_steps; + int searches_per_step[MAX_MVSEARCH_STEPS * 2]; + int radius[MAX_MVSEARCH_STEPS * 2]; + int stride; +} search_site_config; + +enum { + // Search 8-points in the radius grid around center, up to 11 search stages. + DIAMOND = 0, + // Search 12-points in the radius/tan_radius grid around center, + // up to 15 search stages. + NSTEP = 1, + // Search 8-points in the radius grid around center, up to 16 search stages. + NSTEP_8PT = 2, + // Search 8-points in the radius grid around center, upto 11 search stages + // with clamping of search radius. + CLAMPED_DIAMOND = 3, + // Search maximum 8-points in the radius grid around center, + // up to 11 search stages. First stage consists of 8 search points + // and the rest with 6 search points each in hex shape. + HEX = 4, + // Search maximum 8-points in the radius grid around center, + // up to 11 search stages. First stage consists of 4 search + // points and the rest with 8 search points each. + BIGDIA = 5, + // Search 8-points in the square grid around center, up to 11 search stages. + SQUARE = 6, + // HEX search with up to 2 stages. + FAST_HEX = 7, + // BIGDIA search with up to 2 stages. + FAST_DIAMOND = 8, + // BIGDIA search with up to 3 stages. + FAST_BIGDIA = 9, + // BIGDIA search with up to 1 stage. + VFAST_DIAMOND = 10, + // Total number of search methods. + NUM_SEARCH_METHODS, + // Number of distinct search methods. + NUM_DISTINCT_SEARCH_METHODS = SQUARE + 1, +} UENUM1BYTE(SEARCH_METHODS); + +typedef struct warp_search_config { + int num_neighbors; + MV neighbors[MAX_WARP_SEARCH_NEIGHBORS]; + // Bitmask which is used to prune the search neighbors at one iteration + // based on which direction we chose in the previous iteration. + // See comments in av1_refine_warped_mv for details. + uint8_t neighbor_mask[MAX_WARP_SEARCH_NEIGHBORS]; +} warp_search_config; + +// Methods for refining WARPED_CAUSAL motion vectors +enum { + // Search 4 adjacent points in a diamond shape at each iteration + WARP_SEARCH_DIAMOND, + // Search 8 adjacent points in a square at each iteration + WARP_SEARCH_SQUARE, + WARP_SEARCH_METHODS +} UENUM1BYTE(WARP_SEARCH_METHOD); + +#endif // AOM_AV1_ENCODER_MCOMP_STRUCTS_H_ diff --git a/third_party/aom/av1/encoder/misc_model_weights.h b/third_party/aom/av1/encoder/misc_model_weights.h new file mode 100644 index 0000000000..f00aeabcf6 --- /dev/null +++ b/third_party/aom/av1/encoder/misc_model_weights.h @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +#define MV_PREC_FEATURE_SIZE 18 + +#define NUM_DNN_LAYERS 1 +#define NUM_DNN_FEATURES MV_PREC_FEATURE_SIZE +#define MV_PREC_LAYER_SIZE_0 32 +#define NUM_LOGITS 1 + +const float av1_mv_prec_mean[MV_PREC_FEATURE_SIZE] = { 143.67358891063745f, + 141.6251917346238f, + 0.36313633945679064f, + 0.0028162791958822085f, + 0.000484820537626698f, + 0.002769969388939025f, + 0.0f, + 0.00031274626720947577f, + 0.00020578555375160075f, + 0.0007075246732697733f, + 0.000539641029909925f, + 0.0013939401375906984f, + 4.985394760423499f, + 4.985394760423499f, + 4.9992148717283085f, + 5.143739822380163f, + 5.518483124004564f, + 87.63597847427077f }; + +const float av1_mv_prec_std[MV_PREC_FEATURE_SIZE] = { 66.86256140247244f, + 68.04472572607503f, + 13.23247674430399f, + 0.0029123438396921955f, + 0.0015331406169374737f, + 0.0029149813096313775f, + 1.0f, + 0.00047501102871357813f, + 0.00030025962993117947f, + 0.0009861163580391207f, + 0.0012157593528004055f, + 0.002004954948490521f, + 6.539447500484038f, + 6.539447500484038f, + 6.396589058279465f, + 3.4870155874262516f, + 3.8911353973740535f, + 112.07985259573601f }; + +const float av1_mv_prec_nn_weights_layer_0[] = { -0.13008492159557145f, + -0.1483527373474774f, + 0.08112076098858864f, + -0.9582568679627453f, + -0.34794757171071206f, + 0.6465225723304947f, + 0.0f, + 0.06754171885839604f, + 0.27156803620541214f, + 0.10635231245664407f, + -0.031183926995968583f, + 0.048122572260291f, + -0.19498534230045128f, + -0.2614116319273316f, + -0.3223762845136331f, + -1.2063368350609205f, + -0.523333556911706f, + 1.075632260890728f, + 0.48989726814387946f, + -0.34816466111070477f, + 0.41668357610256473f, + -1.0973562848791671f, + 0.04183921854389494f, + -0.9123815389260476f, + 0.0f, + 0.859965047744027f, + 0.1962095804679813f, + 0.2606564339077058f, + 0.26695868715184895f, + 0.5319308568326692f, + -0.23717505799723165f, + -0.43127224481782567f, + -0.3214545776203726f, + 0.5850852241402176f, + -0.26705531612587813f, + -0.5786016766610093f, + 0.9360519909983003f, + 0.20771329289016555f, + -0.027614159544811823f, + -1.175022807046164f, + -0.07578967497693835f, + 0.6890172485324256f, + 0.0f, + -0.008008338164988263f, + -0.08064800010158935f, + -0.22606910981666667f, + 0.4541586669210879f, + 0.07731527661370792f, + -0.6744475941247964f, + -0.2625842448396184f, + 1.7018613444303785f, + -0.08622229073162656f, + 0.041858142814941275f, + -0.24575964090386415f, + -0.046626044730994964f, + 0.7608713064175202f, + -0.23330119070907146f, + -0.10115510984500826f, + 0.9722537349192069f, + 0.11718554254290829f, + 0.0f, + 0.2075123446014759f, + 0.09465167310768637f, + 0.7609896851963016f, + 0.4441038581385328f, + 0.26064144727430955f, + -0.14678625366485035f, + -0.03597014452200524f, + 0.3128680867196166f, + 1.102496797385966f, + 0.06642253233084111f, + -1.2665494483407629f, + 0.09049412632000911f, + -1.1160621999565095f, + 0.043420275255913035f, + -0.8811412259978966f, + 0.21076234632287777f, + 0.16571534463543866f, + 0.0f, + -0.7324075176473275f, + -0.3677622514459495f, + 0.3273532243056415f, + 0.22922161936797775f, + 0.8204766691058087f, + 0.02982161033720488f, + 0.5266419954188112f, + -1.0032154963302191f, + 0.7007602969763729f, + 0.37196355167990885f, + -0.7608579453228548f, + 0.08568111584781847f, + 0.07011061059123677f, + 0.3233263598082507f, + -0.08249928295410253f, + 0.08220165761319252f, + 0.22148722752246794f, + 0.0f, + 0.6122392701743506f, + -0.26429838296378333f, + 0.31958081620005463f, + -0.006027177397853826f, + -0.3088310785887994f, + -0.5436192046707807f, + -0.011080356757423306f, + 0.12632650770008413f, + -0.45097913215234525f, + 1.8008072867127298f, + -0.7630029654575501f, + -0.4054774329826579f, + 0.40386074452544535f, + -0.18541426257453025f, + 0.2444879765079863f, + -0.6216724756115081f, + 0.27030299321302f, + 0.0f, + -0.6835848952967989f, + -0.7914184320964815f, + -0.6761595019582928f, + -1.009565565604081f, + -0.1904242439353305f, + 0.4463417126318631f, + 0.6025503823452971f, + 0.5149990860115566f, + 1.0242970663937634f, + 0.037947306826401385f, + 0.07039339786212848f, + 0.14273796789711987f, + 0.168103961425691f, + 1.6596066376811978f, + 0.19321092229384657f, + -0.3710750388148514f, + -0.01717015559410288f, + 0.0f, + 0.3005688477942597f, + 0.23877080653829577f, + 0.2718594552971173f, + 0.3885402571589898f, + 0.32999531945669247f, + -0.6134460954213243f, + -0.13972265462799183f, + -0.07180089575716991f, + -1.014572598188105f, + 0.0717207322809836f, + 0.34896157745155615f, + -0.27127687591403f, + -0.5058651212773623f, + -1.5442435628306925f, + -0.6399784724734707f, + 0.6274301429074947f, + -0.4645750072767051f, + 0.0f, + -0.2406726815244178f, + -0.06321214115916597f, + 0.312856714253404f, + 0.16459514124116134f, + 0.3993579604809623f, + -0.15232044351561913f, + -0.5613743948568469f, + 0.7219801372223262f, + 0.2936857469624009f, + 0.7823466656034087f, + -0.12416947814098349f, + -0.36413756654028345f, + -0.07992098796866462f, + -0.7395722879842416f, + 0.8639913543220514f, + -0.311931773757945f, + -1.7308240470400613f, + 0.0f, + 0.394499716712104f, + 0.6511462819539963f, + -0.0722425275974144f, + 0.13490818194661386f, + 0.055319135836378035f, + 0.15389577508097013f, + 0.28958598328870605f, + -0.14608429470539772f, + 0.09488817462478298f, + -0.17231294096622088f, + 0.6721115415911466f, + -0.05664621150536103f, + 0.03291799673669331f, + 0.02845382711057482f, + -0.9953563446999164f, + -0.17994298220605923f, + 0.6560824519337476f, + 0.0f, + -0.30990646375917935f, + 0.17215517202874f, + 0.2026816225170481f, + 0.22011958747715601f, + 0.3562520768889686f, + -0.18436559057189175f, + 0.1733377147302066f, + 0.02818276995640877f, + -0.29703005574859076f, + -0.3310652639215064f, + -1.6091173258529277f, + 0.45461585790028003f, + -0.5078643334592593f, + -0.338997374732338f, + 0.4688619590359733f, + 0.627099126828289f, + -0.5249801376494249f, + 0.0f, + 0.34465498218272883f, + 0.009891680630908135f, + -0.27244020967349f, + 0.05404589867626979f, + -0.06220329325739666f, + -0.13365376464759104f, + -0.13098573553512366f, + 0.11434198976289106f, + 0.6740951247574676f, + 1.3381727185724581f, + -1.4865773213251936f, + 0.05809898701966341f, + 0.25380780261023456f, + 1.2716367496512722f, + 0.1768290070780598f, + -0.07554828135356352f, + 0.8180570085344856f, + 0.0f, + 1.0788448980077463f, + 0.0651938742459459f, + 0.3807672030015587f, + 0.6144792680268445f, + 0.011660612214908059f, + -0.018306023765580288f, + 0.44140813809926516f, + -0.13411994195502386f, + 0.15920368955127778f, + -0.19382358417849888f, + -0.08802147969690055f, + -0.019731052733814477f, + 0.1104744229169665f, + -0.195834419735958f, + -0.5005295046454347f, + -0.17041241868229032f, + -0.471942117351489f, + 0.0f, + -0.3599073304761372f, + -0.2745532782968519f, + -0.8323064841106417f, + -0.88355885384943f, + -0.02826466859020679f, + 0.06977870308805256f, + 0.11926112095374196f, + 1.367382707959643f, + -0.06119843162964051f, + -0.5331395268889569f, + -1.2155531584240624f, + -0.01896651779524327f, + 0.10591845408571081f, + -0.010632842156504733f, + 0.6150787968629282f, + -0.4191690185896091f, + -0.9961718918346271f, + 0.0f, + 0.23370364516013867f, + 0.4156033072362998f, + 0.1261005546633433f, + 0.0812413884532226f, + -0.008894337353937203f, + 0.07984447025056046f, + -0.1258098052766725f, + -0.40245475467767916f, + 1.78188906675019f, + -1.1544387954232302f, + -0.41768781481273387f, + 0.6791211165341995f, + -0.4175127856183446f, + -0.07353219159767788f, + -0.2888813577574072f, + -0.7107767892597061f, + -1.0450031091195449f, + 0.0f, + -0.9221599545079143f, + -0.6747876356740621f, + 0.30241454354872105f, + 0.4924965303373908f, + -0.14042722740054084f, + 0.27744210409350445f, + -0.14788270997426836f, + -0.9081467469237995f, + -0.04513115674995093f, + -0.5254168669125793f, + -0.6999012037974789f, + 0.434661246306547f, + -0.7193303957246092f, + -0.9117952623409744f, + -1.5097267865916142f, + -0.20779888103770922f, + 0.4935562480901218f, + 0.0f, + 0.18303393908923593f, + 0.34753722677570037f, + 0.29291001533177663f, + 0.3832351878354224f, + 0.3295194956120599f, + -0.32398033003617527f, + -0.31570906736433746f, + 0.23657779050372962f, + 0.9510794465234161f, + -0.5122243902568278f, + 0.08652112725315658f, + 0.2246634353717998f, + -0.9032595595582497f, + -0.8936484034533545f, + 0.6012969720865752f, + -0.6454216646117924f, + -1.1753786049658332f, + 0.0f, + -0.4360545677728656f, + -0.6586237455328507f, + -0.34347301697886656f, + -0.8909724651992144f, + -0.24378721818350263f, + 0.6179733359297576f, + 0.0661661181742234f, + -0.14120142044993794f, + -0.07732699885498932f, + 1.0221355882357506f, + 0.44514798994115284f, + -0.7371569579959046f, + -0.7212499572378936f, + 0.7453626921081045f, + 0.5478757761345768f, + -0.39411232789985384f, + 0.7200542656743857f, + 0.0f, + -0.11790869453118827f, + -0.12317030713581928f, + -0.4207902738133338f, + 0.15895105878327986f, + 0.304261777102111f, + 0.11450744587017621f, + -0.11470709991317944f, + 0.5949222371739038f, + 0.6549518619412444f, + -0.24390606570422838f, + -0.4212796009440803f, + -0.6269666206320964f, + -0.5421193969807078f, + -0.12297772128652287f, + 0.021517257619930424f, + 0.25462855095544523f, + -0.22107798187348246f, + 0.0f, + 0.5204516300095662f, + 0.2837402841862462f, + 0.11310823283285916f, + 0.8944351685018025f, + 0.17487203235834015f, + -0.5271221928634433f, + -0.19516594503423199f, + 0.452456617580365f, + 1.2456272242706414f, + 0.24166615894862817f, + 0.09411429305204502f, + -0.2730072283327243f, + -0.8129383770918172f, + -0.24093254193486136f, + 0.5696499174142177f, + -0.11110805836073044f, + -0.3968204166235694f, + 0.0f, + -0.04388165369378549f, + -0.005631266017272595f, + -0.02574211858479705f, + 0.06230399626660669f, + 0.17677671232932785f, + 0.5172871274400965f, + 0.4919150085620063f, + -1.597656637582941f, + 0.02415185715719143f, + -0.17945446376668306f, + -0.39340600199798886f, + 0.25013205256886845f, + 0.05972330340308685f, + 0.1359911505596489f, + -0.02341033271820833f, + 0.15726074644063684f, + 0.47512625913020357f, + 0.0f, + 0.7327341664835779f, + -0.3689092312320013f, + 0.4571824787436036f, + 0.6215465537945456f, + 0.0944111296842023f, + -0.12571956176607574f, + -0.2507235674395462f, + -0.09579602654351593f, + 1.4463357293728496f, + 0.749153535856049f, + -0.5553955120807588f, + -0.09622771929369946f, + -0.2598697420394813f, + -0.964691815299676f, + -0.8289963178173902f, + 0.7112949291983329f, + -0.8667009730492162f, + 0.0f, + -0.48698304169042794f, + -0.18786095669893707f, + -0.11425249263203247f, + -0.3693391011684809f, + 0.09933145842585253f, + 0.2568559685298844f, + 0.7048512233651738f, + 0.6056238412407038f, + -0.4355558119826642f, + 0.17318931883915484f, + 0.6481333496429564f, + -0.45728823054344486f, + -0.006325004538589701f, + 0.45609864075494927f, + -0.6199385981116988f, + 0.035105808783046165f, + 0.1203147963894839f, + 0.0f, + 0.383402190836527f, + 0.048429009055370106f, + 0.5887186439275204f, + -0.20538767641607814f, + -0.031237879611002117f, + 0.3140759860883231f, + 0.24447070584999556f, + 0.7271263905705878f, + 0.8432799162434237f, + -0.11530577554199217f, + -0.7781023892314718f, + 0.05359488822710336f, + 0.5624870388700809f, + 0.5134656523208906f, + 0.18304041423438375f, + -0.04237421156328257f, + -0.20759809886942207f, + 0.0f, + -0.06249337454975615f, + 0.10081284533873777f, + 0.3894374350259183f, + 1.518217777528342f, + -0.9100037950171563f, + 0.17796906121831477f, + -0.2892167255357892f, + 0.6117902467884032f, + 0.13332120964959573f, + -0.3487155932849374f, + -0.32920583745734694f, + 0.08242631209809854f, + -0.24920225708110588f, + 0.8401757259392635f, + 0.11729108681358365f, + 0.11222925752499184f, + -0.027078490721459958f, + 0.0f, + 0.726132375517389f, + 0.72220359881096f, + 0.5721582611845177f, + 0.15139162075524315f, + 0.6676549461551197f, + -0.321449586554697f, + -0.10141104515219895f, + -0.09711123988777906f, + 0.9623356184776928f, + -0.7941822373167173f, + -0.9373923554119346f, + 0.4573241832354059f, + -0.42029139056126147f, + 0.2675223459380999f, + -0.5487300191551386f, + 0.2236621891916084f, + 0.11692039230044018f, + 0.0f, + 0.1758399202780961f, + 0.676447587678781f, + 0.5945412815881029f, + 0.5669863357359594f, + 0.8433565415303922f, + -0.30300550790708036f, + -0.43332881999693673f, + -0.4996522695731392f, + -0.2084930815451962f, + 0.27765278702463786f, + 1.0886848763946915f, + -0.0739433655813831f, + -0.4762801579229192f, + -0.2490825339320731f, + -1.8820479350439439f, + -0.4251592225775914f, + -0.3992922365484464f, + 0.0f, + 0.19598917760218867f, + 0.4860238022746914f, + 0.3364528828641281f, + 0.3350950865226741f, + 0.2773654548632006f, + -0.30547262140782566f, + 0.028649620490728344f, + -0.11763407628280315f, + 0.6237318502627169f, + -0.3958952632477945f, + 0.14797171297835243f, + 0.45821729624747465f, + -0.8687137170773626f, + 0.06989667196937126f, + -0.5752606929478727f, + 0.16986945686358412f, + 0.6925071596817824f, + 0.0f, + 0.4991250796183003f, + 0.03424654896322111f, + 0.6153698611882319f, + 0.5070872444849457f, + 0.43615747516328135f, + -0.7870352838659244f, + -0.6424101231965247f, + -0.7005774876651399f, + 0.79983115431488f, + 0.15720357955596242f, + -1.408372612176309f, + -0.039294695217213765f, + 0.6979415372962309f, + 0.27403316751965656f, + 1.2844596102619275f, + -0.2781534150257364f, + 0.3248437714908865f, + 0.0f, + 0.4364362371752831f, + -0.2548580911485434f, + -0.19578001373349452f, + -0.04597194387828005f, + -0.010035156855533233f, + 0.0415941475251266f, + 0.07929549739797387f, + -0.060629652912508866f, + 0.5977303008711333f, + -1.4404008068066554f, + 0.8555694790197376f, + -0.03693438534401856f, + 0.17761411164512408f, + -0.11858304304109235f, + -1.4241324353471327f, + 0.1533849765389186f, + 0.7650643783126995f, + 0.0f, + -0.0639949379280401f, + 0.4288617817939563f, + 0.4235508646885404f, + 0.3419843254383798f, + -0.015992360660098768f, + -0.773247697505441f, + -0.4908452922015917f, + 0.9868134897291486f, + -0.5078689994742608f, + 1.05632043744864f, + -0.38867419409275117f, + -0.0065547696858664194f, + -0.3056003173415037f, + -0.333762331930102f, + 0.4459671174011671f, + 0.08219092584580244f, + -0.08099158579518179f, + 0.0f, + -0.1568180656346373f, + -0.061962372393910135f, + 0.14065868174859464f, + -0.055925712798972765f, + 0.05136117465820622f, + 0.0907831030477633f, + 0.19518110495319604f, + -0.7470794578145956f, + 1.5945999734733545f, + -0.4351697502345834f, + -0.33253649399571805f }; + +const float av1_mv_prec_nn_bias_layer_0[] = { + -0.651213833993862f, -1.1243309933417809f, -0.2123880023097051f, + 0.23095477452877616f, -0.6668057665893545f, 0.3082268148379634f, + -0.3344916753975844f, -0.20920185606857844f, 0.6057933917964854f, + 0.5031857662559803f, -1.5380096313468152f, -0.4457245344804041f, + 1.82368055812373f, 0.7973912064077963f, 0.25706500555622913f, + 0.1394695119825382f, 0.4508811973450553f, -0.5408959545111782f, + 1.064829233697863f, 0.3733268644246235f, 1.1173169029905483f, + -0.2012817466400134f, -0.16628447748302294f, 1.3086000088940826f, + 0.7267092979664235f, -0.9097857006590555f, -0.7564259343863077f, + -0.49844128036716173f, -0.4675729246975423f, -0.03626154526362181f, + -0.41957330902404616f, -0.9658160514319954f +}; + +const float av1_mv_prec_nn_weights_layer_1[] = { + 1.5017296484510276f, 1.044216918060133f, -1.066541411740906f, + -0.7762965171172661f, -0.9814396609661653f, 0.9334065847340715f, + 0.7117244268817873f, -0.7695942296628597f, 0.7892157680137047f, + -0.5786309358654476f, -2.4444494892027264f, 1.1666759262637185f, + -0.9699580532370483f, 0.5849682956422552f, -1.0372272986941953f, + -0.5005014627824439f, 1.1816204711740521f, -1.2204867615892114f, + 0.4510263977504913f, 0.35567865078585165f, -0.7811389330738839f, + -0.6643977800301099f, -0.6283287371705794f, 0.790873821018048f, + 0.8861643352684585f, 0.6438840651522237f, 0.6677191546466089f, + 0.9703715021995785f, 1.250893534236489f, 0.7733742028067933f, + -1.249673977776904f, -1.2890127265725608f +}; + +const float av1_mv_prec_nn_bias_layer_1[] = { -0.341771735378258f }; + +static const NN_CONFIG av1_mv_prec_dnn_config = { + NUM_DNN_FEATURES, + NUM_LOGITS, + NUM_DNN_LAYERS, + { MV_PREC_LAYER_SIZE_0 }, + { + av1_mv_prec_nn_weights_layer_0, + av1_mv_prec_nn_weights_layer_1, + }, + { + av1_mv_prec_nn_bias_layer_0, + av1_mv_prec_nn_bias_layer_1, + }, +}; +#undef NUM_DNN_LAYERS +#undef NUM_DNN_FEATURES +#undef NUM_LAYER_0_UNITS +#undef NUM_LOGITS + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c new file mode 100644 index 0000000000..94cd56c5d1 --- /dev/null +++ b/third_party/aom/av1/encoder/ml.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/mathutils.h" +#include "av1/encoder/ml.h" + +void av1_nn_output_prec_reduce(float *const output, int num_output) { + const int prec_bits = 9; + const int prec = 1 << prec_bits; + const float inv_prec = (float)(1.0 / prec); + for (int i = 0; i < num_output; i++) { + output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec; + } +} + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_c(const float *input_nodes, + const NN_CONFIG *const nn_config, int reduce_prec, + float *const output) { + int num_input_nodes = nn_config->num_inputs; + int buf_index = 0; + float buf[2][NN_MAX_NODES_PER_LAYER]; + + // Propagate hidden layers. + const int num_layers = nn_config->num_hidden_layers; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (int layer = 0; layer < num_layers; ++layer) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + float *output_nodes = buf[buf_index]; + const int num_output_nodes = nn_config->num_hidden_nodes[layer]; + assert(num_output_nodes < NN_MAX_NODES_PER_LAYER); + for (int node = 0; node < num_output_nodes; ++node) { + float val = layer_bias[node]; + for (int i = 0; i < num_input_nodes; ++i) + val += layer_weights[node * num_input_nodes + i] * input_nodes[i]; + // ReLU as activation function. + val = val > 0.0f ? val : 0.0f; // Could use AOMMAX(). + output_nodes[node] = val; + } + num_input_nodes = num_output_nodes; + input_nodes = output_nodes; + buf_index = 1 - buf_index; + } + + // Final output layer. + const float *layer_weights = nn_config->weights[num_layers]; + const float *layer_bias = nn_config->bias[num_layers]; + for (int node = 0; node < nn_config->num_outputs; ++node) { + float val = layer_bias[node]; + for (int i = 0; i < num_input_nodes; ++i) + val += layer_weights[node * num_input_nodes + i] * input_nodes[i]; + output[node] = val; + } + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); +} + +#if CONFIG_NN_V2 +// Applies the ReLu activation to one fc layer +// output[i] = Max(input[i],0.0f) +static float *nn_relu(const float *input, FC_LAYER *layer) { + for (int i = 0; i < layer->num_outputs; ++i) { + layer->output[i] = AOMMAX(input[i], 0.0f); + } + + return layer->output; +} + +// Applies the Sigmoid activation to one fc layer +// output[i] = 1/(1+exp(input[i])) +static float *nn_sigmoid(const float *input, FC_LAYER *layer) { + for (int i = 0; i < layer->num_outputs; ++i) { + const float tmp = AOMMIN(AOMMAX(input[i], -10.0f), 10.0f); + layer->output[i] = 1.0f / (1.0f + expf(-tmp)); + } + + return layer->output; +} + +// Forward prediction in one fc layer, used in function av1_nn_predict_V2 +static float *nn_fc_forward(const float *input, FC_LAYER *layer) { + const float *weights = layer->weights; + const float *bias = layer->bias; + assert(layer->num_outputs < NN_MAX_NODES_PER_LAYER); + // fc + for (int node = 0; node < layer->num_outputs; ++node) { + float val = bias[node]; + for (int i = 0; i < layer->num_inputs; ++i) val += weights[i] * input[i]; + layer->output[node] = val; + weights += layer->num_inputs; + } + + // activation + switch (layer->activation) { + case NONE: return layer->output; + case RELU: return nn_relu(layer->output, layer); + case SIGMOID: return nn_sigmoid(layer->output, layer); + case SOFTSIGN: + assert(0 && "Softsign has not been supported in NN."); // TO DO + return NULL; + default: + assert(0 && "Unknown activation"); // Unknown activation + return NULL; + } +} + +void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config, + int reduce_prec, float *output) { + const float *input_nodes = feature; + + // Propagate the layers. + const int num_layers = nn_config->num_hidden_layers; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (int i = 0; i < num_layers; ++i) { + input_nodes = nn_fc_forward(input_nodes, nn_config->layer + i); + assert(nn_config->layer[i + 1].num_inputs == + nn_config->layer[i].num_outputs); + } + + // Final layer + input_nodes = nn_fc_forward(input_nodes, nn_config->layer + num_layers); + assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits); + // Copy the final layer output + memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits); + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits); +} +#endif // CONFIG_NN_V2 + +void av1_nn_softmax(const float *input, float *output, int n) { + // Softmax function is invariant to adding the same constant + // to all input values, so we subtract the maximum input to avoid + // possible overflow. + float max_input = input[0]; + for (int i = 1; i < n; i++) max_input = AOMMAX(max_input, input[i]); + float sum_out = 0.0f; + for (int i = 0; i < n; i++) { + // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors. + const float normalized_input = AOMMAX(input[i] - max_input, -10.0f); + output[i] = expf(normalized_input); + sum_out += output[i]; + } + for (int i = 0; i < n; i++) output[i] /= sum_out; +} + +void av1_nn_fast_softmax_16_c(const float *input, float *output) { + const int kNumClasses = 16; + float max_input = input[0]; + for (int i = 1; i < kNumClasses; i++) max_input = AOMMAX(max_input, input[i]); + float sum_out = 0.0f; + for (int i = 0; i < kNumClasses; i++) { + // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors. + const float normalized_input = AOMMAX(input[i] - max_input, -10.0f); + output[i] = approx_exp(normalized_input); + sum_out += output[i]; + } + for (int i = 0; i < kNumClasses; i++) output[i] /= sum_out; +} diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h new file mode 100644 index 0000000000..566f9271dd --- /dev/null +++ b/third_party/aom/av1/encoder/ml.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ML_H_ +#define AOM_AV1_ENCODER_ML_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/av1_rtcd.h" + +#define NN_MAX_HIDDEN_LAYERS 10 +#define NN_MAX_NODES_PER_LAYER 128 + +struct NN_CONFIG { + int num_inputs; // Number of input nodes, i.e. features. + int num_outputs; // Number of output nodes. + int num_hidden_layers; // Number of hidden layers, maximum 10. + // Number of nodes for each hidden layer. + int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS]; + // Weight parameters, indexed by layer. + const float *weights[NN_MAX_HIDDEN_LAYERS + 1]; + // Bias parameters, indexed by layer. + const float *bias[NN_MAX_HIDDEN_LAYERS + 1]; +}; +// Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs + +#if CONFIG_NN_V2 +// Fully-connectedly layer configuration +struct FC_LAYER { + const int num_inputs; // Number of input nodes, i.e. features. + const int num_outputs; // Number of output nodes. + + float *weights; // Weight parameters. + float *bias; // Bias parameters. + const ACTIVATION activation; // Activation function. + + float *output; // The output array. + float *dY; // Gradient of outputs + float *dW; // Gradient of weights. + float *db; // Gradient of bias +}; + +// NN configure structure V2 +struct NN_CONFIG_V2 { + const int num_hidden_layers; // Number of hidden layers, max = 10. + FC_LAYER layer[NN_MAX_HIDDEN_LAYERS + 1]; // The layer array + const int num_logits; // Number of output nodes. + float *logits; // Raw prediction (same as output of final layer) + const LOSS loss; // Loss function +}; + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config, + int reduce_prec, float *output); +#endif // CONFIG_NN_V2 + +// Applies the softmax normalization function to the input +// to get a valid probability distribution in the output: +// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k])) +void av1_nn_softmax(const float *input, float *output, int n); + +// A faster but less accurate version of av1_nn_softmax(input, output, 16) +void av1_nn_fast_softmax_16_c(const float *input, float *output); + +// Applies a precision reduction to output of av1_nn_predict to prevent +// mismatches between C and SIMD implementations. +void av1_nn_output_prec_reduce(float *const output, int num_output); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ML_H_ diff --git a/third_party/aom/av1/encoder/mode_prune_model_weights.h b/third_party/aom/av1/encoder/mode_prune_model_weights.h new file mode 100644 index 0000000000..98ec36808a --- /dev/null +++ b/third_party/aom/av1/encoder/mode_prune_model_weights.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define NUM_HIDDEN_LAYERS_12 1 +#define NUM_FEATURES_12 6 +#define NUM_LAYER_0_UNITS_12 24 +#define NUM_LOGITS_12 2 + +static const float av1_intrap_hiddenlayer_0_kernel_12[] = { + 7.28372f, -1.3333898f, -1.3180022f, -0.007156151f, -0.40799126f, + -0.57538104f, -31.81647f, 6.7057495f, 6.351472f, -0.029544508f, + 0.026801195f, 1.12863f, -0.70769817f, -0.24183524f, 0.0649113f, + -0.7189517f, 0.21791299f, 0.12840256f, -0.56424767f, 0.16924907f, + 0.4605501f, -0.170895f, -0.60358995f, -0.15383226f, -4.0523643f, + 0.6961917f, 1.3100256f, -0.4189354f, 0.37264112f, -0.14555685f, + 10.628014f, 8.184437f, 8.941916f, -0.011731001f, -0.45127156f, + 0.42704004f, 36.84277f, 8.988796f, 8.844238f, 0.00030091056f, + -0.022038324f, 1.3566176f, -8.863219f, -0.84811693f, -1.0908632f, + 0.00023130262f, -1.0698471f, -6.755927f, 7.1711984f, 4.7216063f, + 3.5099216f, -0.6650184f, 0.5935173f, -0.6696286f, 11.8595295f, + 0.3001874f, 0.29822728f, 0.04319222f, -1.203178f, 1.1210147f, + 0.035045594f, -0.20559944f, -0.015388541f, -0.7857941f, -0.94100875f, + -0.1278549f, -19.22603f, 7.9466896f, 6.5048656f, -0.22195444f, + 0.19061874f, 1.3927288f, -8.896529f, -0.48146892f, -1.6098932f, + -0.0030235797f, -0.6533787f, -2.1333003f, -22.256454f, -4.934058f, + -4.4707212f, -0.015831878f, -0.4243649f, -2.776269f, -0.23762038f, + 0.1820098f, -0.51865315f, -1.1893421f, 0.34969202f, 0.10636194f, + 14.545696f, 1.3849198f, 2.6815193f, -0.5145498f, 0.45948258f, + -0.8842355f, -0.9111363f, -0.39652422f, 0.077266276f, -0.68084997f, + 0.4593515f, -0.28872707f, -6.936231f, 1.12253f, 1.7616503f, + -0.014069137f, -0.0052156276f, -4.5095444f, 6.2076726f, -0.058755957f, + -0.4675936f, -0.13039507f, 0.12094394f, -0.07285393f, 68.26125f, + 7.4893136f, 8.770954f, 0.020274093f, -0.027877754f, 1.6579602f, + -0.1825479f, 0.34832543f, 0.07472531f, -0.44812247f, -1.0941806f, + -0.16749863f, 1.1394324f, 0.47983396f, -0.99983627f, -0.00064249727f, + -1.3345739f, -0.057157427f, -18.14875f, 16.506035f, 15.539248f, + 0.013191509f, -0.021674965f, -25.006235f, 0.51220596f, 0.7334426f, + 0.81836903f, -1.0443225f, 0.4459505f, -1.2045046f +}; + +static const float av1_intrap_hiddenlayer_0_bias_12[] = { + -4.154915f, 14.33833f, 0.0f, 0.0f, 2.0440118f, 12.40922f, + -16.77514f, 0.5879813f, 3.2305415f, 0.8303539f, 0.0f, 14.488708f, + 2.94393f, 1.874383f, 0.0f, -0.53140444f, 0.0f, 1.8456234f, + -0.55427986f, -19.856262f, 0.0f, 0.17281002f, 48.31631f, 0.0f +}; + +static const float av1_intrap_logits_kernel_12[] = { + 0.26843873f, -0.09576241f, 0.34427166f, 0.09914787f, -0.10275399f, + 0.02999484f, -0.1467772f, 0.11594324f, 0.29200763f, 0.0067976206f, + 0.050393578f, -0.018694371f, 0.3333476f, 0.2127221f, 0.35128218f, + 0.19968672f, 0.08099991f, 0.084850654f, -0.16045967f, 0.30286232f, + 0.6164765f, -0.27140254f, 0.08210814f, 0.34852806f, 0.25028184f, + -0.12188078f, 0.16310331f, 0.31253803f, -0.10792341f, 0.065858394f, + -0.1349708f, 0.08948815f, 0.31905392f, 0.03680656f, -0.05040944f, + -0.051539157f, 0.3211852f, 0.2137136f, 0.45037416f, 0.22748767f, + -0.10978614f, 0.06475646f, -0.16954158f, 0.32831904f, 0.16479677f, + -0.30020145f, 0.066221856f, 0.37213042f +}; + +static const float av1_intrap_logits_bias_12[] = { 0.95783f, -0.95823103f }; + +static const NN_CONFIG av1_intrap_nn_config = { + NUM_FEATURES_12, + NUM_LOGITS_12, + NUM_HIDDEN_LAYERS_12, + { + NUM_LAYER_0_UNITS_12, + }, + { + av1_intrap_hiddenlayer_0_kernel_12, + av1_intrap_logits_kernel_12, + }, + { + av1_intrap_hiddenlayer_0_bias_12, + av1_intrap_logits_bias_12, + }, +}; + +#undef NUM_HIDDEN_LAYERS_12 +#undef NUM_FEATURES_12 +#undef NUM_LAYER_0_UNITS_12 +#undef NUM_LOGITS_12 + +#define NUM_HIDDEN_LAYERS_15 1 +#define NUM_FEATURES_15 6 +#define NUM_LAYER_0_UNITS_15 24 +#define NUM_LOGITS_15 2 + +static const float av1_intraph_hiddenlayer_0_kernel_15[] = { + -0.77480125f, 0.3219551f, -0.015702145f, -0.5310235f, 0.5254026f, + -1.1522819f, 2.682016f, 0.08001052f, -0.2539285f, 0.04711023f, + -0.81296307f, 0.2675382f, 0.1952474f, -0.0664705f, 1.2989824f, + -0.3150117f, -0.8022715f, 0.045423955f, -27.584324f, -2.5608704f, + -3.2280366f, 0.05272543f, -0.47141576f, -0.07644298f, -53.77942f, + -22.393923f, -23.027853f, -0.00015186476f, -0.010696465f, 2.7064638f, + -22.776028f, 11.514891f, 11.138167f, -0.001243723f, -0.4802433f, + -8.758646f, 0.26398206f, -0.23485385f, 0.27586034f, -0.004954741f, + -0.4935232f, -0.017607696f, 69.56049f, -1.1756641f, -0.052366666f, + -0.38052833f, 0.32474658f, 0.04634263f, 0.8583235f, -0.528438f, + -0.7868907f, -0.4757781f, 0.4620985f, -0.70621157f, 231.40195f, + 6.805205f, 9.420295f, 0.02585775f, -0.03480937f, 1.3577378f, + 0.1758226f, 15.056758f, 14.437874f, -0.1305005f, 0.115103304f, + 0.21297209f, 55.821743f, -6.611156f, -6.8552365f, -0.011928095f, + -0.2042175f, 1.2557873f, -1.0722278f, -0.2683614f, 0.48318478f, + -0.73739994f, 0.54055226f, -0.03224738f, -0.06767959f, -0.21015017f, + 0.29171246f, -0.6937296f, -1.2342545f, -0.41278538f, -37.9365f, + 17.68424f, 16.263042f, -0.074828684f, 0.06607806f, -0.16763286f, + 13.594707f, 0.6152676f, -0.4371223f, -0.8365592f, 0.8273623f, + -1.2126317f, 0.1216157f, -1.3002136f, -0.18856938f, -0.2589358f, + -0.76897144f, 0.21777137f, -122.25033f, -0.23490006f, -3.1238277f, + -0.13916978f, 0.08576391f, -1.7391548f, -116.24812f, 14.906071f, + 13.468357f, 0.02332889f, -0.034617376f, -18.506111f, 0.7500542f, + -1.1882535f, 0.40848416f, -0.28434393f, -0.71471655f, -0.29188696f, + -0.46588746f, -0.17324813f, -0.62460244f, -1.1801276f, 0.28993344f, + -0.22072886f, 129.2688f, -0.33782578f, -0.34836572f, -0.034112718f, + -0.023666814f, -0.5865087f, -33.484146f, 1.1431375f, 0.56056374f, + -0.0049730353f, -0.24347587f, -1.3003352f, 0.88973033f, 0.8499571f, + -0.5678484f, -0.39009875f, -0.062105156f, -0.13965102f +}; + +static const float av1_intraph_hiddenlayer_0_bias_15[] = { + 0.0f, -0.2926711f, 0.0f, -1.0303509f, -27.459345f, 12.412848f, + 0.0f, -2.5971522f, -0.02733541f, -19.881912f, 14.391992f, -8.249469f, + 0.0f, 0.0f, 13.676118f, -0.6472994f, -0.07189449f, 1.1986839f, + 52.479107f, 0.0f, 0.0f, -3.0187025f, 1.4435643f, 0.0f +}; + +static const float av1_intraph_logits_kernel_15[] = { + 0.05390722f, -0.06859513f, 0.036842898f, 0.190772f, 0.13623567f, + 0.09321194f, 0.2314745f, -0.13958375f, -0.3058229f, -0.0104543045f, + 0.11336068f, -0.276115f, 0.00470723f, -0.49123898f, -0.15988174f, + 0.087681435f, 0.022517204f, 0.073877744f, 0.2968856f, -0.1401399f, + -0.38788354f, -0.26005393f, -0.39564916f, -0.16195515f, 0.2680102f, + -0.032179773f, -0.35758728f, 0.25819537f, 0.11468631f, 0.13573235f, + -0.2672175f, 0.016490124f, 0.048118807f, 0.020319486f, 0.07892215f, + -0.21821865f, 0.08434734f, 0.3129456f, -0.18215221f, 0.08884877f, + -0.35621428f, 0.11405768f, 0.27370325f, 0.14956686f, 0.01604587f, + -0.18334487f, -0.42385718f, -0.08033409f +}; + +static const float av1_intraph_logits_bias_15[] = { 0.83619016f, -0.8340626f }; + +static const NN_CONFIG av1_intrap_hd_nn_config = { + NUM_FEATURES_15, + NUM_LOGITS_15, + NUM_HIDDEN_LAYERS_15, + { + NUM_LAYER_0_UNITS_15, + }, + { + av1_intraph_hiddenlayer_0_kernel_15, + av1_intraph_logits_kernel_15, + }, + { + av1_intraph_hiddenlayer_0_bias_15, + av1_intraph_logits_bias_15, + }, +}; + +#undef NUM_HIDDEN_LAYERS_15 +#undef NUM_FEATURES_15 +#undef NUM_LAYER_0_UNITS_15 +#undef NUM_LOGITS_15 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/model_rd.h b/third_party/aom/av1/encoder/model_rd.h new file mode 100644 index 0000000000..f7e8b96b5b --- /dev/null +++ b/third_party/aom/av1/encoder/model_rd.h @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MODEL_RD_H_ +#define AOM_AV1_ENCODER_MODEL_RD_H_ + +#include "aom/aom_integer.h" +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/pustats.h" +#include "av1/encoder/rdopt_utils.h" +#include "config/aom_dsp_rtcd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// 0: Legacy model +// 1: Curve fit model +// 2: Surface fit model +// 3: DNN regression model +// 4: Full rd model +#define MODELRD_TYPE_INTERP_FILTER 1 +#define MODELRD_TYPE_TX_SEARCH_PRUNE 1 +#define MODELRD_TYPE_MASKED_COMPOUND 1 +#define MODELRD_TYPE_INTERINTRA 1 +#define MODELRD_TYPE_INTRA 1 +#define MODELRD_TYPE_MOTION_MODE_RD 1 + +typedef void (*model_rd_for_sb_type)( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, + uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist); +typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); + +static int64_t calculate_sse(MACROBLOCKD *const xd, + const struct macroblock_plane *p, + struct macroblockd_plane *pd, const int bw, + const int bh) { + int64_t sse = 0; + const int shift = xd->bd - 8; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + bw, bh); + } else { + sse = + aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); + } +#else + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); +#endif + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + return sse; +} + +static AOM_INLINE int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd, + int plane, const BLOCK_SIZE bsize) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, + &bh); + + int64_t sse = calculate_sse(xd, p, pd, bw, bh); + + return sse; +} + +static AOM_INLINE void model_rd_from_sse(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, + int *rate, int64_t *dist) { + (void)num_samples; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + + // Fast approximate the modelling function. + if (cpi->sf.rd_sf.simple_model_rd_from_var) { + const int64_t square_error = sse; + int quantizer = p->dequant_QTX[1] >> dequant_shift; + if (quantizer < 120) + *rate = (int)AOMMIN( + (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT), + INT_MAX); + else + *rate = 0; + assert(*rate >= 0); + *dist = (square_error * quantizer) >> 8; + } else { + av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize], + p->dequant_QTX[1] >> dequant_shift, rate, + dist); + } + *dist <<= 4; +} + +// Fits a curve for rate and distortion using as feature: +// log2(sse_norm/qstep^2) +static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, + int *rate, int64_t *dist) { + (void)cpi; + (void)plane_bsize; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int qstep = AOMMAX(p->dequant_QTX[1] >> dequant_shift, 1); + + if (sse == 0) { + if (rate) *rate = 0; + if (dist) *dist = 0; + return; + } + const double sse_norm = (double)sse / num_samples; + const double qstepsqr = (double)qstep * qstep; + const double xqr = log2(sse_norm / qstepsqr); + double rate_f, dist_by_sse_norm_f; + av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f, + &dist_by_sse_norm_f); + + const double dist_f = dist_by_sse_norm_f * sse_norm; + int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); + int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + + // Check if skip is better + if (rate_i == 0) { + dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { + rate_i = 0; + dist_i = sse << 4; + } + + if (rate) *rate = rate_i; + if (dist) *dist = dist_i; +} + +static AOM_INLINE void model_rd_for_sb( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, + uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + int plane; + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + assert(bsize < BLOCK_SIZES_ALL); + + for (plane = plane_from; plane <= plane_to; ++plane) { + if (plane && !xd->is_chroma_ref) break; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + int64_t sse; + int rate; + int64_t dist; + + sse = calculate_sse(xd, p, pd, bw, bh); + + model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + assert(rate_sum >= 0); + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + rate_sum = AOMMIN(rate_sum, INT_MAX); + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +static AOM_INLINE void model_rd_for_sb_with_curvfit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, + uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + if (plane && !xd->is_chroma_ref) break; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + + sse = calculate_sse(xd, p, pd, bw, bh); + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = rate_sum == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +enum { MODELRD_LEGACY, MODELRD_CURVFIT, MODELRD_TYPES } UENUM1BYTE(ModelRdType); + +static const model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = { + model_rd_for_sb, model_rd_for_sb_with_curvfit +}; + +static const model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = { + model_rd_from_sse, model_rd_with_curvfit +}; + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_ENCODER_MODEL_RD_H_ diff --git a/third_party/aom/av1/encoder/motion_search_facade.c b/third_party/aom/av1/encoder/motion_search_facade.c new file mode 100644 index 0000000000..e7eec29dc3 --- /dev/null +++ b/third_party/aom/av1/encoder/motion_search_facade.c @@ -0,0 +1,1071 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/reconinter.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/interp_search.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/partition_strategy.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/tx_search.h" + +#define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3) + +typedef struct { + int_mv fmv; + int weight; +} cand_mv_t; + +static int compare_weight(const void *a, const void *b) { + const int diff = ((cand_mv_t *)a)->weight - ((cand_mv_t *)b)->weight; + if (diff < 0) + return 1; + else if (diff > 0) + return -1; + return 0; +} + +// Allow more mesh searches for screen content type on the ARF. +static int use_fine_search_interval(const AV1_COMP *const cpi) { + return cpi->is_screen_content_type && + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE && + cpi->oxcf.speed <= 2; +} + +// Iterate through the tpl and collect the mvs to be used as candidates +static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi, + const MACROBLOCK *x, + BLOCK_SIZE bsize, int ref, + cand_mv_t *cand, int *cand_count, + int *total_cand_weight) { + const SuperBlockEnc *sb_enc = &x->sb_enc; + if (!sb_enc->tpl_data_count) { + return; + } + + const AV1_COMMON *cm = &cpi->common; + const MACROBLOCKD *xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + const BLOCK_SIZE tpl_bsize = + convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); + const int tplw = mi_size_wide[tpl_bsize]; + const int tplh = mi_size_high[tpl_bsize]; + const int nw = mi_size_wide[bsize] / tplw; + const int nh = mi_size_high[bsize] / tplh; + + if (nw >= 1 && nh >= 1) { + const int of_h = mi_row % mi_size_high[cm->seq_params->sb_size]; + const int of_w = mi_col % mi_size_wide[cm->seq_params->sb_size]; + const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw; + int valid = 1; + + // Assign large weight to start_mv, so it is always tested. + cand[0].weight = nw * nh; + + for (int k = 0; k < nh; k++) { + for (int l = 0; l < nw; l++) { + const int_mv mv = + sb_enc + ->tpl_mv[start + k * sb_enc->tpl_stride + l][ref - LAST_FRAME]; + if (mv.as_int == INVALID_MV) { + valid = 0; + break; + } + + const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row), + GET_MV_RAWPEL(mv.as_mv.col) }; + int unique = 1; + for (int m = 0; m < *cand_count; m++) { + if (RIGHT_SHIFT_MV(fmv.row) == + RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.row) && + RIGHT_SHIFT_MV(fmv.col) == + RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.col)) { + unique = 0; + cand[m].weight++; + break; + } + } + + if (unique) { + cand[*cand_count].fmv.as_fullmv = fmv; + cand[*cand_count].weight = 1; + (*cand_count)++; + } + } + if (!valid) break; + } + + if (valid) { + *total_cand_weight = 2 * nh * nw; + if (*cand_count > 2) + qsort(cand, *cand_count, sizeof(cand[0]), &compare_weight); + } + } +} + +void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int ref_idx, int *rate_mv, + int search_range, inter_mode_info *mode_info, + int_mv *best_mv, + struct HandleInterModeArgs *const args) { + MACROBLOCKD *xd = &x->e_mbd; + const AV1_COMMON *cm = &cpi->common; + const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *mbmi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + int bestsme = INT_MAX; + const int ref = mbmi->ref_frame[ref_idx]; + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const MvCosts *mv_costs = x->mv_costs; + + if (scaled_ref_frame) { + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // full-pixel motion search code to be used without additional + // modifications. + for (int i = 0; i < num_planes; i++) { + backup_yv12[i] = xd->plane[i].pre[ref_idx]; + } + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + // Work out the size of the first step in the mv step search. + // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc. + int step_param; + if (cpi->sf.mv_sf.auto_mv_step_size && cm->show_frame) { + // Take the weighted average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = (av1_init_search_range(x->max_mv_context[ref]) + + mv_search_params->mv_step_param) / + 2; + } else { + step_param = mv_search_params->mv_step_param; + } + + const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv; + FULLPEL_MV start_mv; + if (mbmi->motion_mode != SIMPLE_TRANSLATION) + start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv); + else + start_mv = get_fullmv_from_mv(&ref_mv); + + // cand stores start_mv and all possible MVs in a SB. + cand_mv_t cand[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB + 1]; + av1_zero(cand); + cand[0].fmv.as_fullmv = start_mv; + int cnt = 1; + int total_weight = 0; + + if (!cpi->sf.mv_sf.full_pixel_search_level && + mbmi->motion_mode == SIMPLE_TRANSLATION) { + get_mv_candidate_from_tpl(cpi, x, bsize, ref, cand, &cnt, &total_weight); + } + + const int cand_cnt = AOMMIN(2, cnt); + // TODO(any): Test the speed feature for OBMC_CAUSAL mode. + if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv && + mbmi->motion_mode == SIMPLE_TRANSLATION) { + const int stack_size = args->start_mv_cnt; + for (int cand_idx = 0; cand_idx < cand_cnt; cand_idx++) { + int_mv *fmv_cand = &cand[cand_idx].fmv; + int skip_cand_mv = 0; + + // Check difference between mvs in the stack and candidate mv. + for (int stack_idx = 0; stack_idx < stack_size; stack_idx++) { + const uint8_t this_ref_mv_idx = args->ref_mv_idx_stack[stack_idx]; + const FULLPEL_MV *fmv_stack = &args->start_mv_stack[stack_idx]; + const int this_newmv_valid = + args->single_newmv_valid[this_ref_mv_idx][ref]; + const int row_diff = abs(fmv_stack->row - fmv_cand->as_fullmv.row); + const int col_diff = abs(fmv_stack->col - fmv_cand->as_fullmv.col); + + if (!this_newmv_valid) continue; + + if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 2) { + // Prunes the current start_mv candidate, if the absolute mv + // difference of both row and column are <= 1. + if (row_diff <= 1 && col_diff <= 1) { + skip_cand_mv = 1; + break; + } + } else if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 1) { + // Prunes the current start_mv candidate, if the sum of the absolute + // mv difference of row and column is <= 1. + if (row_diff + col_diff <= 1) { + skip_cand_mv = 1; + break; + } + } + } + if (skip_cand_mv) { + // Ensure atleast one full-pel motion search is not pruned. + assert(mbmi->ref_mv_idx != 0); + // Mark the candidate mv as invalid so that motion search gets skipped. + cand[cand_idx].fmv.as_int = INVALID_MV; + } else { + // Store start_mv candidate and corresponding ref_mv_idx of full-pel + // search in the mv stack (except last ref_mv_idx). + if (mbmi->ref_mv_idx != MAX_REF_MV_SEARCH - 1) { + assert(args->start_mv_cnt < (MAX_REF_MV_SEARCH - 1) * 2); + args->start_mv_stack[args->start_mv_cnt] = fmv_cand->as_fullmv; + args->ref_mv_idx_stack[args->start_mv_cnt] = mbmi->ref_mv_idx; + args->start_mv_cnt++; + } + } + } + } + + // Hot fix for asan complaints when resize mode is on. When resize mode is on, + // the stride of the reference frame can be different from indicated by + // MotionVectorSearchParams::search_site_cfg. When this happens, we need to + // readjust the stride. + const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, mv_sf, bsize); + const search_site_config *src_search_site_cfg = + av1_get_search_site_config(cpi, x, search_method); + + // Further reduce the search range. + if (search_range < INT_MAX) { + const search_site_config *search_site_cfg = + &src_search_site_cfg[search_method_lookup[search_method]]; + // Max step_param is search_site_cfg->num_search_steps. + if (search_range < 1) { + step_param = search_site_cfg->num_search_steps; + } else { + while (search_site_cfg->radius[search_site_cfg->num_search_steps - + step_param - 1] > (search_range << 1) && + search_site_cfg->num_search_steps - step_param - 1 > 0) + step_param++; + } + } + + int cost_list[5]; + FULLPEL_MV_STATS best_mv_stats; + int_mv second_best_mv; + best_mv->as_int = second_best_mv.as_int = INVALID_MV; + + // Allow more mesh searches for screen content type on the ARF. + const int fine_search_interval = use_fine_search_interval(cpi); + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + + switch (mbmi->motion_mode) { + case SIMPLE_TRANSLATION: { + // Perform a search with the top 2 candidates + int sum_weight = 0; + for (int m = 0; m < cand_cnt; m++) { + int_mv smv = cand[m].fmv; + FULLPEL_MV this_best_mv, this_second_best_mv; + FULLPEL_MV_STATS this_mv_stats; + + if (smv.as_int == INVALID_MV) continue; + + av1_make_default_fullpel_ms_params( + &full_ms_params, cpi, x, bsize, &ref_mv, smv.as_fullmv, + src_search_site_cfg, search_method, fine_search_interval); + + const int thissme = + av1_full_pixel_search(smv.as_fullmv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), &this_best_mv, + &this_mv_stats, &this_second_best_mv); + + if (thissme < bestsme) { + bestsme = thissme; + best_mv->as_fullmv = this_best_mv; + best_mv_stats = this_mv_stats; + second_best_mv.as_fullmv = this_second_best_mv; + } + + sum_weight += cand[m].weight; + if (4 * sum_weight > 3 * total_weight) break; + } + } break; + case OBMC_CAUSAL: + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, + &ref_mv, start_mv, src_search_site_cfg, + search_method, fine_search_interval); + + bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params, + step_param, &best_mv->as_fullmv); + break; + default: assert(0 && "Invalid motion mode!\n"); + } + if (best_mv->as_int == INVALID_MV) return; + + if (scaled_ref_frame) { + // Swap back the original buffers for subpel motion search. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[ref_idx] = backup_yv12[i]; + } + } + + // Terminate search with the current ref_idx based on fullpel mv, rate cost, + // and other know cost. + if (cpi->sf.inter_sf.skip_newmv_in_drl >= 2 && + mbmi->motion_mode == SIMPLE_TRANSLATION && + best_mv->as_int != INVALID_MV) { + int_mv this_mv; + this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + const int ref_mv_idx = mbmi->ref_mv_idx; + const int this_mv_rate = + av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int; + mode_info[ref_mv_idx].full_mv_rate = this_mv_rate; + mode_info[ref_mv_idx].full_mv_bestsme = bestsme; + + for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) { + // Check if the motion search result same as previous results + if (this_mv.as_int == mode_info[prev_ref_idx].full_search_mv.as_int) { + // Compare the rate cost + const int prev_rate_cost = mode_info[prev_ref_idx].full_mv_rate + + mode_info[prev_ref_idx].drl_cost; + const int this_rate_cost = + this_mv_rate + mode_info[ref_mv_idx].drl_cost; + + if (prev_rate_cost <= this_rate_cost) { + // If the current rate_cost is worse than the previous rate_cost, then + // we terminate the search. Since av1_single_motion_search is only + // called by handle_new_mv in SIMPLE_TRANSLATION mode, we set the + // best_mv to INVALID mv to signal that we wish to terminate search + // for the current mode. + best_mv->as_int = INVALID_MV; + return; + } + } + + // Terminate the evaluation of current ref_mv_idx based on bestsme and + // drl_cost. + const int psme = mode_info[prev_ref_idx].full_mv_bestsme; + if (psme == INT_MAX) continue; + const int thr = + cpi->sf.inter_sf.skip_newmv_in_drl == 3 ? (psme + (psme >> 2)) : psme; + if (cpi->sf.inter_sf.skip_newmv_in_drl >= 3 && + mode_info[ref_mv_idx].full_mv_bestsme > thr && + mode_info[prev_ref_idx].drl_cost < mode_info[ref_mv_idx].drl_cost) { + best_mv->as_int = INVALID_MV; + return; + } + } + } + + if (cpi->common.features.cur_frame_force_integer_mv) { + convert_fullmv_to_mv(best_mv); + } + + const int use_fractional_mv = + bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0; + int best_mv_rate = 0; + int mv_rate_calculated = 0; + if (use_fractional_mv) { + int_mv fractional_ms_list[3]; + av1_set_fractional_mv(fractional_ms_list); + int dis; /* TODO: use dis in distortion calculation later. */ + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, + cost_list); + MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + + switch (mbmi->motion_mode) { + case SIMPLE_TRANSLATION: + if (mv_sf->use_accurate_subpel_search) { + const int try_second = second_best_mv.as_int != INVALID_MV && + second_best_mv.as_int != best_mv->as_int && + (mv_sf->disable_second_mv <= 1); + const int best_mv_var = mv_search_params->find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, + &best_mv->as_mv, &dis, &x->pred_sse[ref], fractional_ms_list); + + if (try_second) { + struct macroblockd_plane *p = xd->plane; + const BUFFER_SET orig_dst = { + { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, + { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + }; + int64_t rd = INT64_MAX; + if (!mv_sf->disable_second_mv) { + // Calculate actual rd cost. + mbmi->mv[0].as_mv = best_mv->as_mv; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, + bsize, 0, 0); + av1_subtract_plane(x, bsize, 0); + RD_STATS this_rd_stats; + av1_init_rd_stats(&this_rd_stats); + av1_estimate_txfm_yrd(cpi, x, &this_rd_stats, INT64_MAX, bsize, + max_txsize_rect_lookup[bsize]); + int this_mv_rate = av1_mv_bit_cost( + &best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + rd = RDCOST(x->rdmult, this_mv_rate + this_rd_stats.rate, + this_rd_stats.dist); + } + + MV this_best_mv; + subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv); + if (av1_is_subpelmv_in_range(&ms_params.mv_limits, + subpel_start_mv)) { + unsigned int sse; + const int this_var = mv_search_params->find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, + &dis, &sse, fractional_ms_list); + + if (!mv_sf->disable_second_mv) { + // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost + // to choose the better MV. + mbmi->mv[0].as_mv = this_best_mv; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, + bsize, 0, 0); + av1_subtract_plane(x, bsize, 0); + RD_STATS tmp_rd_stats; + av1_init_rd_stats(&tmp_rd_stats); + av1_estimate_txfm_yrd(cpi, x, &tmp_rd_stats, INT64_MAX, bsize, + max_txsize_rect_lookup[bsize]); + int tmp_mv_rate = av1_mv_bit_cost( + &this_best_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + int64_t tmp_rd = + RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate, + tmp_rd_stats.dist); + if (tmp_rd < rd) { + best_mv->as_mv = this_best_mv; + x->pred_sse[ref] = sse; + } + } else { + // If cpi->sf.mv_sf.disable_second_mv = 1, use var to decide the + // best MV. + if (this_var < best_mv_var) { + best_mv->as_mv = this_best_mv; + x->pred_sse[ref] = sse; + } + } + } + } + } else { + mv_search_params->find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, + &best_mv->as_mv, &dis, &x->pred_sse[ref], NULL); + } + break; + case OBMC_CAUSAL: + av1_find_best_obmc_sub_pixel_tree_up( + xd, cm, &ms_params, subpel_start_mv, NULL, &best_mv->as_mv, &dis, + &x->pred_sse[ref], NULL); + break; + default: assert(0 && "Invalid motion mode!\n"); + } + + // Terminate search with the current ref_idx based on subpel mv and rate + // cost. + if (cpi->sf.inter_sf.skip_newmv_in_drl >= 1 && args != NULL && + mbmi->motion_mode == SIMPLE_TRANSLATION && + best_mv->as_int != INVALID_MV) { + const int ref_mv_idx = mbmi->ref_mv_idx; + best_mv_rate = + av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + mv_rate_calculated = 1; + + for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) { + if (!args->single_newmv_valid[prev_ref_idx][ref]) continue; + // Check if the motion vectors are the same. + if (best_mv->as_int == args->single_newmv[prev_ref_idx][ref].as_int) { + // Skip this evaluation if the previous one is skipped. + if (mode_info[prev_ref_idx].skip) { + mode_info[ref_mv_idx].skip = 1; + break; + } + // Compare the rate cost that we current know. + const int prev_rate_cost = + args->single_newmv_rate[prev_ref_idx][ref] + + mode_info[prev_ref_idx].drl_cost; + const int this_rate_cost = + best_mv_rate + mode_info[ref_mv_idx].drl_cost; + + if (prev_rate_cost <= this_rate_cost) { + // If the current rate_cost is worse than the previous rate_cost, + // then we terminate the search for this ref_mv_idx. + mode_info[ref_mv_idx].skip = 1; + break; + } + } + } + } + } + + if (mv_rate_calculated) { + *rate_mv = best_mv_rate; + } else { + *rate_mv = + av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } +} + +int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, int *rate_mv, + int allow_second_mv, int joint_me_num_refine_iter) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + const int plane = 0; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + // This function should only ever be called for compound modes + assert(has_second_ref(mbmi)); + const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] }; + const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] }; + const MvCosts *mv_costs = x->mv_costs; + int_mv ref_mv[2]; + int ite, ref; + + // Get the prediction block from the 'other' reference frame. + const int_interpfilters interp_filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + InterPredParams inter_pred_params; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Do joint motion search in compound mode to get more accurate mv. + struct buf_2d backup_yv12[2][MAX_MB_PLANE]; + int last_besterr[2] = { INT_MAX, INT_MAX }; + const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { + av1_get_scaled_ref_frame(cpi, refs[0]), + av1_get_scaled_ref_frame(cpi, refs[1]) + }; + + // Prediction buffer from second frame. + DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]); + uint8_t *second_pred = get_buf_by_bd(xd, second_pred16); + + int_mv best_mv, second_best_mv; + + // Allow joint search multiple times iteratively for each reference frame + // and break out of the search loop if it couldn't find a better mv. + for (ite = 0; ite < (2 * joint_me_num_refine_iter); ite++) { + struct buf_2d ref_yv12[2]; + int bestsme = INT_MAX; + int id = ite % 2; // Even iterations search in the first reference frame, + // odd iterations search in the second. The predictor + // found for the 'other' reference frame is factored in. + if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) { + if (cur_mv[id].as_int == init_mv[id].as_int) { + break; + } else { + int_mv cur_int_mv, init_int_mv; + cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3; + cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3; + init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3; + init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3; + if (cur_int_mv.as_int == init_int_mv.as_int) { + break; + } + } + } + for (ref = 0; ref < 2; ++ref) { + ref_mv[ref] = av1_get_ref_mv(x, ref); + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + if (scaled_ref_frame[ref]) { + int i; + for (i = 0; i < num_planes; i++) + backup_yv12[ref][i] = xd->plane[i].pre[ref]; + av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, + NULL, num_planes); + } + } + + assert(IMPLIES(scaled_ref_frame[0] != NULL, + cm->width == scaled_ref_frame[0]->y_crop_width && + cm->height == scaled_ref_frame[0]->y_crop_height)); + assert(IMPLIES(scaled_ref_frame[1] != NULL, + cm->width == scaled_ref_frame[1]->y_crop_width && + cm->height == scaled_ref_frame[1]->y_crop_height)); + + // Initialize based on (possibly scaled) prediction buffers. + ref_yv12[0] = xd->plane[plane].pre[0]; + ref_yv12[1] = xd->plane[plane].pre[1]; + + av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE, + mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0, + &cm->sf_identity, &ref_yv12[!id], interp_filters); + inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd); + + // Since we have scaled the reference frames to match the size of the + // current frame we must use a unit scaling factor during mode selection. + av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv, + &inter_pred_params); + + // Do full-pixel compound motion search on the current reference frame. + if (id) xd->plane[plane].pre[0] = ref_yv12[id]; + + // Make motion search params + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + FULLPEL_MV_STATS best_mv_stats; + const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, mv_sf, bsize); + const search_site_config *src_search_sites = + av1_get_search_site_config(cpi, x, search_method); + // Use the mv result from the single mode as mv predictor. + const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv); + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, + &ref_mv[id].as_mv, start_fullmv, + src_search_sites, search_method, + /*fine_search_interval=*/0); + + av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask, + mask_stride, id); + + // Small-range full-pixel motion search. + if (!mv_sf->disable_extensive_joint_motion_search && + mbmi->interinter_comp.type != COMPOUND_WEDGE) { + bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL, + &best_mv.as_fullmv, &best_mv_stats, + &second_best_mv.as_fullmv); + } else { + bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv, + &best_mv.as_fullmv); + second_best_mv = best_mv; + } + + const int try_second = second_best_mv.as_int != INVALID_MV && + second_best_mv.as_int != best_mv.as_int && + allow_second_mv; + + // Restore the pointer to the first (possibly scaled) prediction buffer. + if (id) xd->plane[plane].pre[0] = ref_yv12[0]; + + for (ref = 0; ref < 2; ++ref) { + if (scaled_ref_frame[ref]) { + // Swap back the original buffers for subpel motion search. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[ref] = backup_yv12[ref][i]; + } + // Re-initialize based on unscaled prediction buffers. + ref_yv12[ref] = xd->plane[plane].pre[ref]; + } + } + + // Do sub-pixel compound motion search on the current reference frame. + if (id) xd->plane[plane].pre[0] = ref_yv12[id]; + + if (cpi->common.features.cur_frame_force_integer_mv) { + convert_fullmv_to_mv(&best_mv); + } + if (bestsme < INT_MAX && + cpi->common.features.cur_frame_force_integer_mv == 0) { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, + &ref_mv[id].as_mv, NULL); + av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred, + mask, mask_stride, id); + ms_params.forced_stop = EIGHTH_PEL; + MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv)); + bestsme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, &sse, NULL); + + if (try_second) { + MV this_best_mv; + MV subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv); + if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) { + const int thissme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, &dis, + &sse, NULL); + if (thissme < bestsme) { + best_mv.as_mv = this_best_mv; + bestsme = thissme; + } + } + } + } + + // Restore the pointer to the first prediction buffer. + if (id) xd->plane[plane].pre[0] = ref_yv12[0]; + if (bestsme < last_besterr[id]) { + cur_mv[id] = best_mv; + last_besterr[id] = bestsme; + } else { + break; + } + } + + *rate_mv = 0; + + for (ref = 0; ref < 2; ++ref) { + const int_mv curr_ref_mv = av1_get_ref_mv(x, ref); + *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, + mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } + + return AOMMIN(last_besterr[0], last_besterr[1]); +} + +// Search for the best mv for one component of a compound, +// given that the other component is fixed. +int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *this_mv, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + int *rate_mv, int ref_idx) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int ref = mbmi->ref_frame[ref_idx]; + const int_mv ref_mv = av1_get_ref_mv(x, ref_idx); + struct macroblockd_plane *const pd = &xd->plane[0]; + const MvCosts *mv_costs = x->mv_costs; + + struct buf_2d backup_yv12[MAX_MB_PLANE]; + const YV12_BUFFER_CONFIG *const scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + + // Check that this is either an interinter or an interintra block + assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi))); + + // Store the first prediction buffer. + struct buf_2d orig_yv12; + if (ref_idx) { + orig_yv12 = pd->pre[0]; + pd->pre[0] = pd->pre[ref_idx]; + } + + if (scaled_ref_frame) { + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // full-pixel motion search code to be used without additional + // modifications. + for (int i = 0; i < num_planes; i++) { + backup_yv12[i] = xd->plane[i].pre[ref_idx]; + } + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + // The index below needs to be 0 instead of ref_idx since we assume the + // 0th slot to be used for subsequent searches. Note that the ref_idx + // reference buffer has been copied to the 0th slot in the code above. + // Now we need to swap the reference frame for the 0th slot. + av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + int bestsme = INT_MAX; + int_mv best_mv; + + // Make motion search params + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + FULLPEL_MV_STATS best_mv_stats; + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize); + const search_site_config *src_search_sites = + av1_get_search_site_config(cpi, x, search_method); + // Use the mv result from the single mode as mv predictor. + const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv); + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, + &ref_mv.as_mv, start_fullmv, + src_search_sites, search_method, + /*fine_search_interval=*/0); + + av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask, + mask_stride, ref_idx); + + // Small-range full-pixel motion search. + bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL, + &best_mv.as_fullmv, &best_mv_stats, NULL); + + if (scaled_ref_frame) { + // Swap back the original buffers for subpel motion search for the 0th slot. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = backup_yv12[i]; + } + } + + if (cpi->common.features.cur_frame_force_integer_mv) { + convert_fullmv_to_mv(&best_mv); + } + const int use_fractional_mv = + bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0; + if (use_fractional_mv) { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv, + NULL); + av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred, + mask, mask_stride, ref_idx); + ms_params.forced_stop = EIGHTH_PEL; + MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv)); + bestsme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, start_mv, &best_mv_stats, &best_mv.as_mv, &dis, + &sse, NULL); + } + + // Restore the pointer to the first unscaled prediction buffer. + if (ref_idx) pd->pre[0] = orig_yv12; + + if (bestsme < INT_MAX) *this_mv = best_mv.as_mv; + + *rate_mv = 0; + + *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + return bestsme; +} + +static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, + const MV *other_mv, int ref_idx, + uint8_t *second_pred) { + const AV1_COMMON *const cm = &cpi->common; + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x); + const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y); + + // This function should only ever be called for compound modes + assert(has_second_ref(mbmi)); + + const int plane = 0; + struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx]; + + struct scale_factors sf; + av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height, + cm->width, cm->height); + + InterPredParams inter_pred_params; + + av1_init_inter_params(&inter_pred_params, pw, ph, p_row, p_col, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), 0, &sf, &ref_yv12, + mbmi->interp_filters); + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + + // Get the prediction block from the 'other' reference frame. + av1_enc_build_one_inter_predictor(second_pred, pw, other_mv, + &inter_pred_params); +} + +// Wrapper for av1_compound_single_motion_search, for the common case +// where the second prediction is also an inter mode. +int av1_compound_single_motion_search_interinter( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) { + MACROBLOCKD *xd = &x->e_mbd; + // This function should only ever be called for compound modes + assert(has_second_ref(xd->mi[0])); + + // Prediction buffer from second frame. + DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); + uint8_t *second_pred; + if (is_cur_buf_hbd(xd)) + second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); + else + second_pred = (uint8_t *)second_pred_alloc_16; + + MV *this_mv = &cur_mv[ref_idx].as_mv; + const MV *other_mv = &cur_mv[!ref_idx].as_mv; + build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred); + return av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred, + mask, mask_stride, rate_mv, ref_idx); +} + +static AOM_INLINE void do_masked_motion_search_indexed( + const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize, + int_mv *tmp_mv, int *rate_mv, int which) { + // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + BLOCK_SIZE sb_type = mbmi->bsize; + const uint8_t *mask; + const int mask_stride = block_size_wide[bsize]; + + mask = av1_get_compound_type_mask(comp_data, sb_type); + + tmp_mv[0].as_int = cur_mv[0].as_int; + tmp_mv[1].as_int = cur_mv[1].as_int; + if (which == 0 || which == 1) { + av1_compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask, + mask_stride, rate_mv, which); + } else if (which == 2) { + const int joint_me_num_refine_iter = + cpi->sf.inter_sf.enable_fast_compound_mode_search == 2 + ? REDUCED_JOINT_ME_REFINE_ITER + : NUM_JOINT_ME_REFINE_ITER; + av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv, + !cpi->sf.mv_sf.disable_second_mv, + joint_me_num_refine_iter); + } +} + +int av1_interinter_compound_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *x, + const int_mv *const cur_mv, + const BLOCK_SIZE bsize, + const PREDICTION_MODE this_mode) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int_mv tmp_mv[2]; + int tmp_rate_mv = 0; + // TODO(jingning): The average compound mode has proper SAD and variance + // functions implemented, and is triggerd by setting the mask pointer as + // Null. Need to further implement those for frame distance weighted mode. + mbmi->interinter_comp.seg_mask = + mbmi->interinter_comp.type == COMPOUND_AVERAGE ? NULL : xd->seg_mask; + const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp; + + if (this_mode == NEW_NEWMV) { + do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, + tmp_mv, &tmp_rate_mv, 2); + mbmi->mv[0].as_int = tmp_mv[0].as_int; + mbmi->mv[1].as_int = tmp_mv[1].as_int; + } else if (this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV) { + // which = 1 if this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV + // which = 0 if this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV + int which = (NEWMV == compound_ref1_mode(this_mode)); + do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, + tmp_mv, &tmp_rate_mv, which); + mbmi->mv[which].as_int = tmp_mv[which].as_int; + } + return tmp_rate_mv; +} + +int_mv av1_simple_motion_search_sse_var(AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, + BLOCK_SIZE bsize, int ref, + FULLPEL_MV start_mv, int num_planes, + int use_subpixel, unsigned int *sse, + unsigned int *var) { + assert(num_planes == 1 && + "Currently simple_motion_search only supports luma plane"); + assert(!frame_is_intra_only(&cpi->common) && + "Simple motion search only enabled for non-key frames"); + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + + set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); + + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->bsize = bsize; + mbmi->ref_frame[0] = ref; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + struct buf_2d backup_yv12; + // ref_mv is used to calculate the cost of the motion vector + const MV ref_mv = kZeroMv; + const int step_param = + AOMMIN(cpi->mv_search_params.mv_step_param + + cpi->sf.part_sf.simple_motion_search_reduce_search_steps, + MAX_MVSEARCH_STEPS - 2); + int cost_list[5]; + const int ref_idx = 0; + int bestsme; + int_mv best_mv; + FULLPEL_MV_STATS best_mv_stats; + + av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col, + get_ref_scale_factors(cm, ref), num_planes); + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + if (scaled_ref_frame) { + backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx]; + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + // Allow more mesh searches for screen content type on the ARF. + const int fine_search_interval = use_fine_search_interval(cpi); + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, mv_sf, bsize); + const search_site_config *src_search_sites = + av1_get_search_site_config(cpi, x, search_method); + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv, + start_mv, src_search_sites, search_method, + fine_search_interval); + + bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), + &best_mv.as_fullmv, &best_mv_stats, NULL); + + const int use_subpel_search = + bestsme < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv && + use_subpixel && + (cpi->sf.mv_sf.simple_motion_subpel_force_stop != FULL_PEL); + if (scaled_ref_frame) { + xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12; + } + if (use_subpel_search) { + int not_used = 0; + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, + cost_list); + // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params(). + ms_params.forced_stop = mv_sf->simple_motion_subpel_force_stop; + + MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + + cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv.as_mv, + ¬_used, &x->pred_sse[ref], NULL); + + mbmi->mv[0] = best_mv; + + // Get a copy of the prediction output + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + *var = cpi->ppi->fn_ptr[bsize].vf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, + xd->plane[0].dst.stride, sse); + } else { + // Manually convert from units of pixel to 1/8-pixels if we are not doing + // subpel search + convert_fullmv_to_mv(&best_mv); + *var = best_mv_stats.distortion; + *sse = best_mv_stats.sse; + } + + return best_mv; +} diff --git a/third_party/aom/av1/encoder/motion_search_facade.h b/third_party/aom/av1/encoder/motion_search_facade.h new file mode 100644 index 0000000000..d1fa915bca --- /dev/null +++ b/third_party/aom/av1/encoder/motion_search_facade.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MOTION_SEARCH_H_ +#define AOM_AV1_ENCODER_MOTION_SEARCH_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define NUM_JOINT_ME_REFINE_ITER 2 +#define REDUCED_JOINT_ME_REFINE_ITER 1 +// TODO(any): rename this struct to something else. There is already another +// struct called inter_modes_info, which makes this terribly confusing. +typedef struct { + int drl_cost; + int_mv full_search_mv; + int full_mv_rate; + int full_mv_bestsme; + int skip; +} inter_mode_info; + +struct HandleInterModeArgs; +void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int ref_idx, int *rate_mv, + int search_range, inter_mode_info *mode_info, + int_mv *best_mv, + struct HandleInterModeArgs *const args); + +int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, int *rate_mv, + int allow_second_mv, int joint_me_num_refine_iter); + +int av1_interinter_compound_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *x, + const int_mv *const cur_mv, + const BLOCK_SIZE bsize, + const PREDICTION_MODE this_mode); + +int av1_compound_single_motion_search_interinter( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx); + +int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *this_mv, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + int *rate_mv, int ref_idx); + +// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame +// ref and calculates the sse and var of the residue. Note that this sets the +// offset of mbmi, so we will need to reset it after calling this function. +int_mv av1_simple_motion_search_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x, + int mi_row, int mi_col, + BLOCK_SIZE bsize, int ref, + const FULLPEL_MV start_mv, + int num_planes, int use_subpixel, + unsigned int *sse, unsigned int *var); + +static AOM_INLINE const search_site_config *av1_get_search_site_config( + const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) { + const int ref_stride = x->e_mbd.plane[0].pre[0].stride; + + // AV1_COMP::mv_search_params.search_site_config is a compressor level cache + // that's shared by multiple threads. In most cases where all frames have the + // same resolution, the cache contains the search site config that we need. + const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; + if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_SRC]->stride) { + return mv_search_params->search_site_cfg[SS_CFG_SRC]; + } else if (ref_stride == + mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD]->stride) { + return mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD]; + } + + // If the cache does not contain the correct stride, then we will need to rely + // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the + // thread level config doesn't match, then we need to update it. + search_method = search_method_lookup[search_method]; + assert(search_method_lookup[search_method] == search_method && + "The search_method_lookup table should be idempotent."); + if (ref_stride != x->search_site_cfg_buf[search_method].stride) { + av1_refresh_search_site_config(x->search_site_cfg_buf, search_method, + ref_stride); + } + + return x->search_site_cfg_buf; +} + +static AOM_INLINE SEARCH_METHODS +av1_get_faster_search_method(SEARCH_METHODS search_method) { + // Note on search method's accuracy: + // 1. NSTEP + // 2. DIAMOND + // 3. BIGDIA \approx SQUARE + // 4. HEX. + // 5. FAST_HEX \approx FAST_DIAMOND + switch (search_method) { + case NSTEP: return DIAMOND; + case NSTEP_8PT: return DIAMOND; + case DIAMOND: return BIGDIA; + case CLAMPED_DIAMOND: return BIGDIA; + case BIGDIA: return HEX; + case SQUARE: return HEX; + case HEX: return FAST_HEX; + case FAST_HEX: return FAST_HEX; + case FAST_DIAMOND: return VFAST_DIAMOND; + case FAST_BIGDIA: return FAST_BIGDIA; + case VFAST_DIAMOND: return VFAST_DIAMOND; + default: assert(0 && "Invalid search method!"); return DIAMOND; + } +} + +static AOM_INLINE SEARCH_METHODS av1_get_default_mv_search_method( + const MACROBLOCK *x, const MV_SPEED_FEATURES *mv_sf, BLOCK_SIZE bsize) { + SEARCH_METHODS search_method = mv_sf->search_method; + const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method; + const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]); + const int qband = x->qindex >> (QINDEX_BITS - 2); + const bool use_faster_search_method = + (sf_blk_search_method == 1 && min_dim >= 32) || + (sf_blk_search_method >= 2 && min_dim >= 16 && + x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3); + + if (use_faster_search_method) { + search_method = av1_get_faster_search_method(search_method); + } + return search_method; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MOTION_SEARCH_H_ diff --git a/third_party/aom/av1/encoder/mv_prec.c b/third_party/aom/av1/encoder/mv_prec.c new file mode 100644 index 0000000000..b64f4dcd0e --- /dev/null +++ b/third_party/aom/av1/encoder/mv_prec.c @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "av1/encoder/encodemv.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/misc_model_weights.h" +#endif // !CONFIG_REALTIME_ONLY +#include "av1/encoder/mv_prec.h" + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE int_mv get_ref_mv_for_mv_stats( + const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, + int ref_idx) { + int ref_mv_idx = mbmi->ref_mv_idx; + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { + assert(has_second_ref(mbmi)); + ref_mv_idx += 1; + } + + const MV_REFERENCE_FRAME *ref_frames = mbmi->ref_frame; + const int8_t ref_frame_type = av1_ref_frame_type(ref_frames); + const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack; + + if (ref_frames[1] > INTRA_FRAME) { + assert(ref_idx == 0 || ref_idx == 1); + return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv + : curr_ref_mv_stack[ref_mv_idx].this_mv; + } + + assert(ref_idx == 0); + return ref_mv_idx < mbmi_ext_frame->ref_mv_count + ? curr_ref_mv_stack[ref_mv_idx].this_mv + : mbmi_ext_frame->global_mvs[ref_frame_type]; +} + +static AOM_INLINE int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) { + const aom_cdf_prob cur_cdf = AOM_ICDF(cdf[symbol]); + const aom_cdf_prob prev_cdf = symbol ? AOM_ICDF(cdf[symbol - 1]) : 0; + const aom_cdf_prob p15 = AOMMAX(cur_cdf - prev_cdf, EC_MIN_PROB); + + return av1_cost_symbol(p15); +} + +static AOM_INLINE int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val, + int comp_idx, const AV1_COMP *cpi, + int *rates) { + assert(comp_val != 0 && "mv component should not have zero value!"); + const int sign = comp_val < 0; + const int mag = sign ? -comp_val : comp_val; + const int mag_minus_1 = mag - 1; + int offset; + const int mv_class = av1_get_mv_class(mag_minus_1, &offset); + const int int_part = offset >> 3; // int mv data + const int frac_part = (offset >> 1) & 3; // fractional mv data + const int high_part = offset & 1; // high precision mv data + const int use_hp = cpi->common.features.allow_high_precision_mv; + int r_idx = 0; + + const MACROBLOCK *const x = &cpi->td.mb; + const MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + nmv_context *nmvc = &ec_ctx->nmvc; + nmv_component *mvcomp_ctx = nmvc->comps; + nmv_component *cur_mvcomp_ctx = &mvcomp_ctx[comp_idx]; + aom_cdf_prob *sign_cdf = cur_mvcomp_ctx->sign_cdf; + aom_cdf_prob *class_cdf = cur_mvcomp_ctx->classes_cdf; + aom_cdf_prob *class0_cdf = cur_mvcomp_ctx->class0_cdf; + aom_cdf_prob(*bits_cdf)[3] = cur_mvcomp_ctx->bits_cdf; + aom_cdf_prob *frac_part_cdf = mv_class + ? (cur_mvcomp_ctx->fp_cdf) + : (cur_mvcomp_ctx->class0_fp_cdf[int_part]); + aom_cdf_prob *high_part_cdf = + mv_class ? (cur_mvcomp_ctx->hp_cdf) : (cur_mvcomp_ctx->class0_hp_cdf); + + const int sign_rate = get_symbol_cost(sign_cdf, sign); + rates[r_idx++] = sign_rate; + update_cdf(sign_cdf, sign, 2); + + const int class_rate = get_symbol_cost(class_cdf, mv_class); + rates[r_idx++] = class_rate; + update_cdf(class_cdf, mv_class, MV_CLASSES); + + int int_bit_rate = 0; + if (mv_class == MV_CLASS_0) { + int_bit_rate = get_symbol_cost(class0_cdf, int_part); + update_cdf(class0_cdf, int_part, CLASS0_SIZE); + } else { + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (int i = 0; i < n; ++i) { + int_bit_rate += get_symbol_cost(bits_cdf[i], (int_part >> i) & 1); + update_cdf(bits_cdf[i], (int_part >> i) & 1, 2); + } + } + rates[r_idx++] = int_bit_rate; + const int frac_part_rate = get_symbol_cost(frac_part_cdf, frac_part); + rates[r_idx++] = frac_part_rate; + update_cdf(frac_part_cdf, frac_part, MV_FP_SIZE); + const int high_part_rate = + use_hp ? get_symbol_cost(high_part_cdf, high_part) : 0; + if (use_hp) { + update_cdf(high_part_cdf, high_part, 2); + } + rates[r_idx++] = high_part_rate; + + mv_stats->last_bit_zero += !high_part; + mv_stats->last_bit_nonzero += high_part; + const int total_rate = + (sign_rate + class_rate + int_bit_rate + frac_part_rate + high_part_rate); + return total_rate; +} + +static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv, + const MV *cur_mv, const AV1_COMP *cpi) { + const MACROBLOCK *const x = &cpi->td.mb; + const MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + nmv_context *nmvc = &ec_ctx->nmvc; + aom_cdf_prob *joint_cdf = nmvc->joints_cdf; + const int use_hp = cpi->common.features.allow_high_precision_mv; + + const MV diff = { cur_mv->row - ref_mv->row, cur_mv->col - ref_mv->col }; + const int mv_joint = av1_get_mv_joint(&diff); + // TODO(chiyotsai@google.com): Estimate hp_diff when we are using lp + const MV hp_diff = diff; + const int hp_mv_joint = av1_get_mv_joint(&hp_diff); + const MV truncated_diff = { (diff.row / 2) * 2, (diff.col / 2) * 2 }; + const MV lp_diff = use_hp ? truncated_diff : diff; + const int lp_mv_joint = av1_get_mv_joint(&lp_diff); + + const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint); + const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint); + const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint); + + update_cdf(joint_cdf, mv_joint, MV_JOINTS); + + mv_stats->total_mv_rate += mv_joint_rate; + mv_stats->hp_total_mv_rate += hp_mv_joint_rate; + mv_stats->lp_total_mv_rate += lp_mv_joint_rate; + mv_stats->mv_joint_count[mv_joint]++; + + for (int comp_idx = 0; comp_idx < 2; comp_idx++) { + const int comp_val = comp_idx ? diff.col : diff.row; + const int hp_comp_val = comp_idx ? hp_diff.col : hp_diff.row; + const int lp_comp_val = comp_idx ? lp_diff.col : lp_diff.row; + int rates[5]; + av1_zero_array(rates, 5); + + const int comp_rate = + comp_val ? keep_one_comp_stat(mv_stats, comp_val, comp_idx, cpi, rates) + : 0; + // TODO(chiyotsai@google.com): Properly get hp rate when use_hp is false + const int hp_rate = + hp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] + rates[4] : 0; + const int lp_rate = + lp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] : 0; + + mv_stats->total_mv_rate += comp_rate; + mv_stats->hp_total_mv_rate += hp_rate; + mv_stats->lp_total_mv_rate += lp_rate; + } +} + +static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats, + const AV1_COMP *cpi, int mi_row, + int mi_col) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) { + return; + } + + const MB_MODE_INFO *mbmi = + mi_params->mi_grid_base[mi_row * mi_params->mi_stride + mi_col]; + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = + cpi->mbmi_ext_info.frame_base + + get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize, + cpi->mbmi_ext_info.stride); + + if (!is_inter_block(mbmi)) { + mv_stats->intra_count++; + return; + } + mv_stats->inter_count++; + + const PREDICTION_MODE mode = mbmi->mode; + const int is_compound = has_second_ref(mbmi); + + if (mode == NEWMV || mode == NEW_NEWMV) { + // All mvs are new + for (int ref_idx = 0; ref_idx < 1 + is_compound; ++ref_idx) { + const MV ref_mv = + get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv; + const MV cur_mv = mbmi->mv[ref_idx].as_mv; + keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi); + } + } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV || + mode == NEW_NEARESTMV || mode == NEW_NEARMV) { + // has exactly one new_mv + mv_stats->default_mvs += 1; + + const int ref_idx = (mode == NEAREST_NEWMV || mode == NEAR_NEWMV); + const MV ref_mv = + get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv; + const MV cur_mv = mbmi->mv[ref_idx].as_mv; + + keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi); + } else { + // No new_mv + mv_stats->default_mvs += 1 + is_compound; + } + + // Add texture information + const BLOCK_SIZE bsize = mbmi->bsize; + const int num_rows = block_size_high[bsize]; + const int num_cols = block_size_wide[bsize]; + const int y_stride = cpi->source->y_stride; + const int px_row = 4 * mi_row, px_col = 4 * mi_col; + const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH; + const int bd = cm->seq_params->bit_depth; + if (buf_is_hbd) { + uint16_t *source_buf = + CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col; + for (int row = 0; row < num_rows - 1; row++) { + for (int col = 0; col < num_cols - 1; col++) { + const int offset = row * y_stride + col; + const int horz_diff = + abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8); + const int vert_diff = + abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8); + mv_stats->horz_text += horz_diff; + mv_stats->vert_text += vert_diff; + mv_stats->diag_text += horz_diff * vert_diff; + } + } + } else { + uint8_t *source_buf = cpi->source->y_buffer + px_row * y_stride + px_col; + for (int row = 0; row < num_rows - 1; row++) { + for (int col = 0; col < num_cols - 1; col++) { + const int offset = row * y_stride + col; + const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]); + const int vert_diff = + abs(source_buf[offset + y_stride] - source_buf[offset]); + mv_stats->horz_text += horz_diff; + mv_stats->vert_text += vert_diff; + mv_stats->diag_text += horz_diff * vert_diff; + } + } + } +} + +// Split block +static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats, + const AV1_COMP *cpi, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *cm = &cpi->common; + + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) + return; + + const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + + const int hbs = mi_size_wide[bsize] / 2; + const int qbs = mi_size_wide[bsize] / 4; + switch (partition) { + case PARTITION_NONE: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + break; + case PARTITION_HORZ: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + break; + case PARTITION_VERT: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + break; + case PARTITION_SPLIT: + collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize); + collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs, subsize); + collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col, subsize); + collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col + hbs, subsize); + break; + case PARTITION_HORZ_A: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + break; + case PARTITION_HORZ_B: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_VERT_A: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + break; + case PARTITION_VERT_B: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_HORZ_4: + for (int i = 0; i < 4; ++i) { + const int this_mi_row = mi_row + i * qbs; + collect_mv_stats_b(mv_stats, cpi, this_mi_row, mi_col); + } + break; + case PARTITION_VERT_4: + for (int i = 0; i < 4; ++i) { + const int this_mi_col = mi_col + i * qbs; + collect_mv_stats_b(mv_stats, cpi, mi_row, this_mi_col); + } + break; + default: assert(0); + } +} + +static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats, + const AV1_COMP *cpi, + const TileInfo *tile_info) { + const AV1_COMMON *cm = &cpi->common; + const int mi_row_start = tile_info->mi_row_start; + const int mi_row_end = tile_info->mi_row_end; + const int mi_col_start = tile_info->mi_col_start; + const int mi_col_end = tile_info->mi_col_end; + const int sb_size_mi = cm->seq_params->mib_size; + BLOCK_SIZE sb_size = cm->seq_params->sb_size; + for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) { + for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) { + collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size); + } + } +} + +void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) { + MV_STATS *mv_stats = &cpi->mv_stats; + const AV1_COMMON *cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + TileInfo tile_info; + av1_tile_set_row(&tile_info, cm, tile_row); + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + const int tile_idx = tile_row * tile_cols + tile_col; + av1_tile_set_col(&tile_info, cm, tile_col); + cpi->tile_data[tile_idx].tctx = *cm->fc; + cpi->td.mb.e_mbd.tile_ctx = &cpi->tile_data[tile_idx].tctx; + collect_mv_stats_tile(mv_stats, cpi, &tile_info); + } + } + + mv_stats->q = current_q; + mv_stats->order = cpi->common.current_frame.order_hint; + mv_stats->valid = 1; +} + +static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats, + int current_q) { + const AV1_COMMON *cm = &cpi->common; + const int order_hint = cpi->common.current_frame.order_hint; + const int order_diff = order_hint - mv_stats->order; + const float area = (float)(cm->width * cm->height); + float features[MV_PREC_FEATURE_SIZE] = { + (float)current_q, + (float)mv_stats->q, + (float)order_diff, + mv_stats->inter_count / area, + mv_stats->intra_count / area, + mv_stats->default_mvs / area, + mv_stats->mv_joint_count[0] / area, + mv_stats->mv_joint_count[1] / area, + mv_stats->mv_joint_count[2] / area, + mv_stats->mv_joint_count[3] / area, + mv_stats->last_bit_zero / area, + mv_stats->last_bit_nonzero / area, + mv_stats->total_mv_rate / area, + mv_stats->hp_total_mv_rate / area, + mv_stats->lp_total_mv_rate / area, + mv_stats->horz_text / area, + mv_stats->vert_text / area, + mv_stats->diag_text / area, + }; + + for (int f_idx = 0; f_idx < MV_PREC_FEATURE_SIZE; f_idx++) { + features[f_idx] = + (features[f_idx] - av1_mv_prec_mean[f_idx]) / av1_mv_prec_std[f_idx]; + } + float score = 0.0f; + + av1_nn_predict(features, &av1_mv_prec_dnn_config, 1, &score); + + const int use_high_hp = score >= 0.0f; + return use_high_hp; +} +#endif // !CONFIG_REALTIME_ONLY + +void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) { + int use_hp = qindex < HIGH_PRECISION_MV_QTHRESH; +#if !CONFIG_REALTIME_ONLY + MV_STATS *mv_stats = &cpi->mv_stats; +#endif // !CONFIG_REALTIME_ONLY + + if (cpi->sf.hl_sf.high_precision_mv_usage == QTR_ONLY) { + use_hp = 0; + } +#if !CONFIG_REALTIME_ONLY + else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA && + av1_frame_allows_smart_mv(cpi) && mv_stats->valid) { + use_hp = get_smart_mv_prec(cpi, mv_stats, qindex); + } +#endif // !CONFIG_REALTIME_ONLY + + av1_set_high_precision_mv(cpi, use_hp, + cpi->common.features.cur_frame_force_integer_mv); +} diff --git a/third_party/aom/av1/encoder/mv_prec.h b/third_party/aom/av1/encoder/mv_prec.h new file mode 100644 index 0000000000..55108b6cdb --- /dev/null +++ b/third_party/aom/av1/encoder/mv_prec.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MV_PREC_H_ +#define AOM_AV1_ENCODER_MV_PREC_H_ + +#include "av1/encoder/encoder.h" +#include "av1/encoder/speed_features.h" + +// Q threshold for high precision mv. +#define HIGH_PRECISION_MV_QTHRESH 128 +#if !CONFIG_REALTIME_ONLY +void av1_collect_mv_stats(AV1_COMP *cpi, int current_q); + +static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) { + const int gf_group_index = cpi->gf_frame_index; + const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index]; + return !frame_is_intra_only(&cpi->common) && + !(gf_update_type == INTNL_OVERLAY_UPDATE || + gf_update_type == OVERLAY_UPDATE); +} +#endif // !CONFIG_REALTIME_ONLY + +static AOM_INLINE void av1_set_high_precision_mv( + AV1_COMP *cpi, int allow_high_precision_mv, + int cur_frame_force_integer_mv) { + MvCosts *const mv_costs = cpi->td.mb.mv_costs; + // Avoid accessing 'mv_costs' when it is not allocated. + if (mv_costs == NULL) return; + + const int copy_hp = cpi->common.features.allow_high_precision_mv = + allow_high_precision_mv && !cur_frame_force_integer_mv; + + mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX]; + mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX]; + mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX]; + mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX]; + mv_costs->mv_cost_stack = + copy_hp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost; +} + +void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex); + +#endif // AOM_AV1_ENCODER_MV_PREC_H_ diff --git a/third_party/aom/av1/encoder/nonrd_opt.c b/third_party/aom/av1/encoder/nonrd_opt.c new file mode 100644 index 0000000000..651ca43a2e --- /dev/null +++ b/third_party/aom/av1/encoder/nonrd_opt.c @@ -0,0 +1,933 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/reconinter.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/nonrd_opt.h" +#include "av1/encoder/rdopt.h" + +static const SCAN_ORDER av1_fast_idtx_scan_order_16x16 = { + av1_fast_idtx_scan_16x16, av1_fast_idtx_iscan_16x16 +}; + +#define DECLARE_BLOCK_YRD_BUFFERS() \ + DECLARE_ALIGNED(64, tran_low_t, dqcoeff_buf[16 * 16]); \ + DECLARE_ALIGNED(64, tran_low_t, qcoeff_buf[16 * 16]); \ + DECLARE_ALIGNED(64, tran_low_t, coeff_buf[16 * 16]); \ + uint16_t eob[1]; + +#define DECLARE_BLOCK_YRD_VARS() \ + /* When is_tx_8x8_dual_applicable is true, we compute the txfm for the \ + * entire bsize and write macroblock_plane::coeff. So low_coeff is kept \ + * as a non-const so we can reassign it to macroblock_plane::coeff. */ \ + int16_t *low_coeff = (int16_t *)coeff_buf; \ + int16_t *const low_qcoeff = (int16_t *)qcoeff_buf; \ + int16_t *const low_dqcoeff = (int16_t *)dqcoeff_buf; \ + const int diff_stride = bw; + +#define DECLARE_LOOP_VARS_BLOCK_YRD() \ + const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2]; + +static AOM_FORCE_INLINE void update_yrd_loop_vars( + MACROBLOCK *x, int *skippable, int step, int ncoeffs, + int16_t *const low_coeff, int16_t *const low_qcoeff, + int16_t *const low_dqcoeff, RD_STATS *this_rdc, int *eob_cost, + int tx_blk_id) { + const int is_txfm_skip = (ncoeffs == 0); + *skippable &= is_txfm_skip; + x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip; + *eob_cost += get_msb(ncoeffs + 1); + if (ncoeffs == 1) + this_rdc->rate += (int)abs(low_qcoeff[0]); + else if (ncoeffs > 1) + this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4); + + this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2; +} + +static INLINE void aom_process_hadamard_lp_8x16(MACROBLOCK *x, + int max_blocks_high, + int max_blocks_wide, + int num_4x4_w, int step, + int block_step) { + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + const int bw = 4 * num_4x4_w; + const int num_4x4 = AOMMIN(num_4x4_w, max_blocks_wide); + int block = 0; + + for (int r = 0; r < max_blocks_high; r += block_step) { + for (int c = 0; c < num_4x4; c += 2 * block_step) { + const int16_t *src_diff = &p->src_diff[(r * bw + c) << 2]; + int16_t *low_coeff = (int16_t *)p->coeff + BLOCK_OFFSET(block); + aom_hadamard_lp_8x8_dual(src_diff, (ptrdiff_t)bw, low_coeff); + block += 2 * step; + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +#define DECLARE_BLOCK_YRD_HBD_VARS() \ + tran_low_t *const coeff = coeff_buf; \ + tran_low_t *const qcoeff = qcoeff_buf; \ + tran_low_t *const dqcoeff = dqcoeff_buf; + +static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd( + MACROBLOCK *x, int *skippable, int step, int ncoeffs, + tran_low_t *const coeff, tran_low_t *const qcoeff, + tran_low_t *const dqcoeff, RD_STATS *this_rdc, int *eob_cost, + int tx_blk_id) { + const MACROBLOCKD *xd = &x->e_mbd; + const int is_txfm_skip = (ncoeffs == 0); + *skippable &= is_txfm_skip; + x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip; + *eob_cost += get_msb(ncoeffs + 1); + + int64_t dummy; + if (ncoeffs == 1) + this_rdc->rate += (int)abs(qcoeff[0]); + else if (ncoeffs > 1) + this_rdc->rate += aom_satd(qcoeff, step << 4); + this_rdc->dist += + av1_highbd_block_error(coeff, dqcoeff, step << 4, &dummy, xd->bd) >> 2; +} +#endif + +/*!\brief Calculates RD Cost using Hadamard transform. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Calculates RD Cost using Hadamard transform. For low bit depth this function + * uses low-precision set of functions (16-bit) and 32 bit for high bit depth + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] this_rdc Pointer to calculated RD Cost + * \param[in] skippable Pointer to a flag indicating possible tx skip + * \param[in] bsize Current block size + * \param[in] tx_size Transform size + * \param[in] is_inter_mode Flag to indicate inter mode + * + * \remark Nothing is returned. Instead, calculated RD cost is placed to + * \c this_rdc. \c skippable flag is set if there is no non-zero quantized + * coefficients for Hadamard transform + */ +void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + const struct macroblockd_plane *pd = &xd->plane[AOM_PLANE_Y]; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + assert(bsize < BLOCK_SIZES_ALL); + const int num_4x4_w = mi_size_wide[bsize]; + const int num_4x4_h = mi_size_high[bsize]; + const int step = 1 << (tx_size << 1); + const int block_step = (1 << tx_size); + const int row_step = step * num_4x4_w >> tx_size; + int block = 0; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5); + int eob_cost = 0; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + const int use_hbd = is_cur_buf_hbd(xd); + int num_blk_skip_w = num_4x4_w; + +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, + p->src.stride, pd->dst.buf, pd->dst.stride); + } else { + aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); + } +#else + aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); +#endif + + // Keep the intermediate value on the stack here. Writing directly to + // skippable causes speed regression due to load-and-store issues in + // update_yrd_loop_vars. + int temp_skippable = 1; + this_rdc->dist = 0; + this_rdc->rate = 0; + // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks + // can be done per function call. Hence the call of Hadamard txfm is + // abstracted here for the specified cases. + int is_tx_8x8_dual_applicable = + (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 && + block_size_high[bsize] >= 8); + +#if CONFIG_AV1_HIGHBITDEPTH + // As of now, dual implementation of hadamard txfm is available for low + // bitdepth. + if (use_hbd) is_tx_8x8_dual_applicable = 0; +#endif + + if (is_tx_8x8_dual_applicable) { + aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w, + step, block_step); + } + + const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; + DECLARE_BLOCK_YRD_BUFFERS() + DECLARE_BLOCK_YRD_VARS() +#if CONFIG_AV1_HIGHBITDEPTH + DECLARE_BLOCK_YRD_HBD_VARS() +#else + (void)use_hbd; +#endif + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (int r = 0; r < max_blocks_high; r += block_step) { + for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) { + DECLARE_LOOP_VARS_BLOCK_YRD() + + switch (tx_size) { +#if CONFIG_AV1_HIGHBITDEPTH + case TX_16X16: + if (use_hbd) { + aom_hadamard_16x16(src_diff, diff_stride, coeff); + av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, + dqcoeff, p->dequant_QTX, eob, + // default_scan_fp_16x16_transpose and + // av1_default_iscan_fp_16x16_transpose have to be + // used together. + default_scan_fp_16x16_transpose, + av1_default_iscan_fp_16x16_transpose); + } else { + aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff); + av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, + p->quant_fp_QTX, low_qcoeff, low_dqcoeff, + p->dequant_QTX, eob, + // default_scan_lp_16x16_transpose and + // av1_default_iscan_lp_16x16_transpose have to be + // used together. + default_scan_lp_16x16_transpose, + av1_default_iscan_lp_16x16_transpose); + } + break; + case TX_8X8: + if (use_hbd) { + aom_hadamard_8x8(src_diff, diff_stride, coeff); + av1_quantize_fp( + coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, + p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob, + default_scan_8x8_transpose, av1_default_iscan_8x8_transpose); + } else { + if (is_tx_8x8_dual_applicable) { + // The coeffs are pre-computed for the whole block, so re-assign + // low_coeff to the appropriate location. + const int block_offset = BLOCK_OFFSET(block + s); + low_coeff = (int16_t *)p->coeff + block_offset; + } else { + aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff); + } + av1_quantize_lp( + low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, + low_dqcoeff, p->dequant_QTX, eob, + // default_scan_8x8_transpose and + // av1_default_iscan_8x8_transpose have to be used together. + default_scan_8x8_transpose, av1_default_iscan_8x8_transpose); + } + break; + default: + assert(tx_size == TX_4X4); + // In tx_size=4x4 case, aom_fdct4x4 and aom_fdct4x4_lp generate + // normal coefficients order, so we don't need to change the scan + // order here. + if (use_hbd) { + aom_fdct4x4(src_diff, coeff, diff_stride); + av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, + dqcoeff, p->dequant_QTX, eob, scan_order->scan, + scan_order->iscan); + } else { + aom_fdct4x4_lp(src_diff, low_coeff, diff_stride); + av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX, + low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, + scan_order->scan, scan_order->iscan); + } + break; +#else + case TX_16X16: + aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff); + av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX, + low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, + default_scan_lp_16x16_transpose, + av1_default_iscan_lp_16x16_transpose); + break; + case TX_8X8: + if (is_tx_8x8_dual_applicable) { + // The coeffs are pre-computed for the whole block, so re-assign + // low_coeff to the appropriate location. + const int block_offset = BLOCK_OFFSET(block + s); + low_coeff = (int16_t *)p->coeff + block_offset; + } else { + aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff); + } + av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, + low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, + default_scan_8x8_transpose, + av1_default_iscan_8x8_transpose); + break; + default: + aom_fdct4x4_lp(src_diff, low_coeff, diff_stride); + av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX, + low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, + scan_order->scan, scan_order->iscan); + break; +#endif + } + assert(*eob <= 1024); +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) + update_yrd_loop_vars_hbd(x, &temp_skippable, step, *eob, coeff, qcoeff, + dqcoeff, this_rdc, &eob_cost, + r * num_blk_skip_w + c); + else +#endif + update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff, + low_qcoeff, low_dqcoeff, this_rdc, &eob_cost, + r * num_blk_skip_w + c); + } + block += row_step; + } + + this_rdc->skip_txfm = *skippable = temp_skippable; + if (this_rdc->sse < INT64_MAX) { + this_rdc->sse = (this_rdc->sse << 6) >> 2; + if (temp_skippable) { + this_rdc->dist = 0; + this_rdc->dist = this_rdc->sse; + return; + } + } + + // If skippable is set, rate gets clobbered later. + this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT); + this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT); +} + +// Explicitly enumerate the cases so the compiler can generate SIMD for the +// function. According to the disassembler, gcc generates SSE codes for each of +// the possible block sizes. The hottest case is tx_width 16, which takes up +// about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since +// av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the +// potential room of improvement for writing AVX2 optimization is only 3% * 8% = +// 0.24% of total encoding time. +static AOM_INLINE void scale_square_buf_vals(int16_t *dst, int tx_width, + const int16_t *src, + int src_stride) { +#define DO_SCALING \ + do { \ + for (int idy = 0; idy < tx_width; ++idy) { \ + for (int idx = 0; idx < tx_width; ++idx) { \ + dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \ + } \ + } \ + } while (0) + + if (tx_width == 4) { + DO_SCALING; + } else if (tx_width == 8) { + DO_SCALING; + } else if (tx_width == 16) { + DO_SCALING; + } else { + assert(0); + } + +#undef DO_SCALING +} + +/*!\brief Calculates RD Cost when the block uses Identity transform. + * Note that this function is only for low bit depth encoding, since it + * is called in real-time mode for now, which sets high bit depth to 0: + * -DCONFIG_AV1_HIGHBITDEPTH=0 + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Calculates RD Cost. For low bit depth this function + * uses low-precision set of functions (16-bit) and 32 bit for high bit depth + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] pred_buf Pointer to the prediction buffer + * \param[in] pred_stride Stride for the prediction buffer + * \param[in] this_rdc Pointer to calculated RD Cost + * \param[in] skippable Pointer to a flag indicating possible tx skip + * \param[in] bsize Current block size + * \param[in] tx_size Transform size + * + * \remark Nothing is returned. Instead, calculated RD cost is placed to + * \c this_rdc. \c skippable flag is set if all coefficients are zero. + */ +void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf, + int pred_stride, RD_STATS *this_rdc, int *skippable, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + assert(bsize < BLOCK_SIZES_ALL); + const int num_4x4_w = mi_size_wide[bsize]; + const int num_4x4_h = mi_size_high[bsize]; + const int step = 1 << (tx_size << 1); + const int block_step = (1 << tx_size); + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5); + int eob_cost = 0; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + const int num_blk_skip_w = num_4x4_w; + // Keep the intermediate value on the stack here. Writing directly to + // skippable causes speed regression due to load-and-store issues in + // update_yrd_loop_vars. + int temp_skippable = 1; + int tx_wd = 0; + const SCAN_ORDER *scan_order = NULL; + switch (tx_size) { + case TX_64X64: + assert(0); // Not implemented + break; + case TX_32X32: + assert(0); // Not used + break; + case TX_16X16: + scan_order = &av1_fast_idtx_scan_order_16x16; + tx_wd = 16; + break; + case TX_8X8: + scan_order = &av1_fast_idtx_scan_order_8x8; + tx_wd = 8; + break; + default: + assert(tx_size == TX_4X4); + scan_order = &av1_fast_idtx_scan_order_4x4; + tx_wd = 4; + break; + } + assert(scan_order != NULL); + + this_rdc->dist = 0; + this_rdc->rate = 0; + aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pred_buf, pred_stride); + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + DECLARE_BLOCK_YRD_BUFFERS() + DECLARE_BLOCK_YRD_VARS() + for (int r = 0; r < max_blocks_high; r += block_step) { + for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) { + DECLARE_LOOP_VARS_BLOCK_YRD() + scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride); + av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX, + p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, + eob, scan_order->scan, scan_order->iscan); + assert(*eob <= 1024); + update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff, + low_qcoeff, low_dqcoeff, this_rdc, &eob_cost, + r * num_blk_skip_w + c); + } + } + this_rdc->skip_txfm = *skippable = temp_skippable; + if (this_rdc->sse < INT64_MAX) { + this_rdc->sse = (this_rdc->sse << 6) >> 2; + if (temp_skippable) { + this_rdc->dist = 0; + this_rdc->dist = this_rdc->sse; + return; + } + } + // If skippable is set, rate gets clobbered later. + this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT); + this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT); +} + +int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + RD_STATS *this_rdc, int start_plane, + int stop_plane) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + int rate; + int64_t dist; + int plane; + int64_t tot_sse = 0; + + this_rdc->rate = 0; + this_rdc->dist = 0; + this_rdc->skip_txfm = 0; + + for (plane = start_plane; plane <= stop_plane; ++plane) { + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const uint32_t dc_quant = p->dequant_QTX[0]; + const uint32_t ac_quant = p->dequant_QTX[1]; + const BLOCK_SIZE bs = plane_bsize; + unsigned int var; + if (!x->color_sensitivity[COLOR_SENS_IDX(plane)]) continue; + + var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); + assert(sse >= var); + tot_sse += sse; + + av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs], + dc_quant >> 3, &rate, &dist); + + this_rdc->rate += rate >> 1; + this_rdc->dist += dist << 3; + + av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3, + &rate, &dist); + + this_rdc->rate += rate; + this_rdc->dist += dist << 4; + } + + if (this_rdc->rate == 0) { + this_rdc->skip_txfm = 1; + } + + if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >= + RDCOST(x->rdmult, 0, tot_sse << 4)) { + this_rdc->rate = 0; + this_rdc->dist = tot_sse << 4; + this_rdc->skip_txfm = 1; + } + + return tot_sse; +} + +static void compute_intra_yprediction(const AV1_COMMON *cm, + PREDICTION_MODE mode, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd) { + const SequenceHeader *seq_params = cm->seq_params; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + int plane = 0; + int row, col; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (row = 0; row < max_blocks_high; row += (1 << tx_size)) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) { + p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, + block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0, + FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride, + 0, 0, plane); + } + } + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; +} + +// Checks whether Intra mode needs to be pruned based on +// 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad' +// speed features. +static INLINE bool is_prune_intra_mode( + AV1_COMP *cpi, int mode_index, int force_intra_check, BLOCK_SIZE bsize, + uint8_t segment_id, SOURCE_SAD source_sad_nonrd, + uint8_t color_sensitivity[MAX_MB_PLANE - 1]) { + const PREDICTION_MODE this_mode = intra_mode_list[mode_index]; + if (mode_index > 2 || force_intra_check == 0) { + if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize])) + return true; + + if (this_mode == DC_PRED) return false; + + if (!cpi->sf.rt_sf.prune_hv_pred_modes_using_src_sad) return false; + + const bool has_color_sensitivity = + color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] && + color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]; + if (has_color_sensitivity && + (cpi->rc.frame_source_sad > 1.1 * cpi->rc.avg_source_sad || + cyclic_refresh_segment_id_boosted(segment_id) || + source_sad_nonrd > kMedSad)) + return false; + + return true; + } + return false; +} + +/*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Calculates RD Cost for an intra mode for a single TX block using Hadamard + * transform. + * \param[in] plane Color plane + * \param[in] block Index of a TX block in a prediction block + * \param[in] row Row of a current TX block + * \param[in] col Column of a current TX block + * \param[in] plane_bsize Block size of a current prediction block + * \param[in] tx_size Transform size + * \param[in] arg Pointer to a structure that holds parameters + * for intra mode search + * + * \remark Nothing is returned. Instead, best mode and RD Cost of the best mode + * are set in \c args->rdc and \c args->mode + */ +void av1_estimate_block_intra(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct estimate_block_intra_args *const args = arg; + AV1_COMP *const cpi = args->cpi; + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int64_t src_stride = p->src.stride; + const int64_t dst_stride = pd->dst.stride; + + (void)block; + + av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); + + if (args->prune_mode_based_on_sad) { + unsigned int this_sad = cpi->ppi->fn_ptr[plane_bsize].sdf( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); + const unsigned int sad_threshold = + args->best_sad != UINT_MAX ? args->best_sad + (args->best_sad >> 4) + : UINT_MAX; + // Skip the evaluation of current mode if its SAD is more than a threshold. + if (this_sad > sad_threshold) { + // For the current mode, set rate and distortion to maximum possible + // values and return. + // Note: args->rdc->rate is checked in av1_nonrd_pick_intra_mode() to skip + // the evaluation of the current mode. + args->rdc->rate = INT_MAX; + args->rdc->dist = INT64_MAX; + return; + } + if (this_sad < args->best_sad) { + args->best_sad = this_sad; + } + } + + RD_STATS this_rdc; + av1_invalid_rd_stats(&this_rdc); + + p->src.buf = &src_buf_base[4 * (row * src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)]; + + if (plane == 0) { + av1_block_yrd(x, &this_rdc, &args->skippable, bsize_tx, + AOMMIN(tx_size, TX_16X16)); + } else { + av1_model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, plane, plane); + } + + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; + assert(args->rdc->rate != INT_MAX && args->rdc->dist != INT64_MAX); + args->rdc->rate += this_rdc.rate; + args->rdc->dist += this_rdc.dist; +} + +/*!\brief Estimates best intra mode for inter mode search + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * + * Using heuristics based on best inter mode, block size, and other decides + * whether to check intra modes. If so, estimates and selects best intra mode + * from the reduced set of intra modes (max 4 intra modes checked) + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the + * data for the current macroblock + * \param[in] bsize Current block size + * \param[in] best_early_term Flag, indicating that TX for the + * best inter mode was skipped + * \param[in] ref_cost_intra Cost of signalling intra mode + * \param[in] reuse_prediction Flag, indicating prediction re-use + * \param[in] orig_dst Original destination buffer + * \param[in] tmp_buffers Pointer to a temporary buffers for + * prediction re-use + * \param[out] this_mode_pred Pointer to store prediction buffer + * for prediction re-use + * \param[in] best_rdc Pointer to RD cost for the best + * selected intra mode + * \param[in] best_pickmode Pointer to a structure containing + * best mode picked so far + * \param[in] ctx Pointer to structure holding coding + * contexts and modes for the block + * + * \remark Nothing is returned. Instead, calculated RD cost is placed to + * \c best_rdc and best selected mode is placed to \c best_pickmode + * + */ +void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int best_early_term, unsigned int ref_cost_intra, + int reuse_prediction, struct buf_2d *orig_dst, + PRED_BUFFER *tmp_buffers, + PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc, + BEST_PICKMODE *best_pickmode, + PICK_MODE_CONTEXT *ctx) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const unsigned char segment_id = mi->segment_id; + const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; + const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize]; + const bool is_screen_content = + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + + const CommonQuantParams *quant_params = &cm->quant_params; + + RD_STATS this_rdc; + + int intra_cost_penalty = av1_get_intra_cost_penalty( + quant_params->base_qindex, quant_params->y_dc_delta_q, + cm->seq_params->bit_depth); + int64_t inter_mode_thresh = + RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0); + int perform_intra_pred = rt_sf->check_intra_pred_nonrd; + int force_intra_check = 0; + // For spatial enhancement layer: turn off intra prediction if the + // previous spatial layer as golden ref is not chosen as best reference. + // only do this for temporal enhancement layer and on non-key frames. + if (cpi->svc.spatial_layer_id > 0 && + best_pickmode->best_ref_frame != GOLDEN_FRAME && + cpi->svc.temporal_layer_id > 0 && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) + perform_intra_pred = 0; + + int do_early_exit_rdthresh = 1; + + uint32_t spatial_var_thresh = 50; + int motion_thresh = 32; + // Adjust thresholds to make intra mode likely tested if the other + // references (golden, alt) are skipped/not checked. For now always + // adjust for svc mode. + if (cpi->ppi->use_svc || (rt_sf->use_nonrd_altref_frame == 0 && + rt_sf->nonrd_prune_ref_frame_search > 0)) { + spatial_var_thresh = 150; + motion_thresh = 0; + } + + // Some adjustments to checking intra mode based on source variance. + if (x->source_variance < spatial_var_thresh) { + // If the best inter mode is large motion or non-LAST ref reduce intra cost + // penalty, so intra mode is more likely tested. + if (best_rdc->rdcost != INT64_MAX && + (best_pickmode->best_ref_frame != LAST_FRAME || + abs(mi->mv[0].as_mv.row) >= motion_thresh || + abs(mi->mv[0].as_mv.col) >= motion_thresh)) { + intra_cost_penalty = intra_cost_penalty >> 2; + inter_mode_thresh = + RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0); + do_early_exit_rdthresh = 0; + } + if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) && + x->content_state_sb.source_sad_nonrd >= kHighSad) || + (is_screen_content && x->source_variance < 50 && + ((bsize >= BLOCK_32X32 && + x->content_state_sb.source_sad_nonrd != kZeroSad) || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1))) + force_intra_check = 1; + // For big blocks worth checking intra (since only DC will be checked), + // even if best_early_term is set. + if (bsize >= BLOCK_32X32) best_early_term = 0; + } else if (rt_sf->source_metrics_sb_nonrd && + x->content_state_sb.source_sad_nonrd <= kLowSad) { + perform_intra_pred = 0; + } + + if (best_rdc->skip_txfm && best_pickmode->best_mode_initial_skip_flag) { + if (rt_sf->skip_intra_pred == 1 && best_pickmode->best_mode != NEWMV) + perform_intra_pred = 0; + else if (rt_sf->skip_intra_pred == 2) + perform_intra_pred = 0; + } + + if (!(best_rdc->rdcost == INT64_MAX || force_intra_check || + (perform_intra_pred && !best_early_term && + bsize <= cpi->sf.part_sf.max_intra_bsize))) { + return; + } + + // Early exit based on RD cost calculated using known rate. When + // is_screen_content is true, more bias is given to intra modes. Hence, + // considered conservative threshold in early exit for the same. + const int64_t known_rd = is_screen_content + ? CALC_BIASED_RDCOST(inter_mode_thresh) + : inter_mode_thresh; + if (known_rd > best_rdc->rdcost) return; + + struct estimate_block_intra_args args; + init_estimate_block_intra_args(&args, cpi, x); + TX_SIZE intra_tx_size = AOMMIN( + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]), + TX_16X16); + if (is_screen_content && cpi->rc.high_source_sad && + x->source_variance > spatial_var_thresh && bsize <= BLOCK_16X16) + intra_tx_size = TX_4X4; + + PRED_BUFFER *const best_pred = best_pickmode->best_pred; + if (reuse_prediction && best_pred != NULL) { + const int bh = block_size_high[bsize]; + const int bw = block_size_wide[bsize]; + if (best_pred->data == orig_dst->buf) { + *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)]; + aom_convolve_copy(best_pred->data, best_pred->stride, + (*this_mode_pred)->data, (*this_mode_pred)->stride, bw, + bh); + best_pickmode->best_pred = *this_mode_pred; + } + } + pd->dst = *orig_dst; + + for (int midx = 0; midx < RTC_INTRA_MODES; ++midx) { + const PREDICTION_MODE this_mode = intra_mode_list[midx]; + const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)]; + const int64_t mode_rd_thresh = rd_threshes[mode_index]; + + if (is_prune_intra_mode(cpi, midx, force_intra_check, bsize, segment_id, + x->content_state_sb.source_sad_nonrd, + x->color_sensitivity)) + continue; + + if (is_screen_content && rt_sf->source_metrics_sb_nonrd) { + // For spatially flat blocks with zero motion only check + // DC mode. + if (x->content_state_sb.source_sad_nonrd == kZeroSad && + x->source_variance == 0 && this_mode != DC_PRED) + continue; + // Only test Intra for big blocks if spatial_variance is small. + else if (bsize > BLOCK_32X32 && x->source_variance > 50) + continue; + } + + if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh, + rd_thresh_freq_fact[mode_index]) && + (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) { + continue; + } + const BLOCK_SIZE uv_bsize = + get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); + + mi->mode = this_mode; + mi->ref_frame[0] = INTRA_FRAME; + mi->ref_frame[1] = NONE_FRAME; + + av1_invalid_rd_stats(&this_rdc); + args.mode = this_mode; + args.skippable = 1; + args.rdc = &this_rdc; + mi->tx_size = intra_tx_size; + compute_intra_yprediction(cm, this_mode, bsize, x, xd); + // Look into selecting tx_size here, based on prediction residual. + av1_block_yrd(x, &this_rdc, &args.skippable, bsize, mi->tx_size); + // TODO(kyslov@) Need to account for skippable + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) { + av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_U, + av1_estimate_block_intra, &args); + } + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) { + av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_V, + av1_estimate_block_intra, &args); + } + + int mode_cost = 0; + if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) { + mode_cost += + x->mode_costs.angle_delta_cost[this_mode - V_PRED] + [MAX_ANGLE_DELTA + + mi->angle_delta[PLANE_TYPE_Y]]; + } + if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { + mode_cost += x->mode_costs.filter_intra_cost[bsize][0]; + } + this_rdc.rate += ref_cost_intra; + this_rdc.rate += intra_cost_penalty; + this_rdc.rate += mode_cost; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + + if (is_screen_content && rt_sf->source_metrics_sb_nonrd) { + // For blocks with low spatial variance and color sad, + // favor the intra-modes, only on scene/slide change. + if (cpi->rc.high_source_sad && x->source_variance < 800 && + (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) + this_rdc.rdcost = CALC_BIASED_RDCOST(this_rdc.rdcost); + // Otherwise bias against intra for blocks with zero + // motion and no color, on non-scene/slide changes. + else if (!cpi->rc.high_source_sad && x->source_variance > 0 && + x->content_state_sb.source_sad_nonrd == kZeroSad && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) + this_rdc.rdcost = (3 * this_rdc.rdcost) >> 1; + } + + if (this_rdc.rdcost < best_rdc->rdcost) { + *best_rdc = this_rdc; + best_pickmode->best_mode = this_mode; + best_pickmode->best_tx_size = mi->tx_size; + best_pickmode->best_ref_frame = INTRA_FRAME; + best_pickmode->best_second_ref_frame = NONE; + best_pickmode->best_mode_skip_txfm = this_rdc.skip_txfm; + mi->uv_mode = this_mode; + mi->mv[0].as_int = INVALID_MV; + mi->mv[1].as_int = INVALID_MV; + if (!this_rdc.skip_txfm) + memset(ctx->blk_skip, 0, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + } + } + if (best_pickmode->best_ref_frame == INTRA_FRAME) + memset(ctx->blk_skip, 0, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + mi->tx_size = best_pickmode->best_tx_size; +} diff --git a/third_party/aom/av1/encoder/nonrd_opt.h b/third_party/aom/av1/encoder/nonrd_opt.h new file mode 100644 index 0000000000..a53578ebad --- /dev/null +++ b/third_party/aom/av1/encoder/nonrd_opt.h @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_NONRD_OPT_H_ +#define AOM_AV1_ENCODER_NONRD_OPT_H_ + +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/rdopt.h" + +#define RTC_INTER_MODES (4) +#define RTC_INTRA_MODES (4) +#define RTC_MODES (AOMMAX(RTC_INTER_MODES, RTC_INTRA_MODES)) +#define CALC_BIASED_RDCOST(rdcost) (7 * (rdcost) >> 3) +#define NUM_COMP_INTER_MODES_RT (6) +#define NUM_INTER_MODES 12 +#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \ + (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false) +#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16) +#define FILTER_SEARCH_SIZE 2 +#if !CONFIG_REALTIME_ONLY +#define MOTION_MODE_SEARCH_SIZE 2 +#endif + +extern int g_pick_inter_mode_cnt; +/*!\cond */ +typedef struct { + uint8_t *data; + int stride; + int in_use; +} PRED_BUFFER; + +typedef struct { + PRED_BUFFER *best_pred; + PREDICTION_MODE best_mode; + TX_SIZE best_tx_size; + TX_TYPE tx_type; + MV_REFERENCE_FRAME best_ref_frame; + MV_REFERENCE_FRAME best_second_ref_frame; + uint8_t best_mode_skip_txfm; + uint8_t best_mode_initial_skip_flag; + int_interpfilters best_pred_filter; + MOTION_MODE best_motion_mode; + WarpedMotionParams wm_params; + int num_proj_ref; + PALETTE_MODE_INFO pmi; + int64_t best_sse; +} BEST_PICKMODE; + +typedef struct { + MV_REFERENCE_FRAME ref_frame; + PREDICTION_MODE pred_mode; +} REF_MODE; + +typedef struct { + MV_REFERENCE_FRAME ref_frame[2]; + PREDICTION_MODE pred_mode; +} COMP_REF_MODE; + +struct estimate_block_intra_args { + AV1_COMP *cpi; + MACROBLOCK *x; + PREDICTION_MODE mode; + int skippable; + RD_STATS *rdc; + unsigned int best_sad; + bool prune_mode_based_on_sad; +}; +/*!\endcond */ + +/*!\brief Structure to store parameters and statistics used in non-rd inter mode + * evaluation. + */ +typedef struct { + //! Structure to hold best inter mode data + BEST_PICKMODE best_pickmode; + //! Structure to RD cost of current mode + RD_STATS this_rdc; + //! Pointer to the RD Cost for the best mode found so far + RD_STATS best_rdc; + //! Distortion of chroma planes for all modes and reference frames + int64_t uv_dist[RTC_INTER_MODES][REF_FRAMES]; + //! Buffer to hold predicted block for all reference frames and planes + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; + //! Array to hold variance of all modes and reference frames + unsigned int vars[RTC_INTER_MODES][REF_FRAMES]; + //! Array to hold ref cost of single reference mode for all ref frames + unsigned int ref_costs_single[REF_FRAMES]; + //! Array to hold motion vector for all modes and reference frames + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES]; + //! Array to hold best mv for all modes and reference frames + int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES]; + //! Array to hold inter mode cost of single ref mode for all ref frames + int single_inter_mode_costs[RTC_INTER_MODES][REF_FRAMES]; + //! Array to hold use reference frame mask for each reference frame + int use_ref_frame_mask[REF_FRAMES]; + //! Array to hold flags of evaluated modes for each reference frame + uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES]; + //! Array to hold flag indicating if scaled reference frame is used. + bool use_scaled_ref_frame[REF_FRAMES]; +} InterModeSearchStateNonrd; + +static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2, + 2, 2, 3, 3, 3, 4, + 4, 4, 5, 5 }; +static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1, + 2, 3, 2, 3, 4, 3, + 4, 5, 4, 5 }; + +static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED, + SMOOTH_PRED }; + +static const PREDICTION_MODE inter_mode_list[] = { NEARESTMV, NEARMV, GLOBALMV, + NEWMV }; + +static const THR_MODES mode_idx[REF_FRAMES][RTC_MODES] = { + { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH }, + { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV }, + { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 }, + { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 }, + { THR_NEARESTG, THR_NEARG, THR_GLOBALG, THR_NEWG }, + { THR_NEARESTB, THR_NEARB, THR_GLOBALB, THR_NEWB }, + { THR_NEARESTA2, THR_NEARA2, THR_GLOBALA2, THR_NEWA2 }, + { THR_NEARESTA, THR_NEARA, THR_GLOBALA, THR_NEWA }, +}; + +// GLOBALMV in the set below is in fact ZEROMV as we don't do global ME in RT +// mode +static const REF_MODE ref_mode_set[NUM_INTER_MODES] = { + { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV }, + { LAST_FRAME, GLOBALMV }, { LAST_FRAME, NEWMV }, + { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV }, + { GOLDEN_FRAME, GLOBALMV }, { GOLDEN_FRAME, NEWMV }, + { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV }, + { ALTREF_FRAME, GLOBALMV }, { ALTREF_FRAME, NEWMV }, +}; + +static const COMP_REF_MODE comp_ref_mode_set[NUM_COMP_INTER_MODES_RT] = { + { { LAST_FRAME, GOLDEN_FRAME }, GLOBAL_GLOBALMV }, + { { LAST_FRAME, GOLDEN_FRAME }, NEAREST_NEARESTMV }, + { { LAST_FRAME, LAST2_FRAME }, GLOBAL_GLOBALMV }, + { { LAST_FRAME, LAST2_FRAME }, NEAREST_NEARESTMV }, + { { LAST_FRAME, ALTREF_FRAME }, GLOBAL_GLOBALMV }, + { { LAST_FRAME, ALTREF_FRAME }, NEAREST_NEARESTMV }, +}; + +static const int_interpfilters filters_ref_set[9] = { + [0].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }, + [1].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH }, + [2].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH }, + [3].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_REGULAR }, + [4].as_filters = { MULTITAP_SHARP, MULTITAP_SHARP }, + [5].as_filters = { EIGHTTAP_REGULAR, MULTITAP_SHARP }, + [6].as_filters = { MULTITAP_SHARP, EIGHTTAP_REGULAR }, + [7].as_filters = { EIGHTTAP_SMOOTH, MULTITAP_SHARP }, + [8].as_filters = { MULTITAP_SHARP, EIGHTTAP_SMOOTH } +}; + +enum { + // INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV), + INTER_NEAREST = (1 << NEARESTMV), + INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV), + INTER_NEAREST_NEAR = (1 << NEARESTMV) | (1 << NEARMV), + INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV), +}; + +// The original scan order (default_scan_8x8) is modified according to the extra +// transpose in hadamard c implementation, i.e., aom_hadamard_lp_8x8_c and +// aom_hadamard_8x8_c. +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8_transpose[64]) = { + 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, + 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, + 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, + 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63 +}; + +// The original scan order (av1_default_iscan_8x8) is modified to match +// hadamard AVX2 implementation, i.e., aom_hadamard_lp_8x8_avx2 and +// aom_hadamard_8x8_avx2. Since hadamard AVX2 implementation will modify the +// order of coefficients, such that the normal scan order is no longer +// guaranteed to scan low coefficients first, therefore we modify the scan order +// accordingly. +// Note that this one has to be used together with default_scan_8x8_transpose. +DECLARE_ALIGNED(16, static const int16_t, + av1_default_iscan_8x8_transpose[64]) = { + 0, 2, 3, 9, 10, 20, 21, 35, 1, 4, 8, 11, 19, 22, 34, 36, + 5, 7, 12, 18, 23, 33, 37, 48, 6, 13, 17, 24, 32, 38, 47, 49, + 14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58, + 27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63 +}; + +// The original scan order (default_scan_16x16) is modified according to the +// extra transpose in hadamard c implementation in lp case, i.e., +// aom_hadamard_lp_16x16_c. +DECLARE_ALIGNED(16, static const int16_t, + default_scan_lp_16x16_transpose[256]) = { + 0, 8, 2, 4, 10, 16, 24, 18, 12, 6, 64, 14, 20, 26, 32, + 40, 34, 28, 22, 72, 66, 68, 74, 80, 30, 36, 42, 48, 56, 50, + 44, 38, 88, 82, 76, 70, 128, 78, 84, 90, 96, 46, 52, 58, 1, + 9, 3, 60, 54, 104, 98, 92, 86, 136, 130, 132, 138, 144, 94, 100, + 106, 112, 62, 5, 11, 17, 25, 19, 13, 7, 120, 114, 108, 102, 152, + 146, 140, 134, 192, 142, 148, 154, 160, 110, 116, 122, 65, 15, 21, 27, + 33, 41, 35, 29, 23, 73, 67, 124, 118, 168, 162, 156, 150, 200, 194, + 196, 202, 208, 158, 164, 170, 176, 126, 69, 75, 81, 31, 37, 43, 49, + 57, 51, 45, 39, 89, 83, 77, 71, 184, 178, 172, 166, 216, 210, 204, + 198, 206, 212, 218, 224, 174, 180, 186, 129, 79, 85, 91, 97, 47, 53, + 59, 61, 55, 105, 99, 93, 87, 137, 131, 188, 182, 232, 226, 220, 214, + 222, 228, 234, 240, 190, 133, 139, 145, 95, 101, 107, 113, 63, 121, 115, + 109, 103, 153, 147, 141, 135, 248, 242, 236, 230, 238, 244, 250, 193, 143, + 149, 155, 161, 111, 117, 123, 125, 119, 169, 163, 157, 151, 201, 195, 252, + 246, 254, 197, 203, 209, 159, 165, 171, 177, 127, 185, 179, 173, 167, 217, + 211, 205, 199, 207, 213, 219, 225, 175, 181, 187, 189, 183, 233, 227, 221, + 215, 223, 229, 235, 241, 191, 249, 243, 237, 231, 239, 245, 251, 253, 247, + 255 +}; + +#if CONFIG_AV1_HIGHBITDEPTH +// The original scan order (default_scan_16x16) is modified according to the +// extra shift in hadamard c implementation in fp case, i.e., +// aom_hadamard_16x16_c. Note that 16x16 lp and fp hadamard generate different +// outputs, so we handle them separately. +DECLARE_ALIGNED(16, static const int16_t, + default_scan_fp_16x16_transpose[256]) = { + 0, 4, 2, 8, 6, 16, 20, 18, 12, 10, 64, 14, 24, 22, 32, + 36, 34, 28, 26, 68, 66, 72, 70, 80, 30, 40, 38, 48, 52, 50, + 44, 42, 84, 82, 76, 74, 128, 78, 88, 86, 96, 46, 56, 54, 1, + 5, 3, 60, 58, 100, 98, 92, 90, 132, 130, 136, 134, 144, 94, 104, + 102, 112, 62, 9, 7, 17, 21, 19, 13, 11, 116, 114, 108, 106, 148, + 146, 140, 138, 192, 142, 152, 150, 160, 110, 120, 118, 65, 15, 25, 23, + 33, 37, 35, 29, 27, 69, 67, 124, 122, 164, 162, 156, 154, 196, 194, + 200, 198, 208, 158, 168, 166, 176, 126, 73, 71, 81, 31, 41, 39, 49, + 53, 51, 45, 43, 85, 83, 77, 75, 180, 178, 172, 170, 212, 210, 204, + 202, 206, 216, 214, 224, 174, 184, 182, 129, 79, 89, 87, 97, 47, 57, + 55, 61, 59, 101, 99, 93, 91, 133, 131, 188, 186, 228, 226, 220, 218, + 222, 232, 230, 240, 190, 137, 135, 145, 95, 105, 103, 113, 63, 117, 115, + 109, 107, 149, 147, 141, 139, 244, 242, 236, 234, 238, 248, 246, 193, 143, + 153, 151, 161, 111, 121, 119, 125, 123, 165, 163, 157, 155, 197, 195, 252, + 250, 254, 201, 199, 209, 159, 169, 167, 177, 127, 181, 179, 173, 171, 213, + 211, 205, 203, 207, 217, 215, 225, 175, 185, 183, 189, 187, 229, 227, 221, + 219, 223, 233, 231, 241, 191, 245, 243, 237, 235, 239, 249, 247, 253, 251, + 255 +}; +#endif + +// The original scan order (av1_default_iscan_16x16) is modified to match +// hadamard AVX2 implementation, i.e., aom_hadamard_lp_16x16_avx2. +// Since hadamard AVX2 implementation will modify the order of coefficients, +// such that the normal scan order is no longer guaranteed to scan low +// coefficients first, therefore we modify the scan order accordingly. Note that +// this one has to be used together with default_scan_lp_16x16_transpose. +DECLARE_ALIGNED(16, static const int16_t, + av1_default_iscan_lp_16x16_transpose[256]) = { + 0, 44, 2, 46, 3, 63, 9, 69, 1, 45, 4, 64, 8, 68, 11, + 87, 5, 65, 7, 67, 12, 88, 18, 94, 6, 66, 13, 89, 17, 93, + 24, 116, 14, 90, 16, 92, 25, 117, 31, 123, 15, 91, 26, 118, 30, + 122, 41, 148, 27, 119, 29, 121, 42, 149, 48, 152, 28, 120, 43, 150, + 47, 151, 62, 177, 10, 86, 20, 96, 21, 113, 35, 127, 19, 95, 22, + 114, 34, 126, 37, 144, 23, 115, 33, 125, 38, 145, 52, 156, 32, 124, + 39, 146, 51, 155, 58, 173, 40, 147, 50, 154, 59, 174, 73, 181, 49, + 153, 60, 175, 72, 180, 83, 198, 61, 176, 71, 179, 84, 199, 98, 202, + 70, 178, 85, 200, 97, 201, 112, 219, 36, 143, 54, 158, 55, 170, 77, + 185, 53, 157, 56, 171, 76, 184, 79, 194, 57, 172, 75, 183, 80, 195, + 102, 206, 74, 182, 81, 196, 101, 205, 108, 215, 82, 197, 100, 204, 109, + 216, 131, 223, 99, 203, 110, 217, 130, 222, 140, 232, 111, 218, 129, 221, + 141, 233, 160, 236, 128, 220, 142, 234, 159, 235, 169, 245, 78, 193, 104, + 208, 105, 212, 135, 227, 103, 207, 106, 213, 134, 226, 136, 228, 107, 214, + 133, 225, 137, 229, 164, 240, 132, 224, 138, 230, 163, 239, 165, 241, 139, + 231, 162, 238, 166, 242, 189, 249, 161, 237, 167, 243, 188, 248, 190, 250, + 168, 244, 187, 247, 191, 251, 210, 254, 186, 246, 192, 252, 209, 253, 211, + 255 +}; + +#if CONFIG_AV1_HIGHBITDEPTH +// The original scan order (av1_default_iscan_16x16) is modified to match +// hadamard AVX2 implementation, i.e., aom_hadamard_16x16_avx2. +// Since hadamard AVX2 implementation will modify the order of coefficients, +// such that the normal scan order is no longer guaranteed to scan low +// coefficients first, therefore we modify the scan order accordingly. Note that +// this one has to be used together with default_scan_fp_16x16_transpose. +DECLARE_ALIGNED(16, static const int16_t, + av1_default_iscan_fp_16x16_transpose[256]) = { + 0, 44, 2, 46, 1, 45, 4, 64, 3, 63, 9, 69, 8, 68, 11, + 87, 5, 65, 7, 67, 6, 66, 13, 89, 12, 88, 18, 94, 17, 93, + 24, 116, 14, 90, 16, 92, 15, 91, 26, 118, 25, 117, 31, 123, 30, + 122, 41, 148, 27, 119, 29, 121, 28, 120, 43, 150, 42, 149, 48, 152, + 47, 151, 62, 177, 10, 86, 20, 96, 19, 95, 22, 114, 21, 113, 35, + 127, 34, 126, 37, 144, 23, 115, 33, 125, 32, 124, 39, 146, 38, 145, + 52, 156, 51, 155, 58, 173, 40, 147, 50, 154, 49, 153, 60, 175, 59, + 174, 73, 181, 72, 180, 83, 198, 61, 176, 71, 179, 70, 178, 85, 200, + 84, 199, 98, 202, 97, 201, 112, 219, 36, 143, 54, 158, 53, 157, 56, + 171, 55, 170, 77, 185, 76, 184, 79, 194, 57, 172, 75, 183, 74, 182, + 81, 196, 80, 195, 102, 206, 101, 205, 108, 215, 82, 197, 100, 204, 99, + 203, 110, 217, 109, 216, 131, 223, 130, 222, 140, 232, 111, 218, 129, 221, + 128, 220, 142, 234, 141, 233, 160, 236, 159, 235, 169, 245, 78, 193, 104, + 208, 103, 207, 106, 213, 105, 212, 135, 227, 134, 226, 136, 228, 107, 214, + 133, 225, 132, 224, 138, 230, 137, 229, 164, 240, 163, 239, 165, 241, 139, + 231, 162, 238, 161, 237, 167, 243, 166, 242, 189, 249, 188, 248, 190, 250, + 168, 244, 187, 247, 186, 246, 192, 252, 191, 251, 210, 254, 209, 253, 211, + 255 +}; +#endif + +// For entropy coding, IDTX shares the scan orders of the other 2D-transforms, +// but the fastest way to calculate the IDTX transform (i.e. no transposes) +// results in coefficients that are a transposition of the entropy coding +// versions. These tables are used as substitute for the scan order for the +// faster version of IDTX. + +// Must be used together with av1_fast_idtx_iscan_4x4 +DECLARE_ALIGNED(16, static const int16_t, + av1_fast_idtx_scan_4x4[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + +// Must be used together with av1_fast_idtx_scan_4x4 +DECLARE_ALIGNED(16, static const int16_t, + av1_fast_idtx_iscan_4x4[16]) = { 0, 1, 5, 6, 2, 4, 7, 12, + 3, 8, 11, 13, 9, 10, 14, 15 }; + +static const SCAN_ORDER av1_fast_idtx_scan_order_4x4 = { + av1_fast_idtx_scan_4x4, av1_fast_idtx_iscan_4x4 +}; + +// Must be used together with av1_fast_idtx_iscan_8x8 +DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_8x8[64]) = { + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 +}; + +// Must be used together with av1_fast_idtx_scan_8x8 +DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_8x8[64]) = { + 0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, + 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, + 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63 +}; + +static const SCAN_ORDER av1_fast_idtx_scan_order_8x8 = { + av1_fast_idtx_scan_8x8, av1_fast_idtx_iscan_8x8 +}; + +// Must be used together with av1_fast_idtx_iscan_16x16 +DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_16x16[256]) = { + 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, + 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22, + 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8, + 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, + 85, 70, 55, 40, 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131, + 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, + 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, + 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14, + 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, + 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46, + 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, + 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 79, 94, + 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185, + 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231, + 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203, + 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, + 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, + 255 +}; + +// Must be used together with av1_fast_idtx_scan_16x16 +DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_16x16[256]) = { + 0, 1, 5, 6, 14, 15, 27, 28, 44, 45, 65, 66, 90, 91, 119, + 120, 2, 4, 7, 13, 16, 26, 29, 43, 46, 64, 67, 89, 92, 118, + 121, 150, 3, 8, 12, 17, 25, 30, 42, 47, 63, 68, 88, 93, 117, + 122, 149, 151, 9, 11, 18, 24, 31, 41, 48, 62, 69, 87, 94, 116, + 123, 148, 152, 177, 10, 19, 23, 32, 40, 49, 61, 70, 86, 95, 115, + 124, 147, 153, 176, 178, 20, 22, 33, 39, 50, 60, 71, 85, 96, 114, + 125, 146, 154, 175, 179, 200, 21, 34, 38, 51, 59, 72, 84, 97, 113, + 126, 145, 155, 174, 180, 199, 201, 35, 37, 52, 58, 73, 83, 98, 112, + 127, 144, 156, 173, 181, 198, 202, 219, 36, 53, 57, 74, 82, 99, 111, + 128, 143, 157, 172, 182, 197, 203, 218, 220, 54, 56, 75, 81, 100, 110, + 129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55, 76, 80, 101, 109, + 130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77, 79, 102, 108, + 131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78, 103, 107, + 132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106, + 133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105, + 134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253, + 135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254, + 255 +}; + +// Indicates the blocks for which RD model should be based on special logic +static INLINE int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const int large_block = bsize >= BLOCK_32X32; + // Only enable for low bitdepth to mitigate issue: b/303023614. + return cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block && + !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && + cm->quant_params.base_qindex && !cpi->oxcf.use_highbitdepth; +} +/*!\brief Finds predicted motion vectors for a block. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Finds predicted motion vectors for a block from a certain reference frame. + * First, it fills reference MV stack, then picks the test from the stack and + * predicts the final MV for a block for each mode. + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the + * data for the current macroblock + * \param[in] ref_frame Reference frame for which to find + * ref MVs + * \param[out] frame_mv Predicted MVs for a block + * \param[in] yv12_mb Buffer to hold predicted block + * \param[in] bsize Current block size + * \param[in] force_skip_low_temp_var Flag indicating possible mode search + * prune for low temporal variance block + * \param[in] skip_pred_mv Flag indicating to skip av1_mv_pred + * \param[out] use_scaled_ref_frame Flag to indicate if scaled reference + * frame is used. + * + * \remark Nothing is returned. Instead, predicted MVs are placed into + * \c frame_mv array, and use_scaled_ref_frame is set. + */ +static INLINE void find_predictors( + AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], + struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize, + int force_skip_low_temp_var, int skip_pred_mv, bool *use_scaled_ref_frame) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, ref_frame); + const bool ref_is_scaled = + ref->y_crop_height != cm->height || ref->y_crop_width != cm->width; + const YV12_BUFFER_CONFIG *scaled_ref = + av1_get_scaled_ref_frame(cpi, ref_frame); + const YV12_BUFFER_CONFIG *yv12 = + ref_is_scaled && scaled_ref ? scaled_ref : ref; + const int num_planes = av1_num_planes(cm); + x->pred_mv_sad[ref_frame] = INT_MAX; + x->pred_mv0_sad[ref_frame] = INT_MAX; + x->pred_mv1_sad[ref_frame] = INT_MAX; + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; + // TODO(kyslov) this needs various further optimizations. to be continued.. + assert(yv12 != NULL); + if (yv12 != NULL) { + struct scale_factors *const sf = + scaled_ref ? NULL : get_ref_scale_factors(cm, ref_frame); + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + av1_find_best_ref_mvs_from_stack( + cm->features.allow_high_precision_mv, mbmi_ext, ref_frame, + &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0); + frame_mv[GLOBALMV][ref_frame] = mbmi_ext->global_mvs[ref_frame]; + // Early exit for non-LAST frame if force_skip_low_temp_var is set. + if (!ref_is_scaled && bsize >= BLOCK_8X8 && !skip_pred_mv && + !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) { + av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, + bsize); + } + } + if (cm->features.switchable_motion_mode) { + av1_count_overlappable_neighbors(cm, xd); + } + mbmi->num_proj_ref = 1; + *use_scaled_ref_frame = ref_is_scaled && scaled_ref; +} + +static INLINE void init_mbmi_nonrd(MB_MODE_INFO *mbmi, + PREDICTION_MODE pred_mode, + MV_REFERENCE_FRAME ref_frame0, + MV_REFERENCE_FRAME ref_frame1, + const AV1_COMMON *cm) { + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + mbmi->ref_mv_idx = 0; + mbmi->mode = pred_mode; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = ref_frame0; + mbmi->ref_frame[1] = ref_frame1; + pmi->palette_size[PLANE_TYPE_Y] = 0; + pmi->palette_size[PLANE_TYPE_UV] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->num_proj_ref = 1; + mbmi->interintra_mode = 0; + set_default_interp_filters(mbmi, cm->features.interp_filter); +} + +static INLINE void init_estimate_block_intra_args( + struct estimate_block_intra_args *args, AV1_COMP *cpi, MACROBLOCK *x) { + args->cpi = cpi; + args->x = x; + args->mode = DC_PRED; + args->skippable = 1; + args->rdc = 0; + args->best_sad = UINT_MAX; + args->prune_mode_based_on_sad = false; +} + +static INLINE int get_pred_buffer(PRED_BUFFER *p, int len) { + for (int buf_idx = 0; buf_idx < len; buf_idx++) { + if (!p[buf_idx].in_use) { + p[buf_idx].in_use = 1; + return buf_idx; + } + } + return -1; +} + +static INLINE void free_pred_buffer(PRED_BUFFER *p) { + if (p != NULL) p->in_use = 0; +} + +#if CONFIG_INTERNAL_STATS +static INLINE void store_coding_context_nonrd(MACROBLOCK *x, + PICK_MODE_CONTEXT *ctx, + int mode_index) { +#else +static INLINE void store_coding_context_nonrd(MACROBLOCK *x, + PICK_MODE_CONTEXT *ctx) { +#endif // CONFIG_INTERNAL_STATS + MACROBLOCKD *const xd = &x->e_mbd; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + + // Take a snapshot of the coding context so it can be + // restored if we decide to encode this way + ctx->rd_stats.skip_txfm = txfm_info->skip_txfm; + + ctx->skippable = txfm_info->skip_txfm; +#if CONFIG_INTERNAL_STATS + ctx->best_mode_index = mode_index; +#endif // CONFIG_INTERNAL_STATS + ctx->mic = *xd->mi[0]; + ctx->skippable = txfm_info->skip_txfm; + av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); +} + +void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable, + BLOCK_SIZE bsize, TX_SIZE tx_size); + +void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf, + int pred_stride, RD_STATS *this_rdc, int *skippable, + BLOCK_SIZE bsize, TX_SIZE tx_size); + +int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + RD_STATS *this_rdc, int start_plane, + int stop_plane); + +void av1_estimate_block_intra(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg); + +void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int best_early_term, unsigned int ref_cost_intra, + int reuse_prediction, struct buf_2d *orig_dst, + PRED_BUFFER *tmp_buffers, + PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc, + BEST_PICKMODE *best_pickmode, + PICK_MODE_CONTEXT *ctx); + +#endif // AOM_AV1_ENCODER_NONRD_OPT_H_ diff --git a/third_party/aom/av1/encoder/nonrd_pickmode.c b/third_party/aom/av1/encoder/nonrd_pickmode.c new file mode 100644 index 0000000000..f939b6d1fa --- /dev/null +++ b/third_party/aom/av1/encoder/nonrd_pickmode.c @@ -0,0 +1,3537 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + + */ + +#include +#include +#include +#include + +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/nonrd_opt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/var_based_part.h" + +static INLINE int early_term_inter_search_with_sse(int early_term_idx, + BLOCK_SIZE bsize, + int64_t this_sse, + int64_t best_sse, + PREDICTION_MODE this_mode) { + // Aggressiveness to terminate inter mode search early is adjusted based on + // speed and block size. + static const double early_term_thresh[4][4] = { { 0.65, 0.65, 0.65, 0.7 }, + { 0.6, 0.65, 0.85, 0.9 }, + { 0.5, 0.5, 0.55, 0.6 }, + { 0.6, 0.75, 0.85, 0.85 } }; + static const double early_term_thresh_newmv_nearestmv[4] = { 0.3, 0.3, 0.3, + 0.3 }; + + const int size_group = size_group_lookup[bsize]; + assert(size_group < 4); + assert((early_term_idx > 0) && (early_term_idx < EARLY_TERM_INDICES)); + const double threshold = + ((early_term_idx == EARLY_TERM_IDX_4) && + (this_mode == NEWMV || this_mode == NEARESTMV)) + ? early_term_thresh_newmv_nearestmv[size_group] + : early_term_thresh[early_term_idx - 1][size_group]; + + // Terminate inter mode search early based on best sse so far. + if ((early_term_idx > 0) && (threshold * this_sse > best_sse)) { + return 1; + } + return 0; +} + +static INLINE void init_best_pickmode(BEST_PICKMODE *bp) { + bp->best_sse = INT64_MAX; + bp->best_mode = NEARESTMV; + bp->best_ref_frame = LAST_FRAME; + bp->best_second_ref_frame = NONE_FRAME; + bp->best_tx_size = TX_8X8; + bp->tx_type = DCT_DCT; + bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + bp->best_mode_skip_txfm = 0; + bp->best_mode_initial_skip_flag = 0; + bp->best_pred = NULL; + bp->best_motion_mode = SIMPLE_TRANSLATION; + bp->num_proj_ref = 0; + av1_zero(bp->wm_params); + av1_zero(bp->pmi); +} + +// Copy best inter mode parameters to best_pickmode +static INLINE void update_search_state_nonrd( + InterModeSearchStateNonrd *search_state, MB_MODE_INFO *const mi, + TxfmSearchInfo *txfm_info, RD_STATS *nonskip_rdc, PICK_MODE_CONTEXT *ctx, + PREDICTION_MODE this_best_mode, const int64_t sse_y) { + BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode; + + best_pickmode->best_sse = sse_y; + best_pickmode->best_mode = this_best_mode; + best_pickmode->best_motion_mode = mi->motion_mode; + best_pickmode->wm_params = mi->wm_params; + best_pickmode->num_proj_ref = mi->num_proj_ref; + best_pickmode->best_pred_filter = mi->interp_filters; + best_pickmode->best_tx_size = mi->tx_size; + best_pickmode->best_ref_frame = mi->ref_frame[0]; + best_pickmode->best_second_ref_frame = mi->ref_frame[1]; + best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm; + best_pickmode->best_mode_initial_skip_flag = + (nonskip_rdc->rate == INT_MAX && search_state->this_rdc.skip_txfm); + if (!best_pickmode->best_mode_skip_txfm) { + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + } +} + +static INLINE int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int_mv *mv, MV ref_mv, FULLPEL_MV start_mv, + bool fullpel_performed_well) { + const int frame_lowmotion = cpi->rc.avg_frame_low_motion; + const int reduce_mv_pel_precision_highmotion = + cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion; + + // Reduce MV precision for higher int MV value & frame-level motion + if (reduce_mv_pel_precision_highmotion >= 3) { + int mv_thresh = 4; + const int is_low_resoln = + (cpi->common.width * cpi->common.height <= 320 * 240); + mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6; + if (frame_lowmotion > 0 && frame_lowmotion < 40) mv_thresh = 12; + mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh; + if (abs(mv->as_fullmv.row) >= mv_thresh || + abs(mv->as_fullmv.col) >= mv_thresh) + return HALF_PEL; + } else if (reduce_mv_pel_precision_highmotion >= 1) { + int mv_thresh; + const int th_vals[2][3] = { { 4, 8, 10 }, { 4, 6, 8 } }; + const int th_idx = reduce_mv_pel_precision_highmotion - 1; + assert(th_idx >= 0 && th_idx < 2); + if (frame_lowmotion > 0 && frame_lowmotion < 40) + mv_thresh = 12; + else + mv_thresh = (bsize >= BLOCK_32X32) ? th_vals[th_idx][0] + : (bsize >= BLOCK_16X16) ? th_vals[th_idx][1] + : th_vals[th_idx][2]; + if (abs(mv->as_fullmv.row) >= (mv_thresh << 1) || + abs(mv->as_fullmv.col) >= (mv_thresh << 1)) + return FULL_PEL; + else if (abs(mv->as_fullmv.row) >= mv_thresh || + abs(mv->as_fullmv.col) >= mv_thresh) + return HALF_PEL; + } + // Reduce MV precision for relatively static (e.g. background), low-complex + // large areas + if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 2) { + const int qband = x->qindex >> (QINDEX_BITS - 2); + assert(qband < 4); + if (x->content_state_sb.source_sad_nonrd <= kVeryLowSad && + bsize > BLOCK_16X16 && qband != 0) { + if (x->source_variance < 500) + return FULL_PEL; + else if (x->source_variance < 5000) + return HALF_PEL; + } + } else if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 1) { + if (fullpel_performed_well && ref_mv.row == 0 && ref_mv.col == 0 && + start_mv.row == 0 && start_mv.col == 0) + return HALF_PEL; + } + return cpi->sf.mv_sf.subpel_force_stop; +} + +static bool use_aggressive_subpel_search_method(MACROBLOCK *x, + bool use_adaptive_subpel_search, + bool fullpel_performed_well) { + if (!use_adaptive_subpel_search) return false; + const int qband = x->qindex >> (QINDEX_BITS - 2); + assert(qband < 4); + if ((qband > 0) && (fullpel_performed_well || + (x->content_state_sb.source_sad_nonrd <= kLowSad) || + (x->source_variance < 100))) + return true; + return false; +} + +/*!\brief Runs Motion Estimation for a specific block and specific ref frame. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Finds the best Motion Vector by running Motion Estimation for a specific + * block and a specific reference frame. Exits early if RDCost of Full Pel part + * exceeds best RD Cost fund so far + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the + * data for the current macroblock + * \param[in] bsize Current block size + * \param[in] tmp_mv Pointer to best found New MV + * \param[in] rate_mv Pointer to Rate of the best new MV + * \param[in] best_rd_sofar RD Cost of the best mode found so far + * \param[in] use_base_mv Flag, indicating that tmp_mv holds + * specific MV to start the search with + * + * \return Returns 0 if ME was terminated after Full Pel Search because too + * high RD Cost. Otherwise returns 1. Best New MV is placed into \c tmp_mv. + * Rate estimation for this vector is placed to \c rate_mv + */ +static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *tmp_mv, + int *rate_mv, int64_t best_rd_sofar, + int use_base_mv) { + MACROBLOCKD *xd = &x->e_mbd; + const AV1_COMMON *cm = &cpi->common; + const SPEED_FEATURES *sf = &cpi->sf; + MB_MODE_INFO *mi = xd->mi[0]; + int step_param = (sf->rt_sf.fullpel_search_step_param) + ? sf->rt_sf.fullpel_search_step_param + : cpi->mv_search_params.mv_step_param; + FULLPEL_MV start_mv; + const int ref = mi->ref_frame[0]; + const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv; + MV center_mv; + int dis; + int rv = 0; + int cost_list[5]; + int search_subpel = 1; + + start_mv = get_fullmv_from_mv(&ref_mv); + + if (!use_base_mv) + center_mv = ref_mv; + else + center_mv = tmp_mv->as_mv; + + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize); + const search_site_config *src_search_sites = + av1_get_search_site_config(cpi, x, search_method); + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + FULLPEL_MV_STATS best_mv_stats; + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, ¢er_mv, + start_mv, src_search_sites, search_method, + /*fine_search_interval=*/0); + + const unsigned int full_var_rd = av1_full_pixel_search( + start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list), + &tmp_mv->as_fullmv, &best_mv_stats, NULL); + + // calculate the bit cost on motion vector + MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv); + + *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + + // TODO(kyslov) Account for Rate Mode! + rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar); + + if (rv && search_subpel) { + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, + cost_list); + const bool fullpel_performed_well = + (bsize == BLOCK_64X64 && full_var_rd * 40 < 62267 * 7) || + (bsize == BLOCK_32X32 && full_var_rd * 8 < 42380) || + (bsize == BLOCK_16X16 && full_var_rd * 8 < 10127); + if (sf->rt_sf.reduce_mv_pel_precision_highmotion || + sf->rt_sf.reduce_mv_pel_precision_lowcomplex) + ms_params.forced_stop = subpel_select(cpi, x, bsize, tmp_mv, ref_mv, + start_mv, fullpel_performed_well); + + MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + // adaptively downgrade subpel search method based on block properties + if (use_aggressive_subpel_search_method( + x, sf->rt_sf.use_adaptive_subpel_search, fullpel_performed_well)) + av1_find_best_sub_pixel_tree_pruned_more( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv, + &dis, &x->pred_sse[ref], NULL); + else + cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv, + &dis, &x->pred_sse[ref], NULL); + *rate_mv = + av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } + // The final MV can not be equal to the reference MV as this will trigger an + // assert later. This can happen if both NEAREST and NEAR modes were skipped. + rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row); + return rv; +} + +/*!\brief Searches for the best New Motion Vector. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Finds the best Motion Vector by doing Motion Estimation. Uses reduced + * complexity ME for non-LAST frames or calls \c combined_motion_search + * for LAST reference frame + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the + * data for the current macroblock + * \param[in] frame_mv Array that holds MVs for all modes + * and ref frames + * \param[in] ref_frame Reference frame for which to find + * the best New MVs + * \param[in] gf_temporal_ref Flag, indicating temporal reference + * for GOLDEN frame + * \param[in] bsize Current block size + * \param[in] mi_row Row index in 4x4 units + * \param[in] mi_col Column index in 4x4 units + * \param[in] rate_mv Pointer to Rate of the best new MV + * \param[in] best_rdc Pointer to the RD Cost for the best + * mode found so far + * + * \return Returns -1 if the search was not done, otherwise returns 0. + * Best New MV is placed into \c frame_mv array, Rate estimation for this + * vector is placed to \c rate_mv + */ +static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x, + int_mv frame_mv[][REF_FRAMES], + MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref, + BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_mv, + RD_STATS *best_rdc) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + AV1_COMMON *cm = &cpi->common; + int_mv *this_ref_frm_newmv = &frame_mv[NEWMV][ref_frame]; + unsigned int y_sad_zero; + if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR && + gf_temporal_ref) { + int tmp_sad; + int dis; + + if (bsize < BLOCK_16X16) return -1; + + int me_search_size_col = block_size_wide[bsize] >> 1; + int me_search_size_row = block_size_high[bsize] >> 1; + tmp_sad = av1_int_pro_motion_estimation( + cpi, x, bsize, mi_row, mi_col, + &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv, &y_sad_zero, + me_search_size_col, me_search_size_row); + + if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1; + + this_ref_frm_newmv->as_int = mi->mv[0].as_int; + int_mv best_mv = mi->mv[0]; + best_mv.as_mv.row >>= 3; + best_mv.as_mv.col >>= 3; + MV ref_mv = av1_get_ref_mv(x, 0).as_mv; + this_ref_frm_newmv->as_mv.row >>= 3; + this_ref_frm_newmv->as_mv.col >>= 3; + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, NULL); + if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion || + cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex) { + FULLPEL_MV start_mv = { .row = 0, .col = 0 }; + ms_params.forced_stop = + subpel_select(cpi, x, bsize, &best_mv, ref_mv, start_mv, false); + } + MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv)); + cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, + &x->pred_sse[ref_frame], NULL); + this_ref_frm_newmv->as_int = best_mv.as_int; + + // When NEWMV is same as ref_mv from the drl, it is preferred to code the + // MV as NEARESTMV or NEARMV. In this case, NEWMV needs to be skipped to + // avoid an assert failure at a later stage. The scenario can occur if + // NEARESTMV was not evaluated for ALTREF. + if (this_ref_frm_newmv->as_mv.col == ref_mv.col && + this_ref_frm_newmv->as_mv.row == ref_mv.row) + return -1; + + *rate_mv = av1_mv_bit_cost(&this_ref_frm_newmv->as_mv, &ref_mv, + x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } else if (!combined_motion_search(cpi, x, bsize, &frame_mv[NEWMV][ref_frame], + rate_mv, best_rdc->rdcost, 0)) { + return -1; + } + + return 0; +} + +static void estimate_single_ref_frame_costs(const AV1_COMMON *cm, + const MACROBLOCKD *xd, + const ModeCosts *mode_costs, + int segment_id, BLOCK_SIZE bsize, + unsigned int *ref_costs_single) { + int seg_ref_active = + segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + if (seg_ref_active) { + memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single)); + } else { + int intra_inter_ctx = av1_get_intra_inter_context(xd); + ref_costs_single[INTRA_FRAME] = + mode_costs->intra_inter_cost[intra_inter_ctx][0]; + unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1]; + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT && + is_comp_ref_allowed(bsize)) { + const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd); + base_cost += mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1]; + } + ref_costs_single[LAST_FRAME] = base_cost; + ref_costs_single[GOLDEN_FRAME] = base_cost; + ref_costs_single[ALTREF_FRAME] = base_cost; + // add cost for last, golden, altref + ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[0][0][0]; + ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][0][1]; + ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][1][0]; + ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][0][1]; + ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][2][0]; + } +} + +static INLINE void set_force_skip_flag(const AV1_COMP *const cpi, + MACROBLOCK *const x, unsigned int sse, + int *force_skip) { + if (x->txfm_search_params.tx_mode_search_type == TX_MODE_SELECT && + cpi->sf.rt_sf.tx_size_level_based_on_qstep && + cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) { + const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (x->e_mbd.bd - 5); + const unsigned int qstep_sq = qstep * qstep; + // If the sse is low for low source variance blocks, mark those as + // transform skip. + // Note: Though qstep_sq is based on ac qstep, the threshold is kept + // low so that reliable early estimate of tx skip can be obtained + // through its comparison with sse. + if (sse < qstep_sq && x->source_variance < qstep_sq && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) + *force_skip = 1; + } +} + +#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \ + (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false) +#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16) + +static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *const x, unsigned int var, + unsigned int sse, int *force_skip) { + MACROBLOCKD *const xd = &x->e_mbd; + TX_SIZE tx_size; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) { + int multiplier = 8; + unsigned int var_thresh = 0; + unsigned int is_high_var = 1; + // Use quantizer based thresholds to determine transform size. + if (cpi->sf.rt_sf.tx_size_level_based_on_qstep) { + const int qband = x->qindex >> (QINDEX_BITS - 2); + const int mult[4] = { 8, 7, 6, 5 }; + assert(qband < 4); + multiplier = mult[qband]; + const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (xd->bd - 5); + const unsigned int qstep_sq = qstep * qstep; + var_thresh = qstep_sq * 2; + if (cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) { + // If the sse is low for low source variance blocks, mark those as + // transform skip. + // Note: Though qstep_sq is based on ac qstep, the threshold is kept + // low so that reliable early estimate of tx skip can be obtained + // through its comparison with sse. + if (sse < qstep_sq && x->source_variance < qstep_sq && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) + *force_skip = 1; + // Further lower transform size based on aq mode only if residual + // variance is high. + is_high_var = (var >= var_thresh); + } + } + // Choose larger transform size for blocks where dc component is dominant or + // the ac component is low. + if (sse > ((var * multiplier) >> 2) || (var < var_thresh)) + tx_size = + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]); + else + tx_size = TX_8X8; + + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && is_high_var) + tx_size = TX_8X8; + else if (tx_size > TX_16X16) + tx_size = TX_16X16; + } else { + tx_size = + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]); + } + + if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) + tx_size = TX_SIZE_FOR_BSIZE_GT32; + + return AOMMIN(tx_size, TX_16X16); +} + +static void block_variance(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, int h, + unsigned int *sse, int *sum, int block_size, + uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) { + int k = 0; + *sse = 0; + *sum = 0; + + // This function is called for block sizes >= BLOCK_32x32. As per the design + // the aom_get_var_sse_sum_8x8_quad() processes four 8x8 blocks (in a 8x32) + // per call. Hence the width and height of the block need to be at least 8 and + // 32 samples respectively. + assert(w >= 32); + assert(h >= 8); + for (int row = 0; row < h; row += block_size) { + for (int col = 0; col < w; col += 32) { + aom_get_var_sse_sum_8x8_quad(src + src_stride * row + col, src_stride, + ref + ref_stride * row + col, ref_stride, + &sse8x8[k], &sum8x8[k], sse, sum, + &var8x8[k]); + k += 4; + } + } +} + +static void block_variance_16x16_dual(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, + int h, unsigned int *sse, int *sum, + int block_size, uint32_t *sse16x16, + uint32_t *var16x16) { + int k = 0; + *sse = 0; + *sum = 0; + // This function is called for block sizes >= BLOCK_32x32. As per the design + // the aom_get_var_sse_sum_16x16_dual() processes four 16x16 blocks (in a + // 16x32) per call. Hence the width and height of the block need to be at + // least 16 and 32 samples respectively. + assert(w >= 32); + assert(h >= 16); + for (int row = 0; row < h; row += block_size) { + for (int col = 0; col < w; col += 32) { + aom_get_var_sse_sum_16x16_dual(src + src_stride * row + col, src_stride, + ref + ref_stride * row + col, ref_stride, + &sse16x16[k], sse, sum, &var16x16[k]); + k += 2; + } + } +} + +static void calculate_variance(int bw, int bh, TX_SIZE tx_size, + unsigned int *sse_i, int *sum_i, + unsigned int *var_o, unsigned int *sse_o, + int *sum_o) { + const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size]; + const int nw = 1 << (bw - b_width_log2_lookup[unit_size]); + const int nh = 1 << (bh - b_height_log2_lookup[unit_size]); + int row, col, k = 0; + + for (row = 0; row < nh; row += 2) { + for (col = 0; col < nw; col += 2) { + sse_o[k] = sse_i[row * nw + col] + sse_i[row * nw + col + 1] + + sse_i[(row + 1) * nw + col] + sse_i[(row + 1) * nw + col + 1]; + sum_o[k] = sum_i[row * nw + col] + sum_i[row * nw + col + 1] + + sum_i[(row + 1) * nw + col] + sum_i[(row + 1) * nw + col + 1]; + var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> + (b_width_log2_lookup[unit_size] + + b_height_log2_lookup[unit_size] + 6)); + k++; + } + } +} + +// Adjust the ac_thr according to speed, width, height and normalized sum +static int ac_thr_factor(int speed, int width, int height, int norm_sum) { + if (speed >= 8 && norm_sum < 5) { + if (width <= 640 && height <= 480) + return 4; + else + return 2; + } + return 1; +} + +// Sets early_term flag based on chroma planes prediction +static INLINE void set_early_term_based_on_uv_plane( + AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MACROBLOCKD *xd, int mi_row, + int mi_col, int *early_term, int num_blk, const unsigned int *sse_tx, + const unsigned int *var_tx, int sum, unsigned int var, unsigned int sse) { + AV1_COMMON *const cm = &cpi->common; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + const uint32_t dc_quant = p->dequant_QTX[0]; + const uint32_t ac_quant = p->dequant_QTX[1]; + int64_t dc_thr = dc_quant * dc_quant >> 6; + int64_t ac_thr = ac_quant * ac_quant >> 6; + const int bw = b_width_log2_lookup[bsize]; + const int bh = b_height_log2_lookup[bsize]; + int ac_test = 1; + int dc_test = 1; + const int norm_sum = abs(sum) >> (bw + bh); + +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5) + ac_thr = av1_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level, + norm_sum, cpi->svc.temporal_layer_id); + else + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum); +#else + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum); + +#endif + + if (cpi->sf.rt_sf.increase_source_sad_thresh) { + dc_thr = dc_thr << 1; + ac_thr = ac_thr << 2; + } + + for (int k = 0; k < num_blk; k++) { + // Check if all ac coefficients can be quantized to zero. + if (!(var_tx[k] < ac_thr || var == 0)) { + ac_test = 0; + break; + } + // Check if dc coefficient can be quantized to zero. + if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) { + dc_test = 0; + break; + } + } + + // Check if chroma can be skipped based on ac and dc test flags. + if (ac_test && dc_test) { + int skip_uv[2] = { 0 }; + unsigned int var_uv[2]; + unsigned int sse_uv[2]; + // Transform skipping test in UV planes. + for (int plane = AOM_PLANE_U; plane <= AOM_PLANE_V; plane++) { + int j = plane - 1; + skip_uv[j] = 1; + if (x->color_sensitivity[COLOR_SENS_IDX(plane)]) { + skip_uv[j] = 0; + struct macroblock_plane *const puv = &x->plane[plane]; + struct macroblockd_plane *const puvd = &xd->plane[plane]; + const BLOCK_SIZE uv_bsize = get_plane_block_size( + bsize, puvd->subsampling_x, puvd->subsampling_y); + // Adjust these thresholds for UV. + const int shift_ac = cpi->sf.rt_sf.increase_source_sad_thresh ? 5 : 3; + const int shift_dc = cpi->sf.rt_sf.increase_source_sad_thresh ? 4 : 3; + const int64_t uv_dc_thr = + (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> shift_dc; + const int64_t uv_ac_thr = + (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> shift_ac; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + plane, plane); + var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride, + puvd->dst.buf, + puvd->dst.stride, &sse_uv[j]); + if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && + (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) + skip_uv[j] = 1; + else + break; + } + } + if (skip_uv[0] & skip_uv[1]) { + *early_term = 1; + } + } +} + +static INLINE void calc_rate_dist_block_param(AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, + int calculate_rd, int *early_term, + BLOCK_SIZE bsize, + unsigned int sse) { + if (calculate_rd) { + if (!*early_term) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + + model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, rd_stats->sse, bw * bh, + &rd_stats->rate, &rd_stats->dist); + } + + if (*early_term) { + rd_stats->rate = 0; + rd_stats->dist = sse << 4; + } + } +} + +static void model_skip_for_sb_y_large_64(AV1_COMP *cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col, MACROBLOCK *x, + MACROBLOCKD *xd, RD_STATS *rd_stats, + int *early_term, int calculate_rd, + int64_t best_sse, + unsigned int *var_output, + unsigned int var_prune_threshold) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + int test_skip = 1; + unsigned int var; + int sum; + const int bw = b_width_log2_lookup[bsize]; + const int bh = b_height_log2_lookup[bsize]; + unsigned int sse16x16[64] = { 0 }; + unsigned int var16x16[64] = { 0 }; + assert(xd->mi[0]->tx_size == TX_16X16); + assert(bsize > BLOCK_32X32); + + // Calculate variance for whole partition, and also save 16x16 blocks' + // variance to be used in following transform skipping test. + block_variance_16x16_dual(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, 4 << bw, 4 << bh, &sse, &sum, 16, + sse16x16, var16x16); + + var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4)); + if (var_output) { + *var_output = var; + if (*var_output > var_prune_threshold) { + return; + } + } + + rd_stats->sse = sse; + // Skipping test + *early_term = 0; + set_force_skip_flag(cpi, x, sse, early_term); + // The code below for setting skip flag assumes transform size of at least + // 8x8, so force this lower limit on transform. + MB_MODE_INFO *const mi = xd->mi[0]; + if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search && + early_term_inter_search_with_sse( + cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse, + mi->mode)) + test_skip = 0; + + if (*early_term) test_skip = 0; + + // Evaluate if the partition block is a skippable block in Y plane. + if (test_skip) { + const unsigned int *sse_tx = sse16x16; + const unsigned int *var_tx = var16x16; + const unsigned int num_block = (1 << (bw + bh - 2)) >> 2; + set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col, + early_term, num_block, sse_tx, var_tx, sum, + var, sse); + } + calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize, + sse); +} + +static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col, MACROBLOCK *x, + MACROBLOCKD *xd, RD_STATS *rd_stats, + int *early_term, int calculate_rd, + int64_t best_sse, + unsigned int *var_output, + unsigned int var_prune_threshold) { + if (x->force_zeromv_skip_for_blk) { + *early_term = 1; + rd_stats->rate = 0; + rd_stats->dist = 0; + rd_stats->sse = 0; + return; + } + + // For block sizes greater than 32x32, the transform size is always 16x16. + // This function avoids calling calculate_variance() for tx_size 16x16 cases + // by directly populating variance at tx_size level from + // block_variance_16x16_dual() function. + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) { + xd->mi[0]->tx_size = TX_SIZE_FOR_BSIZE_GT32; + model_skip_for_sb_y_large_64(cpi, bsize, mi_row, mi_col, x, xd, rd_stats, + early_term, calculate_rd, best_sse, var_output, + var_prune_threshold); + return; + } + + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + int test_skip = 1; + unsigned int var; + int sum; + + const int bw = b_width_log2_lookup[bsize]; + const int bh = b_height_log2_lookup[bsize]; + unsigned int sse8x8[256] = { 0 }; + int sum8x8[256] = { 0 }; + unsigned int var8x8[256] = { 0 }; + TX_SIZE tx_size; + + // Calculate variance for whole partition, and also save 8x8 blocks' variance + // to be used in following transform skipping test. + block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8); + var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4)); + if (var_output) { + *var_output = var; + if (*var_output > var_prune_threshold) { + return; + } + } + + rd_stats->sse = sse; + // Skipping test + *early_term = 0; + tx_size = calculate_tx_size(cpi, bsize, x, var, sse, early_term); + assert(tx_size <= TX_16X16); + // The code below for setting skip flag assumes transform size of at least + // 8x8, so force this lower limit on transform. + if (tx_size < TX_8X8) tx_size = TX_8X8; + xd->mi[0]->tx_size = tx_size; + + MB_MODE_INFO *const mi = xd->mi[0]; + if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search && + early_term_inter_search_with_sse( + cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse, + mi->mode)) + test_skip = 0; + + if (*early_term) test_skip = 0; + + // Evaluate if the partition block is a skippable block in Y plane. + if (test_skip) { + unsigned int sse16x16[64] = { 0 }; + int sum16x16[64] = { 0 }; + unsigned int var16x16[64] = { 0 }; + const unsigned int *sse_tx = sse8x8; + const unsigned int *var_tx = var8x8; + unsigned int num_blks = 1 << (bw + bh - 2); + + if (tx_size >= TX_16X16) { + calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16, + sum16x16); + sse_tx = sse16x16; + var_tx = var16x16; + num_blks = num_blks >> 2; + } + set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col, + early_term, num_blks, sse_tx, var_tx, sum, + var, sse); + } + calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize, + sse); +} + +static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + RD_STATS *rd_stats, unsigned int *var_out, + int calculate_rd, int *early_term) { + if (x->force_zeromv_skip_for_blk && early_term != NULL) { + *early_term = 1; + rd_stats->rate = 0; + rd_stats->dist = 0; + rd_stats->sse = 0; + } + + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + assert(bsize < BLOCK_SIZES_ALL); + + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + unsigned int sse; + int rate; + int64_t dist; + + unsigned int var = cpi->ppi->fn_ptr[bsize].vf( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); + int force_skip = 0; + xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse, &force_skip); + if (var_out) { + *var_out = var; + } + + if (calculate_rd && (!force_skip || ref == INTRA_FRAME)) { + const int bwide = block_size_wide[bsize]; + const int bhigh = block_size_high[bsize]; + model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate, + &dist); + } else { + rate = INT_MAX; // this will be overwritten later with av1_block_yrd + dist = INT_MAX; + } + rd_stats->sse = sse; + x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + if (force_skip && ref > INTRA_FRAME) { + rate = 0; + dist = (int64_t)sse << 4; + } + + assert(rate >= 0); + + rd_stats->skip_txfm = (rate == 0); + rate = AOMMIN(rate, INT_MAX); + rd_stats->rate = rate; + rd_stats->dist = dist; +} + +static INLINE int get_drl_cost(PREDICTION_MODE this_mode, int ref_mv_idx, + const MB_MODE_INFO_EXT *mbmi_ext, + const int (*const drl_mode_cost0)[2], + int8_t ref_frame_type) { + int cost = 0; + if (this_mode == NEWMV || this_mode == NEW_NEWMV) { + for (int idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][ref_mv_idx != idx]; + if (ref_mv_idx == idx) return cost; + } + } + return cost; + } + + if (have_nearmv_in_inter_mode(this_mode)) { + for (int idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][ref_mv_idx != (idx - 1)]; + if (ref_mv_idx == (idx - 1)) return cost; + } + } + return cost; + } + return cost; +} + +static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode, + int16_t mode_context) { + if (is_inter_compound_mode(mode)) { + return mode_costs + ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)]; + } + + int mode_cost = 0; + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + + assert(is_inter_mode(mode)); + + if (mode == NEWMV) { + mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + + if (mode == GLOBALMV) { + mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV]; + return mode_cost; + } + } +} + +static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode, + RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row, + int mv_col, int speed, uint32_t spatial_variance, + CONTENT_STATE_SB content_state_sb) { + // Bias against MVs associated with NEWMV mode that are very different from + // top/left neighbors. + if (this_mode == NEWMV) { + int al_mv_average_row; + int al_mv_average_col; + int row_diff, col_diff; + int above_mv_valid = 0; + int left_mv_valid = 0; + int above_row = INVALID_MV_ROW_COL, above_col = INVALID_MV_ROW_COL; + int left_row = INVALID_MV_ROW_COL, left_col = INVALID_MV_ROW_COL; + if (bsize >= BLOCK_64X64 && content_state_sb.source_sad_nonrd != kHighSad && + spatial_variance < 300 && + (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) { + this_rdc->rdcost = this_rdc->rdcost << 2; + return; + } + if (xd->above_mbmi) { + above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV; + above_row = xd->above_mbmi->mv[0].as_mv.row; + above_col = xd->above_mbmi->mv[0].as_mv.col; + } + if (xd->left_mbmi) { + left_mv_valid = xd->left_mbmi->mv[0].as_int != INVALID_MV; + left_row = xd->left_mbmi->mv[0].as_mv.row; + left_col = xd->left_mbmi->mv[0].as_mv.col; + } + if (above_mv_valid && left_mv_valid) { + al_mv_average_row = (above_row + left_row + 1) >> 1; + al_mv_average_col = (above_col + left_col + 1) >> 1; + } else if (above_mv_valid) { + al_mv_average_row = above_row; + al_mv_average_col = above_col; + } else if (left_mv_valid) { + al_mv_average_row = left_row; + al_mv_average_col = left_col; + } else { + al_mv_average_row = al_mv_average_col = 0; + } + row_diff = al_mv_average_row - mv_row; + col_diff = al_mv_average_col - mv_col; + if (row_diff > 80 || row_diff < -80 || col_diff > 80 || col_diff < -80) { + if (bsize >= BLOCK_32X32) + this_rdc->rdcost = this_rdc->rdcost << 1; + else + this_rdc->rdcost = 5 * this_rdc->rdcost >> 2; + } + } else { + // Bias for speed >= 8 for low spatial variance. + if (speed >= 8 && spatial_variance < 150 && + (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64)) + this_rdc->rdcost = 5 * this_rdc->rdcost >> 2; + } +} + +static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, + MV_REFERENCE_FRAME ref_frame, + THR_MODES best_mode_idx, + PREDICTION_MODE mode) { + const THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; + const BLOCK_SIZE min_size = AOMMAX(bsize - 3, BLOCK_4X4); + const BLOCK_SIZE max_size = AOMMIN(bsize + 6, BLOCK_128X128); + for (BLOCK_SIZE bs = min_size; bs <= max_size; bs += 3) { + int *freq_fact = &x->thresh_freq_fact[bs][thr_mode_idx]; + if (thr_mode_idx == best_mode_idx) { + *freq_fact -= (*freq_fact >> 4); + } else { + *freq_fact = + AOMMIN(*freq_fact + RD_THRESH_INC, + cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } + } +} + +#if CONFIG_AV1_TEMPORAL_DENOISING +static void av1_pickmode_ctx_den_update( + AV1_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig, + unsigned int ref_frame_cost[REF_FRAMES], + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int reuse_inter_pred, + BEST_PICKMODE *bp) { + ctx_den->zero_last_cost_orig = zero_last_cost_orig; + ctx_den->ref_frame_cost = ref_frame_cost; + ctx_den->frame_mv = frame_mv; + ctx_den->reuse_inter_pred = reuse_inter_pred; + ctx_den->best_tx_size = bp->best_tx_size; + ctx_den->best_mode = bp->best_mode; + ctx_den->best_ref_frame = bp->best_ref_frame; + ctx_den->best_pred_filter = bp->best_pred_filter; + ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm; +} + +static void recheck_zeromv_after_denoising( + AV1_COMP *cpi, MB_MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd, + AV1_DENOISER_DECISION decision, AV1_PICKMODE_CTX_DEN *ctx_den, + struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_STATS *best_rdc, + BEST_PICKMODE *best_pickmode, BLOCK_SIZE bsize, int mi_row, int mi_col) { + // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on + // denoised result. Only do this under noise conditions, and if rdcost of + // ZEROMV on original source is not significantly higher than rdcost of best + // mode. + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow && + ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) && + ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) || + (ctx_den->best_ref_frame == GOLDEN_FRAME && + cpi->svc.number_spatial_layers == 1 && + decision == FILTER_ZEROMV_BLOCK))) { + // Check if we should pick ZEROMV on denoised signal. + AV1_COMMON *const cm = &cpi->common; + RD_STATS this_rdc; + const ModeCosts *mode_costs = &x->mode_costs; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + + mi->mode = GLOBALMV; + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NONE_FRAME; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME); + mi->mv[0].as_int = 0; + mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[LAST_FRAME][AOM_PLANE_Y]; + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + unsigned int var; + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1, NULL); + + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame); + this_rdc.rate += cost_mv_ref(mode_costs, GLOBALMV, mode_ctx); + + this_rdc.rate += ctx_den->ref_frame_cost[LAST_FRAME]; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + txfm_info->skip_txfm = this_rdc.skip_txfm; + // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source + // is higher than best_ref mode (on original source). + if (this_rdc.rdcost > best_rdc->rdcost) { + this_rdc = *best_rdc; + mi->mode = best_pickmode->best_mode; + mi->ref_frame[0] = best_pickmode->best_ref_frame; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME); + mi->interp_filters = best_pickmode->best_pred_filter; + if (best_pickmode->best_ref_frame == INTRA_FRAME) { + mi->mv[0].as_int = INVALID_MV; + } else { + mi->mv[0].as_int = ctx_den + ->frame_mv[best_pickmode->best_mode] + [best_pickmode->best_ref_frame] + .as_int; + if (ctx_den->reuse_inter_pred) { + xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[GOLDEN_FRAME][AOM_PLANE_Y]; + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + } + } + mi->tx_size = best_pickmode->best_tx_size; + txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm; + } else { + ctx_den->best_ref_frame = LAST_FRAME; + *best_rdc = this_rdc; + } + } +} +#endif // CONFIG_AV1_TEMPORAL_DENOISING + +/*!\brief Searches for the best interpolation filter + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Iterates through subset of possible interpolation filters (EIGHTTAP_REGULAR, + * EIGTHTAP_SMOOTH, MULTITAP_SHARP, depending on FILTER_SEARCH_SIZE) and selects + * the one that gives lowest RD cost. RD cost is calculated using curvfit model. + * Support for dual filters (different filters in the x & y directions) is + * allowed if sf.interp_sf.disable_dual_filter = 0. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the + * data for the current macroblock + * \param[in] this_rdc Pointer to calculated RD Cost + * \param[in] inter_pred_params_sr Pointer to structure holding parameters of + inter prediction for single reference + * \param[in] mi_row Row index in 4x4 units + * \param[in] mi_col Column index in 4x4 units + * \param[in] tmp_buffer Pointer to a temporary buffer for + * prediction re-use + * \param[in] bsize Current block size + * \param[in] reuse_inter_pred Flag, indicating prediction re-use + * \param[out] this_mode_pred Pointer to store prediction buffer + * for prediction re-use + * \param[out] this_early_term Flag, indicating that transform can be + * skipped + * \param[out] var The residue variance of the current + * predictor. + * \param[in] use_model_yrd_large Flag, indicating special logic to handle + * large blocks + * \param[in] best_sse Best sse so far. + * \param[in] is_single_pred Flag, indicating single mode. + * + * \remark Nothing is returned. Instead, calculated RD cost is placed to + * \c this_rdc and best filter is placed to \c mi->interp_filters. In case + * \c reuse_inter_pred flag is set, this function also outputs + * \c this_mode_pred. Also \c this_early_temp is set if transform can be + * skipped + */ +static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc, + InterPredParams *inter_pred_params_sr, int mi_row, + int mi_col, PRED_BUFFER *tmp_buffer, + BLOCK_SIZE bsize, int reuse_inter_pred, + PRED_BUFFER **this_mode_pred, + int *this_early_term, unsigned int *var, + int use_model_yrd_large, int64_t best_sse, + int is_single_pred) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + MB_MODE_INFO *const mi = xd->mi[0]; + const int bw = block_size_wide[bsize]; + int dim_factor = + (cpi->sf.interp_sf.disable_dual_filter == 0) ? FILTER_SEARCH_SIZE : 1; + RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 }; + TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 }; + PRED_BUFFER *current_pred = *this_mode_pred; + int best_skip = 0; + int best_early_term = 0; + int64_t best_cost = INT64_MAX; + int best_filter_index = -1; + + SubpelParams subpel_params; + // Initialize inter prediction params at mode level for single reference + // mode. + if (is_single_pred) + init_inter_mode_params(&mi->mv[0].as_mv, inter_pred_params_sr, + &subpel_params, xd->block_ref_scale_factors[0], + pd->pre->width, pd->pre->height); + for (int filter_idx = 0; filter_idx < FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE; + ++filter_idx) { + int64_t cost; + if (cpi->sf.interp_sf.disable_dual_filter && + filters_ref_set[filter_idx].as_filters.x_filter != + filters_ref_set[filter_idx].as_filters.y_filter) + continue; + + mi->interp_filters.as_int = filters_ref_set[filter_idx].as_int; + if (is_single_pred) + av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr, + &subpel_params); + else + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + unsigned int curr_var = UINT_MAX; + if (use_model_yrd_large) + model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, + &pf_rd_stats[filter_idx], this_early_term, 1, + best_sse, &curr_var, UINT_MAX); + else + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[filter_idx], &curr_var, + 1, NULL); + pf_rd_stats[filter_idx].rate += av1_get_switchable_rate( + x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter); + cost = RDCOST(x->rdmult, pf_rd_stats[filter_idx].rate, + pf_rd_stats[filter_idx].dist); + pf_tx_size[filter_idx] = mi->tx_size; + if (cost < best_cost) { + *var = curr_var; + best_filter_index = filter_idx; + best_cost = cost; + best_skip = pf_rd_stats[filter_idx].skip_txfm; + best_early_term = *this_early_term; + if (reuse_inter_pred) { + if (*this_mode_pred != current_pred) { + free_pred_buffer(*this_mode_pred); + *this_mode_pred = current_pred; + } + current_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; + pd->dst.buf = current_pred->data; + pd->dst.stride = bw; + } + } + } + assert(best_filter_index >= 0 && + best_filter_index < dim_factor * FILTER_SEARCH_SIZE); + if (reuse_inter_pred && *this_mode_pred != current_pred) + free_pred_buffer(current_pred); + + mi->interp_filters.as_int = filters_ref_set[best_filter_index].as_int; + mi->tx_size = pf_tx_size[best_filter_index]; + this_rdc->rate = pf_rd_stats[best_filter_index].rate; + this_rdc->dist = pf_rd_stats[best_filter_index].dist; + this_rdc->sse = pf_rd_stats[best_filter_index].sse; + this_rdc->skip_txfm = (best_skip || best_early_term); + *this_early_term = best_early_term; + if (reuse_inter_pred) { + pd->dst.buf = (*this_mode_pred)->data; + pd->dst.stride = (*this_mode_pred)->stride; + } else if (best_filter_index < dim_factor * FILTER_SEARCH_SIZE - 1) { + if (is_single_pred) + av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr, + &subpel_params); + else + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } +} +#if !CONFIG_REALTIME_ONLY + +static AOM_INLINE int is_warped_mode_allowed(const AV1_COMP *cpi, + MACROBLOCK *const x, + const MB_MODE_INFO *mbmi) { + const FeatureFlags *const features = &cpi->common.features; + const MACROBLOCKD *xd = &x->e_mbd; + + if (cpi->sf.inter_sf.extra_prune_warped) return 0; + if (has_second_ref(mbmi)) return 0; + MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; + + if (features->switchable_motion_mode) { + // Determine which motion modes to search if more than SIMPLE_TRANSLATION + // is allowed. + last_motion_mode_allowed = motion_mode_allowed( + xd->global_motion, xd, mbmi, features->allow_warped_motion); + } + + if (last_motion_mode_allowed == WARPED_CAUSAL) { + return 1; + } + + return 0; +} + +static void calc_num_proj_ref(AV1_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const FeatureFlags *const features = &cm->features; + + mi->num_proj_ref = 1; + WARP_SAMPLE_INFO *const warp_sample_info = + &x->warp_sample_info[mi->ref_frame[0]]; + int *pts0 = warp_sample_info->pts; + int *pts_inref0 = warp_sample_info->pts_inref; + MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; + + if (features->switchable_motion_mode) { + // Determine which motion modes to search if more than SIMPLE_TRANSLATION + // is allowed. + last_motion_mode_allowed = motion_mode_allowed( + xd->global_motion, xd, mi, features->allow_warped_motion); + } + + if (last_motion_mode_allowed == WARPED_CAUSAL) { + if (warp_sample_info->num < 0) { + warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0); + } + mi->num_proj_ref = warp_sample_info->num; + } +} + +static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int *this_early_term, int use_model_yrd_large, + int *rate_mv, int64_t best_sse) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const FeatureFlags *const features = &cm->features; + MB_MODE_INFO *const mi = xd->mi[0]; + RD_STATS pf_rd_stats[MOTION_MODE_SEARCH_SIZE] = { 0 }; + int best_skip = 0; + int best_early_term = 0; + int64_t best_cost = INT64_MAX; + int best_mode_index = -1; + const int interp_filter = features->interp_filter; + + const MOTION_MODE motion_modes[MOTION_MODE_SEARCH_SIZE] = { + SIMPLE_TRANSLATION, WARPED_CAUSAL + }; + int mode_search_size = is_warped_mode_allowed(cpi, x, mi) ? 2 : 1; + + WARP_SAMPLE_INFO *const warp_sample_info = + &x->warp_sample_info[mi->ref_frame[0]]; + int *pts0 = warp_sample_info->pts; + int *pts_inref0 = warp_sample_info->pts_inref; + + const int total_samples = mi->num_proj_ref; + if (total_samples == 0) { + // Do not search WARPED_CAUSAL if there are no samples to use to determine + // warped parameters. + mode_search_size = 1; + } + + const MB_MODE_INFO base_mbmi = *mi; + MB_MODE_INFO best_mbmi; + + for (int mode_index = 0; mode_index < mode_search_size; ++mode_index) { + int64_t cost = INT64_MAX; + MOTION_MODE motion_mode = motion_modes[mode_index]; + *mi = base_mbmi; + mi->motion_mode = motion_mode; + if (motion_mode == SIMPLE_TRANSLATION) { + mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + if (use_model_yrd_large) + model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, + &pf_rd_stats[mode_index], this_early_term, 1, + best_sse, NULL, UINT_MAX); + else + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL, 1, + NULL); + pf_rd_stats[mode_index].rate += + av1_get_switchable_rate(x, xd, cm->features.interp_filter, + cm->seq_params->enable_dual_filter); + cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate, + pf_rd_stats[mode_index].dist); + } else if (motion_mode == WARPED_CAUSAL) { + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + const ModeCosts *mode_costs = &x->mode_costs; + mi->wm_params.wmtype = DEFAULT_WMTYPE; + mi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + + memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); + memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); + // Select the samples according to motion vector difference + if (mi->num_proj_ref > 1) { + mi->num_proj_ref = av1_selectSamples(&mi->mv[0].as_mv, pts, pts_inref, + mi->num_proj_ref, bsize); + } + + // Compute the warped motion parameters with a least squares fit + // using the collected samples + if (!av1_find_projection(mi->num_proj_ref, pts, pts_inref, bsize, + mi->mv[0].as_mv.row, mi->mv[0].as_mv.col, + &mi->wm_params, mi_row, mi_col)) { + if (mi->mode == NEWMV) { + const int_mv mv0 = mi->mv[0]; + const WarpedMotionParams wm_params0 = mi->wm_params; + const int num_proj_ref0 = mi->num_proj_ref; + + const int_mv ref_mv = av1_get_ref_mv(x, 0); + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, + &ref_mv.as_mv, NULL); + + // Refine MV in a small range. + av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0, + total_samples, cpi->sf.mv_sf.warp_search_method, + cpi->sf.mv_sf.warp_search_iters); + if (mi->mv[0].as_int == ref_mv.as_int) { + continue; + } + + if (mv0.as_int != mi->mv[0].as_int) { + // Keep the refined MV and WM parameters. + int tmp_rate_mv = av1_mv_bit_cost( + &mi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + *rate_mv = tmp_rate_mv; + } else { + // Restore the old MV and WM parameters. + mi->mv[0] = mv0; + mi->wm_params = wm_params0; + mi->num_proj_ref = num_proj_ref0; + } + } + // Build the warped predictor + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, av1_num_planes(cm) - 1); + if (use_model_yrd_large) + model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, + &pf_rd_stats[mode_index], this_early_term, + 1, best_sse, NULL, UINT_MAX); + else + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL, + 1, NULL); + + pf_rd_stats[mode_index].rate += + mode_costs->motion_mode_cost[bsize][mi->motion_mode]; + cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate, + pf_rd_stats[mode_index].dist); + } else { + cost = INT64_MAX; + } + } + if (cost < best_cost) { + best_mode_index = mode_index; + best_cost = cost; + best_skip = pf_rd_stats[mode_index].skip_txfm; + best_early_term = *this_early_term; + best_mbmi = *mi; + } + } + assert(best_mode_index >= 0 && best_mode_index < FILTER_SEARCH_SIZE); + + *mi = best_mbmi; + this_rdc->rate = pf_rd_stats[best_mode_index].rate; + this_rdc->dist = pf_rd_stats[best_mode_index].dist; + this_rdc->sse = pf_rd_stats[best_mode_index].sse; + this_rdc->skip_txfm = (best_skip || best_early_term); + *this_early_term = best_early_term; + if (best_mode_index < FILTER_SEARCH_SIZE - 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } +} +#endif // !CONFIG_REALTIME_ONLY + +#define COLLECT_NON_SQR_STAT 0 + +#if COLLECT_NONRD_PICK_MODE_STAT + +static AOM_INLINE void print_stage_time(const char *stage_name, + int64_t stage_time, + int64_t total_time) { + printf(" %s: %ld (%f%%)\n", stage_name, stage_time, + 100 * stage_time / (float)total_time); +} + +static void print_time(const mode_search_stat_nonrd *const ms_stat, + BLOCK_SIZE bsize, int mi_rows, int mi_cols, int mi_row, + int mi_col) { + if ((mi_row + mi_size_high[bsize] >= mi_rows) && + (mi_col + mi_size_wide[bsize] >= mi_cols)) { + int64_t total_time = 0l; + int32_t total_blocks = 0; + for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) { + total_time += ms_stat->total_block_times[bs]; + total_blocks += ms_stat->num_blocks[bs]; + } + + printf("\n"); + for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) { + if (ms_stat->num_blocks[bs] == 0) { + continue; + } + if (!COLLECT_NON_SQR_STAT && block_size_wide[bs] != block_size_high[bs]) { + continue; + } + + printf("BLOCK_%dX%d Num %d, Time: %ld (%f%%), Avg_time %f:\n", + block_size_wide[bs], block_size_high[bs], ms_stat->num_blocks[bs], + ms_stat->total_block_times[bs], + 100 * ms_stat->total_block_times[bs] / (float)total_time, + (float)ms_stat->total_block_times[bs] / ms_stat->num_blocks[bs]); + for (int j = 0; j < MB_MODE_COUNT; j++) { + if (ms_stat->nonskipped_search_times[bs][j] == 0) { + continue; + } + + int64_t total_mode_time = ms_stat->nonskipped_search_times[bs][j]; + printf(" Mode %d, %d/%d tps %f\n", j, + ms_stat->num_nonskipped_searches[bs][j], + ms_stat->num_searches[bs][j], + ms_stat->num_nonskipped_searches[bs][j] > 0 + ? (float)ms_stat->nonskipped_search_times[bs][j] / + ms_stat->num_nonskipped_searches[bs][j] + : 0l); + if (j >= INTER_MODE_START) { + total_mode_time = ms_stat->ms_time[bs][j] + ms_stat->ifs_time[bs][j] + + ms_stat->model_rd_time[bs][j] + + ms_stat->txfm_time[bs][j]; + print_stage_time("Motion Search Time", ms_stat->ms_time[bs][j], + total_time); + print_stage_time("Filter Search Time", ms_stat->ifs_time[bs][j], + total_time); + print_stage_time("Model RD Time", ms_stat->model_rd_time[bs][j], + total_time); + print_stage_time("Tranfm Search Time", ms_stat->txfm_time[bs][j], + total_time); + } + print_stage_time("Total Mode Time", total_mode_time, total_time); + } + printf("\n"); + } + printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks); + } +} +#endif // COLLECT_NONRD_PICK_MODE_STAT + +static bool should_prune_intra_modes_using_neighbors( + const MACROBLOCKD *xd, bool enable_intra_mode_pruning_using_neighbors, + PREDICTION_MODE this_mode, PREDICTION_MODE above_mode, + PREDICTION_MODE left_mode) { + if (!enable_intra_mode_pruning_using_neighbors) return false; + + // Avoid pruning of DC_PRED as it is the most probable mode to win as per the + // statistics generated for nonrd intra mode evaluations. + if (this_mode == DC_PRED) return false; + + // Enable the pruning for current mode only if it is not the winner mode of + // both the neighboring blocks (left/top). + return xd->up_available && this_mode != above_mode && xd->left_available && + this_mode != left_mode; +} + +void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + RD_STATS this_rdc, best_rdc; + struct estimate_block_intra_args args; + init_estimate_block_intra_args(&args, cpi, x); + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + mi->tx_size = + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]); + assert(IMPLIES(xd->lossless[mi->segment_id], mi->tx_size == TX_4X4)); + const BLOCK_SIZE tx_bsize = txsize_to_bsize[mi->tx_size]; + + // If the current block size is the same as the transform block size, enable + // mode pruning based on the best SAD so far. + if (cpi->sf.rt_sf.prune_intra_mode_using_best_sad_so_far && bsize == tx_bsize) + args.prune_mode_based_on_sad = true; + + int *bmode_costs; + PREDICTION_MODE best_mode = DC_PRED; + const MB_MODE_INFO *above_mi = xd->above_mbmi; + const MB_MODE_INFO *left_mi = xd->left_mbmi; + const PREDICTION_MODE A = av1_above_block_mode(above_mi); + const PREDICTION_MODE L = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[A]; + const int left_ctx = intra_mode_context[L]; + const unsigned int source_variance = x->source_variance; + bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx]; + + av1_invalid_rd_stats(&best_rdc); + av1_invalid_rd_stats(&this_rdc); + + init_mbmi_nonrd(mi, DC_PRED, INTRA_FRAME, NONE_FRAME, cm); + mi->mv[0].as_int = mi->mv[1].as_int = INVALID_MV; + + // Change the limit of this loop to add other intra prediction + // mode tests. + for (int mode_index = 0; mode_index < RTC_INTRA_MODES; ++mode_index) { + PREDICTION_MODE this_mode = intra_mode_list[mode_index]; + + // As per the statistics generated for intra mode evaluation in the nonrd + // path, it is found that the probability of H_PRED mode being the winner is + // very low when the best mode so far is V_PRED (out of DC_PRED and V_PRED). + // If V_PRED is the winner mode out of DC_PRED and V_PRED, it could imply + // the presence of a vertically dominant pattern. Hence, H_PRED mode is not + // evaluated. + if (cpi->sf.rt_sf.prune_h_pred_using_best_mode_so_far && + this_mode == H_PRED && best_mode == V_PRED) + continue; + + if (should_prune_intra_modes_using_neighbors( + xd, cpi->sf.rt_sf.enable_intra_mode_pruning_using_neighbors, + this_mode, A, L)) { + // Prune V_PRED and H_PRED if source variance of the block is less than + // or equal to 50. The source variance threshold is obtained empirically. + if ((this_mode == V_PRED || this_mode == H_PRED) && source_variance <= 50) + continue; + + // As per the statistics, probability of SMOOTH_PRED being the winner is + // low when best mode so far is DC_PRED (out of DC_PRED, V_PRED and + // H_PRED). Hence, SMOOTH_PRED mode is not evaluated. + if (best_mode == DC_PRED && this_mode == SMOOTH_PRED) continue; + } + + this_rdc.dist = this_rdc.rate = 0; + args.mode = this_mode; + args.skippable = 1; + args.rdc = &this_rdc; + mi->mode = this_mode; + av1_foreach_transformed_block_in_plane(xd, bsize, AOM_PLANE_Y, + av1_estimate_block_intra, &args); + + if (this_rdc.rate == INT_MAX) continue; + + const int skip_ctx = av1_get_skip_txfm_context(xd); + if (args.skippable) { + this_rdc.rate = x->mode_costs.skip_txfm_cost[skip_ctx][1]; + } else { + this_rdc.rate += x->mode_costs.skip_txfm_cost[skip_ctx][0]; + } + this_rdc.rate += bmode_costs[this_mode]; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + best_mode = this_mode; + if (!this_rdc.skip_txfm) { + memset(ctx->blk_skip, 0, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + } + } + } + + mi->mode = best_mode; + // Keep DC for UV since mode test is based on Y channel only. + mi->uv_mode = UV_DC_PRED; + *rd_cost = best_rdc; + + // For lossless: always force the skip flags off. + // Even though the blk_skip is set to 0 above in the rdcost comparison, + // do it here again in case the above logic changes. + if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { + x->txfm_search_info.skip_txfm = 0; + memset(ctx->blk_skip, 0, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + } + +#if CONFIG_INTERNAL_STATS + store_coding_context_nonrd(x, ctx, mi->mode); +#else + store_coding_context_nonrd(x, ctx); +#endif // CONFIG_INTERNAL_STATS +} + +static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) { + struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME); + struct scale_factors *const sf_golden = + get_ref_scale_factors(cm, GOLDEN_FRAME); + return ((sf_last->x_scale_fp == sf_golden->x_scale_fp) && + (sf_last->y_scale_fp == sf_golden->y_scale_fp)); +} + +static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x, + MB_MODE_INFO *mi, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int gf_temporal_ref, + int use_ref_frame[], + int *force_skip_low_temp_var) { + AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); + + // When the ref_frame_config is used to set the reference frame structure + // then the usage of alt_ref is determined by the ref_frame_flags + // (and not the speed feature use_nonrd_altref_frame). + int use_alt_ref_frame = cpi->ppi->rtc_ref.set_ref_frame_config || + cpi->sf.rt_sf.use_nonrd_altref_frame; + + int use_golden_ref_frame = 1; + int use_last_ref_frame = 1; + + // When the ref_frame_config is used to set the reference frame structure: + // check if LAST is used as a reference. And only remove golden and altref + // references below if last is used as a reference. + if (cpi->ppi->rtc_ref.set_ref_frame_config) + use_last_ref_frame = + cpi->ref_frame_flags & AOM_LAST_FLAG ? use_last_ref_frame : 0; + + // frame_since_golden is not used when user sets the referene structure. + if (!cpi->ppi->rtc_ref.set_ref_frame_config && use_last_ref_frame && + cpi->rc.frames_since_golden == 0 && gf_temporal_ref) { + use_golden_ref_frame = 0; + } + + if (use_last_ref_frame && cpi->sf.rt_sf.short_circuit_low_temp_var && + x->nonrd_prune_ref_frame_search) { + if (is_small_sb) + *force_skip_low_temp_var = av1_get_force_skip_low_temp_var_small_sb( + &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); + else + *force_skip_low_temp_var = av1_get_force_skip_low_temp_var( + &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); + // If force_skip_low_temp_var is set, skip golden reference. + if (*force_skip_low_temp_var) { + use_golden_ref_frame = 0; + use_alt_ref_frame = 0; + } + } + + if (use_last_ref_frame && + (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip_for_blk || + (x->nonrd_prune_ref_frame_search > 1 && bsize > BLOCK_64X64))) { + use_golden_ref_frame = 0; + use_alt_ref_frame = 0; + } + + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) { + use_golden_ref_frame = 1; + use_alt_ref_frame = 0; + } + + // Skip golden/altref reference if color is set, on flat blocks with motion. + // For screen: always skip golden/alt (if color_sensitivity_sb_g/alt is set) + // except when x->nonrd_prune_ref_frame_search = 0. This latter flag + // may be set in the variance partition when golden is a much better + // reference than last, in which case it may not be worth skipping + // golden/altref completely. + // Condition on use_last_ref to make sure there remains at least one + // reference. + if (use_last_ref_frame && + ((cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + x->nonrd_prune_ref_frame_search != 0) || + (x->source_variance < 200 && + x->content_state_sb.source_sad_nonrd >= kLowSad))) { + if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) + use_golden_ref_frame = 0; + if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) + use_alt_ref_frame = 0; + } + + // For non-screen: if golden and altref are not being selected as references + // (use_golden_ref_frame/use_alt_ref_frame = 0) check to allow golden back + // based on the sad of nearest/nearmv of LAST ref. If this block sad is large, + // keep golden as reference. Only do this for the agrressive pruning mode and + // avoid it when color is set for golden reference. + if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && + (cpi->ref_frame_flags & AOM_LAST_FLAG) && !use_golden_ref_frame && + !use_alt_ref_frame && x->pred_mv_sad[LAST_FRAME] != INT_MAX && + x->nonrd_prune_ref_frame_search > 2 && + x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) { + int thr = (cm->width * cm->height > RESOLUTION_288P) ? 100 : 150; + int pred = x->pred_mv_sad[LAST_FRAME] >> + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (pred > thr) use_golden_ref_frame = 1; + } + + use_alt_ref_frame = + cpi->ref_frame_flags & AOM_ALT_FLAG ? use_alt_ref_frame : 0; + use_golden_ref_frame = + cpi->ref_frame_flags & AOM_GOLD_FLAG ? use_golden_ref_frame : 0; + + // For spatial layers: enable golden ref if it is set by user and + // corresponds to the lower spatial layer. + if (cpi->svc.spatial_layer_id > 0 && (cpi->ref_frame_flags & AOM_GOLD_FLAG) && + x->content_state_sb.source_sad_nonrd < kHighSad) { + const int buffslot_golden = + cpi->ppi->rtc_ref.ref_idx[GOLDEN_FRAME - LAST_FRAME]; + if (cpi->ppi->rtc_ref.buffer_time_index[buffslot_golden] == + cpi->svc.current_superframe) + use_golden_ref_frame = 1; + } + + use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame; + use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame; + use_ref_frame[LAST_FRAME] = use_last_ref_frame; + // Keep this assert on, as only 3 references are used in nonrd_pickmode + // (LAST, GOLDEN, ALTREF), and if all 3 are not set by user then this + // frame must be an intra-only frame and hence should never enter the + // pickmode here for inter frames. + assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame); +} + +static AOM_INLINE int is_filter_search_enabled_blk( + AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize, + int segment_id, int cb_pred_filter_search, InterpFilter *filt_select) { + const AV1_COMMON *const cm = &cpi->common; + // filt search disabled + if (!cpi->sf.rt_sf.use_nonrd_filter_search) return 0; + // filt search purely based on mode properties + if (!cb_pred_filter_search) return 1; + MACROBLOCKD *const xd = &x->e_mbd; + int enable_interp_search = 0; + if (!(xd->left_mbmi && xd->above_mbmi)) { + // neighbors info unavailable + enable_interp_search = 2; + } else if (!(is_inter_block(xd->left_mbmi) && + is_inter_block(xd->above_mbmi))) { + // neighbor is INTRA + enable_interp_search = 2; + } else if (xd->left_mbmi->interp_filters.as_int != + xd->above_mbmi->interp_filters.as_int) { + // filters are different + enable_interp_search = 2; + } else if ((cb_pred_filter_search == 1) && + (xd->left_mbmi->interp_filters.as_filters.x_filter != + EIGHTTAP_REGULAR)) { + // not regular + enable_interp_search = 2; + } else { + // enable prediction based on chessboard pattern + if (xd->left_mbmi->interp_filters.as_filters.x_filter == EIGHTTAP_SMOOTH) + *filt_select = EIGHTTAP_SMOOTH; + const int bsl = mi_size_wide_log2[bsize]; + enable_interp_search = + (bool)((((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_frame.frame_number)) & + 0x1); + if (cyclic_refresh_segment_id_boosted(segment_id)) enable_interp_search = 1; + } + return enable_interp_search; +} + +static AOM_INLINE int skip_mode_by_threshold( + PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, int_mv mv, + int frames_since_golden, const int *const rd_threshes, + const int *const rd_thresh_freq_fact, int64_t best_cost, int best_skip, + int extra_shift) { + int skip_this_mode = 0; + const THR_MODES mode_index = mode_idx[ref_frame][INTER_OFFSET(mode)]; + int64_t mode_rd_thresh = + best_skip ? ((int64_t)rd_threshes[mode_index]) << (extra_shift + 1) + : ((int64_t)rd_threshes[mode_index]) << extra_shift; + + // Increase mode_rd_thresh value for non-LAST for improved encoding + // speed + if (ref_frame != LAST_FRAME) { + mode_rd_thresh = mode_rd_thresh << 1; + if (ref_frame == GOLDEN_FRAME && frames_since_golden > 4) + mode_rd_thresh = mode_rd_thresh << (extra_shift + 1); + } + + if (rd_less_than_thresh(best_cost, mode_rd_thresh, + rd_thresh_freq_fact[mode_index])) + if (mv.as_int != 0) skip_this_mode = 1; + + return skip_this_mode; +} + +static AOM_INLINE int skip_mode_by_low_temp( + PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize, + CONTENT_STATE_SB content_state_sb, int_mv mv, int force_skip_low_temp_var) { + // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var + // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped + // later. + if (force_skip_low_temp_var && ref_frame != LAST_FRAME && mv.as_int != 0) { + return 1; + } + + if (content_state_sb.source_sad_nonrd != kHighSad && bsize >= BLOCK_64X64 && + force_skip_low_temp_var && mode == NEWMV) { + return 1; + } + return 0; +} + +static AOM_INLINE int skip_mode_by_bsize_and_ref_frame( + PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize, + int extra_prune, unsigned int sse_zeromv_norm, int more_prune) { + const unsigned int thresh_skip_golden = 500; + + if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden && + mode == NEWMV) + return 1; + + if (bsize == BLOCK_128X128 && mode == NEWMV) return 1; + + // Skip testing non-LAST if this flag is set. + if (extra_prune) { + if (extra_prune > 1 && ref_frame != LAST_FRAME && + (bsize > BLOCK_16X16 && mode == NEWMV)) + return 1; + + if (ref_frame != LAST_FRAME && mode == NEARMV) return 1; + + if (more_prune && bsize >= BLOCK_32X32 && mode == NEARMV) return 1; + } + return 0; +} + +static void set_block_source_sad(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + struct buf_2d *yv12_mb) { + struct macroblock_plane *const p = &x->plane[0]; + const int y_sad = cpi->ppi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride, + yv12_mb->buf, yv12_mb->stride); + if (y_sad == 0) x->block_is_zero_sad = 1; +} + +static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int y_sad, + unsigned int source_variance, + struct buf_2d yv12_mb[MAX_MB_PLANE]) { + const int subsampling_x = cpi->common.seq_params->subsampling_x; + const int subsampling_y = cpi->common.seq_params->subsampling_y; + const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; + const int high_res = cpi->common.width * cpi->common.height >= 640 * 360; + if (bsize == cpi->common.seq_params->sb_size) { + // At superblock level color_sensitivity is already set to 0, 1, or 2. + // 2 is middle/uncertain level. To avoid additional sad + // computations when bsize = sb_size force level 2 to 1 (certain color) + // for motion areas. + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 2) { + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = + source_sad_nonrd >= kMedSad ? 1 : 0; + } + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 2) { + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = + source_sad_nonrd >= kMedSad ? 1 : 0; + } + return; + } + int shift = 3; + unsigned int source_var_thr = 50; + int uv_sad_thr = 100; + if (source_sad_nonrd >= kMedSad && x->source_variance > 0 && high_res) + shift = 4; + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + if (cpi->rc.high_source_sad) shift = 6; + if (source_sad_nonrd > kMedSad) { + source_var_thr = 1200; + uv_sad_thr = 10; + } + } + NOISE_LEVEL noise_level = kLow; + int norm_sad = + y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + unsigned int thresh_spatial = (cpi->common.width > 1920) ? 5000 : 1000; + // If the spatial source variance is high and the normalized y_sad + // is low, then y-channel is likely good for mode estimation, so keep + // color_sensitivity off. For low noise content for now, since there is + // some bdrate regression for noisy color clip. + if (cpi->noise_estimate.enabled) + noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate); + if (noise_level == kLow && source_variance > thresh_spatial && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && norm_sad < 50) { + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 0; + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 0; + return; + } + const int num_planes = av1_num_planes(&cpi->common); + + for (int plane = AOM_PLANE_U; plane < num_planes; ++plane) { + // Always check if level = 2. If level = 0 check again for + // motion areas for higher resolns, where color artifacts + // are more noticeable. + if (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 2 || + (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 0 && + source_sad_nonrd >= kMedSad && high_res)) { + struct macroblock_plane *const p = &x->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + + const int uv_sad = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride); + + const int norm_uv_sad = + uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]); + x->color_sensitivity[COLOR_SENS_IDX(plane)] = + uv_sad > (y_sad >> shift) && norm_uv_sad > 40; + if (source_variance < source_var_thr && norm_uv_sad > uv_sad_thr) + x->color_sensitivity[COLOR_SENS_IDX(plane)] = 1; + } + } +} + +static void setup_compound_prediction(const AV1_COMMON *cm, MACROBLOCK *x, + struct buf_2d yv12_mb[8][MAX_MB_PLANE], + const int *use_ref_frame_mask, + const MV_REFERENCE_FRAME *rf, + int *ref_mv_idx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + MV_REFERENCE_FRAME ref_frame_comp; + if (!use_ref_frame_mask[rf[1]]) { + // Need to setup pred_block, if it hasn't been done in find_predictors. + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, rf[1]); + const int num_planes = av1_num_planes(cm); + if (yv12 != NULL) { + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, rf[1]); + av1_setup_pred_block(xd, yv12_mb[rf[1]], yv12, sf, sf, num_planes); + } + } + ref_frame_comp = av1_ref_frame_type(rf); + mbmi_ext->mode_context[ref_frame_comp] = 0; + mbmi_ext->ref_mv_count[ref_frame_comp] = UINT8_MAX; + av1_find_mv_refs(cm, xd, mbmi, ref_frame_comp, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_comp); + *ref_mv_idx = mbmi->ref_mv_idx + 1; +} + +static void set_compound_mode(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME ref_frame2, int ref_mv_idx, + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], + PREDICTION_MODE this_mode) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + mi->ref_frame[0] = ref_frame; + mi->ref_frame[1] = ref_frame2; + mi->compound_idx = 1; + mi->comp_group_idx = 0; + mi->interinter_comp.type = COMPOUND_AVERAGE; + MV_REFERENCE_FRAME ref_frame_comp = av1_ref_frame_type(mi->ref_frame); + if (this_mode == GLOBAL_GLOBALMV) { + frame_mv[this_mode][ref_frame].as_int = 0; + frame_mv[this_mode][ref_frame2].as_int = 0; + } else if (this_mode == NEAREST_NEARESTMV) { + frame_mv[this_mode][ref_frame].as_int = + xd->ref_mv_stack[ref_frame_comp][0].this_mv.as_int; + frame_mv[this_mode][ref_frame2].as_int = + xd->ref_mv_stack[ref_frame_comp][0].comp_mv.as_int; + } else if (this_mode == NEAR_NEARMV) { + frame_mv[this_mode][ref_frame].as_int = + xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].this_mv.as_int; + frame_mv[this_mode][ref_frame2].as_int = + xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].comp_mv.as_int; + } +} + +// Prune compound mode if the single mode variance is lower than a fixed +// percentage of the median value. +static bool skip_comp_based_on_var( + const unsigned int (*single_vars)[REF_FRAMES], BLOCK_SIZE bsize) { + unsigned int best_var = UINT_MAX; + for (int cur_mode_idx = 0; cur_mode_idx < RTC_INTER_MODES; cur_mode_idx++) { + for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { + best_var = AOMMIN(best_var, single_vars[cur_mode_idx][ref_idx]); + } + } + const unsigned int thresh_64 = (unsigned int)(0.57356805f * 8659); + const unsigned int thresh_32 = (unsigned int)(0.23964763f * 4281); + + // Currently, the thresh for 128 and 16 are not well-tuned. We are using the + // results from 64 and 32 as an heuristic. + switch (bsize) { + case BLOCK_128X128: return best_var < 4 * thresh_64; + case BLOCK_64X64: return best_var < thresh_64; + case BLOCK_32X32: return best_var < thresh_32; + case BLOCK_16X16: return best_var < thresh_32 / 4; + default: return false; + } +} + +static AOM_FORCE_INLINE void fill_single_inter_mode_costs( + int (*single_inter_mode_costs)[REF_FRAMES], int num_inter_modes, + const REF_MODE *reference_mode_set, const ModeCosts *mode_costs, + const int16_t *mode_context) { + bool ref_frame_used[REF_FRAMES] = { false }; + for (int idx = 0; idx < num_inter_modes; idx++) { + ref_frame_used[reference_mode_set[idx].ref_frame] = true; + } + + for (int this_ref_frame = LAST_FRAME; this_ref_frame < REF_FRAMES; + this_ref_frame++) { + if (!ref_frame_used[this_ref_frame]) { + continue; + } + + const MV_REFERENCE_FRAME rf[2] = { this_ref_frame, NONE_FRAME }; + const int16_t mode_ctx = av1_mode_context_analyzer(mode_context, rf); + for (PREDICTION_MODE this_mode = NEARESTMV; this_mode <= NEWMV; + this_mode++) { + single_inter_mode_costs[INTER_OFFSET(this_mode)][this_ref_frame] = + cost_mv_ref(mode_costs, this_mode, mode_ctx); + } + } +} + +static AOM_INLINE bool is_globalmv_better( + PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, int rate_mv, + const ModeCosts *mode_costs, + const int (*single_inter_mode_costs)[REF_FRAMES], + const MB_MODE_INFO_EXT *mbmi_ext) { + const int globalmv_mode_cost = + single_inter_mode_costs[INTER_OFFSET(GLOBALMV)][ref_frame]; + int this_mode_cost = + rate_mv + single_inter_mode_costs[INTER_OFFSET(this_mode)][ref_frame]; + if (this_mode == NEWMV || this_mode == NEARMV) { + const MV_REFERENCE_FRAME rf[2] = { ref_frame, NONE_FRAME }; + this_mode_cost += get_drl_cost( + NEWMV, 0, mbmi_ext, mode_costs->drl_mode_cost0, av1_ref_frame_type(rf)); + } + return this_mode_cost > globalmv_mode_cost; +} + +// Set up the mv/ref_frames etc based on the comp_index. Returns 1 if it +// succeeds, 0 if it fails. +static AOM_INLINE int setup_compound_params_from_comp_idx( + const AV1_COMP *cpi, MACROBLOCK *x, struct buf_2d yv12_mb[8][MAX_MB_PLANE], + PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *ref_frame, + MV_REFERENCE_FRAME *ref_frame2, int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], + const int *use_ref_frame_mask, int comp_index, + bool comp_use_zero_zeromv_only, MV_REFERENCE_FRAME *last_comp_ref_frame, + BLOCK_SIZE bsize) { + const MV_REFERENCE_FRAME *rf = comp_ref_mode_set[comp_index].ref_frame; + int skip_gf = 0; + int skip_alt = 0; + *this_mode = comp_ref_mode_set[comp_index].pred_mode; + *ref_frame = rf[0]; + *ref_frame2 = rf[1]; + assert(*ref_frame == LAST_FRAME); + assert(*this_mode == GLOBAL_GLOBALMV || *this_mode == NEAREST_NEARESTMV); + if (x->source_variance < 50 && bsize > BLOCK_16X16) { + if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) + skip_gf = 1; + if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) + skip_alt = 1; + } + if (comp_use_zero_zeromv_only && *this_mode != GLOBAL_GLOBALMV) { + return 0; + } + if (*ref_frame2 == GOLDEN_FRAME && + (cpi->sf.rt_sf.ref_frame_comp_nonrd[0] == 0 || skip_gf || + !(cpi->ref_frame_flags & AOM_GOLD_FLAG))) { + return 0; + } else if (*ref_frame2 == LAST2_FRAME && + (cpi->sf.rt_sf.ref_frame_comp_nonrd[1] == 0 || + !(cpi->ref_frame_flags & AOM_LAST2_FLAG))) { + return 0; + } else if (*ref_frame2 == ALTREF_FRAME && + (cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 0 || skip_alt || + !(cpi->ref_frame_flags & AOM_ALT_FLAG))) { + return 0; + } + int ref_mv_idx = 0; + if (*last_comp_ref_frame != rf[1]) { + // Only needs to be done once per reference pair. + setup_compound_prediction(&cpi->common, x, yv12_mb, use_ref_frame_mask, rf, + &ref_mv_idx); + *last_comp_ref_frame = rf[1]; + } + set_compound_mode(x, *ref_frame, *ref_frame2, ref_mv_idx, frame_mv, + *this_mode); + if (*this_mode != GLOBAL_GLOBALMV && + frame_mv[*this_mode][*ref_frame].as_int == 0 && + frame_mv[*this_mode][*ref_frame2].as_int == 0) { + return 0; + } + + return 1; +} + +static AOM_INLINE bool previous_mode_performed_poorly( + PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, + const unsigned int (*vars)[REF_FRAMES], + const int64_t (*uv_dist)[REF_FRAMES]) { + unsigned int best_var = UINT_MAX; + int64_t best_uv_dist = INT64_MAX; + for (int midx = 0; midx < RTC_INTER_MODES; midx++) { + best_var = AOMMIN(best_var, vars[midx][ref_frame]); + best_uv_dist = AOMMIN(best_uv_dist, uv_dist[midx][ref_frame]); + } + assert(best_var != UINT_MAX && "Invalid variance data."); + const float mult = 1.125f; + bool var_bad = mult * best_var < vars[INTER_OFFSET(mode)][ref_frame]; + if (uv_dist[INTER_OFFSET(mode)][ref_frame] < INT64_MAX && + best_uv_dist != uv_dist[INTER_OFFSET(mode)][ref_frame]) { + // If we have chroma info, then take it into account + var_bad &= mult * best_uv_dist < uv_dist[INTER_OFFSET(mode)][ref_frame]; + } + return var_bad; +} + +static AOM_INLINE bool prune_compoundmode_with_singlemode_var( + PREDICTION_MODE compound_mode, MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME ref_frame2, const int_mv (*frame_mv)[REF_FRAMES], + const uint8_t (*mode_checked)[REF_FRAMES], + const unsigned int (*vars)[REF_FRAMES], + const int64_t (*uv_dist)[REF_FRAMES]) { + const PREDICTION_MODE single_mode0 = compound_ref0_mode(compound_mode); + const PREDICTION_MODE single_mode1 = compound_ref1_mode(compound_mode); + + bool first_ref_valid = false, second_ref_valid = false; + bool first_ref_bad = false, second_ref_bad = false; + if (mode_checked[single_mode0][ref_frame] && + frame_mv[single_mode0][ref_frame].as_int == + frame_mv[compound_mode][ref_frame].as_int && + vars[INTER_OFFSET(single_mode0)][ref_frame] < UINT_MAX) { + first_ref_valid = true; + first_ref_bad = + previous_mode_performed_poorly(single_mode0, ref_frame, vars, uv_dist); + } + if (mode_checked[single_mode1][ref_frame2] && + frame_mv[single_mode1][ref_frame2].as_int == + frame_mv[compound_mode][ref_frame2].as_int && + vars[INTER_OFFSET(single_mode1)][ref_frame2] < UINT_MAX) { + second_ref_valid = true; + second_ref_bad = + previous_mode_performed_poorly(single_mode1, ref_frame2, vars, uv_dist); + } + if (first_ref_valid && second_ref_valid) { + return first_ref_bad && second_ref_bad; + } else if (first_ref_valid || second_ref_valid) { + return first_ref_bad || second_ref_bad; + } + return false; +} + +// Function to setup parameters used for inter mode evaluation in non-rd. +static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode( + AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, + RD_STATS *rd_cost, int *force_skip_low_temp_var, int mi_row, int mi_col, + int gf_temporal_ref, unsigned char segment_id, BLOCK_SIZE bsize +#if CONFIG_AV1_TEMPORAL_DENOISING + , + PICK_MODE_CONTEXT *ctx, int denoise_svc_pickmode +#endif +) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + MB_MODE_INFO *const mi = xd->mi[0]; + const ModeCosts *mode_costs = &x->mode_costs; + int skip_pred_mv = 0; + + // Initialize variance and distortion (chroma) for all modes and reference + // frames + for (int idx = 0; idx < RTC_INTER_MODES; idx++) { + for (int ref = 0; ref < REF_FRAMES; ref++) { + search_state->vars[idx][ref] = UINT_MAX; + search_state->uv_dist[idx][ref] = INT64_MAX; + } + } + + // Initialize values of color sensitivity with sb level color sensitivity + av1_copy(x->color_sensitivity, x->color_sensitivity_sb); + + init_best_pickmode(&search_state->best_pickmode); + + // Estimate cost for single reference frames + estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id, bsize, + search_state->ref_costs_single); + + // Reset flag to indicate modes evaluated + av1_zero(search_state->mode_checked); + + txfm_info->skip_txfm = 0; + + // Initialize mode decisions + av1_invalid_rd_stats(&search_state->best_rdc); + av1_invalid_rd_stats(&search_state->this_rdc); + av1_invalid_rd_stats(rd_cost); + for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) { + x->warp_sample_info[ref_idx].num = -1; + } + + mi->bsize = bsize; + mi->ref_frame[0] = NONE_FRAME; + mi->ref_frame[1] = NONE_FRAME; + +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + // if (cpi->ppi->use_svc) denoise_svc_pickmode = + // av1_denoise_svc_non_key(cpi); + if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode) + av1_denoiser_reset_frame_stats(ctx); + } +#endif + + // Populate predicated motion vectors for LAST_FRAME + if (cpi->ref_frame_flags & AOM_LAST_FLAG) { + find_predictors(cpi, x, LAST_FRAME, search_state->frame_mv, + search_state->yv12_mb, bsize, *force_skip_low_temp_var, + x->force_zeromv_skip_for_blk, + &search_state->use_scaled_ref_frame[LAST_FRAME]); + } + // Update mask to use all reference frame + get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref, + search_state->use_ref_frame_mask, + force_skip_low_temp_var); + + skip_pred_mv = x->force_zeromv_skip_for_blk || + (x->nonrd_prune_ref_frame_search > 2 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2); + + // Populate predicated motion vectors for other single reference frame + // Start at LAST_FRAME + 1. + for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME + 1; + ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) { + if (search_state->use_ref_frame_mask[ref_frame_iter]) { + find_predictors(cpi, x, ref_frame_iter, search_state->frame_mv, + search_state->yv12_mb, bsize, *force_skip_low_temp_var, + skip_pred_mv, + &search_state->use_scaled_ref_frame[ref_frame_iter]); + } + } +} + +// Function to check the inter mode can be skipped based on mode statistics and +// speed features settings. +static AOM_FORCE_INLINE bool skip_inter_mode_nonrd( + AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, + int64_t *thresh_sad_pred, int *force_mv_inter_layer, int *is_single_pred, + PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *last_comp_ref_frame, + MV_REFERENCE_FRAME *ref_frame, MV_REFERENCE_FRAME *ref_frame2, int idx, + int_mv svc_mv, int force_skip_low_temp_var, unsigned int sse_zeromv_norm, + int num_inter_modes, unsigned char segment_id, BLOCK_SIZE bsize, + bool comp_use_zero_zeromv_only, bool check_globalmv) { + AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + const SVC *const svc = &cpi->svc; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + + // Skip compound mode based on reference frame mask and type of the mode and + // for allowed compound modes, setup ref mv stack and reference frame. + if (idx >= num_inter_modes) { + const int comp_index = idx - num_inter_modes; + if (!setup_compound_params_from_comp_idx( + cpi, x, search_state->yv12_mb, this_mode, ref_frame, ref_frame2, + search_state->frame_mv, search_state->use_ref_frame_mask, + comp_index, comp_use_zero_zeromv_only, last_comp_ref_frame, + bsize)) { + return true; + } + *is_single_pred = 0; + } else { + *this_mode = ref_mode_set[idx].pred_mode; + *ref_frame = ref_mode_set[idx].ref_frame; + *ref_frame2 = NONE_FRAME; + } + + if (x->sb_me_block && *ref_frame == LAST_FRAME) { + // We want to make sure to test the superblock MV: + // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they + // have this sb MV. And don't skip NEWMV_LAST: this will be set to + // sb MV in handle_inter_mode_nonrd(), in case NEAREST or NEAR don't + // have it. + if (*this_mode == NEARESTMV && + search_state->frame_mv[NEARESTMV][LAST_FRAME].as_int == + x->sb_me_mv.as_int) { + return false; + } + if (*this_mode == NEARMV && + search_state->frame_mv[NEARMV][LAST_FRAME].as_int == + x->sb_me_mv.as_int) { + return false; + } + if (*this_mode == NEWMV) { + return false; + } + } + + // Skip the single reference mode for which mode check flag is set. + if (*is_single_pred && search_state->mode_checked[*this_mode][*ref_frame]) { + return true; + } + + // Skip GLOBALMV mode if check_globalmv flag is not enabled. + if (!check_globalmv && *this_mode == GLOBALMV) { + return true; + } + +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer1); + x->ms_stat_nonrd.num_searches[bsize][*this_mode]++; +#endif + mi->mode = *this_mode; + mi->ref_frame[0] = *ref_frame; + mi->ref_frame[1] = *ref_frame2; + + // Skip the mode if use reference frame mask flag is not set. + if (!search_state->use_ref_frame_mask[*ref_frame]) return true; + + // Skip mode for some modes and reference frames when + // force_zeromv_skip_for_blk flag is true. + if (x->force_zeromv_skip_for_blk && + ((!(*this_mode == NEARESTMV && + search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) && + *this_mode != GLOBALMV) || + *ref_frame != LAST_FRAME)) + return true; + + // Skip compound mode based on variance of previously evaluated single + // reference modes. + if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred && + prune_compoundmode_with_singlemode_var( + *this_mode, *ref_frame, *ref_frame2, search_state->frame_mv, + search_state->mode_checked, search_state->vars, + search_state->uv_dist)) { + return true; + } + + *force_mv_inter_layer = 0; + if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && + ((*ref_frame == LAST_FRAME && svc->skip_mvsearch_last) || + (*ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf) || + (*ref_frame == ALTREF_FRAME && svc->skip_mvsearch_altref))) { + // Only test mode if NEARESTMV/NEARMV is (svc_mv.mv.col, svc_mv.mv.row), + // otherwise set NEWMV to (svc_mv.mv.col, svc_mv.mv.row). + // Skip newmv and filter search. + *force_mv_inter_layer = 1; + if (*this_mode == NEWMV) { + search_state->frame_mv[*this_mode][*ref_frame] = svc_mv; + } else if (search_state->frame_mv[*this_mode][*ref_frame].as_int != + svc_mv.as_int) { + return true; + } + } + + // If the segment reference frame feature is enabled then do nothing if the + // current ref frame is not allowed. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)(*ref_frame)) + return true; + + // For screen content: skip mode testing based on source_sad. + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + // If source_sad is computed: skip non-zero motion + // check for stationary (super)blocks. Otherwise if superblock + // has motion skip the modes with zero motion on last reference + // for flat blocks, and color is not set. + // For the latter condition: the same condition should apply + // to newmv if (0, 0), so this latter condition is repeated + // below after search_new_mv. + if (rt_sf->source_metrics_sb_nonrd) { + if ((search_state->frame_mv[*this_mode][*ref_frame].as_int != 0 && + x->content_state_sb.source_sad_nonrd == kZeroSad) || + (search_state->frame_mv[*this_mode][*ref_frame].as_int == 0 && + x->block_is_zero_sad == 0 && *ref_frame == LAST_FRAME && + ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) || + cpi->rc.high_source_sad) && + x->source_variance == 0)) + return true; + } + // Skip NEWMV search for flat blocks. + if (*this_mode == NEWMV && x->source_variance < 100) return true; + // Skip non-LAST for color on flat blocks. + if (*ref_frame > LAST_FRAME && x->source_variance == 0 && + (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)) + return true; + } + + // Skip mode based on block size, reference frame mode and other block + // properties. + if (skip_mode_by_bsize_and_ref_frame( + *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search, + sse_zeromv_norm, rt_sf->nonrd_aggressive_skip)) + return true; + + // Skip mode based on low temporal variance and souce sad. + if (skip_mode_by_low_temp(*this_mode, *ref_frame, bsize, x->content_state_sb, + search_state->frame_mv[*this_mode][*ref_frame], + force_skip_low_temp_var)) + return true; + + // Disable this drop out case if the ref frame segment level feature is + // enabled for this segment. This is to prevent the possibility that we + // end up unable to pick any mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Check for skipping GOLDEN and ALTREF based pred_mv_sad. + if (rt_sf->nonrd_prune_ref_frame_search > 0 && + x->pred_mv_sad[*ref_frame] != INT_MAX && *ref_frame != LAST_FRAME) { + if ((int64_t)(x->pred_mv_sad[*ref_frame]) > *thresh_sad_pred) return true; + } + } + + // Check for skipping NEARMV based on pred_mv_sad. + if (*this_mode == NEARMV && x->pred_mv1_sad[*ref_frame] != INT_MAX && + x->pred_mv1_sad[*ref_frame] > (x->pred_mv0_sad[*ref_frame] << 1)) + return true; + + // Skip single reference mode based on rd threshold. + if (*is_single_pred) { + if (skip_mode_by_threshold( + *this_mode, *ref_frame, + search_state->frame_mv[*this_mode][*ref_frame], + cpi->rc.frames_since_golden, cpi->rd.threshes[segment_id][bsize], + x->thresh_freq_fact[bsize], search_state->best_rdc.rdcost, + search_state->best_pickmode.best_mode_skip_txfm, + (rt_sf->nonrd_aggressive_skip ? 1 : 0))) + return true; + } + return false; +} + +// Function to perform inter mode evaluation for non-rd +static AOM_FORCE_INLINE bool handle_inter_mode_nonrd( + AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, + PICK_MODE_CONTEXT *ctx, PRED_BUFFER **this_mode_pred, + PRED_BUFFER *tmp_buffer, InterPredParams inter_pred_params_sr, + int *best_early_term, unsigned int *sse_zeromv_norm, bool *check_globalmv, +#if CONFIG_AV1_TEMPORAL_DENOISING + int64_t *zero_last_cost_orig, int denoise_svc_pickmode, +#endif + int idx, int force_mv_inter_layer, int is_single_pred, int gf_temporal_ref, + int use_model_yrd_large, int filter_search_enabled_blk, BLOCK_SIZE bsize, + PREDICTION_MODE this_mode, InterpFilter filt_select, + int cb_pred_filter_search, int reuse_inter_pred, + int *sb_me_has_been_tested) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + const int bw = block_size_wide[bsize]; + const InterpFilter filter_ref = cm->features.interp_filter; + const InterpFilter default_interp_filter = EIGHTTAP_REGULAR; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + const ModeCosts *mode_costs = &x->mode_costs; + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode; + + MV_REFERENCE_FRAME ref_frame = mi->ref_frame[0]; + MV_REFERENCE_FRAME ref_frame2 = mi->ref_frame[1]; + int_mv *const this_mv = &search_state->frame_mv[this_mode][ref_frame]; + unsigned int var = UINT_MAX; + int this_early_term = 0; + int rate_mv = 0; + int is_skippable; + int skip_this_mv = 0; + unsigned int var_threshold = UINT_MAX; + PREDICTION_MODE this_best_mode; + RD_STATS nonskip_rdc; + av1_invalid_rd_stats(&nonskip_rdc); + + if (x->sb_me_block && this_mode == NEWMV && ref_frame == LAST_FRAME) { + // Set the NEWMV_LAST to the sb MV. + search_state->frame_mv[NEWMV][LAST_FRAME].as_int = x->sb_me_mv.as_int; + } else if (this_mode == NEWMV && !force_mv_inter_layer) { +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer2); +#endif + // Find the best motion vector for single/compound mode. + const bool skip_newmv = search_new_mv( + cpi, x, search_state->frame_mv, ref_frame, gf_temporal_ref, bsize, + mi_row, mi_col, &rate_mv, &search_state->best_rdc); +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); + x->ms_stat_nonrd.ms_time[bsize][this_mode] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); +#endif + // Skip NEWMV mode, + // (i). For bsize smaller than 16X16 + // (ii). Based on sad of the predicted mv w.r.t LAST_FRAME + // (iii). When motion vector is same as that of reference mv + if (skip_newmv) { + return true; + } + } + + // Check the current motion vector is same as that of previously evaluated + // motion vectors. + for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; + inter_mv_mode++) { + if (inter_mv_mode == this_mode) continue; + if (is_single_pred && + search_state->mode_checked[inter_mv_mode][ref_frame] && + this_mv->as_int == + search_state->frame_mv[inter_mv_mode][ref_frame].as_int) { + skip_this_mv = 1; + break; + } + } + + // Skip single mode if current motion vector is same that of previously + // evaluated motion vectors. + if (skip_this_mv && is_single_pred) return true; + + // For screen: for spatially flat blocks with non-zero motion, + // skip newmv if the motion vector is (0, 0)-LAST, and color is not set. + if (this_mode == NEWMV && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + cpi->svc.spatial_layer_id == 0 && rt_sf->source_metrics_sb_nonrd) { + if (this_mv->as_int == 0 && ref_frame == LAST_FRAME && + x->block_is_zero_sad == 0 && + ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) || + cpi->rc.high_source_sad) && + x->source_variance == 0) + return true; + } + + mi->mode = this_mode; + mi->mv[0].as_int = this_mv->as_int; + mi->mv[1].as_int = 0; + if (!is_single_pred) + mi->mv[1].as_int = search_state->frame_mv[this_mode][ref_frame2].as_int; + + // Set buffers to store predicted samples for reuse + if (reuse_inter_pred) { + if (!*this_mode_pred) { + *this_mode_pred = &tmp_buffer[3]; + } else { + *this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; + pd->dst.buf = (*this_mode_pred)->data; + pd->dst.stride = bw; + } + } + + mi->motion_mode = SIMPLE_TRANSLATION; +#if !CONFIG_REALTIME_ONLY + if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) { + calc_num_proj_ref(cpi, x, mi); + } +#endif + // set variance threshold for compound mode pruning + if (rt_sf->prune_compoundmode_with_singlecompound_var && !is_single_pred && + use_model_yrd_large) { + const PREDICTION_MODE single_mode0 = compound_ref0_mode(this_mode); + const PREDICTION_MODE single_mode1 = compound_ref1_mode(this_mode); + var_threshold = + AOMMIN(var_threshold, + search_state->vars[INTER_OFFSET(single_mode0)][ref_frame]); + var_threshold = + AOMMIN(var_threshold, + search_state->vars[INTER_OFFSET(single_mode1)][ref_frame2]); + } + + // decide interpolation filter, build prediction signal, get sse + const bool is_mv_subpel = + (mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07); + const bool enable_filt_search_this_mode = + (filter_search_enabled_blk == 2) + ? true + : (filter_search_enabled_blk && !force_mv_inter_layer && + is_single_pred && + (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)); + if (is_mv_subpel && enable_filt_search_this_mode) { +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer2); +#endif + search_filter_ref( + cpi, x, &search_state->this_rdc, &inter_pred_params_sr, mi_row, mi_col, + tmp_buffer, bsize, reuse_inter_pred, this_mode_pred, &this_early_term, + &var, use_model_yrd_large, best_pickmode->best_sse, is_single_pred); +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); + x->ms_stat_nonrd.ifs_time[bsize][this_mode] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); +#endif +#if !CONFIG_REALTIME_ONLY + } else if (cpi->oxcf.motion_mode_cfg.allow_warped_motion && + this_mode == NEWMV) { + // Find the best motion mode when current mode is NEWMV + search_motion_mode(cpi, x, &search_state->this_rdc, mi_row, mi_col, bsize, + &this_early_term, use_model_yrd_large, &rate_mv, + best_pickmode->best_sse); + if (this_mode == NEWMV) { + this_mv[0] = mi->mv[0]; + } +#endif + } else { + mi->interp_filters = + (filter_ref == SWITCHABLE) + ? av1_broadcast_interp_filter(default_interp_filter) + : av1_broadcast_interp_filter(filter_ref); + if (force_mv_inter_layer) + mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + // If it is sub-pel motion and cb_pred_filter_search is enabled, select + // the pre-decided filter + if (is_mv_subpel && cb_pred_filter_search) + mi->interp_filters = av1_broadcast_interp_filter(filt_select); + +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer2); +#endif + if (is_single_pred) { + SubpelParams subpel_params; + // Initialize inter mode level params for single reference mode. + init_inter_mode_params(&mi->mv[0].as_mv, &inter_pred_params_sr, + &subpel_params, xd->block_ref_scale_factors[0], + pd->pre->width, pd->pre->height); + av1_enc_build_inter_predictor_y_nonrd(xd, &inter_pred_params_sr, + &subpel_params); + } else { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } + + if (use_model_yrd_large) { + model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, + &search_state->this_rdc, &this_early_term, 0, + best_pickmode->best_sse, &var, var_threshold); + } else { + model_rd_for_sb_y(cpi, bsize, x, xd, &search_state->this_rdc, &var, 0, + &this_early_term); + } +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); + x->ms_stat_nonrd.model_rd_time[bsize][this_mode] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); +#endif + } + + // update variance for single mode + if (is_single_pred) { + search_state->vars[INTER_OFFSET(this_mode)][ref_frame] = var; + if (this_mv->as_int == 0) { + search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var; + } + } + // prune compound mode based on single mode var threshold + if (!is_single_pred && var > var_threshold) { + if (reuse_inter_pred) free_pred_buffer(*this_mode_pred); + return true; + } + + if (ref_frame == LAST_FRAME && this_mv->as_int == 0) { + *sse_zeromv_norm = (unsigned int)(search_state->this_rdc.sse >> + (b_width_log2_lookup[bsize] + + b_height_log2_lookup[bsize])); + } + + // Perform early termination based on sse. + if (rt_sf->sse_early_term_inter_search && + early_term_inter_search_with_sse(rt_sf->sse_early_term_inter_search, + bsize, search_state->this_rdc.sse, + best_pickmode->best_sse, this_mode)) { + if (reuse_inter_pred) free_pred_buffer(*this_mode_pred); + return true; + } + +#if COLLECT_NONRD_PICK_MODE_STAT + x->ms_stat_nonrd.num_nonskipped_searches[bsize][this_mode]++; +#endif + + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][1]; + const int no_skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][0]; + const int64_t sse_y = search_state->this_rdc.sse; + + if (this_early_term) { + search_state->this_rdc.skip_txfm = 1; + search_state->this_rdc.rate = skip_txfm_cost; + search_state->this_rdc.dist = search_state->this_rdc.sse << 4; + } else { +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer2); +#endif + // Calculates RD Cost using Hadamard transform. + av1_block_yrd(x, &search_state->this_rdc, &is_skippable, bsize, + mi->tx_size); + if (search_state->this_rdc.skip_txfm || + RDCOST(x->rdmult, search_state->this_rdc.rate, + search_state->this_rdc.dist) >= + RDCOST(x->rdmult, 0, search_state->this_rdc.sse)) { + if (!search_state->this_rdc.skip_txfm) { + // Need to store "real" rdc for possible future use if UV rdc + // disallows tx skip + nonskip_rdc = search_state->this_rdc; + nonskip_rdc.rate += no_skip_txfm_cost; + } + search_state->this_rdc.rate = skip_txfm_cost; + search_state->this_rdc.skip_txfm = 1; + search_state->this_rdc.dist = search_state->this_rdc.sse; + } else { + search_state->this_rdc.rate += no_skip_txfm_cost; + } + + // Populate predicted sample for chroma planes based on color sensitivity. + if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) { + RD_STATS rdc_uv; + const BLOCK_SIZE uv_bsize = + get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_U, AOM_PLANE_U); + } + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_V, AOM_PLANE_V); + } + // Compute sse for chroma planes. + const int64_t sse_uv = av1_model_rd_for_sb_uv( + cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, AOM_PLANE_V); + if (rdc_uv.dist < x->min_dist_inter_uv) + x->min_dist_inter_uv = rdc_uv.dist; + search_state->this_rdc.sse += sse_uv; + // Restore Y rdc if UV rdc disallows txfm skip + if (search_state->this_rdc.skip_txfm && !rdc_uv.skip_txfm && + nonskip_rdc.rate != INT_MAX) + search_state->this_rdc = nonskip_rdc; + if (is_single_pred) { + search_state->uv_dist[INTER_OFFSET(this_mode)][ref_frame] = rdc_uv.dist; + } + search_state->this_rdc.rate += rdc_uv.rate; + search_state->this_rdc.dist += rdc_uv.dist; + search_state->this_rdc.skip_txfm = + search_state->this_rdc.skip_txfm && rdc_uv.skip_txfm; + } +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); + x->ms_stat_nonrd.txfm_time[bsize][this_mode] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); +#endif + } + + this_best_mode = this_mode; + // TODO(kyslov) account for UV prediction cost + search_state->this_rdc.rate += rate_mv; + if (!is_single_pred) { + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame); + search_state->this_rdc.rate += cost_mv_ref(mode_costs, this_mode, mode_ctx); + } else { + // If the current mode has zeromv but is not GLOBALMV, compare the rate + // cost. If GLOBALMV is cheaper, use GLOBALMV instead. + if (this_mode != GLOBALMV && + this_mv->as_int == search_state->frame_mv[GLOBALMV][ref_frame].as_int) { + if (is_globalmv_better(this_mode, ref_frame, rate_mv, mode_costs, + search_state->single_inter_mode_costs, mbmi_ext)) { + this_best_mode = GLOBALMV; + } + } + + search_state->this_rdc.rate += + search_state + ->single_inter_mode_costs[INTER_OFFSET(this_best_mode)][ref_frame]; + } + + if (is_single_pred && this_mv->as_int == 0 && var < UINT_MAX) { + search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var; + } + + search_state->this_rdc.rate += search_state->ref_costs_single[ref_frame]; + + search_state->this_rdc.rdcost = RDCOST(x->rdmult, search_state->this_rdc.rate, + search_state->this_rdc.dist); + if (cpi->oxcf.rc_cfg.mode == AOM_CBR && is_single_pred) { + newmv_diff_bias(xd, this_best_mode, &search_state->this_rdc, bsize, + search_state->frame_mv[this_best_mode][ref_frame].as_mv.row, + search_state->frame_mv[this_best_mode][ref_frame].as_mv.col, + cpi->speed, x->source_variance, x->content_state_sb); + } + +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode && + cpi->denoiser.denoising_level > kDenLowLow) { + av1_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx); + // Keep track of zero_last cost. + if (ref_frame == LAST_FRAME && this_mv->as_int == 0) + *zero_last_cost_orig = search_state->this_rdc.rdcost; + } +#else + (void)(sse_y); +#endif + + search_state->mode_checked[this_mode][ref_frame] = 1; + search_state->mode_checked[this_best_mode][ref_frame] = 1; + + if (*check_globalmv) { + int32_t abs_mv = + abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.row) + + abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.col); + // Early exit check: if the magnitude of this_best_mode's mv is small + // enough, we skip GLOBALMV check in the next loop iteration. + if (abs_mv < 2) { + *check_globalmv = false; + } + } +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer1); + x->ms_stat_nonrd.nonskipped_search_times[bsize][this_mode] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1); +#endif + + if (x->sb_me_block && ref_frame == LAST_FRAME && + search_state->frame_mv[this_best_mode][ref_frame].as_int == + x->sb_me_mv.as_int) + *sb_me_has_been_tested = 1; + + // Copy best mode params to search state + if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) { + search_state->best_rdc = search_state->this_rdc; + *best_early_term = this_early_term; + update_search_state_nonrd(search_state, mi, txfm_info, &nonskip_rdc, ctx, + this_best_mode, sse_y); + + // This is needed for the compound modes. + search_state->frame_mv_best[this_best_mode][ref_frame].as_int = + search_state->frame_mv[this_best_mode][ref_frame].as_int; + if (ref_frame2 > NONE_FRAME) { + search_state->frame_mv_best[this_best_mode][ref_frame2].as_int = + search_state->frame_mv[this_best_mode][ref_frame2].as_int; + } + + if (reuse_inter_pred) { + free_pred_buffer(best_pickmode->best_pred); + best_pickmode->best_pred = *this_mode_pred; + } + } else { + if (reuse_inter_pred) free_pred_buffer(*this_mode_pred); + } + + if (*best_early_term && (idx > 0 || rt_sf->nonrd_aggressive_skip)) { + txfm_info->skip_txfm = 1; + if (!x->sb_me_block || *sb_me_has_been_tested) return false; + } + return true; +} + +// Function to perform screen content mode evaluation for non-rd +static AOM_FORCE_INLINE void handle_screen_content_mode_nonrd( + AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, + PRED_BUFFER *this_mode_pred, PICK_MODE_CONTEXT *ctx, + PRED_BUFFER *tmp_buffer, struct buf_2d *orig_dst, int skip_idtx_palette, + int try_palette, BLOCK_SIZE bsize, int reuse_inter_pred, int mi_col, + int mi_row) { + AV1_COMMON *const cm = &cpi->common; + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode; + + // TODO(marpan): Only allow for 8 bit-depth for now, re-enable for 10/12 bit + // when issue 3359 is fixed. + if (cm->seq_params->bit_depth == 8 && + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !skip_idtx_palette && + !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk && + is_inter_mode(best_pickmode->best_mode) && + best_pickmode->best_pred != NULL && + (!rt_sf->prune_idtx_nonrd || + (rt_sf->prune_idtx_nonrd && bsize <= BLOCK_32X32 && + best_pickmode->best_mode_skip_txfm != 1 && x->source_variance > 200))) { + RD_STATS idtx_rdc; + av1_init_rd_stats(&idtx_rdc); + int is_skippable; + this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; + pd->dst.buf = this_mode_pred->data; + pd->dst.stride = bw; + const PRED_BUFFER *const best_pred = best_pickmode->best_pred; + av1_block_yrd_idtx(x, best_pred->data, best_pred->stride, &idtx_rdc, + &is_skippable, bsize, mi->tx_size); + int64_t idx_rdcost_y = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist); + int allow_idtx = 1; + // Incorporate color into rd cost. + if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) { + RD_STATS rdc_uv; + const BLOCK_SIZE uv_bsize = + get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_U, AOM_PLANE_U); + } + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_V, AOM_PLANE_V); + } + av1_model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, + AOM_PLANE_V); + if (rdc_uv.dist < x->min_dist_inter_uv) + x->min_dist_inter_uv = rdc_uv.dist; + idtx_rdc.rate += rdc_uv.rate; + idtx_rdc.dist += rdc_uv.dist; + idtx_rdc.skip_txfm = idtx_rdc.skip_txfm && rdc_uv.skip_txfm; + if (idx_rdcost_y == 0 && rdc_uv.dist > 0 && x->source_variance < 3000 && + x->content_state_sb.source_sad_nonrd > kMedSad) + allow_idtx = 0; + } + int64_t idx_rdcost = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist); + if (allow_idtx && idx_rdcost < search_state->best_rdc.rdcost) { + best_pickmode->tx_type = IDTX; + search_state->best_rdc.rdcost = idx_rdcost; + best_pickmode->best_mode_skip_txfm = idtx_rdc.skip_txfm; + if (!idtx_rdc.skip_txfm) { + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + } + xd->tx_type_map[0] = best_pickmode->tx_type; + memset(ctx->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk); + memset(xd->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk); + } + pd->dst = *orig_dst; + } + + if (!try_palette) return; + const unsigned int intra_ref_frame_cost = + search_state->ref_costs_single[INTRA_FRAME]; + + if (!is_mode_intra(best_pickmode->best_mode)) { + PRED_BUFFER *const best_pred = best_pickmode->best_pred; + if (reuse_inter_pred && best_pred != NULL) { + if (best_pred->data == orig_dst->buf) { + this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; + aom_convolve_copy(best_pred->data, best_pred->stride, + this_mode_pred->data, this_mode_pred->stride, bw, bh); + best_pickmode->best_pred = this_mode_pred; + } + } + pd->dst = *orig_dst; + } + // Search palette mode for Luma plane in inter frame. + av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx, + &search_state->this_rdc, + search_state->best_rdc.rdcost); + // Update best mode data in search_state + if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) { + best_pickmode->pmi = mi->palette_mode_info; + best_pickmode->best_mode = DC_PRED; + mi->mv[0].as_int = INVALID_MV; + mi->mv[1].as_int = INVALID_MV; + best_pickmode->best_ref_frame = INTRA_FRAME; + best_pickmode->best_second_ref_frame = NONE; + search_state->best_rdc.rate = search_state->this_rdc.rate; + search_state->best_rdc.dist = search_state->this_rdc.dist; + search_state->best_rdc.rdcost = search_state->this_rdc.rdcost; + best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm; + // Keep the skip_txfm off if the color_sensitivity is set. + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) + search_state->this_rdc.skip_txfm = 0; + if (!search_state->this_rdc.skip_txfm) { + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + } + if (xd->tx_type_map[0] != DCT_DCT) + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + } +} + +/*!\brief AV1 inter mode selection based on Non-RD optimized model. + * + * \ingroup nonrd_mode_search + * \callgraph + * Top level function for Non-RD optimized inter mode selection. + * This finction will loop over subset of inter modes and select the best one + * based on calculated modelled RD cost. While making decisions which modes to + * check, this function applies heuristics based on previously checked modes, + * block residual variance, block size, and other factors to prune certain + * modes and reference frames. Currently only single reference frame modes + * are checked. Additional heuristics are applied to decide if intra modes + * need to be checked. + * * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + data/contexts/models for the tile during + encoding + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] rd_cost Struct to keep track of the RD information + * \param[in] bsize Current block size + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ +void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *x, RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { + AV1_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + MV_REFERENCE_FRAME ref_frame, ref_frame2; + const unsigned char segment_id = mi->segment_id; + int best_early_term = 0; + int force_skip_low_temp_var = 0; + unsigned int sse_zeromv_norm = UINT_MAX; + const int num_inter_modes = NUM_INTER_MODES; + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + bool check_globalmv = rt_sf->check_globalmv_on_single_ref; + PRED_BUFFER tmp_buffer[4]; + DECLARE_ALIGNED(16, uint8_t, pred_buf[MAX_MB_PLANE * MAX_SB_SQUARE]); + PRED_BUFFER *this_mode_pred = NULL; + const int reuse_inter_pred = + rt_sf->reuse_inter_pred_nonrd && cm->seq_params->bit_depth == AOM_BITS_8; + InterModeSearchStateNonrd search_state; + av1_zero(search_state.use_ref_frame_mask); + av1_zero(search_state.use_scaled_ref_frame); + BEST_PICKMODE *const best_pickmode = &search_state.best_pickmode; + (void)tile_data; + + const int bh = block_size_high[bsize]; + const int bw = block_size_wide[bsize]; + const int pixels_in_block = bh * bw; + struct buf_2d orig_dst = pd->dst; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; +#if COLLECT_NONRD_PICK_MODE_STAT + // Mode statistics can be collected only when num_workers is 1 + assert(cpi->mt_info.num_workers <= 1); + aom_usec_timer_start(&x->ms_stat_nonrd.bsize_timer); +#endif + int64_t thresh_sad_pred = INT64_MAX; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int_mv svc_mv = { .as_int = 0 }; + int force_mv_inter_layer = 0; + bool comp_use_zero_zeromv_only = 0; + int tot_num_comp_modes = NUM_COMP_INTER_MODES_RT; +#if CONFIG_AV1_TEMPORAL_DENOISING + const int denoise_recheck_zeromv = 1; + AV1_PICKMODE_CTX_DEN ctx_den; + int64_t zero_last_cost_orig = INT64_MAX; + int denoise_svc_pickmode = 1; + const int resize_pending = is_frame_resize_pending(cpi); +#endif + const ModeCosts *mode_costs = &x->mode_costs; + struct scale_factors sf_no_scale; + av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height, + cm->width, cm->height); + if (reuse_inter_pred) { + for (int buf_idx = 0; buf_idx < 3; buf_idx++) { + tmp_buffer[buf_idx].data = &pred_buf[pixels_in_block * buf_idx]; + tmp_buffer[buf_idx].stride = bw; + tmp_buffer[buf_idx].in_use = 0; + } + tmp_buffer[3].data = pd->dst.buf; + tmp_buffer[3].stride = pd->dst.stride; + tmp_buffer[3].in_use = 0; + } + + const int gf_temporal_ref = is_same_gf_and_last_scale(cm); + + // If the lower spatial layer uses an averaging filter for downsampling + // (phase = 8), the target decimated pixel is shifted by (1/2, 1/2) relative + // to source, so use subpel motion vector to compensate. The nonzero motion + // is half pixel shifted to left and top, so (-4, -4). This has more effect + // on higher resolutions, so condition it on that for now. + // Exclude quality layers, which have the same resolution and hence no shift. + if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && + !svc->has_lower_quality_layer && + svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 && + cm->width * cm->height > 640 * 480) { + svc_mv.as_mv.row = -4; + svc_mv.as_mv.col = -4; + } + + // Setup parameters used for inter mode evaluation. + set_params_nonrd_pick_inter_mode(cpi, x, &search_state, rd_cost, + &force_skip_low_temp_var, mi_row, mi_col, + gf_temporal_ref, segment_id, bsize +#if CONFIG_AV1_TEMPORAL_DENOISING + , + ctx, denoise_svc_pickmode +#endif + ); + + if (rt_sf->use_comp_ref_nonrd && is_comp_ref_allowed(bsize)) { + // Only search compound if bsize \gt BLOCK_16X16. + if (bsize > BLOCK_16X16) { + comp_use_zero_zeromv_only = rt_sf->check_only_zero_zeromv_on_large_blocks; + } else { + tot_num_comp_modes = 0; + } + } else { + tot_num_comp_modes = 0; + } + + if (x->pred_mv_sad[LAST_FRAME] != INT_MAX) { + thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1; + // Increase threshold for less aggressive pruning. + if (rt_sf->nonrd_prune_ref_frame_search == 1) + thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2); + } + + const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize); + + // decide block-level interp filter search flags: + // filter_search_enabled_blk: + // 0: disabled + // 1: filter search depends on mode properties + // 2: filter search forced since prediction is unreliable + // cb_pred_filter_search 0: disabled cb prediction + InterpFilter filt_select = EIGHTTAP_REGULAR; + const int cb_pred_filter_search = + x->content_state_sb.source_sad_nonrd > kVeryLowSad + ? cpi->sf.interp_sf.cb_pred_filter_search + : 0; + const int filter_search_enabled_blk = + is_filter_search_enabled_blk(cpi, x, mi_row, mi_col, bsize, segment_id, + cb_pred_filter_search, &filt_select); + +#if COLLECT_NONRD_PICK_MODE_STAT + x->ms_stat_nonrd.num_blocks[bsize]++; +#endif + init_mbmi_nonrd(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm); + mi->tx_size = AOMMIN( + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]), + TX_16X16); + + fill_single_inter_mode_costs(search_state.single_inter_mode_costs, + num_inter_modes, ref_mode_set, mode_costs, + mbmi_ext->mode_context); + + MV_REFERENCE_FRAME last_comp_ref_frame = NONE_FRAME; + + // Initialize inter prediction params at block level for single reference + // mode. + InterPredParams inter_pred_params_sr; + init_inter_block_params(&inter_pred_params_sr, pd->width, pd->height, + mi_row * MI_SIZE, mi_col * MI_SIZE, pd->subsampling_x, + pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), + /*is_intrabc=*/0); + inter_pred_params_sr.conv_params = + get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd); + + x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad; + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + !x->force_zeromv_skip_for_blk && + x->content_state_sb.source_sad_nonrd != kZeroSad && + x->source_variance == 0 && bsize < cm->seq_params->sb_size && + search_state.yv12_mb[LAST_FRAME][0].width == cm->width && + search_state.yv12_mb[LAST_FRAME][0].height == cm->height) { + set_block_source_sad(cpi, x, bsize, &search_state.yv12_mb[LAST_FRAME][0]); + } + + int sb_me_has_been_tested = 0; + x->sb_me_block = x->sb_me_partition; + // Only use this feature (force testing of superblock motion) if coding + // block size is large. + if (x->sb_me_block) { + if (cm->seq_params->sb_size == BLOCK_128X128 && bsize < BLOCK_64X64) + x->sb_me_block = 0; + else if (cm->seq_params->sb_size == BLOCK_64X64 && bsize < BLOCK_32X32) + x->sb_me_block = 0; + } + + x->min_dist_inter_uv = INT64_MAX; + for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) { + // If we are at the first compound mode, and the single modes already + // perform well, then end the search. + if (rt_sf->skip_compound_based_on_var && idx == num_inter_modes && + skip_comp_based_on_var(search_state.vars, bsize)) { + break; + } + + int is_single_pred = 1; + PREDICTION_MODE this_mode; + + if (idx == 0 && !x->force_zeromv_skip_for_blk) { + // Set color sensitivity on first tested mode only. + // Use y-sad already computed in find_predictors: take the sad with motion + // vector closest to 0; the uv-sad computed below in set_color_sensitivity + // is for zeromv. + // For screen: first check if golden reference is being used, if so, + // force color_sensitivity on (=1) if the color sensitivity for sb_g is 1. + // The check in set_color_sensitivity() will then follow and check for + // setting the flag if the level is still 2 or 0. + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + search_state.use_ref_frame_mask[GOLDEN_FRAME]) { + if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1) + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 1; + if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 1; + } + if (search_state.use_ref_frame_mask[LAST_FRAME] && + x->pred_mv0_sad[LAST_FRAME] != INT_MAX) { + int y_sad = x->pred_mv0_sad[LAST_FRAME]; + if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX && + (abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.col) + + abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.row)) < + (abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) + + abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.row))) + y_sad = x->pred_mv1_sad[LAST_FRAME]; + set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance, + search_state.yv12_mb[LAST_FRAME]); + } + } + + // Check the inter mode can be skipped based on mode statistics and speed + // features settings. + if (skip_inter_mode_nonrd(cpi, x, &search_state, &thresh_sad_pred, + &force_mv_inter_layer, &is_single_pred, + &this_mode, &last_comp_ref_frame, &ref_frame, + &ref_frame2, idx, svc_mv, force_skip_low_temp_var, + sse_zeromv_norm, num_inter_modes, segment_id, + bsize, comp_use_zero_zeromv_only, check_globalmv)) + continue; + + // Select prediction reference frames. + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + xd->plane[plane].pre[0] = search_state.yv12_mb[ref_frame][plane]; + if (!is_single_pred) + xd->plane[plane].pre[1] = search_state.yv12_mb[ref_frame2][plane]; + } + + mi->ref_frame[0] = ref_frame; + mi->ref_frame[1] = ref_frame2; + set_ref_ptrs(cm, xd, ref_frame, ref_frame2); + + // Check if the scaled reference frame should be used. This is set in the + // find_predictors() for each usable reference. If so, set the + // block_ref_scale_factors[] to no reference scaling. + if (search_state.use_scaled_ref_frame[ref_frame]) { + xd->block_ref_scale_factors[0] = &sf_no_scale; + } + if (!is_single_pred && search_state.use_scaled_ref_frame[ref_frame2]) { + xd->block_ref_scale_factors[1] = &sf_no_scale; + } + + // Perform inter mode evaluation for non-rd + if (!handle_inter_mode_nonrd( + cpi, x, &search_state, ctx, &this_mode_pred, tmp_buffer, + inter_pred_params_sr, &best_early_term, &sse_zeromv_norm, + &check_globalmv, +#if CONFIG_AV1_TEMPORAL_DENOISING + &zero_last_cost_orig, denoise_svc_pickmode, +#endif + idx, force_mv_inter_layer, is_single_pred, gf_temporal_ref, + use_model_yrd_large, filter_search_enabled_blk, bsize, this_mode, + filt_select, cb_pred_filter_search, reuse_inter_pred, + &sb_me_has_been_tested)) { + break; + } + } + + // Restore mode data of best inter mode + mi->mode = best_pickmode->best_mode; + mi->motion_mode = best_pickmode->best_motion_mode; + mi->wm_params = best_pickmode->wm_params; + mi->num_proj_ref = best_pickmode->num_proj_ref; + mi->interp_filters = best_pickmode->best_pred_filter; + mi->tx_size = best_pickmode->best_tx_size; + memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size)); + mi->ref_frame[0] = best_pickmode->best_ref_frame; + mi->mv[0].as_int = search_state + .frame_mv_best[best_pickmode->best_mode] + [best_pickmode->best_ref_frame] + .as_int; + mi->mv[1].as_int = 0; + if (best_pickmode->best_second_ref_frame > INTRA_FRAME) { + mi->ref_frame[1] = best_pickmode->best_second_ref_frame; + mi->mv[1].as_int = search_state + .frame_mv_best[best_pickmode->best_mode] + [best_pickmode->best_second_ref_frame] + .as_int; + } + // Perform intra prediction search, if the best SAD is above a certain + // threshold. + mi->angle_delta[PLANE_TYPE_Y] = 0; + mi->angle_delta[PLANE_TYPE_UV] = 0; + mi->filter_intra_mode_info.use_filter_intra = 0; + +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer1); + x->ms_stat_nonrd.num_searches[bsize][DC_PRED]++; + x->ms_stat_nonrd.num_nonskipped_searches[bsize][DC_PRED]++; +#endif + + int force_palette_test = 0; + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + x->content_state_sb.source_sad_nonrd != kZeroSad && + bsize <= BLOCK_16X16) { + unsigned int thresh_sse = cpi->rc.high_source_sad ? 15000 : 200000; + unsigned int thresh_source_var = cpi->rc.high_source_sad ? 50 : 200; + unsigned int best_sse_inter_motion = + (unsigned int)(search_state.best_rdc.sse >> + (b_width_log2_lookup[bsize] + + b_height_log2_lookup[bsize])); + if (best_sse_inter_motion > thresh_sse && + x->source_variance > thresh_source_var) + force_palette_test = 1; + } + + // Evaluate Intra modes in inter frame + if (!x->force_zeromv_skip_for_blk) + av1_estimate_intra_mode(cpi, x, bsize, best_early_term, + search_state.ref_costs_single[INTRA_FRAME], + reuse_inter_pred, &orig_dst, tmp_buffer, + &this_mode_pred, &search_state.best_rdc, + best_pickmode, ctx); + + int skip_idtx_palette = (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) && + x->content_state_sb.source_sad_nonrd != kZeroSad && + !cpi->rc.high_source_sad; + + int try_palette = + !skip_idtx_palette && cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mi->bsize); + try_palette = + try_palette && + (is_mode_intra(best_pickmode->best_mode) || force_palette_test) && + x->source_variance > 0 && !x->force_zeromv_skip_for_blk && + (cpi->rc.high_source_sad || x->source_variance > 300); + + if (rt_sf->prune_palette_nonrd && bsize > BLOCK_16X16) try_palette = 0; + + // Perform screen content mode evaluation for non-rd + handle_screen_content_mode_nonrd( + cpi, x, &search_state, this_mode_pred, ctx, tmp_buffer, &orig_dst, + skip_idtx_palette, try_palette, bsize, reuse_inter_pred, mi_col, mi_row); + +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer1); + x->ms_stat_nonrd.nonskipped_search_times[bsize][DC_PRED] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1); +#endif + + pd->dst = orig_dst; + // Best mode is finalized. Restore the mode data to mbmi + if (try_palette) mi->palette_mode_info = best_pickmode->pmi; + mi->mode = best_pickmode->best_mode; + mi->ref_frame[0] = best_pickmode->best_ref_frame; + mi->ref_frame[1] = best_pickmode->best_second_ref_frame; + // For lossless: always force the skip flags off. + if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { + txfm_info->skip_txfm = 0; + memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk); + } else { + txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm; + } + if (has_second_ref(mi)) { + mi->comp_group_idx = 0; + mi->compound_idx = 1; + mi->interinter_comp.type = COMPOUND_AVERAGE; + } + + if (!is_inter_block(mi)) { + mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS); + } else { + // If inter mode is selected and ref_frame was one that uses the + // scaled reference frame, then we can't use reuse_inter_pred. + if (search_state.use_scaled_ref_frame[best_pickmode->best_ref_frame] || + (has_second_ref(mi) && + search_state + .use_scaled_ref_frame[best_pickmode->best_second_ref_frame])) + x->reuse_inter_pred = 0; + } + + // Restore the predicted samples of best mode to final buffer + if (reuse_inter_pred && best_pickmode->best_pred != NULL) { + PRED_BUFFER *const best_pred = best_pickmode->best_pred; + if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { + aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } + } + +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && resize_pending == 0 && + denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow && + cpi->denoiser.reset == 0) { + AV1_DENOISER_DECISION decision = COPY_BLOCK; + ctx->sb_skip_denoising = 0; + av1_pickmode_ctx_den_update( + &ctx_den, zero_last_cost_orig, search_state.ref_costs_single, + search_state.frame_mv, reuse_inter_pred, best_pickmode); + av1_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision, + gf_temporal_ref); + if (denoise_recheck_zeromv) + recheck_zeromv_after_denoising( + cpi, mi, x, xd, decision, &ctx_den, search_state.yv12_mb, + &search_state.best_rdc, best_pickmode, bsize, mi_row, mi_col); + best_pickmode->best_ref_frame = ctx_den.best_ref_frame; + } +#endif + + // Update the factors used for RD thresholding for all modes. + if (cpi->sf.inter_sf.adaptive_rd_thresh && !has_second_ref(mi)) { + THR_MODES best_mode_idx = + mode_idx[best_pickmode->best_ref_frame][mode_offset(mi->mode)]; + if (best_pickmode->best_ref_frame == INTRA_FRAME) { + // Only consider the modes that are included in the intra_mode_list. + int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE); + for (int mode_index = 0; mode_index < intra_modes; mode_index++) { + update_thresh_freq_fact(cpi, x, bsize, INTRA_FRAME, best_mode_idx, + intra_mode_list[mode_index]); + } + } else { + PREDICTION_MODE this_mode; + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + update_thresh_freq_fact(cpi, x, bsize, best_pickmode->best_ref_frame, + best_mode_idx, this_mode); + } + } + } + +#if CONFIG_INTERNAL_STATS + store_coding_context_nonrd(x, ctx, mi->mode); +#else + store_coding_context_nonrd(x, ctx); +#endif // CONFIG_INTERNAL_STATS + +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.bsize_timer); + x->ms_stat_nonrd.total_block_times[bsize] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.bsize_timer); + print_time(&x->ms_stat_nonrd, bsize, cm->mi_params.mi_rows, + cm->mi_params.mi_cols, mi_row, mi_col); +#endif // COLLECT_NONRD_PICK_MODE_STAT + + *rd_cost = search_state.best_rdc; + + // Reset the xd->block_ref_scale_factors[i], as they may have + // been set to pointer &sf_no_scale, which becomes invalid afer + // this function. + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); +} diff --git a/third_party/aom/av1/encoder/optical_flow.c b/third_party/aom/av1/encoder/optical_flow.c new file mode 100644 index 0000000000..dc168e7aee --- /dev/null +++ b/third_party/aom/av1/encoder/optical_flow.c @@ -0,0 +1,1113 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/mathutils.h" +#include "aom_mem/aom_mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/optical_flow.h" +#include "av1/encoder/sparse_linear_solver.h" +#include "av1/encoder/reconinter_enc.h" + +#if CONFIG_OPTICAL_FLOW_API + +void av1_init_opfl_params(OPFL_PARAMS *opfl_params) { + opfl_params->pyramid_levels = OPFL_PYRAMID_LEVELS; + opfl_params->warping_steps = OPFL_WARPING_STEPS; + opfl_params->lk_params = NULL; +} + +void av1_init_lk_params(LK_PARAMS *lk_params) { + lk_params->window_size = OPFL_WINDOW_SIZE; +} + +// Helper function to determine whether a frame is encoded with high bit-depth. +static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) { + return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; +} + +// Helper function to determine whether optical flow method is sparse. +static INLINE int is_sparse(const OPFL_PARAMS *opfl_params) { + return (opfl_params->flags & OPFL_FLAG_SPARSE) ? 1 : 0; +} + +static void gradients_over_window(const YV12_BUFFER_CONFIG *frame, + const YV12_BUFFER_CONFIG *ref_frame, + const double x_coord, const double y_coord, + const int window_size, const int bit_depth, + double *ix, double *iy, double *it, + LOCALMV *mv); + +// coefficients for bilinear interpolation on unit square +static int pixel_interp(const double x, const double y, const double b00, + const double b01, const double b10, const double b11) { + const int xint = (int)x; + const int yint = (int)y; + const double xdec = x - xint; + const double ydec = y - yint; + const double a = (1 - xdec) * (1 - ydec); + const double b = xdec * (1 - ydec); + const double c = (1 - xdec) * ydec; + const double d = xdec * ydec; + // if x, y are already integers, this results to b00 + int interp = (int)round(a * b00 + b * b01 + c * b10 + d * b11); + return interp; +} + +// Scharr filter to compute spatial gradient +static void spatial_gradient(const YV12_BUFFER_CONFIG *frame, const int x_coord, + const int y_coord, const int direction, + double *derivative) { + double *filter; + // Scharr filters + double gx[9] = { -3, 0, 3, -10, 0, 10, -3, 0, 3 }; + double gy[9] = { -3, -10, -3, 0, 0, 0, 3, 10, 3 }; + if (direction == 0) { // x direction + filter = gx; + } else { // y direction + filter = gy; + } + int idx = 0; + double d = 0; + for (int yy = -1; yy <= 1; yy++) { + for (int xx = -1; xx <= 1; xx++) { + d += filter[idx] * + frame->y_buffer[(y_coord + yy) * frame->y_stride + (x_coord + xx)]; + idx++; + } + } + // normalization scaling factor for scharr + *derivative = d / 32.0; +} + +// Determine the spatial gradient at subpixel locations +// For example, when reducing images for pyramidal LK, +// corners found in original image may be at subpixel locations. +static void gradient_interp(double *fullpel_deriv, const double x_coord, + const double y_coord, const int w, const int h, + double *derivative) { + const int xint = (int)x_coord; + const int yint = (int)y_coord; + double interp; + if (xint + 1 > w - 1 || yint + 1 > h - 1) { + interp = fullpel_deriv[yint * w + xint]; + } else { + interp = pixel_interp(x_coord, y_coord, fullpel_deriv[yint * w + xint], + fullpel_deriv[yint * w + (xint + 1)], + fullpel_deriv[(yint + 1) * w + xint], + fullpel_deriv[(yint + 1) * w + (xint + 1)]); + } + + *derivative = interp; +} + +static void temporal_gradient(const YV12_BUFFER_CONFIG *frame, + const YV12_BUFFER_CONFIG *frame2, + const double x_coord, const double y_coord, + const int bit_depth, double *derivative, + LOCALMV *mv) { + const int w = 2; + const int h = 2; + uint8_t pred1[4]; + uint8_t pred2[4]; + + const int y = (int)y_coord; + const int x = (int)x_coord; + const double ydec = y_coord - y; + const double xdec = x_coord - x; + const int is_intrabc = 0; // Is intra-copied? + const int is_high_bitdepth = is_frame_high_bitdepth(frame2); + const int subsampling_x = 0, subsampling_y = 0; // for y-buffer + const int_interpfilters interp_filters = + av1_broadcast_interp_filter(MULTITAP_SHARP); + const int plane = 0; // y-plane + const struct buf_2d ref_buf2 = { NULL, frame2->y_buffer, frame2->y_crop_width, + frame2->y_crop_height, frame2->y_stride }; + struct scale_factors scale; + av1_setup_scale_factors_for_frame(&scale, frame->y_crop_width, + frame->y_crop_height, frame->y_crop_width, + frame->y_crop_height); + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, + subsampling_y, bit_depth, is_high_bitdepth, is_intrabc, + &scale, &ref_buf2, interp_filters); + inter_pred_params.interp_filter_params[0] = + &av1_interp_filter_params_list[interp_filters.as_filters.x_filter]; + inter_pred_params.interp_filter_params[1] = + &av1_interp_filter_params_list[interp_filters.as_filters.y_filter]; + inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); + MV newmv = { .row = (int16_t)round((mv->row + xdec) * 8), + .col = (int16_t)round((mv->col + ydec) * 8) }; + av1_enc_build_one_inter_predictor(pred2, w, &newmv, &inter_pred_params); + const struct buf_2d ref_buf1 = { NULL, frame->y_buffer, frame->y_crop_width, + frame->y_crop_height, frame->y_stride }; + av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, + subsampling_y, bit_depth, is_high_bitdepth, is_intrabc, + &scale, &ref_buf1, interp_filters); + inter_pred_params.interp_filter_params[0] = + &av1_interp_filter_params_list[interp_filters.as_filters.x_filter]; + inter_pred_params.interp_filter_params[1] = + &av1_interp_filter_params_list[interp_filters.as_filters.y_filter]; + inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); + MV zeroMV = { .row = (int16_t)round(xdec * 8), + .col = (int16_t)round(ydec * 8) }; + av1_enc_build_one_inter_predictor(pred1, w, &zeroMV, &inter_pred_params); + + *derivative = pred2[0] - pred1[0]; +} + +// Numerical differentiate over window_size x window_size surrounding (x,y) +// location. Alters ix, iy, it to contain numerical partial derivatives +static void gradients_over_window(const YV12_BUFFER_CONFIG *frame, + const YV12_BUFFER_CONFIG *ref_frame, + const double x_coord, const double y_coord, + const int window_size, const int bit_depth, + double *ix, double *iy, double *it, + LOCALMV *mv) { + const double left = x_coord - window_size / 2.0; + const double top = y_coord - window_size / 2.0; + // gradient operators need pixel before and after (start at 1) + const double x_start = AOMMAX(1, left); + const double y_start = AOMMAX(1, top); + const int frame_height = frame->y_crop_height; + const int frame_width = frame->y_crop_width; + double deriv_x; + double deriv_y; + double deriv_t; + + const double x_end = AOMMIN(x_coord + window_size / 2.0, frame_width - 2); + const double y_end = AOMMIN(y_coord + window_size / 2.0, frame_height - 2); + const int xs = (int)AOMMAX(1, x_start - 1); + const int ys = (int)AOMMAX(1, y_start - 1); + const int xe = (int)AOMMIN(x_end + 2, frame_width - 2); + const int ye = (int)AOMMIN(y_end + 2, frame_height - 2); + // with normalization, gradients may be double values + double *fullpel_dx = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_x)); + double *fullpel_dy = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_y)); + if (!fullpel_dx || !fullpel_dy) { + aom_free(fullpel_dx); + aom_free(fullpel_dy); + return; + } + + // TODO(any): This could be more efficient in the case that x_coord + // and y_coord are integers.. but it may look more messy. + + // calculate spatial gradients at full pixel locations + for (int j = ys; j < ye; j++) { + for (int i = xs; i < xe; i++) { + spatial_gradient(frame, i, j, 0, &deriv_x); + spatial_gradient(frame, i, j, 1, &deriv_y); + int idx = (j - ys) * (xe - xs) + (i - xs); + fullpel_dx[idx] = deriv_x; + fullpel_dy[idx] = deriv_y; + } + } + // compute numerical differentiation for every pixel in window + // (this potentially includes subpixels) + for (double j = y_start; j < y_end; j++) { + for (double i = x_start; i < x_end; i++) { + temporal_gradient(frame, ref_frame, i, j, bit_depth, &deriv_t, mv); + gradient_interp(fullpel_dx, i - xs, j - ys, xe - xs, ye - ys, &deriv_x); + gradient_interp(fullpel_dy, i - xs, j - ys, xe - xs, ye - ys, &deriv_y); + int idx = (int)(j - top) * window_size + (int)(i - left); + ix[idx] = deriv_x; + iy[idx] = deriv_y; + it[idx] = deriv_t; + } + } + // TODO(any): to avoid setting deriv arrays to zero for every iteration, + // could instead pass these two values back through function call + // int first_idx = (int)(y_start - top) * window_size + (int)(x_start - left); + // int width = window_size - ((int)(x_start - left) + (int)(left + window_size + // - x_end)); + + aom_free(fullpel_dx); + aom_free(fullpel_dy); +} + +// To compute eigenvalues of 2x2 matrix: Solve for lambda where +// Determinant(matrix - lambda*identity) == 0 +static void eigenvalues_2x2(const double *matrix, double *eig) { + const double a = 1; + const double b = -1 * matrix[0] - matrix[3]; + const double c = -1 * matrix[1] * matrix[2] + matrix[0] * matrix[3]; + // quadratic formula + const double discriminant = b * b - 4 * a * c; + eig[0] = (-b - sqrt(discriminant)) / (2.0 * a); + eig[1] = (-b + sqrt(discriminant)) / (2.0 * a); + // double check that eigenvalues are ordered by magnitude + if (fabs(eig[0]) > fabs(eig[1])) { + double tmp = eig[0]; + eig[0] = eig[1]; + eig[1] = tmp; + } +} + +// Shi-Tomasi corner detection criteria +static double corner_score(const YV12_BUFFER_CONFIG *frame_to_filter, + const YV12_BUFFER_CONFIG *ref_frame, const int x, + const int y, double *i_x, double *i_y, double *i_t, + const int n, const int bit_depth) { + double eig[2]; + LOCALMV mv = { .row = 0, .col = 0 }; + // TODO(any): technically, ref_frame and i_t are not used by corner score + // so these could be replaced by dummy variables, + // or change this to spatial gradient function over window only + gradients_over_window(frame_to_filter, ref_frame, x, y, n, bit_depth, i_x, + i_y, i_t, &mv); + double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 }; + multiply_mat(i_x, i_x, Mres1, 1, n * n, 1); + multiply_mat(i_x, i_y, Mres2, 1, n * n, 1); + multiply_mat(i_y, i_y, Mres3, 1, n * n, 1); + double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] }; + eigenvalues_2x2(M, eig); + return fabs(eig[0]); +} + +// Finds corners in frame_to_filter +// For less strict requirements (i.e. more corners), decrease threshold +static int detect_corners(const YV12_BUFFER_CONFIG *frame_to_filter, + const YV12_BUFFER_CONFIG *ref_frame, + const int maxcorners, int *ref_corners, + const int bit_depth) { + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + // TODO(any): currently if maxcorners is decreased, then it only means + // corners will be omited from bottom-right of image. if maxcorners + // is actually used, then this algorithm would need to re-iterate + // and choose threshold based on that + assert(maxcorners == frame_height * frame_width); + int countcorners = 0; + const double threshold = 0.1; + double score; + const int n = 3; + double i_x[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + double i_y[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + double i_t[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + const int fromedge = n; + double max_score = corner_score(frame_to_filter, ref_frame, fromedge, + fromedge, i_x, i_y, i_t, n, bit_depth); + // rough estimate of max corner score in image + for (int x = fromedge; x < frame_width - fromedge; x += 1) { + for (int y = fromedge; y < frame_height - fromedge; y += frame_height / 5) { + for (int i = 0; i < n * n; i++) { + i_x[i] = 0; + i_y[i] = 0; + i_t[i] = 0; + } + score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n, + bit_depth); + if (score > max_score) { + max_score = score; + } + } + } + // score all the points and choose corners over threshold + for (int x = fromedge; x < frame_width - fromedge; x += 1) { + for (int y = fromedge; + (y < frame_height - fromedge) && countcorners < maxcorners; y += 1) { + for (int i = 0; i < n * n; i++) { + i_x[i] = 0; + i_y[i] = 0; + i_t[i] = 0; + } + score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n, + bit_depth); + if (score > threshold * max_score) { + ref_corners[countcorners * 2] = x; + ref_corners[countcorners * 2 + 1] = y; + countcorners++; + } + } + } + return countcorners; +} + +// weights is an nxn matrix. weights is filled with a gaussian function, +// with independent variable: distance from the center point. +static void gaussian(const double sigma, const int n, const int normalize, + double *weights) { + double total_weight = 0; + for (int j = 0; j < n; j++) { + for (int i = 0; i < n; i++) { + double distance = sqrt(pow(n / 2 - i, 2) + pow(n / 2 - j, 2)); + double weight = exp(-0.5 * pow(distance / sigma, 2)); + weights[j * n + i] = weight; + total_weight += weight; + } + } + if (normalize == 1) { + for (int j = 0; j < n; j++) { + weights[j] = weights[j] / total_weight; + } + } +} + +static double convolve(const double *filter, const int *img, const int size) { + double result = 0; + for (int i = 0; i < size; i++) { + result += filter[i] * img[i]; + } + return result; +} + +// Applies a Gaussian low-pass smoothing filter to produce +// a corresponding lower resolution image with halved dimensions +static void reduce(uint8_t *img, int height, int width, int stride, + uint8_t *reduced_img) { + const int new_width = width / 2; + const int window_size = 5; + const double gaussian_filter[25] = { + 1. / 256, 1.0 / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16, + 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32, + 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256, + 1. / 64, 3. / 128, 1. / 64, 1. / 256 + }; + // filter is 5x5 so need prev and forward 2 pixels + int img_section[25]; + for (int y = 0; y < height - 1; y += 2) { + for (int x = 0; x < width - 1; x += 2) { + int i = 0; + for (int yy = y - window_size / 2; yy <= y + window_size / 2; yy++) { + for (int xx = x - window_size / 2; xx <= x + window_size / 2; xx++) { + int yvalue = yy; + int xvalue = xx; + // copied pixels outside the boundary + if (yvalue < 0) yvalue = 0; + if (xvalue < 0) xvalue = 0; + if (yvalue >= height) yvalue = height - 1; + if (xvalue >= width) xvalue = width - 1; + img_section[i++] = img[yvalue * stride + xvalue]; + } + } + reduced_img[(y / 2) * new_width + (x / 2)] = (uint8_t)convolve( + gaussian_filter, img_section, window_size * window_size); + } + } +} + +static int cmpfunc(const void *a, const void *b) { + return (*(int *)a - *(int *)b); +} +static void filter_mvs(const MV_FILTER_TYPE mv_filter, const int frame_height, + const int frame_width, LOCALMV *localmvs, MV *mvs) { + const int n = 5; // window size + // for smoothing filter + const double gaussian_filter[25] = { + 1. / 256, 1. / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16, + 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32, + 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256, + 1. / 64, 3. / 128, 1. / 64, 1. / 256 + }; + // for median filter + int mvrows[25]; + int mvcols[25]; + if (mv_filter != MV_FILTER_NONE) { + for (int y = 0; y < frame_height; y++) { + for (int x = 0; x < frame_width; x++) { + int center_idx = y * frame_width + x; + int i = 0; + double filtered_row = 0; + double filtered_col = 0; + for (int yy = y - n / 2; yy <= y + n / 2; yy++) { + for (int xx = x - n / 2; xx <= x + n / 2; xx++) { + int yvalue = yy; + int xvalue = xx; + // copied pixels outside the boundary + if (yvalue < 0) yvalue = 0; + if (xvalue < 0) xvalue = 0; + if (yvalue >= frame_height) yvalue = frame_height - 1; + if (xvalue >= frame_width) xvalue = frame_width - 1; + int index = yvalue * frame_width + xvalue; + if (mv_filter == MV_FILTER_SMOOTH) { + filtered_row += mvs[index].row * gaussian_filter[i]; + filtered_col += mvs[index].col * gaussian_filter[i]; + } else if (mv_filter == MV_FILTER_MEDIAN) { + mvrows[i] = mvs[index].row; + mvcols[i] = mvs[index].col; + } + i++; + } + } + + MV mv = mvs[center_idx]; + if (mv_filter == MV_FILTER_SMOOTH) { + mv.row = (int16_t)filtered_row; + mv.col = (int16_t)filtered_col; + } else if (mv_filter == MV_FILTER_MEDIAN) { + qsort(mvrows, 25, sizeof(mv.row), cmpfunc); + qsort(mvcols, 25, sizeof(mv.col), cmpfunc); + mv.row = mvrows[25 / 2]; + mv.col = mvcols[25 / 2]; + } + LOCALMV localmv = { .row = ((double)mv.row) / 8, + .col = ((double)mv.row) / 8 }; + localmvs[y * frame_width + x] = localmv; + // if mvs array is immediately updated here, then the result may + // propagate to other pixels. + } + } + for (int i = 0; i < frame_height * frame_width; i++) { + MV mv = { .row = (int16_t)round(8 * localmvs[i].row), + .col = (int16_t)round(8 * localmvs[i].col) }; + mvs[i] = mv; + } + } +} + +// Computes optical flow at a single pyramid level, +// using Lucas-Kanade algorithm. +// Modifies mvs array. +static void lucas_kanade(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, const int level, + const LK_PARAMS *lk_params, const int num_ref_corners, + int *ref_corners, const int mv_stride, + const int bit_depth, LOCALMV *mvs) { + assert(lk_params->window_size > 0 && lk_params->window_size % 2 == 0); + const int n = lk_params->window_size; + // algorithm is sensitive to window size + double *i_x = (double *)aom_malloc(n * n * sizeof(*i_x)); + double *i_y = (double *)aom_malloc(n * n * sizeof(*i_y)); + double *i_t = (double *)aom_malloc(n * n * sizeof(*i_t)); + double *weights = (double *)aom_malloc(n * n * sizeof(*weights)); + if (!i_x || !i_y || !i_t || !weights) goto free_lk_buf; + + const int expand_multiplier = (int)pow(2, level); + double sigma = 0.2 * n; + // normalizing doesn't really affect anything since it's applied + // to every component of M and b + gaussian(sigma, n, 0, weights); + for (int i = 0; i < num_ref_corners; i++) { + const double x_coord = 1.0 * ref_corners[i * 2] / expand_multiplier; + const double y_coord = 1.0 * ref_corners[i * 2 + 1] / expand_multiplier; + int highres_x = ref_corners[i * 2]; + int highres_y = ref_corners[i * 2 + 1]; + int mv_idx = highres_y * (mv_stride) + highres_x; + LOCALMV mv_old = mvs[mv_idx]; + mv_old.row = mv_old.row / expand_multiplier; + mv_old.col = mv_old.col / expand_multiplier; + // using this instead of memset, since it's not completely + // clear if zero memset works on double arrays + for (int j = 0; j < n * n; j++) { + i_x[j] = 0; + i_y[j] = 0; + i_t[j] = 0; + } + gradients_over_window(from_frame, to_frame, x_coord, y_coord, n, bit_depth, + i_x, i_y, i_t, &mv_old); + double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 }; + double bres1[1] = { 0 }, bres2[1] = { 0 }; + for (int j = 0; j < n * n; j++) { + Mres1[0] += weights[j] * i_x[j] * i_x[j]; + Mres2[0] += weights[j] * i_x[j] * i_y[j]; + Mres3[0] += weights[j] * i_y[j] * i_y[j]; + bres1[0] += weights[j] * i_x[j] * i_t[j]; + bres2[0] += weights[j] * i_y[j] * i_t[j]; + } + double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] }; + double b[2] = { -1 * bres1[0], -1 * bres2[0] }; + double eig[2] = { 1, 1 }; + eigenvalues_2x2(M, eig); + double threshold = 0.1; + if (fabs(eig[0]) > threshold) { + // if M is not invertible, then displacement + // will default to zeros + double u[2] = { 0, 0 }; + linsolve(2, M, 2, b, u); + int mult = 1; + if (level != 0) + mult = expand_multiplier; // mv doubles when resolution doubles + LOCALMV mv = { .row = (mult * (u[0] + mv_old.row)), + .col = (mult * (u[1] + mv_old.col)) }; + mvs[mv_idx] = mv; + mvs[mv_idx] = mv; + } + } +free_lk_buf: + aom_free(weights); + aom_free(i_t); + aom_free(i_x); + aom_free(i_y); +} + +// Warp the src_frame to warper_frame according to mvs. +// mvs point to src_frame +static void warp_back_frame(YV12_BUFFER_CONFIG *warped_frame, + const YV12_BUFFER_CONFIG *src_frame, + const LOCALMV *mvs, int mv_stride) { + int w, h; + const int fw = src_frame->y_crop_width; + const int fh = src_frame->y_crop_height; + const int src_fs = src_frame->y_stride, warped_fs = warped_frame->y_stride; + const uint8_t *src_buf = src_frame->y_buffer; + uint8_t *warped_buf = warped_frame->y_buffer; + double temp; + for (h = 0; h < fh; h++) { + for (w = 0; w < fw; w++) { + double cord_x = (double)w + mvs[h * mv_stride + w].col; + double cord_y = (double)h + mvs[h * mv_stride + w].row; + cord_x = fclamp(cord_x, 0, (double)(fw - 1)); + cord_y = fclamp(cord_y, 0, (double)(fh - 1)); + const int floorx = (int)floor(cord_x); + const int floory = (int)floor(cord_y); + const double fracx = cord_x - (double)floorx; + const double fracy = cord_y - (double)floory; + + temp = 0; + for (int hh = 0; hh < 2; hh++) { + const double weighth = hh ? (fracy) : (1 - fracy); + for (int ww = 0; ww < 2; ww++) { + const double weightw = ww ? (fracx) : (1 - fracx); + int y = floory + hh; + int x = floorx + ww; + y = clamp(y, 0, fh - 1); + x = clamp(x, 0, fw - 1); + temp += (double)src_buf[y * src_fs + x] * weightw * weighth; + } + } + warped_buf[h * warped_fs + w] = (uint8_t)round(temp); + } + } +} + +// Same as warp_back_frame, but using a better interpolation filter. +static void warp_back_frame_intp(YV12_BUFFER_CONFIG *warped_frame, + const YV12_BUFFER_CONFIG *src_frame, + const LOCALMV *mvs, int mv_stride) { + int w, h; + const int fw = src_frame->y_crop_width; + const int fh = src_frame->y_crop_height; + const int warped_fs = warped_frame->y_stride; + uint8_t *warped_buf = warped_frame->y_buffer; + const int blk = 2; + uint8_t temp_blk[4]; + + const int is_intrabc = 0; // Is intra-copied? + const int is_high_bitdepth = is_frame_high_bitdepth(src_frame); + const int subsampling_x = 0, subsampling_y = 0; // for y-buffer + const int_interpfilters interp_filters = + av1_broadcast_interp_filter(MULTITAP_SHARP2); + const int plane = 0; // y-plane + const struct buf_2d ref_buf2 = { NULL, src_frame->y_buffer, + src_frame->y_crop_width, + src_frame->y_crop_height, + src_frame->y_stride }; + const int bit_depth = src_frame->bit_depth; + struct scale_factors scale; + av1_setup_scale_factors_for_frame( + &scale, src_frame->y_crop_width, src_frame->y_crop_height, + src_frame->y_crop_width, src_frame->y_crop_height); + + for (h = 0; h < fh; h++) { + for (w = 0; w < fw; w++) { + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, blk, blk, h, w, subsampling_x, + subsampling_y, bit_depth, is_high_bitdepth, + is_intrabc, &scale, &ref_buf2, interp_filters); + inter_pred_params.interp_filter_params[0] = + &av1_interp_filter_params_list[interp_filters.as_filters.x_filter]; + inter_pred_params.interp_filter_params[1] = + &av1_interp_filter_params_list[interp_filters.as_filters.y_filter]; + inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); + MV newmv = { .row = (int16_t)round((mvs[h * mv_stride + w].row) * 8), + .col = (int16_t)round((mvs[h * mv_stride + w].col) * 8) }; + av1_enc_build_one_inter_predictor(temp_blk, blk, &newmv, + &inter_pred_params); + warped_buf[h * warped_fs + w] = temp_blk[0]; + } + } +} + +#define DERIVATIVE_FILTER_LENGTH 7 +double filter[DERIVATIVE_FILTER_LENGTH] = { -1.0 / 60, 9.0 / 60, -45.0 / 60, 0, + 45.0 / 60, -9.0 / 60, 1.0 / 60 }; + +// Get gradient of the whole frame +static void get_frame_gradients(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, double *ix, + double *iy, double *it, int grad_stride) { + int w, h, k, idx; + const int fw = from_frame->y_crop_width; + const int fh = from_frame->y_crop_height; + const int from_fs = from_frame->y_stride, to_fs = to_frame->y_stride; + const uint8_t *from_buf = from_frame->y_buffer; + const uint8_t *to_buf = to_frame->y_buffer; + + const int lh = DERIVATIVE_FILTER_LENGTH; + const int hleft = (lh - 1) / 2; + + for (h = 0; h < fh; h++) { + for (w = 0; w < fw; w++) { + // x + ix[h * grad_stride + w] = 0; + for (k = 0; k < lh; k++) { + // if we want to make this block dependent, need to extend the + // boundaries using other initializations. + idx = w + k - hleft; + idx = clamp(idx, 0, fw - 1); + ix[h * grad_stride + w] += filter[k] * 0.5 * + ((double)from_buf[h * from_fs + idx] + + (double)to_buf[h * to_fs + idx]); + } + // y + iy[h * grad_stride + w] = 0; + for (k = 0; k < lh; k++) { + // if we want to make this block dependent, need to extend the + // boundaries using other initializations. + idx = h + k - hleft; + idx = clamp(idx, 0, fh - 1); + iy[h * grad_stride + w] += filter[k] * 0.5 * + ((double)from_buf[idx * from_fs + w] + + (double)to_buf[idx * to_fs + w]); + } + // t + it[h * grad_stride + w] = + (double)to_buf[h * to_fs + w] - (double)from_buf[h * from_fs + w]; + } + } +} + +// Solve for linear equations given by the H-S method +static void solve_horn_schunck(const double *ix, const double *iy, + const double *it, int grad_stride, int width, + int height, const LOCALMV *init_mvs, + int init_mv_stride, LOCALMV *mvs, + int mv_stride) { + // TODO(bohanli): May just need to allocate the buffers once per optical flow + // calculation + int *row_pos = aom_calloc(width * height * 28, sizeof(*row_pos)); + int *col_pos = aom_calloc(width * height * 28, sizeof(*col_pos)); + double *values = aom_calloc(width * height * 28, sizeof(*values)); + double *mv_vec = aom_calloc(width * height * 2, sizeof(*mv_vec)); + double *mv_init_vec = aom_calloc(width * height * 2, sizeof(*mv_init_vec)); + double *temp_b = aom_calloc(width * height * 2, sizeof(*temp_b)); + double *b = aom_calloc(width * height * 2, sizeof(*b)); + if (!row_pos || !col_pos || !values || !mv_vec || !mv_init_vec || !temp_b || + !b) { + goto free_hs_solver_buf; + } + + // the location idx for neighboring pixels, k < 4 are the 4 direct neighbors + const int check_locs_y[12] = { 0, 0, -1, 1, -1, -1, 1, 1, 0, 0, -2, 2 }; + const int check_locs_x[12] = { -1, 1, 0, 0, -1, 1, -1, 1, -2, 2, 0, 0 }; + + int h, w, checkh, checkw, k, ret; + const int offset = height * width; + SPARSE_MTX A; + int c = 0; + const double lambda = 100; + + for (w = 0; w < width; w++) { + for (h = 0; h < height; h++) { + mv_init_vec[w * height + h] = init_mvs[h * init_mv_stride + w].col; + mv_init_vec[w * height + h + offset] = + init_mvs[h * init_mv_stride + w].row; + } + } + + // get matrix A + for (w = 0; w < width; w++) { + for (h = 0; h < height; h++) { + int center_num_direct = 4; + const int center_idx = w * height + h; + if (w == 0 || w == width - 1) center_num_direct--; + if (h == 0 || h == height - 1) center_num_direct--; + // diagonal entry for this row from the center pixel + double cor_w = center_num_direct * center_num_direct + center_num_direct; + row_pos[c] = center_idx; + col_pos[c] = center_idx; + values[c] = lambda * cor_w; + c++; + row_pos[c] = center_idx + offset; + col_pos[c] = center_idx + offset; + values[c] = lambda * cor_w; + c++; + // other entries from direct neighbors + for (k = 0; k < 4; k++) { + checkh = h + check_locs_y[k]; + checkw = w + check_locs_x[k]; + if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) { + continue; + } + int this_idx = checkw * height + checkh; + int this_num_direct = 4; + if (checkw == 0 || checkw == width - 1) this_num_direct--; + if (checkh == 0 || checkh == height - 1) this_num_direct--; + cor_w = -center_num_direct - this_num_direct; + row_pos[c] = center_idx; + col_pos[c] = this_idx; + values[c] = lambda * cor_w; + c++; + row_pos[c] = center_idx + offset; + col_pos[c] = this_idx + offset; + values[c] = lambda * cor_w; + c++; + } + // entries from neighbors on the diagonal corners + for (k = 4; k < 8; k++) { + checkh = h + check_locs_y[k]; + checkw = w + check_locs_x[k]; + if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) { + continue; + } + int this_idx = checkw * height + checkh; + cor_w = 2; + row_pos[c] = center_idx; + col_pos[c] = this_idx; + values[c] = lambda * cor_w; + c++; + row_pos[c] = center_idx + offset; + col_pos[c] = this_idx + offset; + values[c] = lambda * cor_w; + c++; + } + // entries from neighbors with dist of 2 + for (k = 8; k < 12; k++) { + checkh = h + check_locs_y[k]; + checkw = w + check_locs_x[k]; + if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) { + continue; + } + int this_idx = checkw * height + checkh; + cor_w = 1; + row_pos[c] = center_idx; + col_pos[c] = this_idx; + values[c] = lambda * cor_w; + c++; + row_pos[c] = center_idx + offset; + col_pos[c] = this_idx + offset; + values[c] = lambda * cor_w; + c++; + } + } + } + ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height, + 2 * width * height, &A); + if (ret < 0) goto free_hs_solver_buf; + // subtract init mv part from b + av1_mtx_vect_multi_left(&A, mv_init_vec, temp_b, 2 * width * height); + for (int i = 0; i < 2 * width * height; i++) { + b[i] = -temp_b[i]; + } + av1_free_sparse_mtx_elems(&A); + + // add cross terms to A and modify b with ExEt / EyEt + for (w = 0; w < width; w++) { + for (h = 0; h < height; h++) { + int curidx = w * height + h; + // modify b + b[curidx] += -ix[h * grad_stride + w] * it[h * grad_stride + w]; + b[curidx + offset] += -iy[h * grad_stride + w] * it[h * grad_stride + w]; + // add cross terms to A + row_pos[c] = curidx; + col_pos[c] = curidx + offset; + values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w]; + c++; + row_pos[c] = curidx + offset; + col_pos[c] = curidx; + values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w]; + c++; + } + } + // Add diagonal terms to A + for (int i = 0; i < c; i++) { + if (row_pos[i] == col_pos[i]) { + if (row_pos[i] < offset) { + w = row_pos[i] / height; + h = row_pos[i] % height; + values[i] += pow(ix[h * grad_stride + w], 2); + } else { + w = (row_pos[i] - offset) / height; + h = (row_pos[i] - offset) % height; + values[i] += pow(iy[h * grad_stride + w], 2); + } + } + } + + ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height, + 2 * width * height, &A); + if (ret < 0) goto free_hs_solver_buf; + + // solve for the mvs + ret = av1_conjugate_gradient_sparse(&A, b, 2 * width * height, mv_vec); + if (ret < 0) goto free_hs_solver_buf; + + // copy mvs + for (w = 0; w < width; w++) { + for (h = 0; h < height; h++) { + mvs[h * mv_stride + w].col = mv_vec[w * height + h]; + mvs[h * mv_stride + w].row = mv_vec[w * height + h + offset]; + } + } +free_hs_solver_buf: + aom_free(row_pos); + aom_free(col_pos); + aom_free(values); + aom_free(mv_vec); + aom_free(mv_init_vec); + aom_free(b); + aom_free(temp_b); + av1_free_sparse_mtx_elems(&A); +} + +// Calculate optical flow from from_frame to to_frame using the H-S method. +static void horn_schunck(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, const int level, + const int mv_stride, const int mv_height, + const int mv_width, const OPFL_PARAMS *opfl_params, + LOCALMV *mvs) { + // mvs are always on level 0, here we define two new mv arrays that is of size + // of this level. + const int fw = from_frame->y_crop_width; + const int fh = from_frame->y_crop_height; + const int factor = (int)pow(2, level); + int w, h, k, init_mv_stride; + LOCALMV *init_mvs = NULL, *refine_mvs = NULL; + double *ix = NULL, *iy = NULL, *it = NULL; + YV12_BUFFER_CONFIG temp_frame; + temp_frame.y_buffer = NULL; + if (level == 0) { + init_mvs = mvs; + init_mv_stride = mv_stride; + } else { + init_mvs = aom_calloc(fw * fh, sizeof(*mvs)); + if (!init_mvs) goto free_hs_buf; + init_mv_stride = fw; + for (h = 0; h < fh; h++) { + for (w = 0; w < fw; w++) { + init_mvs[h * init_mv_stride + w].row = + mvs[h * factor * mv_stride + w * factor].row / (double)factor; + init_mvs[h * init_mv_stride + w].col = + mvs[h * factor * mv_stride + w * factor].col / (double)factor; + } + } + } + refine_mvs = aom_calloc(fw * fh, sizeof(*mvs)); + if (!refine_mvs) goto free_hs_buf; + // temp frame for warping + temp_frame.y_buffer = + (uint8_t *)aom_calloc(fh * fw, sizeof(*temp_frame.y_buffer)); + if (!temp_frame.y_buffer) goto free_hs_buf; + temp_frame.y_crop_height = fh; + temp_frame.y_crop_width = fw; + temp_frame.y_stride = fw; + // gradient buffers + ix = aom_calloc(fw * fh, sizeof(*ix)); + iy = aom_calloc(fw * fh, sizeof(*iy)); + it = aom_calloc(fw * fh, sizeof(*it)); + if (!ix || !iy || !it) goto free_hs_buf; + // For each warping step + for (k = 0; k < opfl_params->warping_steps; k++) { + // warp from_frame with init_mv + if (level == 0) { + warp_back_frame_intp(&temp_frame, to_frame, init_mvs, init_mv_stride); + } else { + warp_back_frame(&temp_frame, to_frame, init_mvs, init_mv_stride); + } + // calculate frame gradients + get_frame_gradients(from_frame, &temp_frame, ix, iy, it, fw); + // form linear equations and solve mvs + solve_horn_schunck(ix, iy, it, fw, fw, fh, init_mvs, init_mv_stride, + refine_mvs, fw); + // update init_mvs + for (h = 0; h < fh; h++) { + for (w = 0; w < fw; w++) { + init_mvs[h * init_mv_stride + w].col += refine_mvs[h * fw + w].col; + init_mvs[h * init_mv_stride + w].row += refine_mvs[h * fw + w].row; + } + } + } + // copy back the mvs if needed + if (level != 0) { + for (h = 0; h < mv_height; h++) { + for (w = 0; w < mv_width; w++) { + mvs[h * mv_stride + w].row = + init_mvs[h / factor * init_mv_stride + w / factor].row * + (double)factor; + mvs[h * mv_stride + w].col = + init_mvs[h / factor * init_mv_stride + w / factor].col * + (double)factor; + } + } + } +free_hs_buf: + if (level != 0) aom_free(init_mvs); + aom_free(refine_mvs); + aom_free(temp_frame.y_buffer); + aom_free(ix); + aom_free(iy); + aom_free(it); +} + +// Apply optical flow iteratively at each pyramid level +static void pyramid_optical_flow(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, + const int bit_depth, + const OPFL_PARAMS *opfl_params, + const OPTFLOW_METHOD method, LOCALMV *mvs) { + assert(opfl_params->pyramid_levels > 0 && + opfl_params->pyramid_levels <= MAX_PYRAMID_LEVELS); + int levels = opfl_params->pyramid_levels; + const int frame_height = from_frame->y_crop_height; + const int frame_width = from_frame->y_crop_width; + if ((frame_height / pow(2.0, levels - 1) < 50 || + frame_height / pow(2.0, levels - 1) < 50) && + levels > 1) + levels = levels - 1; + uint8_t *images1[MAX_PYRAMID_LEVELS] = { NULL }; + uint8_t *images2[MAX_PYRAMID_LEVELS] = { NULL }; + int *ref_corners = NULL; + + images1[0] = from_frame->y_buffer; + images2[0] = to_frame->y_buffer; + YV12_BUFFER_CONFIG *buffers1 = aom_malloc(levels * sizeof(*buffers1)); + YV12_BUFFER_CONFIG *buffers2 = aom_malloc(levels * sizeof(*buffers2)); + if (!buffers1 || !buffers2) goto free_pyramid_buf; + buffers1[0] = *from_frame; + buffers2[0] = *to_frame; + int fw = frame_width; + int fh = frame_height; + for (int i = 1; i < levels; i++) { + // TODO(bohanli): may need to extend buffers for better interpolation SIMD + images1[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images1[i])); + images2[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images2[i])); + if (!images1[i] || !images2[i]) goto free_pyramid_buf; + int stride; + if (i == 1) + stride = from_frame->y_stride; + else + stride = fw; + reduce(images1[i - 1], fh, fw, stride, images1[i]); + reduce(images2[i - 1], fh, fw, stride, images2[i]); + fh /= 2; + fw /= 2; + YV12_BUFFER_CONFIG a = { .y_buffer = images1[i], + .y_crop_width = fw, + .y_crop_height = fh, + .y_stride = fw }; + YV12_BUFFER_CONFIG b = { .y_buffer = images2[i], + .y_crop_width = fw, + .y_crop_height = fh, + .y_stride = fw }; + buffers1[i] = a; + buffers2[i] = b; + } + // Compute corners for specific frame + int num_ref_corners = 0; + if (is_sparse(opfl_params)) { + int maxcorners = from_frame->y_crop_width * from_frame->y_crop_height; + ref_corners = aom_malloc(maxcorners * 2 * sizeof(*ref_corners)); + if (!ref_corners) goto free_pyramid_buf; + num_ref_corners = detect_corners(from_frame, to_frame, maxcorners, + ref_corners, bit_depth); + } + const int stop_level = 0; + for (int i = levels - 1; i >= stop_level; i--) { + if (method == LUCAS_KANADE) { + assert(is_sparse(opfl_params)); + lucas_kanade(&buffers1[i], &buffers2[i], i, opfl_params->lk_params, + num_ref_corners, ref_corners, buffers1[0].y_crop_width, + bit_depth, mvs); + } else if (method == HORN_SCHUNCK) { + assert(!is_sparse(opfl_params)); + horn_schunck(&buffers1[i], &buffers2[i], i, buffers1[0].y_crop_width, + buffers1[0].y_crop_height, buffers1[0].y_crop_width, + opfl_params, mvs); + } + } +free_pyramid_buf: + for (int i = 1; i < levels; i++) { + aom_free(images1[i]); + aom_free(images2[i]); + } + aom_free(ref_corners); + aom_free(buffers1); + aom_free(buffers2); +} +// Computes optical flow by applying algorithm at +// multiple pyramid levels of images (lower-resolution, smoothed images) +// This accounts for larger motions. +// Inputs: +// from_frame Frame buffer. +// to_frame: Frame buffer. MVs point from_frame -> to_frame. +// from_frame_idx: Index of from_frame. +// to_frame_idx: Index of to_frame. Return all zero MVs when idx are equal. +// bit_depth: +// opfl_params: contains algorithm-specific parameters. +// mv_filter: MV_FILTER_NONE, MV_FILTER_SMOOTH, or MV_FILTER_MEDIAN. +// method: LUCAS_KANADE, HORN_SCHUNCK +// mvs: pointer to MVs. Contains initialization, and modified +// based on optical flow. Must have +// dimensions = from_frame->y_crop_width * from_frame->y_crop_height +void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, + const int from_frame_idx, const int to_frame_idx, + const int bit_depth, const OPFL_PARAMS *opfl_params, + const MV_FILTER_TYPE mv_filter, + const OPTFLOW_METHOD method, MV *mvs) { + const int frame_height = from_frame->y_crop_height; + const int frame_width = from_frame->y_crop_width; + // TODO(any): deal with the case where frames are not of the same dimensions + assert(frame_height == to_frame->y_crop_height && + frame_width == to_frame->y_crop_width); + if (from_frame_idx == to_frame_idx) { + // immediately return all zero mvs when frame indices are equal + for (int yy = 0; yy < frame_height; yy++) { + for (int xx = 0; xx < frame_width; xx++) { + MV mv = { .row = 0, .col = 0 }; + mvs[yy * frame_width + xx] = mv; + } + } + return; + } + + // Initialize double mvs based on input parameter mvs array + LOCALMV *localmvs = + aom_malloc(frame_height * frame_width * sizeof(*localmvs)); + if (!localmvs) return; + + filter_mvs(MV_FILTER_SMOOTH, frame_height, frame_width, localmvs, mvs); + + for (int i = 0; i < frame_width * frame_height; i++) { + MV mv = mvs[i]; + LOCALMV localmv = { .row = ((double)mv.row) / 8, + .col = ((double)mv.col) / 8 }; + localmvs[i] = localmv; + } + // Apply optical flow algorithm + pyramid_optical_flow(from_frame, to_frame, bit_depth, opfl_params, method, + localmvs); + + // Update original mvs array + for (int j = 0; j < frame_height; j++) { + for (int i = 0; i < frame_width; i++) { + int idx = j * frame_width + i; + if (j + localmvs[idx].row < 0 || j + localmvs[idx].row >= frame_height || + i + localmvs[idx].col < 0 || i + localmvs[idx].col >= frame_width) { + continue; + } + MV mv = { .row = (int16_t)round(8 * localmvs[idx].row), + .col = (int16_t)round(8 * localmvs[idx].col) }; + mvs[idx] = mv; + } + } + + filter_mvs(mv_filter, frame_height, frame_width, localmvs, mvs); + + aom_free(localmvs); +} +#endif diff --git a/third_party/aom/av1/encoder/optical_flow.h b/third_party/aom/av1/encoder/optical_flow.h new file mode 100644 index 0000000000..2fbe474d77 --- /dev/null +++ b/third_party/aom/av1/encoder/optical_flow.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_OPTICAL_FLOW_H_ +#define AOM_AV1_ENCODER_OPTICAL_FLOW_H_ + +#include "aom_scale/yv12config.h" +#include "av1/common/mv.h" +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_OPTICAL_FLOW_API + +typedef enum { LUCAS_KANADE, HORN_SCHUNCK } OPTFLOW_METHOD; + +typedef enum { + MV_FILTER_NONE, + MV_FILTER_SMOOTH, + MV_FILTER_MEDIAN +} MV_FILTER_TYPE; + +typedef struct LOCALMV { + double row; + double col; +} LOCALMV; + +#define MAX_PYRAMID_LEVELS 5 +// default options for optical flow +#define OPFL_WINDOW_SIZE 15 +#define OPFL_PYRAMID_LEVELS 3 // total levels +#define OPFL_WARPING_STEPS 3 + +// parameters specific to Lucas-Kanade +typedef struct lk_params { + int window_size; +} LK_PARAMS; + +// generic structure to contain parameters for all +// optical flow algorithms +typedef struct opfl_params { + int pyramid_levels; + int warping_steps; + LK_PARAMS *lk_params; + int flags; +} OPFL_PARAMS; + +#define OPFL_FLAG_SPARSE 1 + +void av1_init_opfl_params(OPFL_PARAMS *opfl_params); + +void av1_init_lk_params(LK_PARAMS *lk_params); + +void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, + const int from_frame_idx, const int to_frame_idx, + const int bit_depth, const OPFL_PARAMS *opfl_params, + const MV_FILTER_TYPE mv_filter, + const OPTFLOW_METHOD method, MV *mvs); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_OPTICAL_FLOW_H_ diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c new file mode 100644 index 0000000000..7f79e9596e --- /dev/null +++ b/third_party/aom/av1/encoder/palette.c @@ -0,0 +1,975 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/pred_common.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/random.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/tx_search.h" + +#define AV1_K_MEANS_DIM 1 +#include "av1/encoder/k_means_template.h" +#undef AV1_K_MEANS_DIM +#define AV1_K_MEANS_DIM 2 +#include "av1/encoder/k_means_template.h" +#undef AV1_K_MEANS_DIM + +static int int16_comparer(const void *a, const void *b) { + return (*(int16_t *)a - *(int16_t *)b); +} + +int av1_remove_duplicates(int16_t *centroids, int num_centroids) { + int num_unique; // number of unique centroids + int i; + qsort(centroids, num_centroids, sizeof(*centroids), int16_comparer); + // Remove duplicates. + num_unique = 1; + for (i = 1; i < num_centroids; ++i) { + if (centroids[i] != centroids[i - 1]) { // found a new unique centroid + centroids[num_unique++] = centroids[i]; + } + } + return num_unique; +} + +static int delta_encode_cost(const int *colors, int num, int bit_depth, + int min_val) { + if (num <= 0) return 0; + int bits_cost = bit_depth; + if (num == 1) return bits_cost; + bits_cost += 2; + int max_delta = 0; + int deltas[PALETTE_MAX_SIZE]; + const int min_bits = bit_depth - 3; + for (int i = 1; i < num; ++i) { + const int delta = colors[i] - colors[i - 1]; + deltas[i - 1] = delta; + assert(delta >= min_val); + if (delta > max_delta) max_delta = delta; + } + int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits); + assert(bits_per_delta <= bit_depth); + int range = (1 << bit_depth) - colors[0] - min_val; + for (int i = 0; i < num - 1; ++i) { + bits_cost += bits_per_delta; + range -= deltas[i]; + bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range)); + } + return bits_cost; +} + +int av1_index_color_cache(const uint16_t *color_cache, int n_cache, + const uint16_t *colors, int n_colors, + uint8_t *cache_color_found, int *out_cache_colors) { + if (n_cache <= 0) { + for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i]; + return n_colors; + } + memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found)); + int n_in_cache = 0; + int in_cache_flags[PALETTE_MAX_SIZE]; + memset(in_cache_flags, 0, sizeof(in_cache_flags)); + for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) { + for (int j = 0; j < n_colors; ++j) { + if (colors[j] == color_cache[i]) { + in_cache_flags[j] = 1; + cache_color_found[i] = 1; + ++n_in_cache; + break; + } + } + } + int j = 0; + for (int i = 0; i < n_colors; ++i) + if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i]; + assert(j == n_colors - n_in_cache); + return j; +} + +int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *zero_count, + int *min_bits) { + const int n = pmi->palette_size[1]; + const int max_val = 1 << bit_depth; + int max_d = 0; + *min_bits = bit_depth - 4; + *zero_count = 0; + for (int i = 1; i < n; ++i) { + const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] - + pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1]; + const int v = abs(delta); + const int d = AOMMIN(v, max_val - v); + if (d > max_d) max_d = d; + if (d == 0) ++(*zero_count); + } + return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits); +} + +int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, + const uint16_t *color_cache, int n_cache, + int bit_depth) { + const int n = pmi->palette_size[0]; + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = + av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n, + cache_color_found, out_cache_colors); + const int total_bits = + n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1); + return av1_cost_literal(total_bits); +} + +int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, + const uint16_t *color_cache, int n_cache, + int bit_depth) { + const int n = pmi->palette_size[1]; + int total_bits = 0; + // U channel palette color cost. + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = av1_index_color_cache( + color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n, + cache_color_found, out_cache_colors); + total_bits += + n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0); + + // V channel palette color cost. + int zero_count = 0, min_bits_v = 0; + const int bits_v = + av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v); + const int bits_using_delta = + 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; + const int bits_using_raw = bit_depth * n; + total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw); + return av1_cost_literal(total_bits); +} + +// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x +// new_height'. Extra rows and columns are filled in by copying last valid +// row/column. +static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map, + int orig_width, int orig_height, + int new_width, int new_height) { + int j; + assert(new_width >= orig_width); + assert(new_height >= orig_height); + if (new_width == orig_width && new_height == orig_height) return; + + for (j = orig_height - 1; j >= 0; --j) { + memmove(color_map + j * new_width, color_map + j * orig_width, orig_width); + // Copy last column to extra columns. + memset(color_map + j * new_width + orig_width, + color_map[j * new_width + orig_width - 1], new_width - orig_width); + } + // Copy last row to extra rows. + for (j = orig_height; j < new_height; ++j) { + memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width, + new_width); + } +} + +// Bias toward using colors in the cache. +// TODO(huisu): Try other schemes to improve compression. +static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache, + int n_cache, int n_colors, + int stride, int16_t *centroids, + int bit_depth) { + if (n_cache <= 0) return; + for (int i = 0; i < n_colors * stride; i += stride) { + int min_diff = abs((int)centroids[i] - (int)color_cache[0]); + int idx = 0; + for (int j = 1; j < n_cache; ++j) { + const int this_diff = abs((int)centroids[i] - (int)color_cache[j]); + if (this_diff < min_diff) { + min_diff = this_diff; + idx = j; + } + } + const int min_threshold = 4 << (bit_depth - 8); + if (min_diff <= min_threshold) centroids[i] = color_cache[idx]; + } +} + +/*!\brief Calculate the luma palette cost from a given color palette + * + * \ingroup palette_mode_search + * \callergraph + * Given the base colors as specified in centroids[], calculate the RD cost + * of palette mode. + */ +static AOM_INLINE void palette_rd_y( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int16_t *centroids, + int n, uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating, + MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, + int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, + int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip, + uint8_t *tx_type_map, int *beat_best_palette_rd, + bool *do_header_rd_based_breakout, int discount_color_cost) { + if (do_header_rd_based_breakout != NULL) *do_header_rd_based_breakout = false; + optimize_palette_colors(color_cache, n_cache, n, 1, centroids, + cpi->common.seq_params->bit_depth); + const int num_unique_colors = av1_remove_duplicates(centroids, n); + if (num_unique_colors < PALETTE_MIN_SIZE) { + // Too few unique colors to create a palette. And DC_PRED will work + // well for that case anyway. So skip. + return; + } + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + if (cpi->common.seq_params->use_highbitdepth) { + for (int i = 0; i < num_unique_colors; ++i) { + pmi->palette_colors[i] = clip_pixel_highbd( + (int)centroids[i], cpi->common.seq_params->bit_depth); + } + } else { + for (int i = 0; i < num_unique_colors; ++i) { + pmi->palette_colors[i] = clip_pixel(centroids[i]); + } + } + pmi->palette_size[0] = num_unique_colors; + MACROBLOCKD *const xd = &x->e_mbd; + uint8_t *const color_map = xd->plane[0].color_index_map; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors, + 1); + extend_palette_color_map(color_map, cols, rows, block_width, block_height); + + RD_STATS tokenonly_rd_stats; + int this_rate; + + if (do_header_rd_based_gating) { + assert(do_header_rd_based_breakout != NULL); + const int palette_mode_rate = intra_mode_info_cost_y( + cpi, x, mbmi, bsize, dc_mode_cost, discount_color_cost); + const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0); + // Less aggressive pruning when prune_luma_palette_size_search_level == 1. + const int header_rd_shift = + (cpi->sf.intra_sf.prune_luma_palette_size_search_level == 1) ? 1 : 0; + // Terminate further palette_size search, if the header cost corresponding + // to lower palette_size is more than *best_rd << header_rd_shift. This + // logic is implemented with a right shift in the LHS to prevent a possible + // overflow with the left shift in RHS. + if ((header_rd >> header_rd_shift) > *best_rd) { + *do_header_rd_based_breakout = true; + return; + } + av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, + *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) return; + this_rate = tokenonly_rd_stats.rate + palette_mode_rate; + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, + *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) return; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost, + discount_color_cost); + } + + int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) { + tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size); + } + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize, + this_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); + if (this_rd < *best_rd) { + *best_rd = this_rd; + // Setting beat_best_rd flag because current mode rd is better than best_rd. + // This flag need to be updated only for palette evaluation in key frames + if (beat_best_rd) *beat_best_rd = 1; + memcpy(best_palette_color_map, color_map, + block_width * block_height * sizeof(color_map[0])); + *best_mbmi = *mbmi; + memcpy(blk_skip, x->txfm_search_info.blk_skip, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + if (rate) *rate = this_rate; + if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate; + if (distortion) *distortion = tokenonly_rd_stats.dist; + if (skippable) *skippable = tokenonly_rd_stats.skip_txfm; + if (beat_best_palette_rd) *beat_best_palette_rd = 1; + } +} + +static AOM_INLINE int is_iter_over(int curr_idx, int end_idx, int step_size) { + assert(step_size != 0); + return (step_size > 0) ? curr_idx >= end_idx : curr_idx <= end_idx; +} + +// Performs count-based palette search with number of colors in interval +// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can +// be less than start_n. Saves the last numbers searched in last_n_searched and +// returns the best number of colors found. +static AOM_INLINE int perform_top_color_palette_search( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, + int16_t *top_colors, int start_n, int end_n, int step_size, + bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache, + int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, + int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip, uint8_t *tx_type_map, int discount_color_cost) { + int16_t centroids[PALETTE_MAX_SIZE]; + int n = start_n; + int top_color_winner = end_n; + /* clang-format off */ + assert(IMPLIES(step_size < 0, start_n > end_n)); + /* clang-format on */ + assert(IMPLIES(step_size > 0, start_n < end_n)); + while (!is_iter_over(n, end_n, step_size)) { + int beat_best_palette_rd = 0; + bool do_header_rd_based_breakout = false; + memcpy(centroids, top_colors, n * sizeof(top_colors[0])); + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, + color_cache, n_cache, do_header_rd_based_gating, best_mbmi, + best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map, &beat_best_palette_rd, + &do_header_rd_based_breakout, discount_color_cost); + *last_n_searched = n; + if (do_header_rd_based_breakout) { + // Terminate palette_size search by setting last_n_searched to end_n. + *last_n_searched = end_n; + break; + } + if (beat_best_palette_rd) { + top_color_winner = n; + } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) { + // At search level 2, we return immediately if we don't see an improvement + return top_color_winner; + } + n += step_size; + } + return top_color_winner; +} + +// Performs k-means based palette search with number of colors in interval +// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can +// be less than start_n. Saves the last numbers searched in last_n_searched and +// returns the best number of colors found. +static AOM_INLINE int perform_k_means_palette_search( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int lower_bound, + int upper_bound, int start_n, int end_n, int step_size, + bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache, + int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, + int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map, + int data_points, int discount_color_cost) { + int16_t centroids[PALETTE_MAX_SIZE]; + const int max_itr = 50; + int n = start_n; + int top_color_winner = end_n; + /* clang-format off */ + assert(IMPLIES(step_size < 0, start_n > end_n)); + /* clang-format on */ + assert(IMPLIES(step_size > 0, start_n < end_n)); + while (!is_iter_over(n, end_n, step_size)) { + int beat_best_palette_rd = 0; + bool do_header_rd_based_breakout = false; + for (int i = 0; i < n; ++i) { + centroids[i] = + lower_bound + (2 * i + 1) * (upper_bound - lower_bound) / n / 2; + } + av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr); + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, + color_cache, n_cache, do_header_rd_based_gating, best_mbmi, + best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map, &beat_best_palette_rd, + &do_header_rd_based_breakout, discount_color_cost); + *last_n_searched = n; + if (do_header_rd_based_breakout) { + // Terminate palette_size search by setting last_n_searched to end_n. + *last_n_searched = end_n; + break; + } + if (beat_best_palette_rd) { + top_color_winner = n; + } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) { + // At search level 2, we return immediately if we don't see an improvement + return top_color_winner; + } + n += step_size; + } + return top_color_winner; +} + +// Sets the parameters to search the current number of colors +- 1 +static AOM_INLINE void set_stage2_params(int *min_n, int *max_n, int *step_size, + int winner, int end_n) { + // Set min to winner - 1 unless we are already at the border, then we set it + // to winner + 1 + *min_n = (winner == PALETTE_MIN_SIZE) ? (PALETTE_MIN_SIZE + 1) + : AOMMAX(winner - 1, PALETTE_MIN_SIZE); + // Set max to winner + 1 unless we are already at the border, then we set it + // to winner - 1 + *max_n = + (winner == end_n) ? (winner - 1) : AOMMIN(winner + 1, PALETTE_MAX_SIZE); + + // Set the step size to max_n - min_n so we only search those two values. + // If max_n == min_n, then set step_size to 1 to avoid infinite loop later. + *step_size = AOMMAX(1, *max_n - *min_n); +} + +static AOM_INLINE void fill_data_and_get_bounds(const uint8_t *src, + const int src_stride, + const int rows, const int cols, + const int is_high_bitdepth, + int16_t *data, int *lower_bound, + int *upper_bound) { + if (is_high_bitdepth) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); + *lower_bound = *upper_bound = src_ptr[0]; + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + const int val = src_ptr[c]; + data[c] = (int16_t)val; + *lower_bound = AOMMIN(*lower_bound, val); + *upper_bound = AOMMAX(*upper_bound, val); + } + src_ptr += src_stride; + data += cols; + } + return; + } + + // low bit depth + *lower_bound = *upper_bound = src[0]; + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + const int val = src[c]; + data[c] = (int16_t)val; + *lower_bound = AOMMIN(*lower_bound, val); + *upper_bound = AOMMAX(*upper_bound, val); + } + src += src_stride; + data += cols; + } +} + +/*! \brief Colors are sorted by their count: the higher the better. + */ +struct ColorCount { + //! Color index in the histogram. + int index; + //! Histogram count. + int count; +}; + +int color_count_comp(const void *c1, const void *c2) { + const struct ColorCount *color_count1 = (const struct ColorCount *)c1; + const struct ColorCount *color_count2 = (const struct ColorCount *)c2; + if (color_count1->count > color_count2->count) return -1; + if (color_count1->count < color_count2->count) return 1; + if (color_count1->index < color_count2->index) return -1; + return 1; +} + +static void find_top_colors(const int *const count_buf, int bit_depth, + int n_colors, int16_t *top_colors) { + // Top color array, serving as a priority queue if more than n_colors are + // found. + struct ColorCount top_color_counts[PALETTE_MAX_SIZE] = { { 0 } }; + int n_color_count = 0; + for (int i = 0; i < (1 << bit_depth); ++i) { + if (count_buf[i] > 0) { + if (n_color_count < n_colors) { + // Keep adding to the top colors. + top_color_counts[n_color_count].index = i; + top_color_counts[n_color_count].count = count_buf[i]; + ++n_color_count; + if (n_color_count == n_colors) { + qsort(top_color_counts, n_colors, sizeof(top_color_counts[0]), + color_count_comp); + } + } else { + // Check the worst in the sorted top. + if (count_buf[i] > top_color_counts[n_colors - 1].count) { + int j = n_colors - 1; + // Move up to the best one. + while (j >= 1 && count_buf[i] > top_color_counts[j - 1].count) --j; + memmove(top_color_counts + j + 1, top_color_counts + j, + (n_colors - j - 1) * sizeof(top_color_counts[0])); + top_color_counts[j].index = i; + top_color_counts[j].count = count_buf[i]; + } + } + } + } + assert(n_color_count == n_colors); + + for (int i = 0; i < n_colors; ++i) { + top_colors[i] = top_color_counts[i].index; + } +} + +void av1_rd_pick_palette_intra_sby( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost, + MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, + int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, + int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, + uint8_t *tx_type_map) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools, + bsize)); + assert(PALETTE_MAX_SIZE == 8); + assert(PALETTE_MIN_SIZE == 2); + + const int src_stride = x->plane[0].src.stride; + const uint8_t *const src = x->plane[0].src.buf; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + const SequenceHeader *const seq_params = cpi->common.seq_params; + const int is_hbd = seq_params->use_highbitdepth; + const int bit_depth = seq_params->bit_depth; + const int discount_color_cost = cpi->sf.rt_sf.use_nonrd_pick_mode; + int unused; + + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. + int colors, colors_threshold = 0; + if (is_hbd) { + int count_buf_8bit[1 << 8]; // Maximum (1 << 8) bins for hbd path. + av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, count_buf, + count_buf_8bit, &colors_threshold, &colors); + } else { + av1_count_colors(src, src_stride, rows, cols, count_buf, &colors); + colors_threshold = colors; + } + + uint8_t *const color_map = xd->plane[0].color_index_map; + int color_thresh_palette = 64; + // Allow for larger color_threshold for palette search, based on color, + // scene_change, and block source variance. + // Since palette is Y based, only allow larger threshold if block + // color_dist is below threshold. + if (cpi->sf.rt_sf.use_nonrd_pick_mode && + cpi->sf.rt_sf.increase_color_thresh_palette && cpi->rc.high_source_sad && + x->source_variance > 50) { + int64_t norm_color_dist = 0; + if (x->color_sensitivity[0] || x->color_sensitivity[1]) { + norm_color_dist = x->min_dist_inter_uv >> + (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); + if (x->color_sensitivity[0] && x->color_sensitivity[1]) + norm_color_dist = norm_color_dist >> 1; + } + if (norm_color_dist < 8000) color_thresh_palette += 20; + } + if (colors_threshold > 1 && colors_threshold <= color_thresh_palette) { + int16_t *const data = x->palette_buffer->kmeans_data_buf; + int16_t centroids[PALETTE_MAX_SIZE]; + int lower_bound, upper_bound; + fill_data_and_get_bounds(src, src_stride, rows, cols, is_hbd, data, + &lower_bound, &upper_bound); + + mbmi->mode = DC_PRED; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + + // Find the dominant colors, stored in top_colors[]. + int16_t top_colors[PALETTE_MAX_SIZE] = { 0 }; + find_top_colors(count_buf, bit_depth, AOMMIN(colors, PALETTE_MAX_SIZE), + top_colors); + + // The following are the approaches used for header rdcost based gating + // for early termination for different values of prune_palette_search_level. + // 0: Pruning based on header rdcost for ascending order palette_size + // search. + // 1: When colors > PALETTE_MIN_SIZE, enabled only for coarse palette_size + // search and for finer search do_header_rd_based_gating parameter is + // explicitly passed as 'false'. + // 2: Enabled only for ascending order palette_size search and for + // descending order search do_header_rd_based_gating parameter is explicitly + // passed as 'false'. + const bool do_header_rd_based_gating = + cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0; + + // TODO(huisu@google.com): Try to avoid duplicate computation in cases + // where the dominant colors and the k-means results are similar. + if ((cpi->sf.intra_sf.prune_palette_search_level == 1) && + (colors > PALETTE_MIN_SIZE)) { + // Start index and step size below are chosen to evaluate unique + // candidates in neighbor search, in case a winner candidate is found in + // coarse search. Example, + // 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step + // size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8. + // If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2 + // (3) and 8 (7). + // 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same + // as for 8 colors) then step size should also be 2, to cover all + // candidates. Coarse search will evaluate 2, 4 and 6. If winner is either + // 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3, + // coarse search will evaluate 3 and 6. For the winner, unique neighbors + // (3: 2,4 or 6: 5,7) would be evaluated. + + // Start index for coarse palette search for dominant colors and k-means + const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0, + 3, 3, 2, + 3, 3, 2 }; + // Step size for coarse palette search for dominant colors and k-means + const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0, + 3, 3, 3, + 3, 3, 3 }; + + // Choose the start index and step size for coarse search based on number + // of colors + const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE); + const int min_n = start_n_lookup_table[max_n]; + const int step_size = step_size_lookup_table[max_n]; + assert(min_n >= PALETTE_MIN_SIZE); + // Perform top color coarse palette search to find the winner candidate + const int top_color_winner = perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1, + step_size, do_header_rd_based_gating, &unused, color_cache, n_cache, + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + discount_color_cost); + // Evaluate neighbors for the winner color (if winner is found) in the + // above coarse search for dominant colors + if (top_color_winner <= max_n) { + int stage2_min_n, stage2_max_n, stage2_step_size; + set_stage2_params(&stage2_min_n, &stage2_max_n, &stage2_step_size, + top_color_winner, max_n); + // perform finer search for the winner candidate + perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n, + stage2_max_n + 1, stage2_step_size, + /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache, + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map, discount_color_cost); + } + // K-means clustering. + // Perform k-means coarse palette search to find the winner candidate + const int k_means_winner = perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, + min_n, max_n + 1, step_size, do_header_rd_based_gating, &unused, + color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, + rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, color_map, rows * cols, + discount_color_cost); + // Evaluate neighbors for the winner color (if winner is found) in the + // above coarse search for k-means + if (k_means_winner <= max_n) { + int start_n_stage2, end_n_stage2, step_size_stage2; + set_stage2_params(&start_n_stage2, &end_n_stage2, &step_size_stage2, + k_means_winner, max_n); + // perform finer search for the winner candidate + perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, + start_n_stage2, end_n_stage2 + 1, step_size_stage2, + /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache, + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map, color_map, rows * cols, discount_color_cost); + } + } else { + const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE), + min_n = PALETTE_MIN_SIZE; + // Perform top color palette search in ascending order + int last_n_searched = min_n; + perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1, + 1, do_header_rd_based_gating, &last_n_searched, color_cache, n_cache, + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + discount_color_cost); + if (last_n_searched < max_n) { + // Search in descending order until we get to the previous best + perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n, + last_n_searched, -1, /*do_header_rd_based_gating=*/false, &unused, + color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, + rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, discount_color_cost); + } + // K-means clustering. + if (colors == PALETTE_MIN_SIZE) { + // Special case: These colors automatically become the centroids. + assert(colors == 2); + centroids[0] = lower_bound; + centroids[1] = upper_bound; + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors, + color_cache, n_cache, /*do_header_rd_based_gating=*/false, + best_mbmi, best_palette_color_map, best_rd, rate, + rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, NULL, NULL, + discount_color_cost); + } else { + // Perform k-means palette search in ascending order + last_n_searched = min_n; + perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, + min_n, max_n + 1, 1, do_header_rd_based_gating, &last_n_searched, + color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, + rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, color_map, rows * cols, + discount_color_cost); + if (last_n_searched < max_n) { + // Search in descending order until we get to the previous best + perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, + max_n, last_n_searched, -1, /*do_header_rd_based_gating=*/false, + &unused, color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, rate, rate_tokenonly, distortion, skippable, + beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map, + rows * cols, discount_color_cost); + } + } + } + } + + if (best_mbmi->palette_mode_info.palette_size[0] > 0) { + memcpy(color_map, best_palette_color_map, + block_width * block_height * sizeof(best_palette_color_map[0])); + // Gather the stats to determine whether to use screen content tools in + // function av1_determine_sc_tools_with_encoding(). + x->palette_pixels += (block_width * block_height); + } + *mbmi = *best_mbmi; +} + +void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x, + int dc_mode_cost, + uint8_t *best_palette_color_map, + MB_MODE_INFO *const best_mbmi, + int64_t *best_rd, int *rate, + int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mbmi->bsize)); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const BLOCK_SIZE bsize = mbmi->bsize; + const SequenceHeader *const seq_params = cpi->common.seq_params; + int this_rate; + int64_t this_rd; + int colors_u, colors_v; + int colors_threshold_u = 0, colors_threshold_v = 0, colors_threshold = 0; + const int src_stride = x->plane[1].src.stride; + const uint8_t *const src_u = x->plane[1].src.buf; + const uint8_t *const src_v = x->plane[2].src.buf; + uint8_t *const color_map = xd->plane[1].color_index_map; + RD_STATS tokenonly_rd_stats; + int plane_block_width, plane_block_height, rows, cols; + av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, + &plane_block_height, &rows, &cols); + + mbmi->uv_mode = UV_DC_PRED; + if (seq_params->use_highbitdepth) { + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. + int count_buf_8bit[1 << 8]; // Maximum (1 << 8) bins for hbd path. + av1_count_colors_highbd(src_u, src_stride, rows, cols, + seq_params->bit_depth, count_buf, count_buf_8bit, + &colors_threshold_u, &colors_u); + av1_count_colors_highbd(src_v, src_stride, rows, cols, + seq_params->bit_depth, count_buf, count_buf_8bit, + &colors_threshold_v, &colors_v); + } else { + int count_buf[1 << 8]; + av1_count_colors(src_u, src_stride, rows, cols, count_buf, &colors_u); + av1_count_colors(src_v, src_stride, rows, cols, count_buf, &colors_v); + colors_threshold_u = colors_u; + colors_threshold_v = colors_v; + } + + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + + colors_threshold = colors_threshold_u > colors_threshold_v + ? colors_threshold_u + : colors_threshold_v; + if (colors_threshold > 1 && colors_threshold <= 64) { + int r, c, n, i, j; + const int max_itr = 50; + int lb_u, ub_u, val_u; + int lb_v, ub_v, val_v; + int16_t *const data = x->palette_buffer->kmeans_data_buf; + int16_t centroids[2 * PALETTE_MAX_SIZE]; + + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u); + uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v); + if (seq_params->use_highbitdepth) { + lb_u = src_u16[0]; + ub_u = src_u16[0]; + lb_v = src_v16[0]; + ub_v = src_v16[0]; + } else { + lb_u = src_u[0]; + ub_u = src_u[0]; + lb_v = src_v[0]; + ub_v = src_v[0]; + } + + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { + if (seq_params->use_highbitdepth) { + val_u = src_u16[r * src_stride + c]; + val_v = src_v16[r * src_stride + c]; + data[(r * cols + c) * 2] = val_u; + data[(r * cols + c) * 2 + 1] = val_v; + } else { + val_u = src_u[r * src_stride + c]; + val_v = src_v[r * src_stride + c]; + data[(r * cols + c) * 2] = val_u; + data[(r * cols + c) * 2 + 1] = val_v; + } + if (val_u < lb_u) + lb_u = val_u; + else if (val_u > ub_u) + ub_u = val_u; + if (val_v < lb_v) + lb_v = val_v; + else if (val_v > ub_v) + ub_v = val_v; + } + } + + const int colors = colors_u > colors_v ? colors_u : colors_v; + const int max_colors = + colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; + for (n = PALETTE_MIN_SIZE; n <= max_colors; ++n) { + for (i = 0; i < n; ++i) { + centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2; + centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2; + } + av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr); + optimize_palette_colors(color_cache, n_cache, n, 2, centroids, + cpi->common.seq_params->bit_depth); + // Sort the U channel colors in ascending order. + for (i = 0; i < 2 * (n - 1); i += 2) { + int min_idx = i; + int min_val = centroids[i]; + for (j = i + 2; j < 2 * n; j += 2) + if (centroids[j] < min_val) min_val = centroids[j], min_idx = j; + if (min_idx != i) { + int temp_u = centroids[i], temp_v = centroids[i + 1]; + centroids[i] = centroids[min_idx]; + centroids[i + 1] = centroids[min_idx + 1]; + centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v; + } + } + av1_calc_indices(data, centroids, color_map, rows * cols, n, 2); + extend_palette_color_map(color_map, cols, rows, plane_block_width, + plane_block_height); + pmi->palette_size[1] = n; + for (i = 1; i < 3; ++i) { + for (j = 0; j < n; ++j) { + if (seq_params->use_highbitdepth) + pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd( + (int)centroids[j * 2 + i - 1], seq_params->bit_depth); + else + pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = + clip_pixel((int)centroids[j * 2 + i - 1]); + } + } + + if (cpi->sf.intra_sf.early_term_chroma_palette_size_search) { + const int palette_mode_rate = + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost); + const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0); + // Terminate further palette_size search, if header cost corresponding + // to lower palette_size is more than the best_rd. + if (header_rd >= *best_rd) break; + av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + this_rate = tokenonly_rd_stats.rate + palette_mode_rate; + } else { + av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost); + } + + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + if (this_rd < *best_rd) { + *best_rd = this_rd; + *best_mbmi = *mbmi; + memcpy(best_palette_color_map, color_map, + plane_block_width * plane_block_height * + sizeof(best_palette_color_map[0])); + *rate = this_rate; + *distortion = tokenonly_rd_stats.dist; + *rate_tokenonly = tokenonly_rd_stats.rate; + *skippable = tokenonly_rd_stats.skip_txfm; + } + } + } + if (best_mbmi->palette_mode_info.palette_size[1] > 0) { + memcpy(color_map, best_palette_color_map, + plane_block_width * plane_block_height * + sizeof(best_palette_color_map[0])); + } +} + +void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const BLOCK_SIZE bsize = mbmi->bsize; + int src_stride = x->plane[1].src.stride; + const uint8_t *const src_u = x->plane[1].src.buf; + const uint8_t *const src_v = x->plane[2].src.buf; + int16_t *const data = x->palette_buffer->kmeans_data_buf; + int16_t centroids[2 * PALETTE_MAX_SIZE]; + uint8_t *const color_map = xd->plane[1].color_index_map; + int r, c; + const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u); + const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v); + int plane_block_width, plane_block_height, rows, cols; + av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, + &plane_block_height, &rows, &cols); + + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { + if (cpi->common.seq_params->use_highbitdepth) { + data[(r * cols + c) * 2] = src_u16[r * src_stride + c]; + data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c]; + } else { + data[(r * cols + c) * 2] = src_u[r * src_stride + c]; + data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c]; + } + } + } + + for (r = 1; r < 3; ++r) { + for (c = 0; c < pmi->palette_size[1]; ++c) { + centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c]; + } + } + + av1_calc_indices(data, centroids, color_map, rows * cols, + pmi->palette_size[1], 2); + extend_palette_color_map(color_map, cols, rows, plane_block_width, + plane_block_height); +} diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h new file mode 100644 index 0000000000..7da863a0cc --- /dev/null +++ b/third_party/aom/av1/encoder/palette.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Declares functions used in palette search. + */ +#ifndef AOM_AV1_ENCODER_PALETTE_H_ +#define AOM_AV1_ENCODER_PALETTE_H_ + +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct PICK_MODE_CONTEXT; +struct macroblock; + +/*!\cond */ +#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c + +void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int16_t *data, int16_t *centroids, + uint8_t *indices, int n, int k, + int max_itr); +void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int16_t *data, int16_t *centroids, + uint8_t *indices, int n, int k, + int max_itr); +/*!\endcond */ + +/*!\brief Calculates the cluster to which each data point belong. + * + * \ingroup palette_mode_search + * \param[in] data The data points whose cluster indices are + * to be computed. The data layout is + * NUM_DATA_POINTS X DATA_DIM. + * \param[in] centroids Pointer to the centroids. The data layout + * is NUM_CENTROIDS X DATA_DIM. + * \param[in] indices Pointer to store the computed indices. + * \param[in] n Number of data points. + * \param[in] k Number of clusters. + * \param[in] dim Data dimension. + * + * \remark Returns nothing, but saves each data's cluster index in \a indices. + */ +static INLINE void av1_calc_indices(const int16_t *data, + const int16_t *centroids, uint8_t *indices, + int n, int k, int dim) { + assert(n > 0); + assert(k > 0); + if (dim == 1) { + av1_calc_indices_dim1(data, centroids, indices, /*total_dist=*/NULL, n, k); + } else if (dim == 2) { + av1_calc_indices_dim2(data, centroids, indices, /*total_dist=*/NULL, n, k); + } else { + assert(0 && "Untemplated k means dimension"); + } +} + +/*!\brief Performs k-means cluster on the data. + * + * \ingroup palette_mode_search + * \param[in] data The data points to be clustered. The data + * layout is NUM_DATA_POINTS X DATA_DIM. + * \param[in] centroids Pointer to store the computed centroids. + * The data layout is + * NUM_CENTROIDS X DATA_DIM. + * \param[in] indices Pointer to store the computed indices. For + * each training data. + * \param[in] n Number of data points. + * \param[in] k Number of clusters. + * \param[in] dim Data dimension. + * \param[in] max_itr Maximum number of iterations to run. + * + * \remark Returns nothing, but saves each cluster's centroid in centroids and + * each data's cluster index in \a indices. + * + * \attention The output centroids are rounded off to nearest integers. + */ +static INLINE void av1_k_means(const int16_t *data, int16_t *centroids, + uint8_t *indices, int n, int k, int dim, + int max_itr) { + assert(n > 0); + assert(k > 0); + if (dim == 1) { + AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr); + } else if (dim == 2) { + AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr); + } else { + assert(0 && "Untemplated k means dimension"); + } +} + +/*!\brief Removes duplicated centroid indices. + * + * \ingroup palette_mode_search + * \param[in] centroids A list of centroids index. + * \param[in] num_centroids Number of centroids. + * + * \return Returns the number of unique centroids and saves the unique centroids + * in beginning of the centroids array. + * + * \attention The centroids should be rounded to integers before calling this + * method. + */ +int av1_remove_duplicates(int16_t *centroids, int num_centroids); + +/*!\brief Checks what colors are in the color cache. + * + * \ingroup palette_mode_search + * \param[in] color_cache A cache of colors. + * \param[in] n_cache Number of colors in the cache. + * \param[in] colors New base colors. + * \param[in] n_colors Number of new colors. + * \param[in] cache_color_found Stores what cached colors are presented in + * colors. + * \param[in] out_cache_colors Stores what colors are not in the cache. + * + * \return Returns the number of colors that are not in cache. In addition, + * records whether each cache color is presented in colors in cache_color_found, + * and stores and stores the out of cache colors in out_cache_colors. + */ +int av1_index_color_cache(const uint16_t *color_cache, int n_cache, + const uint16_t *colors, int n_colors, + uint8_t *cache_color_found, int *out_cache_colors); + +/*!\brief Gets the rate cost for each delta-encoding v palette. + * + * \ingroup palette_mode_search + * \param[in] pmi Struct that stores the palette mode info. + * \param[in] bit_depth Pixel bitdepth of the sequence. + * \param[in] zero_count Stores the number of zero deltas. + * \param[in] min_bits Minimum bits for the deltas. Sets to + * bit_depth - 4. + * + * \return Returns the number of bits used to transmit each v palette color + * delta and assigns zero_count with the number of deltas being 0. + */ +int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *zero_count, int *min_bits); + +/*!\brief Gets the rate cost for transmitting luma palette color values. + * + * \ingroup palette_mode_search + * \param[in] pmi Struct that stores the palette mode info. + * \param[in] color_cache Color cache presented at the decoder. + * \param[in] n_cache Number of colors in the cache. + * \param[in] bit_depth Pixel bitdepth of the sequence. + * + * \return Returns the rate needed to transmit the palette. Note that this does + * not include the cost of transmitted the color map. + */ +int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, + const uint16_t *color_cache, int n_cache, + int bit_depth); + +/*!\brief Gets the rate cost for transmitting luma palette chroma values. + * + * \ingroup palette_mode_search + * \param[in] pmi Struct that stores the palette mode info. + * \param[in] color_cache Color cache presented at the decoder. + * \param[in] n_cache Number of colors in the cache. + * \param[in] bit_depth Pixel bitdepth of the sequence. + * + * \return Returns the rate needed to transmit the palette. Note that this does + * not include the cost of transmitted the color map. + */ +int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, + const uint16_t *color_cache, int n_cache, + int bit_depth); + +/*!\brief Search for the best palette in the luma plane. + * + * \ingroup palette_mode_search + * \callergraph + * This function is used in both inter and intra frame coding. + */ +void av1_rd_pick_palette_intra_sby( + const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize, + int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, + int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip, uint8_t *tx_type_map); + +/*!\brief Search for the best palette in the chroma plane. + * + * \ingroup palette_mode_search + * \callergraph + * This function is used in both inter and intra frame coding. + */ +void av1_rd_pick_palette_intra_sbuv(const struct AV1_COMP *cpi, + struct macroblock *x, int dc_mode_cost, + uint8_t *best_palette_color_map, + MB_MODE_INFO *const best_mbmi, + int64_t *best_rd, int *rate, + int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable); + +/*!\brief Resets palette color map for chroma channels. + */ +void av1_restore_uv_color_map(const struct AV1_COMP *cpi, struct macroblock *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PALETTE_H_ diff --git a/third_party/aom/av1/encoder/partition_cnn_weights.h b/third_party/aom/av1/encoder/partition_cnn_weights.h new file mode 100644 index 0000000000..504038c63a --- /dev/null +++ b/third_party/aom/av1/encoder/partition_cnn_weights.h @@ -0,0 +1,2139 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ +#define AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/cnn.h" +#include "av1/encoder/ml.h" + +#define CNN_BRANCH_0_OUT_CH 20 +#define CNN_BRANCH_1_OUT_CH 4 +#define CNN_BRANCH_2_OUT_CH 20 +#define CNN_BRANCH_3_OUT_CH 20 +#define CNN_TOT_OUT_CH \ + (((CNN_BRANCH_0_OUT_CH) + (CNN_BRANCH_1_OUT_CH) + (CNN_BRANCH_2_OUT_CH) + \ + (CNN_BRANCH_3_OUT_CH))) +#define CNN_BRANCH_0_OUT_SIZE (CNN_BRANCH_0_OUT_CH) +#define CNN_BRANCH_1_OUT_SIZE ((CNN_BRANCH_1_OUT_CH)*2 * 2) +#define CNN_BRANCH_2_OUT_SIZE ((CNN_BRANCH_2_OUT_CH)*4 * 4) +#define CNN_BRANCH_3_OUT_SIZE ((CNN_BRANCH_3_OUT_CH)*8 * 8) +#define CNN_OUT_BUF_SIZE \ + (((CNN_BRANCH_0_OUT_SIZE) + (CNN_BRANCH_1_OUT_SIZE) + \ + (CNN_BRANCH_2_OUT_SIZE) + (CNN_BRANCH_3_OUT_SIZE))) + +#define NUM_DNN_BRANCHES 4 +#define NUM_CNN_LAYERS 5 +#define BRANCH_0_NUM_DNN_LAYERS 2 +#define BRANCH_1_NUM_DNN_LAYERS 2 +#define BRANCH_2_NUM_DNN_LAYERS 2 +#define BRANCH_3_NUM_DNN_LAYERS 2 +#define CNN_LAYER_0_HEIGHT 5 +#define CNN_LAYER_0_WIDTH 5 +#define CNN_LAYER_0_IN_CH 1 +#define CNN_LAYER_0_OUT_CH 20 +#define CNN_LAYER_0_HORZ_STRIDE 4 +#define CNN_LAYER_0_VERT_STRIDE 4 +#define CNN_LAYER_1_HEIGHT 2 +#define CNN_LAYER_1_WIDTH 2 +#define CNN_LAYER_1_IN_CH 20 +#define CNN_LAYER_1_OUT_CH 20 +#define CNN_LAYER_1_HORZ_STRIDE 2 +#define CNN_LAYER_1_VERT_STRIDE 2 +#define CNN_LAYER_2_HEIGHT 2 +#define CNN_LAYER_2_WIDTH 2 +#define CNN_LAYER_2_IN_CH 20 +#define CNN_LAYER_2_OUT_CH 20 +#define CNN_LAYER_2_HORZ_STRIDE 2 +#define CNN_LAYER_2_VERT_STRIDE 2 +#define CNN_LAYER_3_HEIGHT 2 +#define CNN_LAYER_3_WIDTH 2 +#define CNN_LAYER_3_IN_CH 20 +#define CNN_LAYER_3_OUT_CH 4 +#define CNN_LAYER_3_HORZ_STRIDE 2 +#define CNN_LAYER_3_VERT_STRIDE 2 +#define CNN_LAYER_4_HEIGHT 2 +#define CNN_LAYER_4_WIDTH 2 +#define CNN_LAYER_4_IN_CH 4 +#define CNN_LAYER_4_OUT_CH 20 +#define CNN_LAYER_4_HORZ_STRIDE 2 +#define CNN_LAYER_4_VERT_STRIDE 2 +#define BRANCH_0_NUM_DNN_FEATURES 37 +#define BRANCH_0_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_0_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_0_NUM_LOGITS 1 +#define BRANCH_1_NUM_DNN_FEATURES 25 +#define BRANCH_1_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_1_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_1_NUM_LOGITS 1 +#define BRANCH_2_NUM_DNN_FEATURES 25 +#define BRANCH_2_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_2_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_2_NUM_LOGITS 1 +#define BRANCH_3_NUM_DNN_FEATURES 41 +#define BRANCH_3_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_3_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_3_NUM_LOGITS 1 + +static const float av1_intra_mode_cnn_partition_cnn_layer_0_kernel[] = { + 0.131894f, -0.593536f, -0.212935f, -0.00220011f, -0.396949f, + 0.287753f, -0.91875f, -0.0095057f, 0.804197f, -0.395239f, + 0.516604f, 1.16439f, 0.445784f, -0.163349f, 0.746488f, + -0.33891f, -0.562652f, 0.481403f, 0.755378f, -0.200753f, + 0.0784307f, 0.105657f, 0.0205673f, -0.524089f, -0.476146f, + -0.161206f, -0.65079f, 0.137474f, 0.28584f, 0.508768f, + -0.643386f, 0.227068f, -0.899507f, -0.413382f, 0.631466f, + 0.398203f, -0.544392f, 0.825155f, 0.671847f, -0.249779f, + 0.323121f, 0.125357f, -0.719564f, -0.0714854f, -0.168472f, + -0.213246f, -0.674525f, 0.330148f, -0.138414f, 0.20462f, + -0.518571f, -0.15091f, -0.605116f, -0.448732f, -0.475599f, + 0.738f, -0.328526f, 0.755035f, 0.969414f, -0.321039f, + -0.23068f, 0.408567f, -0.377813f, -0.273974f, 1.0684f, + 0.373968f, -0.450305f, 0.439258f, -0.381846f, -0.267331f, + 0.30613f, -0.39369f, 0.622438f, -0.52877f, -0.334991f, + 0.263193f, -0.402121f, 0.64142f, 0.793048f, -0.0231174f, + -0.68474f, -0.293338f, -0.737511f, -0.462654f, 0.474629f, + 0.141397f, -0.152529f, 0.345879f, -0.499991f, 0.00174024f, + 0.337387f, -0.131151f, 0.427385f, -0.457449f, -0.879614f, + -0.425908f, -0.263172f, 0.0344974f, 1.07861f, -0.00416662f, + 0.0208952f, 0.233905f, 0.765965f, 0.0423685f, -0.117554f, + -0.248237f, 0.49848f, -0.845131f, 0.223648f, -0.838709f, + 0.5834f, 0.309956f, -0.0625093f, -0.619619f, 0.918957f, + 0.358271f, -0.668459f, 0.518783f, -0.418963f, -0.206788f, + 0.364983f, -0.0396087f, 0.624309f, -0.138679f, -0.142453f, + 0.28309f, 0.895092f, -0.215713f, 0.439025f, 0.659333f, + -0.366025f, -0.413518f, 0.66657f, -0.265919f, 0.473471f, + -1.0729f, -0.526702f, 0.2838f, 0.367648f, -0.61242f, + 0.121656f, 0.547727f, -0.0636793f, -0.33006f, -0.306604f, + -0.00897731f, 0.688242f, 0.0944626f, 0.321508f, 0.0437392f, + -0.560035f, -0.768334f, 0.0571051f, -0.0427601f, -0.0437806f, + -0.816209f, -0.395829f, 0.293733f, 0.217645f, -0.646428f, + 0.132448f, -0.435806f, -0.0556814f, 0.0218857f, 0.348525f, + -0.17296f, 0.669057f, 0.638604f, -0.0995596f, -0.024099f, + -0.262332f, -0.548975f, 0.357894f, 0.43873f, -0.688234f, + -0.425519f, 0.190986f, -0.074778f, 0.294232f, -0.548969f, + -0.731198f, 0.03616f, -0.475969f, -0.306075f, -0.111929f, + -0.234146f, 0.612669f, 0.882254f, -0.622893f, 0.262431f, + 0.465242f, 0.245384f, -0.811016f, 0.501798f, -0.925875f, + 0.264373f, 0.307766f, -0.26872f, 0.113027f, -0.158875f, + 0.0711483f, 0.220275f, -0.0699022f, -0.0111303f, -0.435384f, + -0.720014f, 0.593484f, -0.964082f, 0.750925f, 0.252433f, + 0.964332f, -0.256904f, -0.421715f, -0.403851f, -0.188081f, + 0.694014f, -1.00183f, 0.798921f, 0.0603123f, 0.213814f, + 0.739642f, -0.0203375f, 0.72569f, -0.260224f, 0.0199516f, + -0.322451f, 0.318204f, -0.38392f, 0.740994f, -0.265215f, + -0.54541f, -0.51479f, -0.458397f, 0.519564f, 0.0509182f, + 0.0363331f, -0.293051f, 0.317714f, -0.327488f, -0.0840401f, + 0.318437f, -0.619403f, 0.641094f, -0.288435f, -0.260185f, + 0.181083f, -0.169294f, 0.292645f, 0.140405f, 0.0572885f, + -0.637428f, -0.102616f, 0.288955f, 0.817314f, 0.116855f, + 0.635532f, 0.283334f, -0.236391f, -0.305035f, -0.217365f, + -0.033021f, -0.455858f, 0.439922f, -0.104039f, 0.373376f, + 0.310659f, 0.388789f, 0.266341f, 0.0746306f, -0.428192f, + -0.202695f, -0.347625f, 0.00585741f, 0.366203f, 0.221413f, + 0.518856f, 0.57245f, -0.375071f, -0.2436f, -0.511895f, + -1.03708f, 0.681455f, -0.111544f, -0.183563f, 0.109729f, + -0.422646f, -0.529777f, 0.747473f, -0.270223f, -0.11435f, + 0.378931f, 0.420456f, 0.236331f, 0.49261f, -0.0666801f, + 0.0475846f, 0.906095f, -0.4146f, -0.020588f, -0.653285f, + 0.135335f, 0.543846f, -0.309061f, 0.11899f, -0.639168f, + -0.719994f, -0.219706f, -0.645631f, -0.829049f, -0.0114746f, + 0.834604f, 0.0378035f, 0.107957f, 0.546929f, -0.674395f, + -0.854817f, -1.1443f, 0.223413f, -0.326324f, 0.440971f, + 0.383582f, -0.495084f, 0.280091f, -0.53116f, 0.0333923f, + -0.354339f, -0.0449156f, -0.538896f, -0.753355f, 0.463995f, + 0.000969967f, -0.2832f, 0.587276f, 0.853094f, -0.481985f, + -0.138202f, 0.180989f, -0.349044f, -0.417534f, 0.455591f, + 0.287332f, 0.251496f, 0.381416f, 0.339632f, -0.0825727f, + 0.352739f, 0.161697f, -0.319764f, -0.258015f, 0.668833f, + -0.553303f, -0.578815f, -0.3758f, 0.289f, 0.247368f, + 0.00681103f, 0.421092f, -0.191033f, -0.425868f, -0.1239f, + 0.0540422f, -0.0856856f, 0.481168f, -0.0283741f, -0.196018f, + 0.230923f, -0.145288f, 0.52188f, 0.00628462f, -0.604556f, + -0.562879f, 0.319282f, 0.323799f, 0.453941f, 0.271129f, + -0.0520196f, 0.684571f, -0.391779f, -0.404614f, 0.134097f, + -0.825482f, 0.0913949f, 0.483543f, 0.159084f, 0.301637f, + 0.427013f, 0.196153f, 0.460091f, -0.730573f, -0.12278f, + 0.221665f, 0.674622f, -0.623363f, -0.0761517f, 0.637979f, + -0.468498f, 0.527276f, -0.596894f, -0.34675f, -0.251241f, + 0.418533f, -0.476696f, -0.901267f, -0.0088241f, -0.12421f, + -0.660316f, -0.0222117f, -0.470898f, -1.10739f, -0.441645f, + 0.39516f, -0.0117906f, 0.254122f, 0.00722599f, -1.00697f, + 0.48908f, -0.122287f, -0.378608f, -0.339145f, 0.682463f, + 0.305606f, 0.453628f, -0.49923f, -0.791388f, -0.202515f, + 0.23214f, -0.434209f, -0.778283f, -0.538015f, 0.145769f, + 0.446281f, -0.339329f, -0.198478f, -0.183717f, -0.855441f, + -0.105778f, 0.575067f, -0.18592f, -0.348094f, 0.740614f, + 0.041549f, -0.109663f, 0.0434492f, 0.245242f, -1.22192f, + 0.685896f, -0.208115f, -0.0616216f, -1.00552f, 0.31045f, + -0.184394f, 0.466705f, -0.0984364f, -0.506252f, 0.144874f, + 0.357038f, 0.675221f, -0.822171f, -0.52729f, 0.991212f, + 0.432422f, 0.383493f, -0.372395f, 0.35651f, -0.25369f, + 0.660208f, -0.117745f, -0.142433f, -0.724115f, -1.0035f, + -0.59178f, 0.563444f, -0.282531f, -0.599989f, 0.507424f, + -0.782875f, 0.755029f, -0.754962f, -0.617825f, 0.565984f, + -0.826878f, -0.456563f, 0.0212161f, 0.469867f, -0.144864f, + 0.225748f, -0.279029f, 0.21052f, -0.440183f, 0.936069f, + 0.170595f, 0.40966f, 0.452453f, -0.576006f, 1.50696f, + 0.649049f, 0.094957f, -0.167706f, -0.258342f, 0.59269f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_0_bias[] = { + 0.00475215f, -0.00362332f, -0.00317542f, 0.190083f, 0.0488147f, + -0.0268093f, -0.00432231f, 0.0112229f, 0.0626653f, -0.0025698f, + 0.0018675f, -0.00368139f, -0.00159125f, -0.00034354f, 0.311437f, + 0.000136436f, 0.0667295f, 0.0251274f, 0.00226553f, -0.000638344f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_1_kernel[] = { + 0.228403f, 0.241933f, 0.181079f, 0.101728f, 0.278455f, + -0.222078f, 0.387578f, 0.0847356f, -0.0737012f, 0.26518f, + -1.0817f, 0.0404161f, -0.805199f, 0.336576f, -0.541494f, + 0.246264f, 0.116597f, -0.756804f, -0.914136f, 0.410265f, + 0.413294f, 0.07873f, 0.450017f, -0.264346f, 0.549095f, + 1.03755f, -0.203542f, 1.61018f, 0.374131f, 0.402515f, + -2.36115f, 0.116427f, -0.172157f, -0.231482f, -0.905736f, + -0.0183059f, -0.575746f, 0.110348f, -0.268018f, 0.140399f, + 0.427196f, 0.0718528f, 0.247936f, -0.326661f, 0.150404f, + -0.659979f, -0.157148f, 0.00826241f, -0.679275f, -0.131564f, + -1.04822f, 1.06039f, -0.207898f, 0.510167f, 0.484233f, + 0.138972f, -0.0801639f, -0.184416f, 0.0741107f, -0.0299281f, + 0.112263f, 0.380071f, -0.0185269f, -0.0821188f, 0.918796f, + -0.576106f, 0.593007f, 0.479446f, 0.0440703f, 0.322379f, + 0.176783f, -0.147111f, 0.0953247f, -0.636377f, 0.0702104f, + 0.130979f, 0.293892f, -0.0112124f, -0.040347f, -0.16034f, + 0.3252f, -0.586802f, 0.601786f, -0.487148f, -0.458777f, + 0.463835f, 0.144942f, 0.00339965f, -0.779966f, 0.0585298f, + -1.20758f, -0.275614f, 0.292346f, -0.132781f, 0.337892f, + -0.357677f, 1.48511f, 0.172907f, -0.148668f, 0.243184f, + -0.503392f, -0.0791543f, 0.0265389f, -0.102267f, 0.213294f, + 0.0657801f, 0.156996f, 0.0891168f, 0.120805f, 0.261285f, + -0.343025f, -0.0792235f, -0.106415f, 0.133878f, -0.112981f, + -0.00151126f, -0.0643829f, 0.0458938f, -0.0452731f, -0.00147422f, + 0.1871f, -0.0208793f, 0.0752037f, 0.0794674f, 0.167666f, + 0.198028f, -0.361015f, -0.0661721f, -0.10672f, -0.0773641f, + -1.15856f, -0.516443f, -0.322702f, 0.15668f, 0.0075841f, + -0.157731f, 0.270926f, -0.241551f, 0.0169097f, -0.0263953f, + -0.303556f, -0.239237f, 0.117792f, -0.137871f, 0.122054f, + -0.587381f, 0.112938f, 0.0867262f, -0.27909f, -0.203622f, + -0.622195f, 0.42623f, 0.670704f, 0.190826f, -0.304979f, + -0.570075f, -0.240699f, 0.43744f, 0.632896f, -0.563846f, + -0.0160434f, -0.0709745f, 0.816662f, 0.269999f, -0.358734f, + 0.193644f, 1.19339f, -0.118223f, -0.363291f, -0.723616f, + -1.58825f, 0.0222856f, 0.769852f, 0.322713f, 0.0857619f, + -0.669756f, -1.08414f, 1.18593f, 0.486166f, -0.520646f, + 0.0861854f, -0.134197f, 0.258337f, 0.223345f, 0.697639f, + -0.57261f, 0.54031f, 0.892644f, 0.497572f, -0.287076f, + -1.95928f, -0.0568128f, -0.253335f, 0.00233392f, -0.192787f, + -0.115203f, -0.0975649f, 0.277954f, 0.000704534f, -0.315884f, + 0.309583f, 0.357458f, 0.0939298f, -0.072701f, 0.433045f, + -0.536938f, 0.534523f, 0.184585f, -0.0415175f, -0.120909f, + -1.2622f, 0.412449f, -0.114741f, 0.290453f, -0.441671f, + -0.0242497f, -0.20746f, 0.139019f, -0.422668f, -0.146732f, + -0.688828f, -0.00339426f, 0.04166f, 0.41755f, 0.405675f, + 0.562564f, 0.0216812f, 0.0271391f, 0.215227f, 0.328183f, + -1.6442f, -0.827838f, 0.115491f, 0.0951442f, -0.133779f, + -0.0482928f, 0.203177f, 0.322953f, -0.513259f, 0.0676788f, + -0.0877928f, 0.224448f, 0.451957f, 0.314243f, 0.307403f, + 0.35653f, 0.0286278f, 2.27554f, 0.569313f, -0.0488753f, + -2.48809f, 0.274555f, -0.248375f, -0.635634f, -0.187663f, + 0.1827f, -0.409634f, -0.0280568f, -0.207119f, -0.208192f, + -0.410268f, -0.017669f, 0.134856f, 0.434551f, 0.165201f, + 0.584608f, -0.389997f, -0.088713f, 0.118087f, 0.00210905f, + -1.07698f, -0.520967f, -0.198742f, 0.190255f, -0.162639f, + 0.0122759f, 0.460774f, -0.684633f, -0.149512f, 0.167556f, + -0.295034f, -0.0650964f, 0.0868653f, -0.691352f, 0.089795f, + 0.0620608f, 0.0531289f, 0.0124286f, 0.151921f, 1.51067f, + -0.10586f, -0.0311871f, 0.114706f, 0.0565205f, -0.159634f, + -0.423987f, -0.226896f, 0.0605352f, -0.36324f, -0.142205f, + -0.252249f, 0.0666312f, 0.316655f, 0.00687196f, 0.131079f, + -0.128281f, -0.293468f, 1.3327f, 0.542277f, -0.060088f, + -1.73475f, 0.0542297f, -0.227522f, -0.376004f, -0.147028f, + 0.0228252f, 0.0569538f, -0.0796497f, 0.0937596f, -0.0660153f, + -0.979219f, -0.377322f, 0.0523787f, 0.467299f, 0.0824278f, + 0.437147f, 0.263637f, 0.0325681f, 0.303581f, 0.353479f, + -0.142369f, -0.394797f, 0.597185f, 0.116482f, -0.0782593f, + 0.364539f, -0.30396f, 0.119016f, -0.0022429f, -0.044292f, + -0.0110531f, 0.233571f, 0.000975879f, 0.447332f, -0.0320396f, + 0.541609f, 0.14232f, 0.163905f, 0.848609f, 0.19954f, + -0.186591f, -0.44465f, -0.431672f, 0.159037f, -0.129977f, + -0.141778f, 0.246818f, -0.197539f, -0.70115f, 0.185449f, + 0.400274f, -0.0350744f, 0.239727f, -0.290504f, 0.0698443f, + -0.180374f, -0.759591f, -0.0569088f, -0.50246f, -0.0986616f, + -0.892114f, 0.306737f, -0.133937f, 0.285625f, 0.495471f, + -0.686222f, -0.168647f, -0.0926158f, 0.351772f, -0.0215394f, + 0.361223f, 0.0657142f, 0.268229f, -0.616299f, 0.0564718f, + -0.294013f, -0.588019f, 0.0234195f, -0.426863f, -0.511253f, + -0.72177f, 0.420903f, 0.0987506f, 0.309368f, 0.523532f, + 1.06073f, -0.33028f, 0.0818142f, 0.0130354f, 0.0180882f, + 0.0316898f, -0.416614f, -0.566344f, -0.163083f, 0.285085f, + -0.0534352f, 0.385496f, 0.151068f, -0.208295f, -0.175648f, + 0.0476705f, 0.190428f, -0.643391f, 0.484004f, -0.421836f, + -0.19829f, -0.227574f, -0.0869152f, 1.09881f, 0.345129f, + -0.236732f, -0.381935f, -1.46271f, 0.465914f, 0.610375f, + 0.689968f, -0.688546f, 1.95033f, 0.420946f, 0.0282428f, + 0.147823f, 0.669393f, 0.429085f, -0.328385f, -0.150439f, + -0.419097f, -0.828102f, 0.248743f, 0.24644f, 0.0186131f, + -0.384319f, -0.126294f, -0.417067f, 0.271483f, -0.0128456f, + -0.881351f, 0.152581f, 0.185584f, -0.745827f, 0.0551359f, + 0.127083f, 0.936983f, -0.0225341f, 0.575861f, 0.767417f, + -0.140867f, -0.762518f, 0.422446f, -0.0611973f, 0.0515641f, + -0.144168f, -0.298882f, 0.308461f, 0.0208704f, 0.213872f, + -0.258708f, 1.13186f, 0.314083f, -0.347536f, -0.137768f, + 0.653953f, -0.217883f, -0.56112f, -0.864661f, 0.488836f, + 0.268133f, -0.548664f, -0.765226f, 0.117082f, 0.326798f, + -0.678246f, 0.477785f, -1.27584f, 0.198912f, -0.710395f, + 1.39096f, -0.411577f, -0.55119f, 0.51092f, -0.295023f, + 0.245983f, -0.0957192f, -0.312001f, 0.0175991f, 0.524423f, + -0.126379f, 0.124687f, -1.53945f, -0.342856f, 0.514072f, + 0.400884f, -0.00581101f, -0.219327f, 0.0977873f, 0.337551f, + -0.058603f, 0.20034f, 0.0429945f, 0.676803f, -0.273585f, + -0.173435f, -0.581596f, 0.226263f, -0.0946223f, -0.060088f, + -0.0100809f, -0.022242f, -0.22218f, -0.030463f, -0.141389f, + -0.190757f, -0.00526518f, -0.77519f, -0.0825695f, 0.308403f, + 0.262792f, -0.601842f, 0.0783697f, 0.197527f, 0.0714048f, + 0.0392629f, -0.388628f, 0.172541f, -0.0222009f, 0.252096f, + 0.0728652f, 0.173632f, 0.192914f, -0.00969965f, 0.0530136f, + -0.00765759f, 0.440234f, -0.0943323f, 0.112319f, 0.0878737f, + -0.739021f, 0.385305f, 0.133334f, -0.396697f, 0.177818f, + -0.0712558f, 0.516923f, 0.102174f, 0.17158f, -0.211068f, + 0.295795f, -0.36198f, 0.179087f, -0.845744f, -0.242514f, + -1.49073f, 0.272702f, 0.59011f, -0.408184f, -0.0731313f, + 0.234643f, 0.589642f, -0.100778f, 0.516921f, -0.700154f, + 0.316432f, 0.36117f, 0.0380282f, 0.480101f, -0.0975487f, + 0.941452f, 0.231705f, -0.151182f, -1.20305f, 0.28255f, + -0.0427662f, -0.00717175f, -0.842085f, -0.357376f, 0.545581f, + -0.290714f, 0.741498f, 1.00377f, 0.483864f, 0.150405f, + 0.0834512f, -0.10031f, 0.424054f, -0.0223491f, -0.0696701f, + -0.134479f, -0.747227f, 0.422208f, 0.123858f, -0.392624f, + -0.0299847f, -0.0376142f, -0.392536f, -0.0343114f, 0.298224f, + -0.375899f, 0.693119f, 0.27909f, -0.53463f, 0.105459f, + -0.0267383f, 0.5094f, -0.411557f, 0.451749f, -0.348479f, + -0.0497316f, -0.353913f, -0.14858f, 0.241838f, 0.331039f, + 0.756607f, -0.0701661f, -0.827264f, -0.367772f, 0.447201f, + 0.834616f, -0.00497265f, -0.0557285f, 0.055088f, -0.300115f, + -0.143833f, -1.07838f, -0.106896f, 0.16945f, 0.0170324f, + 0.108754f, 0.335893f, -0.0923708f, 0.450209f, -0.0713308f, + -0.0233037f, -0.0129902f, -1.40664f, -0.0996218f, 0.711236f, + 0.400716f, 0.227871f, 2.01499f, 0.572926f, 0.135673f, + -0.0340458f, -0.316736f, 0.24257f, -0.700768f, -0.194985f, + 0.312011f, -0.179599f, 0.128114f, 0.0725977f, -0.193816f, + 0.352143f, 0.070641f, -0.467808f, -0.399047f, 0.10136f, + 0.671574f, -0.553965f, 0.105729f, 0.210383f, 0.065048f, + 0.248198f, -0.731674f, 0.588725f, -0.308237f, 0.24511f, + 0.00608906f, 0.170906f, 0.246175f, 0.149521f, 0.106071f, + 0.160246f, 0.118487f, -0.104102f, 0.872823f, 0.227478f, + 0.0182631f, -0.115083f, 0.0142445f, 0.307947f, -0.884925f, + 0.0767105f, 0.0414042f, -0.448021f, -0.0400193f, -0.0765448f, + -0.411931f, -0.199624f, 0.333371f, 0.17267f, -0.0431816f, + 0.190826f, -0.0758961f, -1.02831f, -0.0414525f, 0.605374f, + -0.0188181f, -0.2207f, 1.30004f, -0.207005f, -0.0333617f, + 0.227145f, 0.105059f, -0.0473393f, -0.448752f, -0.0342152f, + -0.0244812f, 0.220329f, 0.0313591f, -0.0902074f, -0.0731945f, + 0.88488f, 0.306306f, -0.275613f, -0.476372f, 0.00678104f, + 0.442029f, 0.122049f, 0.118042f, 0.270527f, -0.462538f, + 0.0665021f, -0.260255f, 0.209182f, 0.162321f, 0.0629934f, + -0.244896f, -0.078863f, 0.655585f, -0.0506617f, -0.487128f, + 0.118765f, -0.34408f, 0.0930615f, -0.365632f, -0.0670776f, + 0.44428f, 0.286734f, 0.146608f, 0.686757f, -0.0738428f, + -0.10034f, -0.928438f, -0.172601f, -0.0959575f, -0.010532f, + 0.277549f, 0.28773f, -0.318883f, 0.71254f, 0.273593f, + -0.382845f, -0.0104587f, -0.647769f, 0.25541f, 0.194625f, + 0.265197f, -0.750938f, -0.0650515f, -0.567092f, 0.070613f, + 0.209531f, 0.429699f, 0.130676f, 0.514914f, 0.615778f, + 0.594535f, -0.0878778f, 0.40593f, -0.303383f, 0.0907863f, + -0.320068f, 0.0137162f, -0.303424f, 0.594207f, -0.236524f, + -0.692627f, -0.990063f, -0.0262934f, 0.222375f, 0.503412f, + 0.220224f, 0.676871f, -0.150996f, 0.379777f, 0.841339f, + -1.05981f, 0.259943f, -0.781745f, 0.0346478f, 0.115791f, + -0.25171f, -0.00872158f, 0.395561f, -0.0849893f, -1.20134f, + -0.313938f, 0.789542f, 0.159606f, -0.782095f, -0.229754f, + 0.266687f, -0.0354282f, -0.3041f, 0.0338618f, -0.390001f, + -0.28362f, -0.436144f, 0.777351f, 0.855321f, 0.653338f, + -0.0382912f, -0.204577f, 1.13828f, 0.220395f, -4.60853f, + 0.575694f, 0.0453189f, 1.76567f, 0.466151f, -0.366109f, + 0.594717f, 0.278891f, -0.750676f, -0.332739f, -0.942304f, + 0.280363f, 0.284561f, 0.209326f, 0.238347f, -0.0124311f, + -0.439463f, -0.036186f, 0.165997f, 0.374717f, -0.481148f, + -0.626417f, 0.0223598f, 0.039337f, -0.379918f, 0.211046f, + 0.0795812f, 0.863355f, -0.341448f, 0.421494f, 0.410477f, + -0.117025f, -0.511108f, 0.565193f, -0.063582f, -0.031349f, + -0.0750174f, 0.387941f, 0.541266f, 0.0919753f, 1.05041f, + 0.263004f, 0.289006f, 0.0439694f, -1.22439f, -0.247832f, + 0.260967f, 0.355794f, 0.599694f, -0.69418f, 0.372805f, + -0.161731f, 0.0720574f, 0.0394657f, 0.122772f, -0.458067f, + -0.370826f, -1.34495e-05f, -0.373404f, 0.0245539f, -2.3472f, + -2.61448f, 0.264794f, 0.0601582f, -0.968597f, -0.196022f, + -0.727067f, 0.167346f, 0.517478f, 0.0035377f, 0.777219f, + 0.553128f, 0.727211f, 0.606202f, -0.495604f, 2.41445f, + 0.465214f, -0.0443004f, 0.142972f, 0.141459f, -0.17771f, + 0.0156117f, 0.169264f, 0.0428022f, -0.164827f, -0.240632f, + 0.215289f, -0.213134f, -0.184163f, 0.0161321f, -0.20025f, + -0.0311616f, 0.00292108f, -0.0131921f, 0.0437664f, -0.104817f, + -0.131906f, 0.0822771f, 0.237307f, -0.347567f, -1.2485f, + 0.253616f, -0.442217f, 0.0514077f, 0.337561f, -0.0147658f, + -0.132888f, -0.643821f, 0.445573f, -0.0146213f, 0.235511f, + 0.53583f, -0.640644f, 0.0280044f, 0.00628834f, 0.143885f, + 0.380077f, -0.542342f, 0.363101f, 0.0647334f, -0.476556f, + -0.822676f, 0.482454f, -0.0467326f, -0.253083f, 0.116726f, + 0.317333f, 0.548131f, -0.234667f, 0.579923f, -0.420683f, + 0.595613f, -0.279864f, -0.753204f, -0.516844f, -0.436574f, + -0.120682f, -0.278939f, 0.752202f, -0.183443f, -0.14632f, + -0.0344068f, 0.127638f, -0.225245f, 0.489391f, 0.145082f, + -0.73672f, 0.980065f, -0.0367412f, 0.40632f, -0.802509f, + 0.356897f, 0.366172f, 1.23858f, -0.978381f, -0.684924f, + -0.0870693f, -0.353628f, 0.695788f, -0.244593f, -1.8897f, + -0.257803f, 0.686937f, 0.405155f, -0.125696f, 0.258075f, + 0.570584f, -0.439481f, -0.59798f, 0.0745711f, -0.235162f, + 0.133048f, -0.243033f, 0.0415527f, -0.00118735f, 0.00980514f, + -0.297429f, -0.144983f, 0.463093f, 0.0965441f, -0.338508f, + -0.651077f, 0.817577f, -0.0364773f, -0.388465f, 0.113288f, + 0.231198f, 0.316208f, -0.592201f, 0.530376f, -0.431434f, + 0.0200985f, 0.104303f, -0.130705f, 0.4374f, 0.362342f, + 0.70641f, 0.20037f, 0.309128f, -0.484535f, -1.18469f, + 0.513893f, 0.201236f, -0.022396f, 0.179638f, -0.361289f, + -0.0794946f, -1.04704f, -0.0281103f, 0.0494822f, 0.00196415f, + 0.0625478f, -0.229033f, 0.12018f, 0.542629f, -0.222423f, + -0.0123321f, -0.0988525f, 0.773192f, -0.192218f, -3.19156f, + 0.300606f, 0.462751f, 2.2968f, 0.137182f, 0.132539f, + 0.165884f, 0.128818f, -0.155856f, -0.558538f, -0.231742f, + -0.244377f, -0.442397f, 0.250947f, 0.0850658f, -0.00820139f, + 0.391284f, 0.17453f, 0.306003f, -0.531499f, -0.624451f, + 0.564584f, -0.343953f, -0.0278713f, 0.212664f, -0.135969f, + -0.0179867f, -0.687887f, 0.371065f, -0.0537029f, 0.0499509f, + 0.0980684f, -0.0438569f, 0.186731f, 0.182105f, 0.172254f, + -0.149446f, -0.0247637f, 0.148098f, 1.20772f, -0.136664f, + 0.00983112f, 0.0181381f, -0.0147549f, -0.0846561f, -0.827022f, + 0.00207177f, 0.0478215f, 0.0652549f, 0.0898219f, -0.0224959f, + -0.0274246f, 0.0166498f, -0.0211715f, -0.502932f, 0.0961452f, + 0.251206f, -0.0623632f, 0.741566f, 0.0078449f, -2.99162f, + -0.187244f, 0.0743479f, 1.46425f, 0.0737923f, 0.0133544f, + 0.20922f, -0.178671f, -0.0528492f, -0.526717f, 0.0282125f, + -0.0363201f, 0.37406f, -0.303658f, -0.066803f, 0.132237f, + 0.962057f, -0.399733f, 0.191765f, -0.452606f, -0.348732f, + 0.444939f, 0.153025f, 0.0796317f, 0.265985f, -0.319638f, + 0.0278161f, -0.333734f, 0.226108f, 0.147895f, -0.124066f, + -0.37306f, 0.19541f, 0.200175f, -0.0593244f, 0.0333887f, + -0.0284278f, 0.462491f, 0.0686487f, -0.332435f, -0.437166f, + 0.302795f, 0.100542f, 0.0265019f, 0.767212f, -0.140621f, + 0.11558f, -0.70584f, -0.00017415f, 0.00793092f, -0.0490901f, + 0.0598338f, 0.484876f, -0.13025f, 0.660349f, 0.147503f, + -0.462766f, 0.0843824f, 0.218493f, 0.310921f, -0.162284f, + 0.210404f, -0.788799f, 0.0698512f, -0.484799f, 0.0311505f, + -0.308243f, 0.417298f, 0.0593723f, 0.208908f, 0.451437f, + 0.354546f, -0.0700888f, -0.281678f, -0.311177f, 0.00914652f, + -0.372084f, 0.135036f, 0.185393f, 0.461347f, -0.114241f, + -0.402347f, -0.692327f, 0.0376155f, -0.200267f, 0.565963f, + -0.0627442f, 0.429677f, 0.170514f, 0.350565f, 0.699528f, + -0.948126f, -0.364205f, 0.348878f, -0.137832f, -0.0791649f, + -0.0462295f, -0.255078f, -0.398509f, 0.136783f, -0.0164628f, + -0.555472f, 0.690396f, 0.147715f, 0.000523095f, 0.14874f, + 0.524804f, 0.162974f, 0.797599f, 0.277473f, -0.500696f, + 0.189917f, -0.333309f, 0.00613646f, -1.07817f, 0.0470502f, + 0.210766f, 0.159768f, -0.447774f, -0.252968f, -1.72739f, + 0.0658259f, -0.448747f, 2.26511f, 0.349651f, 0.157232f, + 0.956842f, 0.856676f, 0.149227f, -0.626957f, -0.566771f, + -0.0980846f, 0.351668f, -0.362741f, -0.0272282f, -0.113632f, + 0.366015f, -0.00790003f, -0.458632f, -0.31157f, -0.182257f, + -0.953975f, 0.0583582f, 0.164721f, -0.900107f, -0.115542f, + 0.0654192f, 0.99056f, -0.247976f, 0.48254f, 0.670196f, + 0.098585f, -0.212855f, 0.310072f, 0.0894616f, 0.151944f, + 0.119629f, -0.26735f, 0.162257f, -0.0305818f, 0.681526f, + -0.229847f, 1.01556f, 0.29132f, 0.740113f, 0.0703937f, + 0.537892f, -0.18653f, -0.0252359f, -0.420014f, 0.197631f, + -0.176629f, 0.00674754f, 0.301288f, -0.162816f, 0.636235f, + -0.341362f, 0.197296f, -0.589747f, -0.749363f, -0.277197f, + -1.27291f, -0.0857908f, -0.147591f, -0.0956297f, -0.109097f, + 0.0717554f, 0.359078f, 0.301457f, 0.486934f, -0.260955f, + -0.126821f, 1.55756f, 0.477469f, -1.45363f, 1.42198f, + -0.360847f, -0.0211924f, -0.0184957f, -0.110706f, -0.152136f, + 0.104703f, 0.267615f, 0.127392f, 0.172996f, 0.258326f, + 0.268578f, -0.431123f, -0.114419f, 0.0101172f, -0.195671f, + 0.0792025f, -0.151505f, -0.064077f, 0.0479777f, -0.141882f, + 0.121492f, -0.139132f, -0.348252f, 0.341043f, -0.565367f, + -0.0791259f, -0.781086f, 0.0140045f, 0.571094f, -0.00875077f, + 0.217132f, -0.202345f, 0.157213f, 0.228445f, 0.366612f, + -0.529989f, 0.42241f, -0.540538f, -0.0425556f, -0.207774f, + -0.0663941f, 0.37836f, -0.0650245f, -0.0828694f, -0.0835478f, + -0.795512f, 0.470268f, 0.1551f, -0.69017f, -0.116735f, + 0.157614f, 0.555973f, -0.293311f, 0.245428f, -0.0853701f, + -0.449278f, -0.0551647f, -0.00137429f, 0.709439f, -0.456796f, + 0.132062f, -0.0449484f, -0.308599f, 0.180608f, -2.24196f, + 0.421478f, -0.640946f, -0.460397f, -0.920628f, -0.184949f, + -0.0416982f, 0.6484f, -0.22806f, 0.412229f, -0.468079f, + -0.72372f, -0.347698f, -1.3899f, 0.631876f, 0.0611046f, + 0.0294258f, -0.128091f, -0.205615f, 0.355348f, -0.267725f, + -0.644835f, 0.435879f, 0.517477f, -0.338123f, -0.157764f, + 0.32762f, -0.166454f, 0.221007f, -0.0438278f, -0.0777725f, + 0.10986f, 0.941545f, -0.542284f, -0.172312f, -0.256597f, + -0.0181391f, 0.220623f, -0.432456f, 0.0164074f, 0.250226f, + -0.522576f, 0.783109f, 0.198703f, -0.784554f, -0.0929628f, + 0.326861f, 0.470293f, 0.442684f, 0.271879f, -0.108256f, + 0.0483558f, -0.403151f, 0.36183f, -0.268186f, 0.270851f, + -0.696826f, -0.166037f, -0.354658f, 0.405977f, -0.473447f, + 0.649689f, -0.0863114f, -0.147319f, 0.0869966f, 0.319792f, + 0.493026f, -1.07456f, 0.354751f, 0.114605f, -0.120647f, + -0.238315f, 0.0290955f, -0.355299f, -0.45381f, 0.0812865f, + -0.0180434f, 0.00861318f, -0.892943f, -0.0127801f, -1.66398f, + 0.290505f, 0.126832f, 2.08173f, -0.0454847f, -0.162481f, + 1.07426f, 0.228566f, 0.280528f, -0.537625f, -0.175288f, + -0.118012f, 0.649114f, -0.349926f, -0.0189864f, -0.30934f, + -0.363178f, -0.119822f, -0.22656f, 0.484513f, -0.173269f, + 0.41987f, -0.448517f, -0.0950466f, 0.482443f, 0.061558f, + 0.4219f, -0.536388f, 0.0781972f, 0.212489f, 0.104229f, + -0.0792804f, 0.402066f, -0.676313f, -0.2272f, -0.16379f, + 0.260145f, -0.0504658f, -0.0826579f, -1.37749f, 0.00790747f, + 0.0841031f, -0.0671308f, -0.00301736f, -0.386206f, 0.190311f, + 0.0702639f, 0.0643968f, 0.133741f, -0.0141555f, -0.0365324f, + 0.87028f, 0.207894f, -0.421266f, 0.689256f, 0.145037f, + -0.270796f, 0.212604f, -0.345326f, 0.0074631f, -1.72379f, + 0.0672097f, -0.273153f, 1.30503f, -1.01324f, 0.00284696f, + 0.851459f, 0.176847f, 0.30948f, -0.57144f, -0.0596695f, + -0.111189f, 0.130361f, -0.298286f, 0.0567591f, -0.0885215f, + -0.847601f, 0.238624f, -0.162391f, 0.452357f, -0.0192713f, + 0.226661f, 0.0762922f, -0.0894055f, 0.332702f, 0.424484f, + 0.0443207f, -0.162345f, -0.601036f, 0.280527f, -0.137362f, + 0.266345f, 0.729438f, -0.887182f, 0.152943f, -0.573548f, + -0.0201383f, -0.56521f, 0.033582f, 0.300284f, -0.144472f, + 0.633026f, 0.30866f, 0.0653073f, 0.316901f, 0.0721326f, + 0.192252f, -0.833162f, 0.194292f, -0.08663f, -0.189401f, + -0.178242f, 0.111488f, 0.522487f, -0.65497f, 0.457049f, + 0.390654f, 0.0522936f, -0.39712f, -0.293717f, -0.374656f, + -0.118916f, -0.853076f, -0.0829578f, -0.17335f, -0.0218694f, + 0.367968f, 0.478469f, 0.0913813f, 0.519251f, 0.803526f, + -0.272516f, -0.341329f, 0.0897285f, 0.247653f, 0.000898686f, + 0.313196f, 0.000587979f, -0.314189f, -0.449439f, -0.0291611f, + -0.356287f, -0.722904f, -0.0480958f, -0.523758f, -0.576146f, + 0.133754f, 0.616921f, -0.085494f, 0.487487f, 0.745129f, + 0.993267f, 0.256555f, 0.0822743f, 0.0411971f, 0.139388f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_1_bias[] = { + 0.00447951f, 0.0202534f, 0.00970833f, -0.00460874f, 0.0942288f, + -0.0534704f, 0.00829869f, -0.0255174f, -0.0809143f, 0.00169117f, + 0.0177427f, 0.0259387f, 0.0291077f, -0.0267599f, 0.100275f, + -0.00389366f, 0.0315499f, 0.0265846f, -0.000206604f, 0.0302221f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_2_kernel[] = { + 0.153048f, 0.0725422f, 0.068901f, -0.475608f, 0.0736706f, + -0.134076f, 0.229289f, 0.0217921f, 0.0449205f, -1.00002f, + 0.149133f, 0.0497258f, 0.118988f, 0.0741764f, 0.0385486f, + 0.225181f, 0.012966f, 0.155593f, -3.07175f, -0.0641051f, + 0.09161f, 0.0259005f, -0.209998f, -0.420298f, 0.0587126f, + 0.00352744f, 0.0451313f, -0.049384f, 0.11516f, 0.083135f, + 0.103675f, -0.0185604f, 0.0623248f, -0.0993726f, 0.0448522f, + 0.0134017f, -0.294776f, -0.251924f, 0.0712635f, -0.0764298f, + -0.463766f, -0.0295011f, -0.579168f, 0.573853f, -0.00596607f, + 0.0237762f, -0.0500104f, -0.0969275f, 0.155573f, 0.0515382f, + -0.178454f, -0.154008f, -0.278299f, -0.166421f, 0.0149533f, + -0.0700236f, 0.239287f, -1.19545f, -0.0744625f, 0.143037f, + 0.141874f, 0.086302f, 0.0838633f, -0.454179f, 0.120308f, + -0.0896718f, 0.254909f, 0.0714462f, 0.00471098f, -0.869494f, + 0.209407f, 0.138285f, 0.0816641f, 0.0666266f, 0.0848555f, + 0.173313f, 0.0695633f, 0.285667f, -3.15384f, 0.00140275f, + -0.969824f, -0.0318689f, -0.00487396f, 0.412541f, 0.0263593f, + -0.249824f, 0.0897776f, 0.0208836f, -0.0982745f, -0.16049f, + -0.12719f, -0.186166f, 0.102338f, 0.273931f, -0.0886306f, + -0.19513f, -0.0135712f, -0.194127f, -0.0834291f, 0.426623f, + -0.0705446f, 0.0327476f, 0.0800862f, 0.478757f, -0.00849111f, + -0.554911f, -0.0489312f, -0.184029f, -0.227428f, 0.159989f, + -0.0677731f, -0.0901436f, 0.00308696f, -0.352243f, 0.278715f, + 0.306374f, -0.0772054f, -0.0122733f, -0.0693457f, 0.074365f, + -0.267458f, -0.123612f, -0.495954f, 0.552604f, -0.103951f, + -0.121771f, 0.179966f, -0.377947f, -1.35472f, 0.153294f, + -0.445284f, -0.089813f, -0.00529807f, 0.254047f, -0.0378426f, + 0.114597f, -0.143052f, 0.0815258f, -0.10528f, 0.00833533f, + -0.117508f, 0.129052f, 0.0706719f, -1.39506f, 0.0124731f, + 0.109831f, -0.0744156f, 0.181612f, 0.0787894f, 0.0293352f, + 0.494929f, 0.00997207f, -0.585882f, -0.0844138f, -0.00864134f, + -0.109943f, 0.0713114f, 0.14883f, 0.0610554f, 0.204145f, + -0.00390313f, 0.0184763f, -0.111387f, 0.175442f, -0.0840215f, + -0.178785f, -0.0693612f, -0.254507f, -0.191549f, 0.501561f, + -0.0858995f, -0.164921f, 0.0250706f, -0.0916282f, 0.247085f, + 0.13877f, -0.419487f, -0.295065f, -0.213812f, -0.10362f, + 0.138243f, 0.086985f, 0.113633f, -0.459273f, 0.12388f, + -0.139296f, 0.253792f, 0.0421624f, 0.0665065f, -0.977282f, + 0.199927f, 0.115194f, 0.099045f, 0.0534806f, 0.089283f, + 0.0815367f, 0.150901f, 0.253458f, -3.24825f, -0.0118163f, + -0.544565f, 0.0201825f, -0.0682201f, 0.759028f, 0.00479696f, + -0.00625607f, 0.058007f, -0.0811189f, -0.114617f, -0.0998578f, + 0.133312f, 0.0246256f, -0.0167416f, 0.196118f, 0.109823f, + 0.109489f, 0.474682f, -0.763475f, 0.0818745f, 0.0798777f, + -0.0994905f, -0.00138143f, -0.108563f, 0.697289f, -0.103702f, + -0.306085f, -0.0996705f, -0.142618f, -0.130989f, 0.0813303f, + -0.0909275f, -0.10786f, -0.0280431f, 0.206877f, -1.70798f, + 0.525568f, 0.559891f, -0.166132f, -0.227574f, -0.150955f, + 0.0849226f, 0.00497342f, -0.168667f, -0.282575f, 0.00537805f, + -0.0185572f, 0.0607167f, -0.0534948f, -0.0215776f, -0.14825f, + -0.0164577f, -0.0611978f, 0.0347562f, 0.286917f, 0.226598f, + 0.149497f, -0.478101f, -0.246006f, 0.0663239f, -0.121728f, + 0.267087f, 0.0802681f, -0.184741f, -0.558267f, 0.0437066f, + 0.13816f, -0.0710939f, 0.0725697f, 0.339857f, 0.161069f, + 0.304871f, 0.108138f, 0.193396f, 0.0891607f, -0.0701939f, + -0.182038f, -0.451873f, -0.233883f, 0.0444747f, 0.0436545f, + -0.245894f, -0.0721136f, 0.309013f, 0.278996f, 0.0259377f, + 0.0278116f, 0.0686773f, -0.271237f, 0.235082f, -0.0778285f, + -0.456541f, -0.109303f, -0.074565f, -0.407301f, -0.162191f, + -0.801819f, 0.372435f, -0.559083f, -0.039189f, 0.0477762f, + 0.0875363f, 0.0699926f, 0.116552f, -0.308217f, 0.0341607f, + -0.14202f, 0.135517f, 0.0316971f, 0.153297f, -0.759722f, + 0.12849f, 0.114229f, 0.0814893f, 0.275402f, 0.0403976f, + 0.0357503f, 0.212295f, 0.0673998f, -2.59822f, -0.0475021f, + -0.0594725f, 0.0659163f, 0.0469717f, -0.0370461f, -0.12863f, + -0.381743f, -0.0445055f, -0.106843f, -0.0880648f, 0.00591106f, + 0.235514f, -0.165162f, -0.0696645f, 0.115374f, 0.245558f, + 0.192049f, -0.388628f, -0.48291f, 0.154313f, -0.160207f, + 0.125928f, 0.122039f, 0.0713794f, -0.161244f, 0.128082f, + -0.234659f, 0.0680219f, 0.0597933f, 0.208421f, -0.163623f, + 0.196873f, 0.156603f, 0.184179f, -0.278331f, -0.0481286f, + 0.0828152f, 0.247004f, 0.0915582f, -0.0906229f, -0.20376f, + 0.136593f, 0.0740336f, -0.0134935f, -0.355048f, 0.0898485f, + -0.0962068f, 0.185804f, -0.0145596f, 0.0966589f, -0.515784f, + 0.121602f, 0.0320428f, 0.11093f, -0.0559421f, 0.0355484f, + 0.192128f, 0.0500888f, 0.133641f, -1.73282f, -0.0624599f, + 0.122524f, 0.0757292f, -0.0974648f, -0.193649f, 0.0561096f, + 0.0159959f, 0.0334472f, -0.0168832f, -0.12386f, -0.112419f, + 0.19552f, 0.0308502f, 0.0537643f, -0.0181012f, 0.0392183f, + 0.0461833f, -0.52623f, -0.238252f, 0.0821762f, -0.212384f, + 0.112901f, 0.096063f, 0.0540225f, 0.0773583f, 0.143045f, + -0.101551f, 0.282418f, 0.0176749f, -0.00244542f, -0.780154f, + -0.254428f, -5.82215f, 0.106638f, 0.11746f, 0.0486823f, + 0.164562f, 0.0303006f, 0.229614f, -2.41845f, -0.117122f, + 0.0451654f, 0.0237383f, -0.208731f, 0.0721137f, 0.0761163f, + -0.0569416f, -0.00830511f, -0.045256f, 0.14535f, -0.0189222f, + -0.283363f, -3.15502f, 0.0971161f, -0.035913f, 0.00813281f, + 0.0187974f, -0.361573f, -0.302067f, 0.118014f, -0.0956148f, + -0.596567f, 0.0105443f, -0.49019f, -0.0801959f, 0.0322344f, + -0.0280032f, 0.0555038f, -0.111495f, -0.0994456f, 0.0178021f, + 0.0358362f, 1.07063f, -0.0833138f, 0.0621246f, 0.0637157f, + 0.0999207f, 0.191975f, -1.2811f, 0.0341681f, 0.14818f, + 0.0957259f, 0.109909f, 0.0566115f, 0.0585633f, 0.179939f, + -0.104372f, 0.309091f, 0.0172941f, 0.0243182f, -0.935252f, + -0.296257f, -5.83634f, 0.0899249f, 0.455347f, 0.129505f, + 0.220212f, 0.0214801f, 0.284802f, -2.94585f, -0.0805413f, + -1.01819f, 0.00534034f, -0.057203f, 0.0869331f, 0.0207575f, + -0.124479f, -0.0465806f, 0.0894252f, 0.32203f, 0.0858497f, + 0.25178f, 0.0932205f, 0.0888455f, 0.233153f, -0.446398f, + -0.00791233f, 0.0909603f, -0.0904397f, 0.131835f, 0.475597f, + -0.1236f, 0.0231622f, 0.138602f, -0.097731f, -0.0282484f, + -0.549095f, -0.0457428f, -0.0895407f, -0.293965f, 0.166872f, + 0.46719f, 0.236254f, 0.0615991f, 0.499236f, 0.540366f, + 0.402035f, 0.0606324f, -0.0499928f, -0.0155198f, 0.0994403f, + -0.14773f, -0.183433f, -0.612093f, -0.334201f, -0.110877f, + -0.143441f, 0.05815f, -0.318586f, -0.344235f, 0.199593f, + 0.51109f, -0.252281f, -0.028834f, 0.0615421f, 0.0623699f, + 0.210745f, -0.236448f, 0.166279f, 0.127516f, -0.0971157f, + -0.204389f, 0.208112f, 0.0377023f, 0.271837f, -0.00859528f, + 0.0797081f, -0.00582115f, 0.140018f, -0.384865f, -0.0853243f, + -0.586727f, -0.0664489f, -0.631436f, -0.245828f, -0.0647894f, + -0.171912f, -0.0801706f, 0.0731614f, -0.11725f, 0.281478f, + -0.03047f, 0.0363488f, -0.0481651f, -0.326329f, -0.0155898f, + -0.428316f, -0.0989367f, -0.271902f, -0.00263837f, 0.366168f, + 0.325989f, 0.165463f, 0.0668512f, -0.142202f, 0.419992f, + 0.164971f, -0.515479f, -0.187585f, -0.151783f, -0.0682468f, + 0.0910191f, 0.117086f, 0.106579f, 0.0961825f, 0.162148f, + -0.129645f, 0.301039f, 0.000320343f, -0.0558097f, -0.844295f, + -0.218919f, -5.7571f, 0.0982612f, 0.238955f, 0.0703565f, + 0.0969388f, 0.107202f, 0.321585f, -3.00594f, -0.058755f, + -0.620004f, 0.052114f, 0.128423f, -0.177673f, -0.00341509f, + -0.146756f, -0.0414309f, -0.0893262f, -0.0584779f, -0.129552f, + 0.127629f, 0.13275f, -0.0973342f, -0.215617f, 0.0724309f, + 0.0102229f, 0.178137f, -0.943374f, -0.171465f, 0.304949f, + -0.0963836f, -0.0346437f, -0.138667f, -0.234184f, 0.0344159f, + -0.319592f, -0.0990766f, -0.16065f, 0.369432f, 0.194911f, + 0.363348f, -0.356009f, -0.00736217f, 0.241788f, -2.21311f, + 0.704816f, 0.697019f, 0.129186f, -0.132799f, -0.11861f, + 0.0383451f, 0.0247782f, -0.12687f, 0.0256552f, 0.048413f, + 0.00660549f, 0.0457962f, -0.012819f, 0.115991f, -0.1117f, + -0.291045f, -0.646138f, 0.0813613f, 0.112063f, 0.191675f, + 0.120835f, -0.444267f, -0.340385f, 0.0391936f, -0.151132f, + 0.184419f, 0.124998f, -0.14089f, 0.214087f, 0.00108535f, + 0.119611f, 0.0236965f, 0.0715074f, -0.225997f, -0.0126552f, + -0.459214f, -0.490444f, 0.173716f, 0.355811f, -0.13607f, + -0.191091f, -0.530085f, -0.400666f, 0.011221f, 0.10527f, + -0.11498f, -0.011864f, 0.364376f, 0.0319587f, -0.0528563f, + 0.0353899f, 0.0393453f, -0.289211f, -0.347785f, -0.0417157f, + 0.545848f, 0.741785f, -0.0732565f, -1.29687f, -0.0433128f, + -1.44162f, 0.318894f, -0.377784f, 0.123751f, -0.00444347f, + 0.0957118f, 0.0893616f, 0.0911595f, 0.092917f, 0.127681f, + -0.159929f, 0.190417f, -0.0297948f, -0.00132599f, -0.742756f, + -0.0364169f, -4.00108f, 0.0784767f, 0.223048f, 0.0430138f, + 0.0180493f, 0.212842f, 0.122987f, -2.83267f, -0.0641464f, + -0.173247f, 0.100946f, 0.0804885f, 0.0172631f, 0.0877408f, + -0.353222f, 0.0108262f, -0.0452121f, -0.116127f, 0.268154f, + -0.132587f, -0.27481f, -0.0316914f, 0.0610525f, 0.439691f, + 0.00966415f, -0.78962f, -0.424823f, -0.0214365f, -0.113846f, + 0.100793f, 0.126482f, 0.0415354f, 0.0427995f, 0.14273f, + -0.315674f, 0.110095f, 0.0061568f, 0.0320474f, -0.3596f, + -0.12533f, -1.28837f, 0.174673f, -0.235912f, 0.00495439f, + 0.0695473f, 0.266489f, 0.049248f, 0.0868526f, -0.0685969f, + 0.102984f, 0.0924639f, -0.027535f, 0.0709277f, 0.155776f, + -0.190944f, 0.188273f, -0.00897471f, 0.0964232f, -0.475822f, + -0.209374f, -5.00252f, 0.103495f, 0.110698f, 0.00682092f, + 0.208586f, 0.0489575f, 0.0966254f, -1.42973f, -0.0645128f, + 0.0515961f, 0.0571281f, -0.0992321f, 0.00791648f, 0.0087609f, + 0.0607367f, 0.0315705f, 0.0183317f, 0.0756087f, -0.0292847f, + -0.212932f, -0.782259f, 0.0899944f, 0.102677f, 0.0681135f, + 0.0447764f, -0.481969f, -0.221459f, 0.0794475f, -0.229157f, + 0.136781f, 0.0832359f, 0.0297807f, -0.00287225f, -5.97897f, + -0.0960581f, 0.250945f, -0.00133314f, -0.112396f, -0.856922f, + 0.115776f, 0.124536f, 0.0914194f, -0.160775f, 0.128684f, + 0.106718f, 0.100665f, 0.139579f, -0.86141f, -0.190323f, + 0.0884896f, 0.0363845f, -0.19831f, 0.121601f, 0.0264453f, + -0.00557822f, 0.0720238f, -0.0140132f, -0.166814f, -0.266214f, + 0.00500545f, 0.0146905f, 0.126035f, 0.0812372f, 0.0615973f, + 0.0766063f, -0.420156f, -0.126157f, -0.0284299f, -0.112513f, + -0.567008f, -0.0100263f, -0.607567f, 0.193053f, 0.0067527f, + -0.0753897f, 0.00134269f, -0.0512249f, -0.161661f, 0.0667741f, + -0.113702f, -0.071606f, -0.300563f, 0.276479f, -0.155318f, + -0.0512306f, 0.0896443f, -0.987911f, 0.0440889f, 0.430958f, + 0.175427f, 0.101385f, 0.0303662f, 0.0672653f, -6.62463f, + -0.10475f, 0.228249f, -0.00482173f, -0.0608713f, -0.895836f, + 0.187976f, 0.162173f, 0.0747544f, 0.219953f, 0.0682489f, + 0.142665f, 0.100287f, 0.301887f, -1.97736f, -0.295001f, + -1.0733f, -0.0562668f, -0.0604295f, 0.0304073f, 0.194274f, + -0.243593f, 0.0727137f, 0.0610967f, -0.0692415f, -0.02967f, + 0.055633f, 0.0192402f, 0.105841f, 0.102236f, -0.0757102f, + -0.0067639f, 0.0102317f, -0.257959f, -0.0638652f, 0.45521f, + -0.114967f, 0.0921177f, 0.223796f, 0.277072f, -0.0613282f, + -0.564693f, -0.151333f, -0.158035f, 0.228491f, 0.12997f, + -0.192625f, -0.125344f, 0.0983258f, -0.931206f, 0.618715f, + 0.273759f, -0.145527f, -0.099431f, -0.119551f, 0.0663484f, + -0.161419f, -0.202377f, -0.545393f, 0.0917645f, 0.042263f, + -0.17117f, -0.178622f, -0.336977f, 0.866715f, 0.0376922f, + -0.319728f, -0.127406f, 0.0599384f, 0.268804f, -0.0331844f, + 0.355326f, -0.103902f, 0.0425935f, 0.00525512f, -0.133687f, + -0.122695f, 0.145582f, 0.139013f, -0.0053352f, 0.0313566f, + 0.327295f, -0.0117993f, 0.233524f, 0.162388f, -0.0793262f, + 0.454543f, 0.0442224f, -0.742673f, -0.144882f, 0.0874983f, + -0.0707259f, 0.0219869f, 0.201728f, 0.0204537f, 0.0788857f, + -0.0374329f, 0.0724169f, 0.0743593f, -0.0193526f, -0.313546f, + -0.418882f, -0.0815754f, -0.197144f, 0.305053f, 0.330196f, + -0.131006f, -0.00113249f, 0.0750458f, -0.541764f, 0.299935f, + 0.308516f, -0.20547f, -0.333066f, 0.0285833f, 0.191147f, + 0.160372f, 0.0724649f, 0.0426326f, 0.153046f, -6.59656f, + -0.081237f, 0.219163f, 0.0147081f, -0.0109837f, -1.01487f, + 0.170055f, 0.163386f, 0.106413f, 0.150188f, 0.0688875f, + 0.0541359f, 0.156307f, 0.178844f, -1.51054f, -0.149477f, + -0.504503f, 0.017878f, -0.181821f, -0.0999659f, 0.0484548f, + -0.32211f, 0.0406744f, 0.0017627f, 0.0220593f, 0.0900512f, + -0.561625f, 0.107279f, -0.0861521f, -0.0862376f, 0.0816765f, + 0.168072f, 0.150063f, -0.816825f, -0.13569f, 0.557555f, + -0.155265f, 0.025135f, -0.109304f, -0.0487062f, -0.00347487f, + -0.454803f, -0.0394371f, -0.214597f, -0.248898f, 0.286501f, + -0.249246f, -0.138935f, 0.00391409f, -0.122544f, -2.14993f, + 0.588942f, 0.541231f, 0.0154047f, -0.359742f, 0.0520729f, + 0.0667058f, 0.0418163f, -0.132533f, -0.184759f, 0.0546118f, + -0.131198f, 0.109664f, -0.0714679f, -0.114163f, -0.243081f, + -0.0405089f, 0.0342795f, 0.0801825f, -0.268408f, 0.192207f, + 0.0800494f, -0.586539f, -0.118155f, -0.0508569f, -0.193987f, + 0.261478f, 0.105719f, -0.125361f, -0.0956201f, 0.0233802f, + 0.271098f, 0.0113352f, 0.0910447f, 0.00628244f, -0.071722f, + 0.21439f, 0.0747191f, 0.207765f, -0.0782454f, -0.0151716f, + -0.196505f, -0.44798f, -0.228597f, 0.0549039f, -0.120715f, + -0.19388f, -0.0768461f, 0.361102f, 0.122936f, -0.0334211f, + -0.202503f, -0.0450776f, -0.272345f, 0.662321f, 0.109247f, + -0.218026f, -0.0669386f, -0.0864701f, -0.633421f, -0.158007f, + -1.10778f, 0.351211f, -0.541458f, -0.0171707f, 0.149606f, + 0.106105f, 0.0880349f, 0.0968455f, 0.113269f, -5.01949f, + -0.106404f, 0.175578f, -0.030045f, -0.0267249f, -0.563713f, + 0.173885f, 0.130772f, 0.0334519f, 0.0770157f, 0.0394389f, + -0.0290326f, 0.220003f, 0.180901f, -1.62203f, -0.151858f, + -0.202386f, -0.0067836f, 0.0287665f, -0.194183f, -0.239834f, + -0.484159f, 0.00671722f, -0.122459f, 0.0808959f, -0.263769f, + -0.015066f, -0.0429868f, -0.111255f, -0.231872f, 0.219659f, + -0.0437412f, -0.536618f, -0.477831f, 0.0421895f, -0.0815851f, + 0.119638f, 0.0786293f, -0.000668378f, 0.0305567f, -0.0868189f, + -0.178327f, 0.0799657f, 0.0280923f, -0.211395f, -0.464577f, + 0.216912f, 0.0761976f, 0.160288f, -0.416372f, -0.10286f, + -0.0733786f, 0.261033f, 0.0493698f, 0.143137f, -0.179979f, + 0.15655f, 0.0897976f, -0.0258041f, -0.152852f, -6.15512f, + -0.118917f, 0.227283f, -0.0514043f, -0.0786432f, -0.523485f, + 0.1644f, 0.0869001f, 0.0984082f, -0.428288f, 0.0791992f, + 0.141904f, 0.0652073f, 0.104429f, -0.775125f, -0.121479f, + 0.0841637f, 0.0135705f, -0.208863f, -0.0629523f, 0.0455794f, + 0.0513898f, -0.0147657f, 0.0401145f, 0.0660079f, 0.0210609f, + -0.0151801f, 0.0562111f, 0.140308f, -0.0196394f, 0.0230753f, + -0.0336115f, -0.422411f, -0.196974f, -0.0405748f, -0.283428f, + 0.15458f, 0.0876296f, 0.0314038f, 0.16389f, -7.01385f, + -0.117146f, 0.197273f, -0.0400688f, 0.0143951f, -0.964007f, + -0.0618919f, 0.0406891f, 0.07992f, -0.144132f, 0.116416f, + 0.0326838f, 0.103641f, 0.171805f, -1.05158f, -0.182589f, + 0.116991f, 0.0530774f, -0.212454f, -0.016727f, -0.0565992f, + 0.0712873f, 0.0445466f, -0.000107032f, -0.121449f, -0.15148f, + 0.0220338f, 0.0762024f, 0.12253f, 0.0622466f, 0.0835822f, + 0.0465119f, -0.388743f, -0.34665f, -0.0720734f, -0.101581f, + -0.630565f, -0.0512685f, -0.520541f, 0.0530119f, -0.0245276f, + -0.19116f, -0.0144446f, -0.0604486f, 0.187251f, -0.021341f, + -0.217823f, 0.0510256f, -0.197946f, 0.060955f, -0.0617316f, + 0.0741673f, 0.117591f, -1.47844f, -0.0911093f, 0.359225f, + 0.145027f, 0.127513f, 0.0617905f, 0.141154f, -7.63868f, + -0.0808127f, 0.274843f, 0.00693195f, -0.0283113f, -0.853871f, + -0.15737f, 0.0858904f, 0.0746279f, 0.109912f, 0.193775f, + 0.0698094f, 0.174159f, 0.259556f, -1.49885f, -0.156706f, + -1.04113f, -0.0329546f, -0.0491449f, -0.0304125f, 0.0514892f, + -0.244284f, 0.126814f, -0.0387081f, -0.153173f, -0.0566748f, + 0.294111f, -0.0170534f, 0.102381f, 0.447606f, -0.0613267f, + -0.0636869f, -0.0347599f, -0.259572f, -0.0657846f, 0.454352f, + -0.169453f, -0.00177987f, 0.133279f, -0.0863932f, -0.134423f, + -0.475107f, -0.00448962f, -0.214607f, 0.111413f, 0.194377f, + -0.0710837f, 0.0562353f, 0.0401193f, 0.248595f, 0.538374f, + 0.449469f, -0.39111f, 0.0125057f, 0.0448811f, -0.00707751f, + -0.164894f, -0.317516f, -0.56231f, -0.270262f, 0.127016f, + -0.12092f, -0.0881587f, -0.323908f, 0.872344f, 0.103391f, + 0.267971f, -0.155088f, -0.0136683f, 0.309517f, 0.119901f, + 0.271307f, -0.188463f, 0.185121f, -0.142777f, -0.110535f, + -0.163107f, 0.175502f, 0.0801924f, 0.240499f, 0.0874759f, + 0.308907f, -0.00222504f, 0.193366f, 0.109018f, -0.0772158f, + -0.520675f, 0.0259432f, -0.736666f, -0.296579f, 0.043486f, + -0.128932f, 0.0417669f, 0.125747f, 0.157879f, 0.112857f, + -0.0595681f, 0.0611936f, -0.042125f, -0.270338f, 0.120072f, + -0.36675f, -0.0347962f, -0.119539f, 0.0873369f, 0.296432f, + -0.069501f, -0.0383859f, 0.0913597f, -0.40747f, 0.234276f, + 0.332536f, -0.732132f, -0.312291f, 0.137759f, 0.227593f, + 0.14165f, 0.129068f, 0.102734f, 0.135818f, -7.35883f, + -0.101533f, 0.256027f, -0.0142278f, -0.0561601f, -1.09899f, + -0.106538f, 0.0612256f, 0.099487f, -0.0605983f, 0.134311f, + 0.052226f, 0.143672f, 0.219944f, -1.47539f, -0.101828f, + -0.429979f, 0.010478f, -0.0132605f, 0.103363f, 0.0267373f, + -0.338865f, 0.0090188f, 0.0810085f, -0.124368f, -0.0133776f, + 0.595666f, -0.00162201f, -0.212444f, -0.26342f, 0.0913656f, + -0.106279f, 0.414515f, -0.709901f, -0.00198859f, 0.305288f, + -0.188536f, -0.0377482f, -0.131909f, -0.116099f, -0.236827f, + -0.36356f, 0.0179455f, -0.202143f, -0.00395508f, 0.177363f, + 0.0630679f, -0.145173f, -0.0558639f, -0.44879f, -1.55687f, + 0.473398f, 0.50531f, -0.0656231f, -0.137197f, 0.064707f, + 0.122083f, 0.0321111f, -0.167096f, 0.0406581f, -0.0793592f, + -0.0777081f, 0.0321379f, -0.0108834f, -0.0652323f, -0.102918f, + 0.0178664f, 0.0781873f, 0.0613189f, -0.04177f, 0.159566f, + 0.15134f, -0.445996f, -0.384905f, 0.0951659f, -0.175046f, + 0.255746f, 0.177047f, -0.150632f, 0.200522f, 0.00778549f, + 0.232168f, -0.0304652f, 0.083155f, -0.125395f, -0.0203289f, + -0.23874f, 0.0349836f, 0.231701f, -0.14849f, -0.204272f, + -0.198309f, -0.364955f, -0.228428f, 0.0614142f, -0.040976f, + -0.227785f, -0.0898404f, 0.271566f, -0.209196f, 0.0226431f, + -0.0911715f, 0.0840369f, -0.299411f, -0.529182f, 0.0622292f, + 0.202475f, 0.0155583f, -0.083114f, 0.124253f, -0.22721f, + -1.02565f, 0.193961f, -0.54287f, -0.00849364f, 0.11124f, + 0.0993531f, 0.120621f, 0.0959537f, 0.136274f, -5.23358f, + -0.107433f, 0.155286f, -0.0136043f, -0.0246768f, -0.631187f, + -0.0493852f, 0.0446751f, 0.0588353f, 0.160766f, -0.0354385f, + -0.0672548f, 0.243743f, 0.186004f, -1.20199f, -0.151872f, + -0.0760096f, -0.00775123f, -0.0122227f, 0.0891327f, -0.377876f, + -0.469926f, -0.134715f, -0.0969362f, 0.212542f, 0.0871489f, + 0.164638f, -0.0485785f, -0.167754f, -0.515052f, 0.13821f, + 0.0515572f, -0.430691f, -0.394719f, 0.143947f, -0.00670816f, + 0.129623f, 0.140299f, 0.0336978f, 0.153545f, -0.350927f, + -0.213485f, 0.0344809f, 0.0405889f, 0.0749967f, -0.369352f, + -0.109398f, 0.0350649f, 0.190893f, -0.284106f, -0.185376f, + 0.0105842f, 0.263692f, 0.160429f, 0.0998209f, -0.127779f, + 0.140558f, 0.108968f, -0.0122672f, 0.102875f, -5.72172f, + -0.161288f, 0.135935f, -0.0143087f, 0.106556f, -0.649813f, + -0.123049f, -0.0108861f, 0.102918f, -0.298137f, 0.0329013f, + 0.100763f, 0.12018f, 0.100782f, -0.648036f, -0.111122f, + 0.12363f, 0.0211952f, -0.225201f, 0.0506021f, 0.0167621f, + 0.0608759f, -0.0245646f, 0.0503477f, -0.0972749f, -0.0415155f, + -0.00578366f, -0.0977591f, 0.124867f, 0.0134788f, -0.0375816f, + -0.00581233f, -0.272292f, -0.250393f, 0.024511f, -0.184891f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_2_bias[] = { + 0.182474f, 0.0223202f, 0.204111f, 0.0573683f, 0.111143f, + 0.0800926f, -0.0364215f, 0.192371f, 0.00498262f, 0.302543f, + 0.0133081f, 0.119719f, 0.237522f, -0.266705f, 0.129427f, + 0.0695857f, 0.22068f, 0.231667f, 0.405829f, -0.0972567f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_3_kernel[] = { + -0.0393876f, -0.269924f, -0.0703231f, -0.0236484f, 0.170478f, + 0.245566f, 0.175963f, 0.104194f, -0.0490501f, -0.157605f, + -0.0275165f, -0.0169499f, -0.250725f, 0.215203f, -0.00733655f, + 0.0111298f, 0.205606f, 0.928046f, 0.15139f, 0.0955483f, + -0.015115f, -0.126643f, 0.0957605f, -0.140178f, -0.0246866f, + 0.097097f, 0.116287f, 0.177746f, 0.0570021f, -0.0518686f, + -0.0446482f, -0.0125318f, 0.0116092f, 0.102431f, 0.0898519f, + 0.0870372f, -0.843274f, 0.383311f, -0.102761f, -0.0246494f, + 0.0312555f, 0.19472f, 0.111573f, 0.0920392f, -0.0555618f, + 0.326461f, 0.219357f, -0.133727f, -0.118399f, -0.0611432f, + -0.169931f, 0.123733f, -0.204607f, 0.082592f, 0.0323181f, + 0.201618f, -0.00388867f, -0.053583f, 0.0266333f, -0.0951787f, + -0.0358283f, -0.0649549f, 0.0119263f, -0.11812f, 0.209851f, + -0.036616f, -0.014911f, -0.138096f, -0.139664f, -0.207395f, + 0.0128848f, -0.201816f, 0.0899419f, 0.343308f, -0.0096243f, + -0.212605f, -0.0905284f, -0.0597114f, -0.055261f, -0.0653405f, + 0.0330484f, -0.27681f, -0.0994095f, -0.0468272f, 0.145713f, + 0.267216f, 0.185335f, 0.1798f, -0.0437882f, -0.200401f, + -0.0398117f, -0.0736501f, -0.166349f, 0.203316f, 0.0710647f, + 0.061825f, 0.281131f, 0.733323f, 0.215488f, 0.00145659f, + -0.138995f, -0.0833713f, 0.107809f, -0.105343f, -0.0672139f, + 0.101852f, 0.135455f, 0.132903f, 0.0312017f, -0.0643586f, + -0.0274546f, -0.0687466f, -0.020233f, 0.109444f, 0.0774587f, + 0.139497f, -0.800587f, 0.325783f, -0.0546695f, -0.092003f, + -0.0773301f, 0.189672f, 0.0604666f, 0.0939425f, 0.679495f, + 0.114789f, -0.161153f, 0.12843f, -0.0345385f, -0.134641f, + -0.153995f, 0.0823055f, -0.0349296f, 0.0299183f, -0.0606872f, + 0.137588f, 0.0449805f, -0.0555399f, -0.00553351f, -0.120719f, + -0.204701f, -0.0739813f, 0.0584115f, -0.104833f, -0.110989f, + 0.00845446f, 0.0630702f, -0.147861f, 0.0268545f, -0.216419f, + 0.00531986f, -0.206641f, 0.253082f, 0.413215f, -0.05909f, + -0.0939983f, -0.116818f, -0.0450892f, -0.0551134f, -0.00696931f, + -0.113003f, -0.289192f, -0.00884866f, -0.0365724f, 0.0401887f, + 0.238622f, 0.149151f, 0.175751f, -0.157425f, -0.138924f, + -0.0277598f, -0.0285915f, 0.10165f, 0.209532f, 0.0862249f, + 0.0256428f, 0.623204f, -0.0941196f, 0.20345f, -0.132869f, + 0.00947298f, -0.14753f, 0.103918f, -0.161799f, 0.125566f, + 0.10916f, 0.115446f, 0.135627f, -0.0181667f, -0.0734694f, + -0.0154729f, -0.085849f, -0.000427605f, 0.113614f, 0.0776308f, + 0.111899f, -0.214917f, 0.393234f, -0.132223f, 0.020783f, + -0.074902f, 0.217477f, 0.107883f, 0.109466f, 0.146609f, + 0.317061f, 0.074379f, -0.0505457f, -0.0503772f, -0.0678954f, + -0.220003f, 0.114878f, 0.176014f, -0.00657996f, -0.0875497f, + 0.065582f, 0.00238612f, -0.063395f, 0.0295323f, -0.127126f, + 0.099813f, -0.115452f, 0.0106309f, -0.179632f, -0.0436553f, + 0.0120295f, 0.0652713f, -0.131512f, -0.081714f, -0.205363f, + -0.0374944f, -0.196707f, 0.680568f, -0.00991824f, -0.0212223f, + -0.186258f, -0.432361f, -0.0291303f, -0.0475983f, -0.071383f, + -0.0116416f, -0.28257f, -0.0635272f, -0.0576546f, -0.280129f, + 0.286528f, 0.199997f, 0.192851f, 0.323829f, -0.185006f, + -0.04791f, -0.0882187f, -0.0496895f, 0.293135f, 0.125539f, + 0.0341828f, 0.993452f, 0.0369177f, 0.0453796f, 0.0329807f, + 0.157673f, -0.153195f, 0.122383f, -0.161983f, -0.317619f, + 0.105129f, 0.155673f, 0.152489f, 0.0685417f, -0.0595907f, + -0.026657f, -0.0954336f, -0.0359557f, 0.105617f, 0.0825066f, + 0.100189f, -0.22125f, 0.382508f, -0.0247677f, -0.115807f, + -0.0639787f, 0.177786f, 0.0566206f, 0.0496389f, 1.31533f, + 0.0482907f, -0.118743f, 0.190632f, 0.172867f, -0.108446f, + -0.200186f, 0.122572f, 0.0897468f, 0.0155328f, -0.0380217f, + 0.125161f, -0.141723f, -0.023157f, 0.0270805f, -0.101961f, + 0.12358f, -0.0866255f, 0.00306761f, -0.131764f, -0.461118f, + -0.00803936f, 0.0895496f, -0.153905f, 0.207623f, -0.249099f, + -0.0198487f, -0.160013f, 0.81136f, -0.109978f, -0.0880332f, + -0.0761368f, -0.0755881f, -0.0384827f, -0.0554777f, -0.0750048f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_3_bias[] = { + 0.0106809f, 0.136699f, 0.285316f, 0.395746f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_4_kernel[] = { + -0.0161019f, -0.088871f, 0.0463358f, -0.198037f, 0.038122f, + 0.0135483f, -0.196641f, -0.433531f, 0.527972f, -0.143716f, + 0.558627f, 0.459889f, 0.322864f, -0.491514f, -0.190915f, + -0.0765601f, 0.210329f, 0.689389f, -0.100415f, -1.8788f, + 0.2228f, 0.292781f, -0.954838f, -0.0788763f, -0.131402f, + -0.17154f, 0.049934f, -0.0541183f, -0.530529f, -0.666165f, + 0.195492f, 0.218548f, -0.314895f, 0.0749444f, -0.191344f, + 0.349469f, 0.00811248f, -0.760157f, 0.0707434f, -0.0719285f, + -0.264495f, -0.432009f, -0.432686f, 0.155738f, -0.020197f, + 0.19278f, -0.658335f, -0.273143f, -0.286079f, 0.243402f, + 0.497701f, 0.0121003f, -0.666308f, 0.028172f, -0.547901f, + -0.11755f, 0.322028f, 0.0878274f, -0.0328334f, 0.311816f, + 0.0951026f, -1.11429f, -0.0417486f, 0.123467f, -0.0910681f, + -0.0154255f, 0.311201f, -0.0156158f, -0.600437f, 0.0274156f, + -0.174907f, -1.29313f, -0.178656f, 0.596556f, -0.421725f, + -0.289137f, 0.529297f, 0.114833f, -0.0155887f, -0.308232f, + -0.0228361f, 0.184017f, 0.138232f, 0.146347f, -0.117867f, + 0.248351f, -0.282846f, -0.18058f, 0.348355f, -0.415754f, + 0.0657168f, 0.431728f, -0.231043f, -0.186745f, 0.137401f, + -0.282329f, -0.159678f, 0.754262f, 0.037824f, -1.68521f, + -0.290175f, 0.289588f, -0.18683f, -0.300385f, 0.285449f, + -0.00386456f, 0.0563485f, -0.376541f, 0.159899f, -0.697312f, + 0.0284389f, 0.437307f, 0.3968f, -0.372082f, -0.232535f, + 0.394629f, 0.00315248f, -0.38374f, 0.0311291f, -0.624353f, + 0.498083f, -0.342663f, -0.125978f, 0.186797f, 0.187723f, + 0.149335f, -0.82727f, -0.0740974f, -0.659039f, 0.42671f, + -0.448835f, 0.150677f, 0.830742f, -0.233148f, -0.65308f, + -0.0878935f, -0.407797f, -0.511826f, -0.0739023f, 0.506305f, + -0.187451f, 0.0284968f, -0.822238f, 0.362523f, -0.270865f, + 0.032335f, 0.560413f, -0.00388247f, -0.446333f, 0.163147f, + -0.409633f, -0.372575f, 0.306993f, 0.55953f, -0.24362f, + -0.0929369f, -0.520298f, -0.444022f, 0.186077f, -0.0942208f, + 0.624049f, -0.429625f, -0.869528f, 0.405257f, -0.120445f, + 0.537685f, -0.3911f, 0.142142f, 0.0913808f, -0.00375967f, + 0.382781f, 0.60505f, -0.271608f, -0.0630436f, -0.150625f, + -0.0124598f, 0.0132878f, 0.138475f, -0.106264f, -0.416581f, + -0.518415f, 0.185127f, -0.464622f, -0.0102925f, 0.0389567f, + 0.406439f, -0.0414264f, -0.366185f, -0.511867f, -0.650255f, + 0.278252f, 0.0270234f, 0.262788f, -0.0294793f, 0.12651f, + 0.421537f, 0.0300837f, 0.0742187f, 0.281954f, -0.122069f, + -0.450145f, -0.312206f, -0.402633f, -0.0868137f, 0.190433f, + -0.149602f, -0.175029f, 0.00900023f, -0.266596f, 0.21721f, + -0.245079f, -1.09798f, 0.319409f, -0.337938f, 0.358514f, + 0.0771549f, 0.447087f, -0.305507f, -0.285492f, 0.383896f, + 0.145933f, -0.264944f, -0.118486f, 0.068805f, -0.194231f, + -1.79133f, 0.363408f, -0.17434f, -0.229629f, 0.132188f, + 0.207548f, -0.876264f, 0.265634f, 0.139332f, 0.236206f, + -0.0145184f, 0.562865f, 0.526612f, -0.0333508f, -0.421885f, + 0.273485f, -0.110882f, 0.425557f, 0.513303f, -0.422322f, + 0.0563155f, -0.0409693f, 0.194768f, -0.419828f, -0.107195f, + -1.19224f, 0.48552f, 0.132782f, -0.00932096f, -0.225484f, + -0.428484f, -0.0392684f, 0.750697f, 0.337615f, 0.158476f, + 0.413484f, 0.326017f, -0.757107f, -0.183962f, 0.00884361f, + 0.126507f, -0.0751588f, -0.308782f, -0.104237f, -0.703877f, + -0.491806f, -0.204251f, -0.317212f, 0.0815479f, 0.296323f, + 0.219632f, -0.039859f, 0.556257f, 0.176144f, -0.0750654f, + -0.106419f, 0.00400385f, -0.172266f, 0.000178763f, 0.146532f, + 0.255202f, -0.427235f, -0.182198f, -0.256557f, 0.260255f, + -0.0143364f, 0.0868664f, -0.564373f, -0.0876947f, 0.726289f, + 0.0160001f, -0.381562f, -0.638214f, -0.803803f, 0.25945f, + -0.371542f, -0.419611f, 0.238617f, 0.371834f, -0.226777f, + -0.894602f, 0.37458f, -0.354866f, 0.0249312f, 0.142374f, + 0.433813f, -0.0218183f, -0.33248f, 0.107223f, 0.390823f, + -0.0271108f, -0.616878f, -0.604984f, 0.517269f, -0.293573f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_4_bias[] = { + -0.290371f, -0.0560272f, -0.118144f, -0.270583f, 0.401388f, + -0.308677f, 0.150729f, -0.0324442f, -0.135937f, 0.0875581f, + 0.0206493f, -0.212682f, -0.0266535f, -0.326656f, 0.0185105f, + -1.01429f, -0.00315052f, -0.0273938f, -0.0263379f, -0.171702f +}; + +static const CNN_CONFIG av1_intra_mode_cnn_partition_cnn_config = { + NUM_CNN_LAYERS, // num_layers + 0, // is_residue + 0, // ext_width + 0, // ext_height + 0, // strict_bounds + { + { + CNN_LAYER_0_IN_CH, // in_channels + CNN_LAYER_0_WIDTH, // filter_width + CNN_LAYER_0_WIDTH, // filter_height + CNN_LAYER_0_OUT_CH, // out_channels + CNN_LAYER_0_HORZ_STRIDE, // skip_width + CNN_LAYER_0_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_0_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_0_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + -1, // output_num + }, + { + CNN_LAYER_1_IN_CH, // in_channels + CNN_LAYER_1_WIDTH, // filter_width + CNN_LAYER_1_WIDTH, // filter_height + CNN_LAYER_1_OUT_CH, // out_channels + CNN_LAYER_1_HORZ_STRIDE, // skip_width + CNN_LAYER_1_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_1_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_1_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 3, // output_num + }, + { + CNN_LAYER_2_IN_CH, // in_channels + CNN_LAYER_2_WIDTH, // filter_width + CNN_LAYER_2_WIDTH, // filter_height + CNN_LAYER_2_OUT_CH, // out_channels + CNN_LAYER_2_HORZ_STRIDE, // skip_width + CNN_LAYER_2_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_2_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_2_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 2, // output_num + }, + { + CNN_LAYER_3_IN_CH, // in_channels + CNN_LAYER_3_WIDTH, // filter_width + CNN_LAYER_3_WIDTH, // filter_height + CNN_LAYER_3_OUT_CH, // out_channels + CNN_LAYER_3_HORZ_STRIDE, // skip_width + CNN_LAYER_3_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_3_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_3_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 1, // output_num + }, + { + CNN_LAYER_4_IN_CH, // in_channels + CNN_LAYER_4_WIDTH, // filter_width + CNN_LAYER_4_WIDTH, // filter_height + CNN_LAYER_4_OUT_CH, // out_channels + CNN_LAYER_4_HORZ_STRIDE, // skip_width + CNN_LAYER_4_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_4_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_4_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 0, // output_num + }, + }, +}; + +static const float + av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel[] = { + 0.604356f, -0.236007f, 0.342172f, 0.531397f, -0.635698f, + -0.591573f, 0.833872f, 0.492814f, -0.100308f, 0.186385f, + 0.202779f, 0.263578f, 0.330001f, -0.15531f, 0.879584f, + -0.0048796f, 0.490796f, 0.242254f, -0.292211f, -0.696912f, + 0.746664f, 0.129371f, -0.0122443f, 0.196234f, -0.251605f, + -0.385617f, 0.157707f, 0.699963f, 0.0432536f, -0.11141f, + -0.0353473f, -0.0364045f, -0.113556f, -0.520842f, 0.231248f, + 0.230638f, -0.323852f, -1.08633f, -0.0469168f, -0.481821f, + 0.366838f, 0.189627f, -0.0637262f, -0.484917f, -0.109874f, + 0.292237f, 0.368702f, -0.183896f, -0.109038f, -1.22613f, + -0.880355f, -1.63768f, 0.337426f, -0.940994f, 0.413097f, + -0.37879f, -0.480525f, -0.594819f, -0.0172653f, -0.499436f, + -0.298395f, -0.840181f, -0.0758645f, -0.772089f, -0.232727f, + -0.815968f, 0.160785f, -0.0767165f, 0.0064244f, -0.540491f, + 0.417776f, -0.384337f, -0.497377f, 0.68414f, 0.00797514f, + 0.262626f, 0.203732f, 0.702047f, 0.0617544f, 0.0878249f, + -0.315032f, -0.0169776f, 0.403986f, 0.815872f, 0.135388f, + 0.0858594f, 0.169172f, -0.638227f, -1.65268f, -0.0476042f, + -0.982685f, 0.45707f, -0.0577537f, 0.367329f, 0.176513f, + -0.356454f, 0.0979095f, -0.277476f, 0.257271f, -0.333451f, + 0.0241497f, 0.0671127f, 0.221216f, 0.106065f, 0.537151f, + 0.0257329f, 0.265559f, -0.348353f, 0.285569f, -0.0610511f, + -1.59334f, -1.63826f, -0.164898f, -0.36605f, -0.489304f, + 0.729241f, 0.0197627f, 0.200291f, -0.231506f, -0.255715f, + -0.0932264f, -0.728793f, 0.468297f, -1.09592f, -0.079791f, + -1.76531f, -0.182904f, -2.05897f, -0.371894f, 0.207124f, + 0.255029f, 0.186501f, -0.005805f, 0.00160733f, -0.178206f, + -0.352757f, -0.164741f, -0.557583f, -0.559692f, -0.00731467f, + 0.149326f, 0.409735f, 0.22083f, -0.332572f, -0.1741f, + -0.0519008f, -0.266402f, 0.294031f, -2.4453f, 0.339851f, + -0.573747f, -5.97783f, -0.084142f, 0.20286f, -0.576038f, + -0.111081f, 0.101238f, -5.83427f, -1.98537f, 0.322796f, + -0.60171f, 0.212412f, 0.247176f, 0.603694f, -0.54357f, + -0.693439f, 0.250725f, -4.31988f, 0.0935924f, 0.43669f, + -0.139706f, -0.158391f, 0.244309f, 0.619213f, -0.309154f, + -0.135341f, 0.475815f, -0.290804f, -0.109038f, -0.0937104f, + 0.0385907f, -0.29105f, -0.0597651f, -0.451187f, -1.51821f, + 0.141772f, 0.822204f, -0.729661f, -0.109908f, 0.178217f, + -0.750278f, 0.113762f, -0.0959985f, 0.066579f, -0.104209f, + -0.951378f, 1.4087f, -1.13175f, -1.09103f, -1.50416f, + -0.182273f, -1.80129f, -0.152135f, 0.356931f, 0.205591f, + 0.183148f, -0.498671f, -0.183034f, -0.176428f, 0.395706f, + -0.589908f, -0.318276f, -0.421162f, 0.658766f, -0.186752f, + 0.0656253f, 0.248002f, 0.289618f, -0.458111f, -0.130789f, + -0.542988f, 0.405804f, -0.35364f, -0.311927f, 0.218339f, + 0.309215f, -0.130347f, -0.0257543f, 0.0413234f, -0.190205f, + -0.242382f, 0.819886f, -0.255157f, -0.181219f, -0.290903f, + -0.301995f, -0.0469988f, 0.702936f, 0.209122f, 0.0234243f, + 0.598637f, 0.0305196f, 0.0423457f, -0.618799f, 0.0190867f, + 0.420584f, -0.224752f, -0.410077f, 0.127854f, 0.395261f, + -0.393685f, -0.282822f, 0.0289504f, 0.0406515f, -0.511531f, + -0.497611f, 0.0252715f, 0.0812549f, 0.80205f, 1.29084f, + 0.764972f, 0.561258f, -0.23499f, 0.217594f, -0.690935f, + -0.26607f, 0.357955f, 0.391608f, 0.448352f, 0.458586f, + -0.790071f, 0.719959f, -0.468052f, 1.24579f, 0.220705f, + 0.284044f, 0.141346f, 0.246687f, 0.147826f, -0.403557f, + -0.00648195f, 0.398034f, -0.100464f, -0.77107f, -0.188274f, + -0.219245f, -0.0330375f, 0.367585f, -0.220391f, 0.308736f, + 0.221399f, 0.340292f, 0.037597f, 0.606083f, 0.665634f, + -0.755529f, -0.95989f, -0.243673f, 0.233709f, -0.454628f, + -0.110952f, 0.776062f, 0.731136f, -0.140422f, 0.19261f, + 0.355086f, 0.975026f, 0.190936f, 0.776205f, 0.982781f, + 0.555569f, 0.42382f, -0.409721f, 0.25053f, -0.271328f, + 0.859941f, -0.0210901f, 0.0176916f, -0.562895f, -0.0787431f, + -0.861032f, -0.34022f, -0.571995f, 0.205436f, 0.346968f, + 0.377033f, -1.08484f, 0.297007f, -1.01693f, 0.189463f, + -0.483242f, 0.147058f, 0.0159503f, 0.0908779f, -0.46962f, + 0.174024f, -0.490704f, -0.383501f, -0.0507626f, 0.00902188f, + -0.202495f, 0.205047f, 0.0562261f, -0.143371f, 0.219524f, + -0.317294f, -0.0575756f, -0.0595825f, -0.000625279f, -0.278864f, + -0.0516874f, -0.225259f, 0.429046f, -0.0952421f, 0.0799135f, + -0.122883f, -0.262308f, -0.481006f, -0.0466122f, -0.402822f, + 0.150595f, -0.0919558f, -0.356765f, -0.199222f, 0.219389f, + -0.214452f, -0.196361f, -0.095758f, -0.115891f, -0.143777f, + 0.549843f, -0.113036f, 0.764895f, -0.0114812f, -0.0684054f, + -0.98045f, -0.0170634f, 0.247719f, -0.18718f, -0.381566f, + 0.150758f, -0.526257f, 1.00851f, 0.776634f, 1.69728f, + -0.303058f, 0.228967f, -0.414134f, 0.0858226f, -0.285472f, + 0.431459f, 0.315318f, 0.587835f, 0.335737f, -0.0222039f, + 0.18945f, 0.274008f, 0.609263f, 0.320232f, -0.214137f, + -0.0297668f, 0.0439046f, -0.52821f, -0.0127375f, 0.431885f, + 0.508846f, -0.329189f, -0.166778f, -0.94338f, -0.358807f, + 0.208641f, -0.517986f, -0.128278f, 0.693464f, -0.24408f, + -0.0669412f, -0.410287f, 0.0444145f, -0.264179f, 0.143884f, + 0.276842f, 0.498934f, -0.682557f, -0.217198f, -0.8249f, + -0.40446f, -0.115376f, 0.417934f, 0.65605f, -0.00570035f, + -0.365742f, -0.367625f, 0.526824f, -0.0164913f, -0.255998f, + 0.247292f, 0.0846536f, 0.109302f, -0.302996f, 0.160564f, + 0.0228132f, 0.035211f, -0.236951f, 0.493801f, 1.37315f, + -0.182348f, 0.234437f, -0.256906f, 0.12523f, 0.667113f, + -0.437981f, -0.0721831f, 0.303976f, -0.041336f, -0.145894f, + -0.733741f, 0.436056f, 0.368542f, -0.149072f, -0.290281f, + 0.0946743f, -0.0579292f, 0.264539f, 0.170048f, 0.262411f, + 0.049679f, 0.371369f, 0.760675f, 0.482157f, -0.0196783f, + 0.260888f, 0.948856f, 0.170228f, -0.134432f, -0.942235f, + -1.23226f, -0.373963f, -0.0381773f, -0.17947f, 0.00947998f, + 0.01086f, 0.389578f, -0.380389f, -0.0865851f, -0.220328f, + -0.171901f, -0.384325f, -0.0787615f, 0.392678f, 0.123392f, + -0.0895824f, 0.00480886f, -0.162918f, 0.214336f, -0.00147339f, + 0.203899f, -0.00292344f, -0.148594f, 0.0425697f, -0.306896f, + -0.342225f, -0.45088f, -0.184454f, -0.00923638f, -0.521993f, + -0.334464f, 0.156497f, -0.0856832f, -0.277661f, -0.0721105f, + -0.488781f, -0.509543f, -0.012664f, 0.0940558f, -0.29869f, + 0.0434843f, -0.0178945f, -0.0525666f, -0.303178f, 0.713507f, + -0.137413f, -0.170289f, -0.142942f, -0.316002f, 0.229125f, + -0.277585f, 0.0125026f, 0.508316f, -1.20614f, -0.915129f, + -1.63389f, -0.454604f, -0.893951f, -0.447403f, -0.751423f, + 1.3886f, 0.617818f, 0.611458f, -0.884173f, -0.7779f, + -0.608639f, -0.164759f, -0.631846f, -0.176894f, -0.459361f, + -0.187119f, 0.173283f, -0.477191f, -0.156736f, 0.182675f, + 0.598854f, -0.489941f, -0.420493f, -0.162002f, 0.344418f, + 0.33832f, -0.187463f, -0.388721f, -0.0733151f, -0.138835f, + 0.313699f, 0.0625967f, -0.291488f, 0.114088f, -0.356843f, + 0.197506f, 0.0320749f, 1.16745f, -0.36081f, 1.63416f, + 0.198392f, 1.13928f, -0.317971f, 0.531019f, 0.526518f, + 0.185814f, 0.0923607f, 0.192858f, -0.234378f, 0.18091f, + -0.228837f, 0.397216f, 0.581501f, 0.284376f, -0.130434f, + 0.20076f, 0.242662f, -0.0480872f, 0.131746f, 0.362712f, + 0.0146821f, 0.475679f + }; + +static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias[] = { + 0.477356f, 0.385222f, 0.389122f, 0.539506f, -0.0272558f, 0.581605f, + -0.800961f, 0.142229f, 0.117549f, -0.0724944f, 0.102095f, -0.71319f, + -0.0162434f, -0.132858f, 0.543411f, -0.626599f +}; + +static const float + av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel[] = { + 0.195436f, -0.623354f, 1.27907f, 0.270071f, -0.677612f, + 0.0266141f, 0.272991f, -0.425446f, 0.891889f, -0.299836f, + -0.611825f, -0.0322273f, 0.185276f, 0.238639f, -0.150954f, + 0.083495f, -0.472106f, 0.573506f, 1.16465f, -0.154947f, + 0.640631f, -1.59467f, -9.8166f, -0.22889f, -0.189912f, + 0.227052f, -0.540787f, 0.0840873f, -3.04293f, -0.0209975f, + -6.10979f, -5.92801f, 0.288467f, -0.169476f, 0.0527948f, + -1.21202f, -0.280915f, 0.290863f, -0.601877f, 0.0598784f, + -0.592136f, -0.535588f, -0.0434018f, -0.653223f, 0.00339129f, + -0.133273f, 0.279463f, 0.483879f, 0.463664f, -0.14174f, + -1.56354f, 0.560043f, -1.44639f, 0.673528f, -0.108418f, + -0.707313f, 0.49633f, -0.0321971f, 0.411475f, -0.382184f, + -0.965501f, -0.0507655f, 0.540415f, -0.977297f, 0.370382f, + -0.375683f, 0.0844529f, -2.0002f, -0.346289f, 0.621251f, + -0.489855f, 0.191252f, -0.576629f, -0.35773f, 0.023167f, + 0.180793f, -0.417864f, 0.0587254f, 0.167824f, 0.0612058f, + -0.712108f, 0.155614f, 0.900036f, -0.480124f, 0.146117f, + 0.467011f, 0.412525f, 0.312724f, 0.551826f, -0.179601f, + 0.706261f, 0.00674965f, -0.495221f, 0.140829f, -0.0619195f, + -0.0697912f, 0.511967f, -0.0318237f, -0.285946f, -0.28608f, + 0.0894142f, 0.234351f, -0.272328f, -0.350369f, -0.392605f, + 0.287318f, 0.310426f, 0.293524f, 0.357681f, -0.157868f, + 0.149652f, -0.259363f, 0.192941f, -0.850096f, 0.456507f, + 0.387857f, -0.491187f, -0.0541993f, -0.28118f, 0.193991f, + -0.0956664f, 0.0679829f, 0.0341118f, 0.141826f, 0.271538f, + -0.285295f, -0.68666f, 0.306414f, 0.600678f, 0.494801f, + -1.11907f, 0.524849f, 0.151169f, 0.474068f, -0.43441f, + -0.229138f, 0.0345483f, 0.682888f, -0.471534f, -0.0457066f, + -2.36721f, 0.446407f, 0.20396f, -1.17868f, 0.815363f, + -1.13897f, 0.397217f, -0.593796f, -6.95512f, 0.650695f, + 0.771657f, 0.15227f, -0.824519f, 0.617854f, -0.295353f, + -0.101207f, 0.600989f, -0.550653f, -0.722371f, 0.292006f, + -0.451891f, 0.54544f, 0.354278f, 0.0136258f, 0.192003f, + 0.258275f, -0.0443647f, 0.0928186f, 0.667775f, 0.239558f, + 0.0523887f, 0.71586f, 0.292563f, 0.362479f, 0.373453f, + 0.250638f, -0.423037f, -0.486574f, -0.619397f, 0.343888f, + 0.974971f, 0.574218f, 0.273989f, -0.209956f, -0.274333f, + 0.0553766f, 0.263918f, 0.733824f, 0.038713f, -0.0788992f, + 0.292014f, 0.111808f, -0.197507f, 0.593668f, -0.0245337f, + 0.0873662f, 0.530997f, 0.620717f, 0.310697f, -1.54861f, + 1.12915f, 0.0991346f, -0.59214f, 0.422325f, -0.0157936f, + 0.380975f, 0.626403f, 0.268064f, -0.615231f, -1.43172f, + 0.0928048f, 0.0949026f, -0.470912f, -0.0867527f, -0.0381206f, + 0.178393f, -1.13737f, 0.12798f, 0.258214f, -0.803364f, + 0.177506f, 0.542718f, 0.660656f, 0.145091f, 0.183056f, + -0.47338f, 0.469287f, 0.10832f, 0.0994899f, -0.402719f, + 0.157287f, 0.523071f, -0.324493f, 0.343599f, 0.664839f, + -0.0375519f, -0.279238f, -0.0722333f, 0.395344f, -0.289316f, + 0.0259298f, -0.843245f, -0.160021f, 0.741429f, -1.38726f, + -0.2969f, -0.240443f, 0.247731f, -1.04088f, -0.280454f, + -0.237054f, -0.759227f, 0.0456369f, -0.647453f, -1.02372f, + -0.200395f, -0.546839f, -0.104226f, -0.152727f, -0.56685f, + -0.0559663f, -0.425494f, -0.610679f, -0.987096f, -0.575138f, + -0.0887979f, 0.463646f, -1.041f, -0.49412f, -0.175298f, + -0.463296f, -0.955177f, 0.17852f, -1.10694f, 0.181991f, + -0.18998f, 0.227818f, 0.688237f, -1.10444f, 0.549108f, + -0.171849f, -0.245614f, 0.120624f, 1.29571f, 0.607116f, + 0.00809927f, 0.1041f, -1.22918f, -0.212948f, 0.430239f, + -1.57341f, 0.482054f, 0.275905f, 0.939785f, -1.0209f, + -0.355534f, 0.397337f, -0.0593077f, -0.239603f, 0.475483f, + -0.999101f, -0.140578f, 1.04787f, -0.591981f, -0.306989f, + -0.879012f, -0.994715f, 0.0343158f, 0.218509f, 0.34704f, + 0.0672934f, -0.178941f, 0.20509f, -0.360031f, 0.161241f, + -0.324775f, -0.359531f, -0.0657085f, -0.864422f, -0.444865f, + 0.597095f, -0.948691f, 0.240001f, -0.783159f, -0.569422f, + 0.974205f, -1.04539f, 0.345915f, -0.681558f, -0.246047f, + 0.256174f, 0.493667f, 0.681324f, 0.155613f, 0.773309f, + -0.647027f, -0.214744f, -0.474202f, -0.661092f, -1.02316f, + 0.0572593f, -0.437082f, -0.119874f, -0.464877f, -0.58067f, + -0.218029f, 0.319516f, -0.378983f, -0.0698695f, 0.554693f, + -0.537875f, 0.126429f, -0.145113f, -0.594312f, -0.218021f, + -0.703569f, 0.0720548f, 0.261054f, -0.81438f, 0.249921f, + 0.165296f, -0.079028f, -0.322647f, 0.134458f, 0.0975046f, + 0.538594f, -0.250126f, 0.142309f, 0.526486f, 0.0532615f, + -0.383332f, -0.38143f, -0.101611f, 0.519776f, -0.278364f, + -0.23287f, -0.29139f, 0.22353f, 0.472085f, 0.366264f, + 0.741187f, 0.42019f, 0.0676459f, -0.230008f + }; + +static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias[] = { + -0.48603f, -0.578556f, 0.257639f, 0.459915f, 0.178156f, -1.16663f, + 0.828891f, 0.620291f, 0.413257f, -1.00508f, -0.574179f, -1.20623f, + -0.377837f, -0.0360333f, 0.681536f, 0.137189f, -0.458718f, 0.387131f, + 0.0233112f, 0.126045f, 0.361304f, 0.655317f, 0.413134f, 0.769947f +}; + +static const float av1_intra_mode_cnn_partition_branch_0_logits_kernel[] = { + 0.67244f, -2.59179f, 0.50425f, -1.86481f, 1.15891f, -1.26447f, + 0.761081f, 0.645117f, -1.78594f, -0.872703f, -0.192054f, -1.82359f, + -0.560935f, 0.838959f, 0.502264f, -1.28958f, -0.205551f, 0.635671f, + -1.12619f, -1.68277f, 0.83361f, 1.57235f, 1.15839f, 0.35345f +}; + +static const float av1_intra_mode_cnn_partition_branch_0_logits_bias[] = { + 1.14463f +}; + +static const float + av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel[] = { + 0.364612f, 0.237868f, -0.192821f, 0.12364f, 0.522205f, + -0.205785f, -0.503288f, -0.426503f, -0.083073f, 0.0164429f, + 0.184278f, -0.426055f, 0.0717997f, -0.261968f, 0.176412f, + -0.101226f, 0.0400285f, -0.332051f, 0.344385f, 0.189565f, + 0.441162f, 0.330462f, -0.719857f, -1.14209f, 0.557831f, + 0.104756f, 0.0562001f, -0.465923f, -0.344592f, -0.191554f, + -0.0656866f, -0.640162f, 0.419388f, 0.409308f, -1.68632f, + -1.10829f, 0.105485f, -0.14561f, -0.944738f, 0.104629f, + -0.146837f, 0.538823f, -0.153157f, 0.321081f, -1.77714f, + -0.0559296f, 0.324136f, -0.497023f, -1.15793f, -0.740144f, + -0.0888472f, 0.010059f, -0.18394f, -0.234405f, -0.10586f, + 0.130958f, -0.101944f, -0.186483f, -0.447049f, -0.900026f, + 0.128444f, 0.401696f, 0.128509f, 0.123778f, 0.062168f, + -0.321755f, -0.0691584f, 0.254468f, -0.115212f, -0.848885f, + 0.817005f, 0.0615853f, 0.153363f, 0.513855f, 0.789225f, + 0.356168f, 0.371613f, 0.269541f, 0.268173f, 0.220481f, + -0.109063f, -0.00620798f, -0.0334622f, 0.236267f, -0.0235294f, + -0.0800253f, 0.0294184f, 0.047131f, -0.224047f, 0.0890737f, + -0.356293f, 0.0989534f, 0.16799f, 0.498266f, 0.612581f, + -0.372897f, -0.75125f, 0.77698f, 1.1032f, -0.0764679f, + 0.0266299f, 0.309532f, 0.461305f, 0.0193521f, -0.0939161f, + -0.276156f, -0.102714f, -0.0828328f, 0.40003f, 0.122542f, + 0.0867203f, -0.170738f, 0.0850642f, -0.130762f, 0.082324f, + -0.115218f, -0.0244491f, 0.0434331f, 0.216453f, 0.443733f, + -0.173679f, -0.161617f, 0.316209f, -0.689656f, -1.52007f, + -0.421018f, 0.430833f, -0.00734122f, 0.284499f, -0.0207885f, + 0.0572024f, -0.878942f, 0.388264f, 0.0191589f, -0.123415f, + -0.0461196f, -0.0444461f, -0.00383171f, 0.0945655f, -0.0597219f, + -0.374918f, 0.0182124f, 0.523083f, 0.00519547f, 0.80513f, + -0.221433f, -1.30591f, -0.416917f, -0.718173f, 0.622999f, + 0.941798f, 0.0477536f, 0.0303772f, 0.268078f, 0.414778f, + 0.394325f, 0.299733f, -0.583208f, 0.309379f, 0.416581f, + 0.0299948f, -0.409145f, -0.161557f, -0.214082f, -0.0098119f, + 0.221912f, 0.107135f, 0.0692518f, 0.00490957f, 0.107613f, + -0.368404f, -0.548006f, 0.208274f, 0.550475f, 0.643678f, + -1.65859f, 0.095938f, -0.0434245f, -0.0792685f, 0.838109f, + -0.0138653f, -0.527573f, -0.123472f, -0.235618f, -0.677401f, + -0.125877f, -0.175604f, -0.203196f, 0.113478f, -0.228323f, + -0.53539f, 0.134458f, 0.0534899f, -0.213006f, -0.138679f, + -2.15023f, 0.186303f, 0.48566f, -1.22301f, -0.240982f, + -0.486836f, -0.121181f, -0.131382f, -0.0320283f, 0.278828f, + 0.342581f, -0.182257f, -0.365193f, -0.226351f, 0.108928f, + -0.100159f, 0.448355f, -0.0768947f, 0.0633719f, -0.104786f, + 0.0456653f, 0.0965752f, 0.156403f, -0.157337f, 0.212259f, + 0.317939f, 0.124193f, -0.329475f, 0.206868f, -2.15986f, + -0.108385f, -0.396769f, -0.0317231f, -0.271524f, -0.184697f, + 0.662615f, 0.412926f, -0.0217462f, -0.0285475f, -0.118826f, + 0.0252706f, -0.137091f, 0.198973f, 0.329509f, -0.0831966f, + -0.621237f, 0.0896179f, 0.805261f, -0.019675f, 0.962452f, + 0.307433f, 0.892168f, -0.537587f, -2.46145f, 0.125606f, + 0.920491f, 0.219462f, 0.292765f, -0.748238f, -0.0537239f, + -0.224326f, 0.505492f, 0.176426f, 0.0343168f, 0.16708f, + -0.581393f, 0.951726f, -1.1777f, -0.561914f, -1.53288f, + 0.864567f, -1.19648f, -1.24141f, -0.334688f, -0.622026f, + 0.666876f, -0.197005f, -0.600507f, -0.851924f, 0.492299f, + 0.31078f, -0.0736115f, 0.030999f, -6.02463e-05f, -0.0604341f, + -0.0254238f, 0.139222f, 0.333235f, 0.366534f, -0.191982f, + -0.0156092f, 0.44234f, -0.0193213f, 0.0938745f, -0.015709f, + -0.12043f, 0.00895591f, 0.0464401f, 0.0530699f, -0.623018f, + -1.23372f, -0.538647f, -1.12389f, 0.26742f, 0.548694f, + 0.00540655f, -0.219703f, 0.314894f, -0.573463f, -0.241555f, + 0.441851f, 0.422491f, 0.253785f, -0.384683f, 0.0370165f, + 0.226669f, 0.245587f, 0.215265f, -0.122272f, 0.0492235f, + 0.000658591f, -0.312877f, 0.436487f, -0.229199f, -0.174373f, + 0.904268f, -0.855845f, -0.877293f, -0.65409f, 0.313795f, + 0.461748f, -0.737766f, -0.228523f, 0.182181f, 0.334522f, + 0.0629676f, -0.151087f, 0.178798f, -0.325809f, -0.331672f, + 0.0865837f, -0.0684225f, 0.0252008f, -0.0820631f, 0.0481863f, + 0.209473f, -0.0242151f, -0.0898919f, -0.163828f, -0.164282f, + 0.581888f, 0.816896f, 0.0607674f, 0.364855f, -0.346512f, + -0.764174f, 0.595561f, 0.302872f, 0.206361f, 0.106917f, + -0.972338f, 0.176948f, 0.6415f, -0.131897f, -0.155802f, + 0.216337f, -0.342511f, 0.123743f, -0.123014f, 0.0205439f, + 0.15173f, -0.23801f, -1.00387f, 0.651328f, 0.237439f, + -0.542952f, 1.066f, -0.161107f, -0.593545f, 0.219343f, + -0.178094f, 0.0789992f, 0.428332f, 0.23827f, -0.327421f, + 0.416144f, 0.00394653f, 0.052046f, -0.238289f, 0.405942f, + 0.00141984f, 0.161017f, 0.077111f, 0.0823985f, 0.0981208f, + 0.109949f, -0.0428502f, 0.343629f, -0.722978f, -0.375269f, + -0.111634f, -0.271523f, 0.712093f, 0.684904f, -0.572331f + }; + +static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias[] = { + 0.583367f, -0.202004f, -0.207626f, 0.412451f, -0.258311f, 0.0304954f, + -0.102458f, 0.450087f, -0.376851f, -0.338702f, 0.335226f, 0.889072f, + 0.502411f, 0.649282f, 0.15345f, -0.0109896f +}; + +static const float + av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel[] = { + 0.0214882f, -0.934339f, -0.173335f, 0.8362f, -0.764234f, + 0.525163f, 0.409749f, 0.821539f, -0.784157f, -0.455593f, + 0.446099f, 0.406756f, 0.479242f, -0.814038f, -0.419332f, + 0.328869f, -0.340707f, 0.133219f, 0.0320347f, 0.25089f, + -0.324917f, -0.0684265f, 0.0377777f, -0.262556f, 0.673458f, + -0.0291454f, -0.417957f, -1.0075f, -0.481537f, 0.922105f, + -0.000516239f, -0.40034f, 0.242067f, -0.43178f, 0.32001f, + 0.143599f, -0.345172f, 0.126093f, 0.148518f, -1.12151f, + -1.03435f, 0.551691f, -0.310001f, -0.323194f, -0.595128f, + -0.395689f, 0.737268f, -0.729227f, 0.590804f, -0.590022f, + -1.01427f, -0.521159f, -0.617579f, 1.07292f, -0.613047f, + -0.619093f, 0.335268f, 0.473753f, -0.795027f, 1.24635f, + -0.556193f, 0.241046f, -0.0354181f, -0.354215f, 0.716752f, + -0.00200745f, -1.25171f, -0.440731f, -0.763918f, -0.588614f, + -0.183901f, -0.396056f, 0.226903f, 0.921471f, 1.10465f, + 0.207053f, 0.57681f, -0.555699f, 0.235469f, -0.92149f, + 0.625808f, 0.29653f, -0.81775f, -0.307889f, -1.41384f, + -0.136205f, -0.365314f, -0.516741f, 0.748052f, 0.617947f, + 0.0973239f, 0.839607f, 0.530668f, -0.227032f, -0.449044f, + -1.04725f, -0.244363f, -0.396888f, -0.146161f, 0.359789f, + 0.0436599f, 1.21645f, -0.336069f, 0.0534646f, -0.00200328f, + 0.658551f, -0.156142f, -1.0728f, 0.0951015f, 0.234837f, + -0.380525f, 0.041783f, -0.269273f, 0.0386013f, -0.455589f, + -0.174338f, 0.0345251f, 0.17116f, -0.507642f, 0.210453f, + 0.739987f, -0.0438776f, 0.570145f, -0.118811f, 0.0548662f, + 0.153458f, -0.89887f, 0.493704f, 0.283351f, 0.785441f, + -0.586002f, -0.0616167f, -0.714328f, -0.145941f, -0.449656f, + 0.850117f, 0.279997f, 0.204143f, -0.31356f, 0.947057f, + -0.135787f, 0.747071f, 0.0145968f, -0.81414f, 0.431009f, + -0.275824f, -0.342928f, -0.0528272f, -0.592183f, 0.433915f, + -0.251752f, -0.311815f, -1.47533f, -1.43677f, 0.0698436f, + 1.01341f, 0.305063f, -0.252003f, -0.428915f, -0.00104153f, + -0.368267f, -0.354523f, -0.27956f, -0.771664f, 0.232092f, + -0.428495f, 0.424952f, -0.343229f, 0.196899f, -0.761084f, + -0.0110293f, -0.335361f, 0.571637f, -0.423489f, -0.52773f, + 0.0108043f, -0.504715f, -1.1419f, -0.402904f, -0.160747f, + -0.329184f, 0.375374f, -1.02604f, -0.601371f, 0.631652f, + 0.0742486f, -0.464765f, 0.467445f, 0.240562f, -0.38211f, + -0.459004f, 0.704196f, 0.021357f, 0.860785f, -1.16731f, + -0.479029f, -0.139644f, -0.444087f, 0.322326f, -0.25455f, + 0.874399f, 0.477696f, 0.0464487f, 1.20658f, 0.0993356f, + 0.00682712f, -0.10163f, -0.371765f, -0.629513f, -0.679196f, + -0.193935f, 0.47405f, -0.18238f, 0.254918f, -0.35306f, + -0.375611f, 0.119771f, -0.257282f, -0.565124f, 0.162667f, + -0.356128f, 0.870351f, 0.241847f, -0.264712f, -0.384322f, + 0.31807f, 0.211621f, -0.180767f, 0.764944f, 0.368646f, + 0.186111f, 1.02458f, -0.494252f, -0.483375f, -0.699664f, + 0.00415657f, -0.189376f, -0.677103f, -0.030319f, 0.667087f, + 0.810951f, -0.488237f, -0.387355f, -0.726579f, -0.304763f, + 1.10392f, -0.775977f, -0.247731f, 0.532396f, 1.24089f, + 0.206621f, -0.670568f, -1.08142f, -0.342503f, 0.189854f, + -0.200846f, 0.784204f, 0.641112f, -0.509346f, 0.0805264f, + -1.40006f, 0.322084f, -0.823739f, -1.12965f, -0.215668f, + 0.099673f, 0.425966f, 0.771697f, 0.338834f, 0.345364f, + -0.297826f, -0.176746f, -0.297299f, -1.80029f, -0.178348f, + 0.421194f, -0.19155f, 0.417653f, 0.374441f, -0.135654f, + -0.895843f, 0.220647f, 0.368264f, 0.369233f, 0.382707f, + 0.0800511f, 0.542053f, 0.318896f, -0.385539f, 0.313305f, + -1.01166f, -0.222379f, -1.53708f, 1.32407f, -0.665444f, + -0.102348f, 0.0410504f, -0.616825f, 1.3108f, 0.405902f, + 1.27777f, 0.0630558f, -0.172696f, 0.16224f, -1.10111f, + -3.31326f, -0.242566f, 0.831422f, 0.917397f, 0.311749f, + -0.238613f, 0.438007f, -0.407089f, -0.0202555f, -1.82502f, + -0.907965f, -0.300031f, -0.616669f, -0.767921f, 0.285919f, + -0.112019f, 0.252677f, 0.350892f, 0.000214244f, 0.315915f, + 0.260344f, 0.327362f, -0.0211213f, -0.41241f, 0.0418355f, + 0.103328f, -0.0158439f, -0.230505f, -0.0215114f, 0.266739f, + -0.234376f, -0.352583f, 0.0709437f, -0.90649f, -0.535843f, + 1.21322f, -1.05144f, -0.983682f, -0.189956f, 1.14208f, + -0.0188492f, -0.254821f, -0.463214f, -0.708714f, 0.0447348f, + -0.220831f, 0.476299f, 0.102544f, 1.1173f, -0.36981f, + -0.814102f, 0.103604f, -0.247871f, 0.0610701f, -0.356616f, + -0.144093f, 1.66496f, 0.180206f, -1.04384f, -0.65883f, + 0.0290771f, -0.622728f, 0.761523f, -0.909091f, -0.0340348f, + 0.666895f, -0.0232575f, 0.962643f, -2.50103f, -1.69745f, + -0.0482305f, 0.771811f, -1.32233f, -0.778722f, -0.203309f, + 0.395875f, -0.171812f, 0.253794f, 0.432799f + }; + +static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias[] = { + -0.152159f, 0.552347f, -0.806068f, 0.227901f, 0.335896f, 0.180785f, + 0.75277f, 0.982208f, 0.409823f, -0.17755f, -0.125365f, 0.738114f, + 0.202331f, 0.751737f, -0.360511f, 0.149254f, 0.085073f, -0.214542f, + 0.529727f, -0.0348777f, -2.13162f, -0.893332f, -0.136952f, -0.71258f +}; + +static const float av1_intra_mode_cnn_partition_branch_1_logits_kernel[] = { + -0.632145f, 0.738727f, -0.750737f, -0.931571f, -1.79763f, -2.31153f, + 0.912733f, 0.879995f, -1.00602f, -1.02467f, 0.0536835f, 1.76011f, + -0.898546f, 1.06959f, 1.60471f, -1.7312f, -0.877168f, -0.681185f, + -1.57286f, -1.16038f, -4.11303f, -3.06351f, -3.02536f, -2.92186f +}; + +static const float av1_intra_mode_cnn_partition_branch_1_logits_bias[] = { + 1.33207f +}; + +static const float + av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel[] = { + 0.0419551f, 0.0924078f, -0.153084f, 0.191642f, 0.069586f, + -0.530661f, 0.431968f, 0.000453838f, 0.793047f, 0.0161817f, + -0.476075f, -0.156638f, -0.219066f, 0.372716f, -0.0642299f, + 0.156813f, -0.105819f, -0.0519422f, 0.149935f, 0.295544f, + 0.192037f, -0.0450383f, 0.828794f, -0.0510661f, -1.22549f, + -0.100293f, -0.178274f, 0.0304427f, -0.0664097f, -0.0438936f, + 0.948248f, 0.425486f, -0.238206f, 1.3744f, 0.336897f, + 0.0760769f, -0.583508f, 0.0735519f, -0.117024f, 0.0501598f, + 0.332212f, 0.199531f, 0.424764f, 0.206712f, 0.342868f, + 0.592673f, -0.0961148f, -0.190113f, -0.155027f, 0.00789871f, + -0.0514839f, -0.416154f, -0.290309f, 0.407541f, 0.48534f, + 0.126564f, 0.0709566f, -0.0469664f, 0.735403f, -0.365963f, + 0.150295f, -0.50147f, 0.021383f, 0.76514f, 0.0085721f, + -0.416384f, 1.22268f, 0.0832438f, 0.367813f, -0.12012f, + 0.823183f, -0.0525972f, -0.325526f, -0.0983032f, 0.370128f, + 0.368778f, 0.138971f, -0.0397997f, 0.411058f, -0.0400404f, + 0.588437f, -0.29963f, -0.107992f, -1.75238f, -0.274387f, + 0.430418f, 0.495152f, 0.283172f, -0.441166f, 0.195339f, + -0.436182f, -0.252613f, 0.176204f, -0.126541f, -0.474833f, + -0.0721603f, -0.496599f, -0.0608464f, 0.0333451f, -0.0621485f, + 0.0843859f, 0.0637854f, -0.145291f, 0.14876f, 0.181665f, + -0.675805f, 0.294903f, 0.301118f, -0.225957f, 0.0105897f, + -0.136427f, -0.555925f, -0.158853f, -0.216779f, 0.0612481f, + -0.107158f, 0.352451f, 0.140536f, -0.0148237f, 0.189371f, + -0.091046f, -0.0476226f, 0.366054f, -0.0723413f, 0.389883f, + -0.0213411f, 0.0279539f, 0.194827f, -0.271502f, -0.166474f, + 0.0690549f, 0.0584665f, 0.0198415f, -0.442348f, 0.1571f, + -0.113463f, -0.16822f, -0.0580659f, -0.13441f, -0.0022386f, + 0.251521f, -0.160494f, -0.0753547f, 0.0897289f, 0.137917f, + 0.129836f, 0.0816833f, -0.626288f, 0.0643293f, -1.20001f, + 0.085631f, -0.195602f, 0.251244f, 0.0321744f, 0.0493178f, + -0.220616f, 0.724075f, -0.00831514f, 2.00319f, 0.407932f, + 0.0710799f, -0.166128f, 0.0126611f, -0.229644f, -0.0984299f, + 0.632041f, -0.0946141f, 0.295315f, 0.100934f, 0.184883f, + -0.236173f, 0.158081f, 0.195775f, 0.413542f, 0.789801f, + 0.767741f, 0.166275f, -0.348271f, -0.384074f, -0.291648f, + -0.119899f, 0.0368354f, 0.0751987f, 1.04217f, -0.159002f, + -2.71592f, -0.788502f, -1.06268f, 0.536057f, 0.0575876f, + 1.06811f, 0.12033f, 0.198578f, -0.0419196f, 0.0631388f, + 0.623138f, -0.142226f, 1.33129f, 0.0868059f, -0.0287825f, + 0.139378f, -0.143037f, 0.307452f, 0.0363987f, -0.0976368f, + 0.040544f, 0.0269327f, -0.0845524f, 0.0674699f, 0.104501f, + -0.0351155f, 0.167071f, 0.00986971f, 0.10284f, 0.0300016f, + 0.192601f, 0.0397177f, 0.0251346f, -0.00912908f, -0.0452825f, + 0.0164356f, -0.0275149f, 0.194846f, 0.0943608f, 1.61674f, + 0.0124345f, 0.523787f, 0.0397258f, -0.17208f, -0.147808f, + -1.23583f, 0.676385f, 0.551994f, 0.0233041f, 0.0116391f, + -0.466706f, 0.154725f, -0.207371f, 0.606662f, 0.247286f, + 0.31216f, 0.173765f, -0.268033f, 0.224422f, 0.314649f, + 0.481922f, -0.190604f, -0.0129162f, 0.270552f, 0.135195f, + 0.0927735f, -0.226099f, 0.53897f, 0.103309f, -0.0257271f, + -0.0246776f, 0.442013f, -0.179246f, -1.02581f, 0.206176f, + -0.326365f, 0.391623f, -0.103549f, 0.115645f, 0.0269328f, + -0.584517f, -0.237502f, 0.157996f, 0.0447407f, -0.161f, + -0.126072f, -0.148967f, -0.416347f, 0.0236496f, -1.12612f, + 0.0120709f, -0.00979376f, 0.0507126f, -0.172262f, 0.0697059f, + -0.212334f, 0.335731f, -0.0301362f, -0.839583f, -0.238539f, + 0.0636752f, -0.0467217f, -0.0372118f, -0.144615f, -0.161773f, + -0.648242f, 0.158197f, -0.051471f, -0.0615805f, -0.0426936f, + -0.0745554f, 0.358975f, 0.358297f, 0.0568553f, -1.14383f, + -0.103955f, 0.728194f, -0.224945f, -0.31659f, -0.204458f, + 0.171763f, -0.465666f, 0.899234f, -0.37042f, -0.0894774f, + 0.11478f, -0.334957f, 0.0896514f, 0.413251f, 0.359471f, + 1.41597f, 0.558082f, 0.153486f, 0.0270558f, -0.0178797f, + 0.124983f, -0.12273f, -1.04516f, -0.125375f, 0.370336f, + -0.209423f, -0.36816f, -0.66077f, -0.0180773f, -0.628921f, + -0.178542f, 0.0346841f, 0.0319309f, -0.470138f, 0.172763f, + 0.0798846f, -0.259737f, -0.652461f, -0.386283f, -0.474447f, + -0.924054f, -0.0154613f, -0.613712f, -0.138068f, -0.337842f, + 0.217921f, -0.0711405f, 0.000404091f, -0.703766f, 0.0364683f, + 0.150173f, 0.0126249f, 0.170594f, 0.0371879f, -0.0862515f, + -0.23454f, -0.0144143f, 0.164947f, 0.45591f, 0.115703f, + 0.069752f, -0.011993f, 0.0402097f, 0.00697581f, 0.0811613f, + 0.384752f, 0.341977f, 0.06087f, 0.0590107f, 0.00812679f, + 0.121211f, -0.0612108f, 0.167851f, 0.195781f, -1.62162f, + 0.336292f, -0.0772523f, -0.310786f, 0.188257f, -0.0325804f, + -0.240098f, 0.158748f, -0.265264f, 3.19593f, -0.449251f, + -1.33102f, -0.482856f, -0.435731f, 0.300808f, 0.346503f, + 2.67378f, -0.152379f, 0.219322f, -0.146119f, -0.0584806f, + -0.0276895f, -0.21955f, -0.479179f, -0.689545f, 0.152799f + }; + +static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias[] = { + -0.296575f, 0.101072f, -0.208429f, 0.111585f, 0.699552f, -0.379484f, + 0.313244f, -0.746369f, 0.867757f, 0.457318f, -0.0190943f, -0.290745f, + 0.45592f, -0.160465f, -0.634243f, 0.0829737f +}; + +static const float + av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel[] = { + 0.27511f, -2.14172f, 1.25755f, -0.554772f, 0.589508f, + 0.228307f, 0.0754914f, 1.07061f, 0.293323f, 0.65162f, + -0.272016f, -1.33519f, -0.606759f, -0.57827f, 0.368807f, + -1.48668f, 0.162439f, 0.0821667f, 0.225535f, -0.795996f, + 0.0328293f, 0.975476f, -0.187514f, 2.47069f, -1.5638f, + -0.461524f, 0.00310062f, 1.1556f, -0.286206f, 0.00426021f, + 0.585836f, 0.900007f, 0.384055f, 0.189435f, -0.157291f, + -0.0710573f, -0.0663986f, -0.710772f, -0.669136f, -0.379493f, + -1.2634f, -0.377524f, 0.824094f, 0.312308f, 0.125368f, + -0.382737f, 0.637109f, 0.61907f, -0.741184f, 0.00257198f, + -0.0151343f, -0.669826f, -0.439855f, 0.564852f, -0.0588036f, + -1.38123f, -1.1126f, 0.701831f, 0.198686f, 0.266866f, + 0.270172f, -0.692401f, 0.272533f, -1.70914f, 0.66064f, + 0.0886659f, -0.132233f, 0.270531f, -0.479581f, 0.704338f, + -0.307039f, -0.111792f, -2.05753f, -0.231749f, 0.300528f, + 0.383266f, -0.130857f, -0.373944f, 1.21025f, 0.704655f, + -0.589422f, 0.267185f, -0.109065f, -0.195991f, 0.20209f, + -0.0676526f, -0.183926f, 0.164894f, 0.0877923f, 0.565943f, + -0.0610466f, -0.86354f, -0.80853f, -0.176111f, -1.45016f, + -2.29078f, -0.124524f, -0.139305f, -0.187858f, -0.0250151f, + -0.572544f, 0.185336f, -0.69275f, -0.430354f, -0.30861f, + -0.754258f, -0.468221f, -0.160487f, -0.766692f, -0.636418f, + -0.71016f, 0.576125f, -0.240476f, -0.954556f, -0.104693f, + 0.155557f, -0.840224f, -0.685457f, -0.0346927f, -0.644882f, + -1.92475f, -0.314544f, 0.463569f, 0.323569f, -0.990124f, + -0.213658f, 0.407183f, 1.19797f, -4.77004f, -0.0613379f, + -2.40345f, -0.0591791f, -0.477622f, -0.303556f, 0.104077f, + -0.974128f, -0.035172f, 1.47064f, 0.233727f, -0.0754056f, + 0.158553f, 0.0614361f, -1.38865f, 0.690729f, 0.568455f, + 0.205866f, -0.0236852f, -0.0921077f, -0.538954f, 0.336613f, + -0.427115f, 0.791754f, -1.819f, -0.404432f, 0.670242f, + -0.0343869f, -0.37191f, 0.0271262f, 0.988161f, -0.547343f, + 0.925304f, 0.548079f, -0.430343f, -0.214109f, 0.242013f, + 1.39027f, 0.37648f, -1.63524f, -0.158864f, -0.572779f, + -0.766801f, -2.62032f, 0.47799f, -1.12025f, -0.115283f, + 1.22349f, -0.262132f, -0.151274f, 0.390483f, -0.496482f, + 1.06166f, -0.183052f, 0.54647f, 0.847486f, 0.0229506f, + 0.653309f, -0.020736f, -1.27453f, 0.48386f, -0.366625f, + -0.515725f, -1.31196f, 0.140701f, -0.183636f, 0.000413912f, + 0.300993f, -0.849529f, -0.59764f, -0.212992f, -0.933365f, + -1.4054f, -0.091982f, 0.41695f, 0.264004f, -0.26379f, + -0.0738219f, 0.434052f, 1.16617f, -0.639624f, -0.146465f, + 0.0409936f, -0.900182f, 0.73517f, 0.805746f, -0.208088f, + 1.74459f, -0.0592751f, 0.624865f, -0.62325f, -0.446315f, + 0.150526f, 0.0526697f, 0.374254f, -0.658043f, 1.02623f, + -0.941758f, 0.381217f, -0.359448f, 0.160051f, 0.556455f, + 0.239382f, 0.75851f, 0.437583f, -0.122221f, 0.746136f, + 0.218286f, -0.426729f, 0.0353903f, -0.830513f, -0.877586f, + 0.488077f, -0.132354f, -0.180756f, 0.736163f, -0.202934f, + -0.882534f, 0.166305f, 0.183122f, 0.0599858f, 0.442687f, + 0.0522908f, -1.17755f, -1.03733f, 0.392363f, 0.672718f, + -1.44704f, 0.360623f, 0.390298f, -0.213968f, 0.169783f, + -0.717536f, -0.830984f, -0.445049f, 0.196772f, -0.730634f, + -1.09497f, 0.344012f, -0.292802f, -0.67966f, 0.138515f, + -0.361803f, 0.936778f, -0.189802f, 0.197777f, -0.367507f, + -0.293653f, 0.447759f, -0.409245f, -0.687568f, -0.431301f, + -0.271234f, -0.585413f, -0.936414f, -0.396049f, -0.29388f, + -0.0930843f, 0.0179339f, 0.262463f, -0.166598f, 0.0171466f, + -0.329641f, 0.39343f, 0.657445f, -0.579052f, -0.312444f, + -0.0915881f, -0.432622f, -0.247645f, 0.485749f, -0.602508f, + -0.347936f, 0.287353f, 0.288705f, 0.168397f, 0.568228f, + -0.493586f, 1.04155f, -0.097956f, 0.658928f, -0.561007f, + 0.0457783f, 2.12744f, 0.182683f, -0.690282f, 0.183302f, + 0.0309499f, -0.722251f, 0.0660448f, -0.333277f, 0.198929f, + -0.724102f, -0.405597f, 0.614868f, -0.292862f, 0.886513f, + 0.142353f, -1.48934f, -0.97273f, 0.199683f, 0.522121f, + 0.0877478f, -0.172593f, -1.58858f, 0.113191f, -0.436178f, + 0.640895f, -0.504676f, 0.0658654f, -0.361301f, 0.604323f, + 0.315196f, -0.423021f, -0.323484f, -0.563163f, 0.118989f, + -0.404508f, -0.0550995f, -0.0359236f, -0.126574f, -0.357288f, + -0.0494502f, 1.04959f, -0.31646f, -0.0376684f, -0.300744f, + -0.135016f, 0.102696f, -0.392333f, -1.17502f, 0.505227f, + 0.337608f, -0.348831f, -0.420815f, 0.202791f, -0.154264f, + -0.563686f, 0.0942187f, 0.353862f, 0.0303509f, -0.132794f, + 0.420746f, 0.143529f, 0.455822f, -1.28348f, -1.35662f, + -0.850688f, -1.76361f, -0.717546f, 0.443111f, 0.227155f, + -0.863307f, -0.452033f, -0.278151f, 1.86233f + }; + +static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias[] = { + -0.103218f, -0.359587f, 0.619666f, -0.473497f, -0.649803f, 0.86992f, + -0.115561f, 0.335114f, -0.285044f, -0.59295f, 0.24497f, 0.611583f, + 0.38568f, 0.137913f, -0.281191f, -0.0107777f, 0.487236f, -0.262363f, + 0.696962f, 0.121565f, 0.312511f, 0.430916f, 0.694134f, 0.393632f +}; + +static const float av1_intra_mode_cnn_partition_branch_2_logits_kernel[] = { + -2.42496f, -1.239f, 0.832673f, 1.56923f, -2.6175f, -1.42492f, + -0.311387f, -1.94237f, 0.54071f, -2.50391f, 0.352205f, -0.96572f, + 1.47144f, -2.04702f, -1.12372f, -0.709186f, 0.812238f, 0.310389f, + 0.789163f, -0.65236f, 1.77018f, 0.273867f, 1.19506f, 1.07022f +}; + +static const float av1_intra_mode_cnn_partition_branch_2_logits_bias[] = { + 0.953424f +}; + +static const float + av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel[] = { + 0.0485154f, 0.0496279f, 0.0268229f, -0.0584843f, -0.166928f, + 0.0316731f, -0.0895094f, -0.0433243f, -0.00893639f, -0.0886265f, + -0.0345622f, -0.235395f, -0.213754f, -0.00212398f, 0.0218857f, + -0.0054983f, -0.0248236f, 0.081822f, -0.0355708f, -0.0795593f, + -0.106995f, -0.0596378f, 0.0350686f, -0.133863f, -0.00582928f, + 0.114963f, 0.193906f, -0.00419085f, 0.0430529f, -0.128318f, + 0.0614715f, -0.000952935f, -0.0345722f, -0.109459f, 0.074204f, + -0.0865131f, 0.0649158f, -0.0942417f, -0.10122f, -0.047551f, + -1.27825f, -0.0125456f, -0.019722f, -0.152058f, 0.280306f, + -0.121231f, -0.0565484f, 0.0959188f, 0.0603919f, 0.0457468f, + 0.967589f, 0.105892f, -0.118326f, 0.198933f, 0.163437f, + -0.056824f, -0.0302956f, -0.07366f, -0.681407f, -0.0781575f, + 0.255732f, -0.0712105f, 0.177882f, 0.709206f, -0.232457f, + 1.33809f, -0.0328557f, 0.0572231f, -1.01361f, 0.130676f, + -0.205159f, 0.975398f, 0.356293f, 0.0766364f, -0.297397f, + -0.0261066f, -0.0933549f, 0.0568851f, -0.0123034f, -0.0433538f, + 0.131003f, 0.890705f, 0.0084565f, 0.00547395f, 0.00157634f, + 0.0047937f, -0.0511092f, 0.0300034f, -0.00604993f, -0.0133502f, + -0.000274302f, 0.129728f, -0.00532916f, 0.0855351f, 0.136885f, + 0.0175562f, -0.0123633f, -0.000512229f, -0.019924f, -0.0316328f, + 0.422972f, 0.0460336f, 0.0170841f, -0.00086795f, -0.0655137f, + 0.0287308f, -0.0375644f, -0.0329215f, -0.0273072f, 0.0241426f, + -0.0429052f, 0.0221593f, -0.063881f, -0.0347391f, -6.44339e-07f, + 0.0476934f, -0.0150068f, 0.0146403f, -0.0653099f, 0.0107635f, + 0.012407f, 0.0048935f, 1.50975f, 0.322256f, 0.17881f, + 0.0943775f, -0.100583f, -0.367022f, -0.156525f, -0.0397161f, + 0.0752784f, -0.00219022f, -0.887456f, 0.0153415f, -0.0148185f, + -0.56435f, 0.163996f, -0.0221024f, -0.0115872f, -0.0529284f, + 0.156838f, -1.13813f, -0.207863f, -0.00484959f, 0.135719f, + 0.131004f, 0.0417939f, 0.31453f, 0.121719f, -0.101515f, + 0.267951f, 0.219727f, 0.0398821f, 0.0713504f, 3.65918e-06f, + -0.00659998f, 0.477343f, -0.128426f, 0.0648877f, 0.111884f, + 0.224552f, 0.0617426f, 0.117742f, 0.031377f, 0.0586865f, + -0.459293f, 0.100211f, -0.14127f, 0.624412f, 0.014659f, + -1.41807f, -0.382452f, -0.695931f, -0.103153f, 0.145808f, + 0.333526f, -0.256367f, 0.096842f, 0.102458f, -0.181224f, + 0.729272f, 0.151177f, 1.46729f, 0.111044f, -4.28813f, + 0.0178379f, 0.47641f, -6.57533f, 0.0633335f, 0.496934f, + -0.154657f, -9.07298e-05f, 0.848937f, -5.40143f, 0.375685f, + 0.23586f, -0.166591f, -0.0191648f, -0.039862f, -3.25093f, + 0.168472f, -0.260317f, -5.51548f, 0.0575334f, 0.328979f, + 0.112644f, 0.231339f, -0.122641f, 0.0567331f, 1.19541f, + -0.038735f, 0.0630576f, 0.176668f, 0.0757184f, -0.833104f, + 0.133669f, 0.982669f, 0.0311783f, 0.0908558f, -0.10065f, + -0.0386599f, -0.231587f, -0.83876f, -0.347148f, 0.225529f, + -1.29625f, 0.0806834f, 0.369648f, -1.63367f, 0.118057f, + -0.311948f, 0.95022f, -0.354807f, -0.648657f, -1.72048f, + 0.260397f, 0.915555f, 0.057737f, -0.162019f, -0.453543f, + -1.70388f, -0.311632f, -0.731593f, -0.678089f, 0.10438f, + -0.293911f, 0.144864f, 0.039212f, 0.0289241f, -0.0685266f, + 0.634592f, -0.0798614f, -0.119197f, -0.00517433f, -0.04653f, + -0.127568f, -0.0582645f, 0.0735302f, -0.0946823f, 0.00865585f, + 0.0115748f, 0.0194847f, 0.0455664f, 0.181006f, -0.0824601f, + 0.0869093f, 0.264767f, -0.0750432f, 0.135136f, 0.316511f, + 0.399015f, 0.0994808f, -0.166944f, -0.102126f, 0.457858f, + 0.300488f, 0.467582f, 0.830244f, -0.0511439f, -0.522892f, + -0.183049f, 0.2626f, 0.118382f, 0.241674f, 0.250399f, + -0.0963507f, -0.83231f, -0.227699f, -0.133314f, 0.231718f, + -0.0700274f, 0.891311f, 0.224742f, -0.572836f, 0.402798f, + -0.191576f, 0.740922f, -0.00374073f, 0.658178f, -0.209364f, + -0.416259f, 0.166297f, 0.0095577f, -0.0876076f, 0.424954f, + 0.265226f, -0.129343f, -0.203146f, -0.194637f, -0.818142f, + -0.164152f, -0.368962f, 0.273373f, 0.599927f, -0.19859f, + 0.0939651f, -0.12458f, -0.751816f, -0.302997f, -0.139176f, + -0.372737f, 0.332704f, -0.206045f, -0.00593763f, -0.452363f, + -0.2704f, -0.198846f, 0.0976308f, -0.216124f, 0.110122f, + -0.220342f, 0.00763426f, -0.0272775f, -0.190395f, -0.0359411f, + -0.0395759f, 0.000941162f, -1.49959f, 0.0914233f, 0.448346f, + -0.420435f, -0.0102102f, -0.0757978f, -0.0177687f, -0.0231492f, + -0.142125f, 1.31774f, 0.0269368f, 0.134566f, 0.152079f, + -0.139933f, 0.139226f, -0.214467f, -0.194446f, -0.555893f, + 0.271197f, -0.111047f, 0.0888069f, -0.198121f, 0.0871713f, + 0.100612f, 0.429782f, -0.3787f, 0.123147f, -0.12538f, + 0.235678f, 0.139237f, 0.223326f, 0.85806f, -0.00554756f, + 0.285095f, 0.0954683f, 0.0464989f, 0.100806f, -0.0211297f, + 0.121672f, 0.242473f, 0.0810475f, -0.834356f, 0.119629f, + 0.111338f, -0.227126f, 0.159296f, -0.0584685f, -0.108265f, + -0.0909221f, -0.21749f, 0.0929309f, -0.176815f, 0.178067f, + -0.0025905f, 0.317883f, 0.313045f, 0.26774f, -0.589329f, + -1.19882f, -0.285513f, -0.109478f, 0.309441f, -0.0604479f, + 0.947461f, -0.142342f, -0.9086f, -0.814788f, 0.184588f, + -0.0736317f, 0.276237f, 0.13132f, -0.3931f, -0.381744f, + -0.0122719f, 0.0246101f, -0.0920412f, 0.11331f, -0.110355f, + 0.00848064f, 0.0931248f, -0.0638655f, -4.30869e-05f, -0.300367f, + 0.0489508f, 0.464441f, -0.0466243f, -0.0137732f, 0.0099241f, + -0.223972f, 0.188966f, -0.653173f, -0.354322f, 0.189237f, + -0.624276f, -1.46218f, -0.075161f, -0.516172f, 0.40993f, + 0.291178f, -1.95088f, -0.0352157f, 0.196354f, -0.335897f, + 0.0857039f, 0.605319f, -1.12923f, -0.638387f, 1.41868f, + 0.0955757f, -0.00913477f, 0.315935f, -0.671223f, -0.851436f, + -0.157464f, -0.296763f, 0.182277f, -0.139309f, 0.232789f, + 0.869562f, 0.248894f, 0.242709f, 0.195479f, 0.106153f, + 0.358881f, 0.167443f, 0.982987f, 0.104767f, -0.033925f, + -0.0263185f, 0.0045304f, 0.0722479f, -0.111307f, 0.00128896f, + 0.406128f, -0.00944947f, 0.121592f, 0.546284f, -0.00175696f, + 0.776588f, 0.238846f, 0.064469f, 0.27082f, 0.269187f, + 0.0294455f, 0.62364f, -0.27872f, -0.0488013f, 0.229024f, + 0.154457f, 0.0445898f, 0.349943f, 0.0710998f, 0.0820674f, + 0.0279449f, 0.172826f, -0.122156f, -0.164688f, 0.0292124f, + 0.0496112f, -0.741762f, 0.0673926f, 0.108159f, -0.0942327f, + -0.0562883f, 0.558231f, 0.0552399f, 0.211393f, 0.0376817f, + -0.275788f, 0.0548436f, 0.212732f, 0.163603f, 0.0663363f, + -0.0252315f, 0.164533f, 0.0826088f, 0.0301389f, 0.345705f, + -0.0378046f, -0.139581f, 1.30162f, 1.23551f, -0.446693f, + 0.682534f, -0.0831157f, -0.0121595f, 1.50505f, 0.0839017f, + -0.953413f, 0.0820985f, -0.125556f, 0.699796f, -0.140453f, + 0.168438f, -0.110966f, 0.173806f, 0.114683f, 0.132502f, + -0.0453539f, -0.133096f, 0.511947f, -0.180657f, -0.0298605f, + 0.291437f, -0.0275017f, -0.229703f, -0.0504205f, 0.559622f, + 0.384601f, 0.111024f, -0.0773559f, -0.0591752f, -0.0866182f, + -0.189437f, -0.262345f, -0.0372182f, 0.149925f, 0.154644f, + -0.188298f, 0.236949f, -0.199328f, -0.378909f, -0.680128f, + 0.277184f, -0.172784f, 0.184717f, -0.23899f, 0.0712069f, + 0.0235425f, 0.4225f, -0.441487f, 0.177434f, -0.298303f, + 0.295696f, 0.17346f, 0.220542f, -0.680116f, 0.00266223f, + -0.0408459f, -0.15486f, 0.24335f, 0.237258f, -0.0283245f, + 0.19703f, -0.100027f, 0.0554843f, -1.03081f, 0.151745f, + 0.538582f, 0.370368f, 0.196683f, 0.0222123f, -0.0831401f, + -0.0832803f, -0.286743f, -0.686003f, 0.0995004f, 0.148901f, + -0.0436037f, -0.316508f, 0.00391835f, -0.228452f, 0.940058f, + 0.520047f, -0.334211f, 0.652142f, -0.0755971f, 0.0965123f, + -0.98191f, 0.394096f, -0.420466f, 0.327284f, -0.134651f, + 0.849297f, -0.523372f, 0.010327f, 0.133636f, 0.298119f, + -0.257389f, 0.0376153f, -0.198298f, 0.0736235f, 0.608809f, + 0.0291836f, -0.290005f, -0.141316f, 0.0184599f, 0.0554437f, + 0.0621519f, 0.485276f, 0.617062f, -0.0924811f, -0.0120834f, + 0.0817611f, 0.100421f, -0.0153553f, -0.135958f, -0.0185322f, + -0.395803f, -0.204862f, 0.547916f, -0.438117f, 0.0229788f, + 0.406981f, 0.795584f, -2.02756f, -0.8355f, -0.386789f, + 0.00968368f, 1.2147f, -0.740869f, -1.18415f, -0.954918f, + -0.541142f, 0.0596003f, 0.107189f, -0.411708f, -0.964593f, + 0.511906f + }; + +static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias[] = { + -0.485545f, 0.131552f, 0.796833f, -0.157582f, -0.0948124f, 0.00818613f, + -0.485562f, 0.3826f, -0.0839326f, 0.170998f, 0.279545f, -0.287143f, + 0.184986f, -0.0719864f, 0.19748f, 0.404145f +}; + +static const float + av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel[] = { + 1.30172f, 0.720189f, 0.261675f, -0.466201f, 1.21773f, + 0.495525f, 0.62398f, 0.44567f, -0.330993f, -0.269798f, + 0.835161f, -0.294874f, 0.186981f, 0.0162467f, 0.367654f, + 0.658468f, 1.08325f, 1.01558f, 0.12783f, -0.280581f, + 2.2204f, 0.0337286f, -0.403649f, -0.230908f, -0.35188f, + 0.437712f, -0.103634f, -0.645929f, 1.17407f, 0.157385f, + 0.212438f, 1.41874f, 0.284242f, -0.493105f, 1.0703f, + 0.00632116f, 1.18222f, -0.26003f, 0.276795f, -0.823156f, + 0.29577f, -0.157467f, -0.18092f, 0.0237336f, 0.205715f, + -0.295679f, 0.165443f, -0.628279f, 1.00804f, 0.361232f, + 0.646155f, -0.028651f, 1.64317f, 0.334251f, -1.50713f, + -1.51685f, -0.488522f, 0.169694f, -0.593176f, -0.372682f, + -1.50223f, 0.35076f, -0.24641f, -0.237189f, 0.190502f, + -0.948191f, -0.303346f, 0.45108f, -0.794368f, -2.3116f, + 0.404008f, -2.67269f, -0.941992f, -0.45336f, 0.0655987f, + -0.288432f, 0.106068f, 0.286978f, 0.121403f, 0.462739f, + 0.0130292f, 0.240597f, -2.30983f, -0.453309f, -0.149335f, + 0.856424f, -0.186576f, 0.769961f, -0.0657097f, -0.976188f, + 0.972971f, -0.532728f, -0.699334f, -0.168803f, 0.361945f, + 0.950769f, 1.5368f, -0.223899f, 1.17547f, -0.281483f, + 0.533619f, 0.315344f, 0.0854543f, 0.464701f, 0.346828f, + 0.271794f, -0.0185388f, 0.109517f, 0.371662f, -0.10852f, + 0.244092f, 0.491959f, -0.750281f, 1.41865f, -3.51221f, + 0.298194f, -0.0790832f, -0.134158f, -0.424084f, 0.189593f, + -0.238361f, -0.407872f, -0.366222f, -0.606813f, -0.230498f, + 0.387248f, -0.102734f, -0.190544f, -1.43649f, 0.141338f, + -0.0438917f, 0.204628f, 1.57033f, 0.0366937f, -0.14733f, + 0.048198f, -0.122631f, 0.183354f, 0.0658753f, -0.243381f, + 0.0246889f, -0.768798f, -0.0644054f, 0.775073f, 1.63419f, + 0.491624f, 0.21898f, -0.358944f, 3.31304f, 0.0195916f, + 0.236174f, 0.530704f, 0.140124f, 0.0736778f, -0.27361f, + -0.598836f, -1.01659f, 0.361765f, 0.00455986f, -0.345222f, + 1.68731f, 0.764082f, 0.193555f, 0.322782f, 1.19801f, + 0.538935f, -0.0393231f, -0.0248292f, -0.151168f, 0.479879f, + -0.208582f, 0.22798f, 0.335473f, -0.00295455f, 0.139539f, + 0.400814f, 0.478307f, -0.189376f, 0.540084f, 0.466072f, + 0.920231f, 0.398774f, -0.472403f, -0.0431972f, -0.581665f, + -0.990058f, 0.258995f, -0.0148889f, 0.27105f, 0.340334f, + 0.223576f, -0.0405193f, -1.23888f, -1.45229f, -1.44543f, + -0.376146f, 0.132601f, -0.4064f, -0.583611f, -0.374588f, + 0.0659428f, 0.325652f, -0.338456f, 0.253767f, -0.0181164f, + 0.681732f, 0.222041f, 0.837496f, 1.09735f, 0.156328f, + 0.177236f, -0.702702f, 0.473689f, 0.322118f, 0.43343f, + 0.315441f, -0.40798f, 0.0811291f, 0.631431f, 0.361929f, + 0.0723276f, 0.0164498f, 0.0293847f, 0.156406f, -1.10453f, + 0.837977f, -1.03449f, -0.348408f, 1.71953f, -0.401765f, + 0.64272f, -0.182438f, -0.233954f, 0.364597f, 0.269177f, + -0.578512f, 0.397216f, 0.0425122f, -0.258728f, 1.41621f, + -0.688768f, 0.0944726f, 0.253163f, -0.989037f, 1.72726f, + 1.15976f, -0.0460612f, 0.534186f, -0.136814f, 0.49327f, + 0.115744f, -0.633052f, -0.433855f, -1.01874f, -0.324035f, + 0.489487f, 1.08696f, 0.836376f, -0.423477f, -0.421309f, + 1.07348f, 0.323266f, 0.717604f, 0.366422f, 0.32983f, + 0.336583f, 0.749292f, -0.210666f, 0.387101f, -0.583376f, + 0.0391101f, -1.07537f, 0.914591f, -0.51303f, 1.15023f, + -0.0378782f, 0.262889f, -0.841128f, 0.41619f, -0.669704f, + -0.109995f, 1.01825f, -0.194853f, 0.120739f, 0.627889f, + -0.00269221f, 0.751152f, -0.529865f, -1.50238f, 0.184521f, + 0.795464f, 0.106099f, 1.83117f, 0.0883305f, 0.306844f, + -0.0671504f, -0.169306f, -0.214575f, -0.121606f, -0.234965f, + 0.109752f, -0.35831f, -0.07894f, 0.497203f, -2.63013f, + 0.815608f, -0.193593f, -0.62292f, 0.338941f, 0.0970922f, + -0.531178f, 0.723346f, 0.35063f, 0.182647f, -0.257013f, + 0.784924f, -0.217915f, -0.0797363f, -0.399706f, -0.485602f, + 1.23155f, 0.345998f, 0.322949f, -0.168196f, -0.173313f, + 0.282205f, 0.45117f, 0.918706f, -0.046172f, -0.0873883f, + 0.56103f, -0.485768f, 0.546199f, 0.254997f, 0.394296f, + 0.607178f, 0.667532f, -0.343883f, 0.374402f, -0.531439f, + 2.27782f, -1.13255f, 0.505867f, -0.514742f, 0.998571f, + -1.60984f, -0.172873f, -0.0604094f, 0.719791f, -0.733982f, + 0.348905f, 1.39008f, -0.895343f, -0.677064f, -1.84221f, + 0.0434018f, -0.534794f, 0.0434753f, -0.266576f, 0.268099f, + -0.242935f, 0.00166289f, 0.0263789f, -0.224794f, -0.113493f, + -0.236397f, 0.0879936f, 0.510895f, -0.511789f, -1.48962f, + -2.78268f, -0.0495784f, -0.0343907f, 0.440459f, -0.364209f, + 0.833223f, -0.0589337f, 0.00181418f, 0.455499f, 0.101762f, + -1.16424f, 0.270405f, 0.219033f, -4.91105f + }; + +static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias[] = { + -0.40114f, -0.372342f, -0.216186f, -0.240014f, -0.341773f, -0.344489f, + -0.113037f, 0.198479f, 0.482958f, -0.630072f, -0.728704f, -0.171963f, + 0.519883f, 0.253003f, -0.121618f, -0.0569875f, -0.485568f, -0.147577f, + 0.533305f, -0.587251f, -0.120837f, -0.483953f, 0.445641f, -0.125136f +}; + +static const float av1_intra_mode_cnn_partition_branch_3_logits_kernel[] = { + -1.57431f, -1.09069f, 1.67996f, -0.669702f, 0.499807f, -3.03145f, + -0.878135f, 0.637818f, -1.58419f, -3.79756f, 0.62755f, -0.446646f, + 0.653269f, -0.667854f, -2.19774f, -3.53349f, 2.6107f, -0.685892f, + -1.2603f, -0.89707f, -0.715551f, 0.382202f, 2.09574f, 0.469386f +}; + +static const float av1_intra_mode_cnn_partition_branch_3_logits_bias[] = { + -0.022787f +}; + +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_0_dnn_config = { + BRANCH_0_NUM_DNN_FEATURES, + BRANCH_0_NUM_LOGITS, + BRANCH_0_NUM_DNN_LAYERS, + { + BRANCH_0_NUM_DNN_LAYER_0_UNITS, + BRANCH_0_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_0_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_0_logits_bias, + }, +}; +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_1_dnn_config = { + BRANCH_1_NUM_DNN_FEATURES, + BRANCH_1_NUM_LOGITS, + BRANCH_1_NUM_DNN_LAYERS, + { + BRANCH_1_NUM_DNN_LAYER_0_UNITS, + BRANCH_1_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_1_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_1_logits_bias, + }, +}; +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_2_dnn_config = { + BRANCH_2_NUM_DNN_FEATURES, + BRANCH_2_NUM_LOGITS, + BRANCH_2_NUM_DNN_LAYERS, + { + BRANCH_2_NUM_DNN_LAYER_0_UNITS, + BRANCH_2_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_2_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_2_logits_bias, + }, +}; +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_3_dnn_config = { + BRANCH_3_NUM_DNN_FEATURES, + BRANCH_3_NUM_LOGITS, + BRANCH_3_NUM_DNN_LAYERS, + { + BRANCH_3_NUM_DNN_LAYER_0_UNITS, + BRANCH_3_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_3_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_3_logits_bias, + }, +}; + +#undef NUM_DNN_BRANCHES +#undef NUM_CNN_LAYERS +#undef BRANCH_0_NUM_DNN_LAYERS +#undef BRANCH_1_NUM_DNN_LAYERS +#undef BRANCH_2_NUM_DNN_LAYERS +#undef BRANCH_3_NUM_DNN_LAYERS +#undef CNN_LAYER_0_HEIGHT +#undef CNN_LAYER_0_WIDTH +#undef CNN_LAYER_0_IN_CH +#undef CNN_LAYER_0_OUT_CH +#undef CNN_LAYER_0_HORZ_STRIDE +#undef CNN_LAYER_0_VERT_STRIDE +#undef CNN_LAYER_1_HEIGHT +#undef CNN_LAYER_1_WIDTH +#undef CNN_LAYER_1_IN_CH +#undef CNN_LAYER_1_OUT_CH +#undef CNN_LAYER_1_HORZ_STRIDE +#undef CNN_LAYER_1_VERT_STRIDE +#undef CNN_LAYER_2_HEIGHT +#undef CNN_LAYER_2_WIDTH +#undef CNN_LAYER_2_IN_CH +#undef CNN_LAYER_2_OUT_CH +#undef CNN_LAYER_2_HORZ_STRIDE +#undef CNN_LAYER_2_VERT_STRIDE +#undef CNN_LAYER_3_HEIGHT +#undef CNN_LAYER_3_WIDTH +#undef CNN_LAYER_3_IN_CH +#undef CNN_LAYER_3_OUT_CH +#undef CNN_LAYER_3_HORZ_STRIDE +#undef CNN_LAYER_3_VERT_STRIDE +#undef CNN_LAYER_4_HEIGHT +#undef CNN_LAYER_4_WIDTH +#undef CNN_LAYER_4_IN_CH +#undef CNN_LAYER_4_OUT_CH +#undef CNN_LAYER_4_HORZ_STRIDE +#undef CNN_LAYER_4_VERT_STRIDE +#undef BRANCH_0_NUM_DNN_FEATURES +#undef BRANCH_0_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_0_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_0_NUM_LOGITS +#undef BRANCH_1_NUM_DNN_FEATURES +#undef BRANCH_1_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_1_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_1_NUM_LOGITS +#undef BRANCH_2_NUM_DNN_FEATURES +#undef BRANCH_2_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_2_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_2_NUM_LOGITS +#undef BRANCH_3_NUM_DNN_FEATURES +#undef BRANCH_3_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_3_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_3_NUM_LOGITS + +static const float av1_intra_mode_cnn_partition_split_thresh_hdres[5] = { + 100.000000f, 4.750139f, 1.655964f, 3.711212f, 0.963839f, +}; + +static const float av1_intra_mode_cnn_partition_no_split_thresh_hdres[5] = { + -100.000000f, -2.404842f, -3.858223f, -2.041206f, -1.573735f, +}; + +static const float av1_intra_mode_cnn_partition_split_thresh_midres[5] = { + 100.000000f, 3.218737f, 2.657764f, 0.868458f, 2.454447f, +}; + +static const float av1_intra_mode_cnn_partition_no_split_thresh_midres[5] = { + -100.000000f, -3.842426f, -4.005076f, -3.642994f, -2.467197f, +}; + +static const float av1_intra_mode_cnn_partition_split_thresh_lowres[5] = { + 100.000000f, 1.890757f, 2.658417f, 1.450626f, 1.833180f, +}; + +static const float av1_intra_mode_cnn_partition_no_split_thresh_lowres[5] = { + -100.000000f, -4.100921f, -4.564202f, -5.695176f, -1.483546f, +}; + +static const float av1_intra_mode_cnn_partition_mean[1] = { + 1.191922f, +}; + +static const float av1_intra_mode_cnn_partition_std[1] = { + 1.730044f, +}; + +static const int quad_to_linear_0[1] = { 0 }; +static const int quad_to_linear_1[4] = { 0, 1, 2, 3 }; +static const int quad_to_linear_2[16] = { 0, 1, 4, 5, 2, 3, 6, 7, + 8, 9, 12, 13, 10, 11, 14, 15 }; +static const int quad_to_linear_3[64] = { + 0, 1, 8, 9, 2, 3, 10, 11, 16, 17, 24, 25, 18, 19, 26, 27, + 4, 5, 12, 13, 6, 7, 14, 15, 20, 21, 28, 29, 22, 23, 30, 31, + 32, 33, 40, 41, 34, 35, 42, 43, 48, 49, 56, 57, 50, 51, 58, 59, + 36, 37, 44, 45, 38, 39, 46, 47, 52, 53, 60, 61, 54, 55, 62, 63 +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h new file mode 100644 index 0000000000..71c1ace782 --- /dev/null +++ b/third_party/aom/av1/encoder/partition_model_weights.h @@ -0,0 +1,5646 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +// TODO(chiyotsai@google.com): The performance of these models are getting worse +// due the changes in the encoder. We should retrain the models here to get +// better performance once we have the time. + +#define FEATURE_SIZE 10 +#define LABEL_SIZE 16 +// nn model for ab partition pruning, 128x128. +static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = { + -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f, -0.469759f, + 0.426152f, 0.489798f, 0.469865f, 0.773821f, 0.088517f, 0.074585f, + 0.838754f, 0.048449f, -0.007584f, 0.638968f, 0.233305f, -0.319236f, + -0.257124f, -0.170869f, 0.137180f, 0.114852f, -0.721241f, -0.947962f, + -0.411298f, 0.494306f, -0.060435f, -0.648421f, -0.126624f, 0.072686f, + -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f, -0.189925f, + 0.134361f, -0.258070f, -0.177558f, 0.158049f, 0.168668f, -0.062919f, + 0.341986f, 0.038100f, -0.435577f, -0.321255f, 0.203213f, 0.213061f, + 0.533304f, 0.359296f, -0.079558f, 0.004637f, 0.663904f, 0.043779f, + 0.383018f, 1.136559f, -0.084155f, 0.333057f, -0.199011f, 0.152059f, + -0.078419f, -0.167752f, -0.093651f, 0.083171f, -0.190143f, 0.086195f, + -0.280632f, -0.160663f, -0.017298f, 0.122628f, -0.138116f, 0.062927f, + 0.222462f, 0.626979f, 0.426928f, 0.117170f, -0.240457f, 0.053750f, + 0.038017f, 0.007359f, -0.017595f, 0.101407f, 0.332891f, 0.074933f, + 0.306498f, 0.219380f, -0.151638f, -0.247976f, 0.343405f, 0.121256f, + 0.049173f, 0.171474f, -0.139608f, -1.016599f, -0.345553f, -0.901138f, + 0.243401f, 0.059928f, -0.089396f, -0.195565f, 0.364705f, -0.020400f, + -1.383672f, 0.413018f, 0.536950f, -0.020904f, -1.335306f, -0.732290f, + 0.102885f, 0.315290f, -0.208521f, -0.081811f, 0.182300f, 0.125712f, + -0.593833f, -0.220639f, -0.314155f, 0.188327f, 0.118503f, 0.524427f, + -1.083859f, -1.130640f, 0.390352f, -0.045591f, 0.113160f, -0.009149f, + -0.096183f, 0.115829f, 0.377752f, 0.318396f, -0.591983f, 0.004797f, + -0.497377f, -0.342248f, 0.079546f, -0.025249f, -0.295972f, 0.615501f, + -0.464372f, 0.418315f, -0.173556f, 0.105217f, 0.298073f, 0.082478f, + 0.033223f, 0.977341f, -0.372982f, -0.052337f, 0.154124f, 0.396787f, + 0.536654f, -0.139061f, -0.223702f, 0.229666f, -0.846766f, 0.107723f, + 0.563839f, -0.483141f, 0.304813f, -0.765283f, 0.070964f, 0.151101f, + 0.275188f, 0.490303f, 1.175892f, 0.085377f, -0.191200f, 0.544532f, + -0.365075f, 0.167546f, 0.052183f, -0.220529f, -0.212227f, -0.144988f, + -0.273356f, -0.062023f, 0.103993f, -0.238493f, -0.161204f, -0.054611f, + -0.166672f, 0.128327f, 0.461751f, -0.545822f, 0.739798f, 0.594386f, + -0.163192f, -0.332501f, 0.363834f, -0.065043f, 0.474812f, -0.138811f, + 0.170924f, -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f, + 0.340591f, 0.041783f, 0.055419f, 0.015155f, -0.981830f, -1.355237f, + 0.347516f, 1.155327f, 0.081319f, 0.274163f, -0.327230f, -0.113478f, + 0.556552f, -0.055986f, 0.217318f, -0.445351f, 0.325759f, 0.526547f, + -0.657434f, -0.572214f, -0.037087f, 0.081384f, 0.064518f, 0.014892f, + 0.215279f, 1.834504f, -0.242107f, 0.079810f, 0.129558f, 0.079588f, + -0.035189f, -0.221745f, -0.163414f, 0.043978f, -1.028662f, -0.623609f, + 1.130336f, 0.664661f, -0.063975f, -0.415863f, 0.018581f, 0.157758f, + 0.200570f, 0.063420f, 0.901039f, -0.746286f, 0.196230f, -0.290592f, + 0.042373f, -0.502500f, 0.183638f, 0.103394f, -0.298858f, 0.145436f, + 0.196916f, 0.108319f, -0.448572f, -0.881385f, 0.302497f, 0.121679f, + -0.021327f, 0.025150f, 0.481306f, -0.359634f, 0.350257f, -0.228647f, + -0.669860f, 0.260025f, -0.034182f, 0.619247f, -0.158826f, -0.405864f, + 0.674112f, -0.027885f, -0.325274f, -0.241492f, 0.036024f, -0.437685f, + -0.091458f, -0.109295f, -0.350676f, 0.044706f, 0.297059f, 0.016290f, + 1.121203f, 1.289062f, -1.299476f, -1.129221f, 0.103752f, 0.131302f, + -0.263265f, 0.222155f, -0.229908f, 0.013922f, -0.226001f, -0.248383f, + -0.004415f, -0.020958f, 0.055634f, 0.086200f, 0.114556f, -0.184061f, + -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f, 0.023781f, + -0.264460f, 0.157026f, -0.235228f, -0.102564f, 0.043463f, -0.187823f, + -0.257500f, -0.199049f, -0.242210f, 0.030448f, 0.221604f, 0.151804f, + -0.100404f, -0.073931f, 0.144749f, -0.001572f, -1.438079f, -0.233716f, + 0.733422f, 1.727080f, -0.036397f, 0.027551f, 0.425321f, 0.085703f, + 0.031186f, 0.032333f, -0.675130f, 1.437733f, -0.202392f, -0.525003f, + 0.087048f, 0.328194f, -0.079989f, -0.391088f, -0.238732f, -0.120660f, + -0.139600f, 0.154665f, 0.026202f, -0.233501f, -0.009046f, -0.149187f, + -0.199646f, 0.115375f, 0.209762f, -0.014875f, 0.124038f, -0.119985f, + 1.079625f, -0.461513f, 0.614114f, 0.021003f, 0.439449f, -0.824834f, + -0.299701f, 0.193817f, -0.870551f, -1.262313f, -0.079517f, 0.341570f, + 0.305310f, -0.089721f, -0.317314f, -0.075631f, 0.127172f, -0.208635f, + 1.191922f, 0.163141f, 0.564285f, 0.286352f, 0.480865f, 0.173094f, + -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f, 0.090258f, + -0.016099f, 0.193230f, 0.188061f, 0.398144f, 0.722781f, 0.769949f, + 0.025442f, -0.162016f, 0.070192f, -0.056946f, -0.100957f, -0.219934f, + -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f, -0.017493f, + 0.527446f, 0.083605f, 0.588318f, 0.878215f, 0.028747f, -0.146479f, + -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f, -0.101340f, + -0.027733f, -0.282611f, 0.265366f, 0.082362f, -0.265420f, -0.131124f, + 0.166303f, 0.040194f, -0.100710f, 0.579151f, -0.530136f, 0.163422f, + -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f, -0.090302f, + 1.723272f, 0.552370f, -0.295954f, -0.439095f, -0.266730f, 0.027936f, + 0.539616f, -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f, + -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f, -0.159378f, + 0.029145f, -0.050892f, -0.223407f, -0.246239f, 0.043152f, -0.018460f, + 0.169972f, -0.187769f, -0.034670f, -0.238330f, 0.288070f, -0.093243f, + -0.437105f, -0.573376f, 0.660073f, 0.285727f, 0.408470f, 0.158475f, + 0.032699f, 0.056280f, -0.237176f, -0.083003f, 0.105598f, -0.169522f, + -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f, + 0.029124f, 0.009580f, -0.252034f, 0.103087f, 1.156561f, 0.603848f, + -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f, 0.076095f, + 1.490819f, 0.415893f, -0.277788f, -0.115787f, 0.093750f, 0.270726f, + -0.395983f, -0.353742f, 0.034605f, 0.005342f, 0.184537f, 0.086445f, + 0.156417f, 1.476367f, 0.122587f, 0.002145f, 0.431057f, -0.381184f, + -1.646457f, -0.014009f, -0.671224f, 0.193726f, -0.019247f, -0.031267f, + -0.046208f, 0.298733f, 0.064734f, 0.616984f, 0.039381f, 0.182722f, + -0.116670f, 0.233093f, -1.214374f, -0.817970f, -0.064394f, -0.584783f, + 0.077697f, -0.266720f, 0.130875f, -0.235295f, -0.265754f, -0.159999f, + -0.250114f, -0.183017f, 0.194403f, -0.105808f, -0.169215f, -0.240866f, + -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f, + -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f, + 0.881460f, -0.678603f, 0.008666f, -0.252053f, -0.341035f, -0.175290f, + 0.183012f, 0.385991f, 0.079888f, -0.014039f, -0.148653f, 0.671778f, + -0.130219f, 1.086467f, 0.129267f, -0.040400f, -0.201221f, -0.077005f, + 0.015890f, 0.000781f, 0.137764f, 1.389546f, 0.172152f, 0.047279f, + -0.042783f, 0.127740f, 0.141467f, -0.335738f, -1.396392f, 0.031496f, + 0.357385f, 0.343602f, -0.714553f, 0.311014f, 0.132845f, 0.061149f, + 0.006796f, 0.568106f, -0.255949f, 0.104134f, -0.993447f, 0.298135f, + -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f, + 0.068481f, 0.036240f, -0.495801f, 0.180574f, -0.766129f, 0.886967f, + -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f, + 0.019016f, -0.015837f, 0.600197f, 0.429773f, 0.315026f, 0.319667f, + 0.214617f, -0.017316f, 0.270257f, -0.040524f, 0.695803f, -0.015223f, + -1.554965f, 0.356997f, -1.472428f, 0.024637f, -0.562958f, 0.870351f, + 0.193635f, 0.036063f, 0.328638f, 0.200274f, -1.634707f, 0.110534f, + 0.420104f, -0.072042f, -0.006404f, 0.171680f, +}; + +static const float av1_ab_partition_nn_bias_128_layer0[64] = { + 0.643147f, -1.348826f, 0.431627f, 0.000000f, 0.102717f, -0.772628f, + -0.034351f, -0.761977f, -0.638397f, 0.541969f, -0.391311f, 0.563076f, + 0.148553f, 0.267217f, -0.788092f, 0.544573f, -0.546280f, 0.000000f, + -0.446945f, 0.127732f, 0.270624f, -0.219435f, -1.220203f, 0.324584f, + 0.110885f, 0.276547f, 0.179726f, -0.375160f, 0.026401f, -0.032595f, + 0.000000f, -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f, + 0.476453f, -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f, + 0.915351f, -0.209962f, 0.000000f, -0.025731f, 0.218288f, 0.000000f, + 0.047726f, -0.813077f, -1.263281f, 0.239087f, 0.278614f, -0.030753f, + 0.000000f, 0.346744f, -0.948543f, -1.174211f, 0.216377f, 0.498913f, + 0.853918f, 0.002504f, -0.190403f, 0.452050f, +}; + +static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = { + 0.179769f, 1.499417f, -0.445135f, -0.142278f, -0.337661f, 0.682064f, + -0.203213f, 0.302171f, 0.226877f, -0.422169f, 1.687586f, 0.783773f, + 0.220995f, 0.253482f, 0.370435f, -1.342775f, 0.337229f, -0.271473f, + 0.291796f, 1.362227f, -1.751397f, -0.086178f, 0.725496f, -0.118597f, + 0.227963f, -0.501577f, 0.223849f, -0.122421f, -0.123437f, -0.051045f, + -0.020115f, 0.212711f, 0.246025f, 0.088120f, -0.168995f, 1.740190f, + -0.195098f, 0.680339f, -0.589572f, -0.075244f, 0.878766f, 0.064092f, + -3.548527f, 0.001660f, 0.107926f, -0.169501f, -0.455212f, 0.123045f, + -1.836998f, 0.330365f, 1.301475f, 0.454761f, -0.576552f, -0.190761f, + 0.208459f, 0.618483f, 1.383364f, 0.970718f, 0.390174f, 0.406252f, + -0.564519f, -0.312062f, 1.345712f, -0.151873f, 0.109290f, 0.408847f, + 0.391243f, 0.152024f, 0.181764f, -0.036263f, -0.160466f, 0.153595f, + 0.049163f, -0.753012f, -1.804062f, 0.347475f, -2.746580f, 0.575618f, + 0.261799f, 0.210505f, -0.302054f, -0.109872f, 0.199506f, -1.182971f, + 0.723668f, 0.177758f, -0.338202f, 0.254396f, -0.220023f, 0.043504f, + 0.669866f, -0.040816f, -0.402730f, 0.017990f, 0.215523f, -0.216816f, + 0.454826f, -0.726067f, -0.018750f, -0.928679f, 0.154315f, -0.465641f, + 0.144566f, -0.030064f, -0.054667f, -0.154055f, 0.625384f, 1.323795f, + -0.159496f, 0.097072f, -0.463197f, -0.057938f, 0.750290f, -0.233061f, + 0.412631f, -0.535223f, -0.151423f, -0.154583f, 0.024721f, -0.494448f, + 0.230594f, -0.980138f, -0.653968f, 0.126079f, 0.051814f, -0.053219f, + -0.421708f, -0.228853f, 0.237885f, 0.888157f, 0.059655f, 0.241295f, + 0.210443f, 0.228238f, 0.119127f, -0.051989f, -0.355408f, 0.182215f, + 0.244277f, -0.104577f, -0.558035f, -0.023270f, 0.054571f, 0.700646f, + -0.223006f, 0.115523f, 0.023391f, 0.437264f, 0.709477f, -0.531212f, + -0.094731f, 0.328161f, -0.105418f, -0.133511f, 0.497168f, -0.030948f, + -0.407132f, -0.043943f, 0.155505f, 0.251945f, 0.205010f, 0.167160f, + 0.083654f, -0.636810f, 0.401315f, -0.398414f, 0.290046f, 0.206846f, + 0.042218f, 0.168150f, 0.843181f, -0.671242f, -0.202392f, -0.073301f, + 0.142895f, 0.237466f, 0.212145f, -0.091828f, 0.187038f, -0.720841f, + -0.616069f, -0.238021f, 0.065365f, 0.434119f, 0.179023f, -0.040107f, + -0.430734f, -0.297368f, 0.575954f, 0.382619f, -0.709787f, -0.320810f, + 0.242342f, -0.047614f, 0.705216f, 0.098077f, 0.357179f, 0.046017f, + 0.115074f, -0.412305f, -0.272304f, 0.048096f, -0.803811f, 0.275000f, + 0.642198f, 0.180286f, -0.087178f, -0.112707f, -0.394443f, 0.201989f, + 0.241759f, -1.038870f, 0.728124f, 0.800559f, -1.296268f, 0.198612f, + -0.053478f, 0.414344f, -0.510529f, 0.124179f, -2.219115f, -0.074583f, + -0.143055f, 0.001697f, 0.810811f, -0.657140f, 0.186818f, -0.936414f, + 0.539578f, -0.308244f, -0.126624f, -0.204767f, 0.091145f, -0.049340f, + 0.252014f, 0.394582f, 0.018764f, -0.060377f, -0.019133f, 0.064083f, + 0.069211f, -0.526693f, 0.209850f, -0.481466f, -0.468302f, -0.100407f, + 0.241018f, -1.037781f, 0.038539f, -2.113840f, -0.974895f, 0.163187f, + 0.425132f, -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f, + -0.745175f, -0.177077f, 0.217658f, 0.381431f, -0.052338f, 0.087176f, + -0.165972f, 0.085937f, 0.472564f, -0.796627f, -2.453307f, 0.569664f, + -0.233010f, -0.192134f, 0.064339f, -0.111411f, -0.262469f, -0.410022f, + 0.519993f, -0.684620f, 0.393460f, -0.277753f, -0.153624f, 0.528984f, + -0.415558f, -0.445863f, 0.588512f, -0.142439f, -0.132127f, 0.199776f, + -0.579284f, 0.119488f, -0.033590f, -0.503846f, -0.674979f, 0.335125f, + 0.020519f, 0.233973f, -0.297998f, -0.051511f, 0.518626f, -0.412782f, + -0.074045f, 0.130523f, 0.465751f, -0.117795f, 2.535813f, 0.352108f, + -0.499228f, 0.379784f, 0.056699f, 0.173142f, -0.076519f, -0.026666f, + 0.017834f, 0.492333f, 0.093364f, 0.037867f, -0.165420f, -0.356429f, + -0.562334f, 0.057656f, -0.307544f, 0.085857f, -0.559851f, 0.107230f, + -0.398633f, 0.152618f, -0.216835f, -0.024539f, 0.026044f, -0.249519f, + -0.563594f, -0.746025f, 0.025265f, -0.298888f, -0.185243f, 0.058794f, + 0.233696f, -0.115223f, 0.144617f, -0.864390f, 0.619944f, -0.023980f, + 0.019481f, 0.225252f, 0.416552f, -0.115993f, 0.935387f, 0.744386f, + 0.053353f, -0.052582f, -0.065650f, 0.228488f, -0.032042f, -0.371252f, + -0.003638f, -0.736984f, -0.203776f, 0.030922f, -0.065577f, -0.031643f, + -0.049253f, -0.054640f, 0.787134f, 0.545414f, -0.140297f, -0.124274f, + -0.110011f, -0.029552f, 0.657005f, 0.214973f, -0.374300f, 0.251642f, + 0.276591f, 0.030566f, -0.145470f, 0.350579f, -0.356436f, -0.052694f, + -0.063966f, -0.751008f, -1.042392f, 0.328892f, -0.425058f, -0.421571f, + -0.571889f, -1.141472f, -0.125216f, 0.212713f, -0.485170f, -0.088791f, + 0.124589f, 0.023237f, 0.077635f, 0.020901f, -0.271402f, -0.321424f, + -0.513946f, -0.867872f, -0.284593f, 0.106276f, 0.220192f, -0.143532f, + -0.014648f, 0.073402f, 0.327256f, -0.139803f, 0.168763f, 0.048199f, + -0.122526f, 0.111713f, -0.134257f, 0.810364f, -0.085222f, -0.259221f, + -0.239349f, 0.044448f, 0.205031f, 0.413113f, -0.107720f, -0.018816f, + -0.247741f, -0.004963f, 0.041170f, -0.158019f, 0.134839f, 0.129502f, + 0.800488f, -1.041584f, -0.129336f, 0.170834f, 0.566586f, -0.230443f, + 0.437937f, -0.149922f, -0.046665f, -0.094646f, 0.200070f, 0.072943f, + -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f, -0.444731f, + -0.100877f, 0.545196f, -1.786626f, -0.482946f, 0.500509f, -0.843257f, + 0.200374f, 0.045103f, -0.575718f, -0.164335f, -0.232522f, -0.021825f, + -0.139490f, 0.356058f, -0.352075f, 0.061751f, -0.200616f, -1.180921f, + -0.181355f, -0.137459f, 0.247574f, 0.181541f, 0.184314f, -0.961482f, + 0.493615f, 0.910261f, -2.279238f, 0.648631f, -0.055526f, -0.037137f, + 0.038643f, 0.136609f, -0.819373f, -0.040840f, -0.265989f, 0.006877f, + 0.454651f, -0.595323f, -0.099500f, -0.263717f, 0.150456f, 0.245077f, + -0.268666f, 0.162232f, -0.516451f, -0.024501f, 0.188046f, -0.002262f, + 0.261319f, 0.004173f, 0.746982f, 0.174761f, 0.470447f, -0.159558f, + -0.385240f, 0.023084f, -0.133520f, -0.220607f, -0.018731f, -0.373558f, + -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f, + -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f, -0.153361f, + 0.334394f, -0.569472f, -0.198118f, 0.255922f, 0.104717f, -0.065179f, + 0.111879f, -0.447237f, 1.373623f, -0.190191f, -0.063311f, 0.337529f, + -0.138800f, 0.057009f, -0.137006f, 0.641378f, 0.883147f, -0.679655f, + 0.267717f, -0.351602f, -0.135225f, 0.229398f, -0.513225f, -1.120345f, + 0.528786f, -0.051081f, 0.086653f, 0.140141f, -0.563969f, 0.333402f, + -0.174745f, 0.321093f, -0.438641f, -0.005131f, 0.247415f, 0.110120f, + -0.076308f, -0.083244f, 0.838944f, -0.113043f, -0.013258f, -0.175028f, + -0.179941f, 0.272676f, -0.047946f, -0.088076f, -0.450031f, 0.053929f, + -0.083549f, -0.089952f, -0.186253f, 0.257483f, 0.011019f, 0.586435f, + 0.060580f, -0.052078f, 0.090277f, -0.780869f, 0.969811f, -0.025349f, + -0.281917f, 0.014857f, 0.231863f, -0.228601f, -0.003861f, 0.226550f, + 0.141825f, -0.102171f, -0.010387f, 0.220378f, -2.561975f, -0.497071f, + -0.315117f, 0.371981f, 0.138247f, 0.625031f, -0.308133f, -0.217876f, + 0.005615f, -0.860179f, 0.747491f, 0.006356f, -0.057024f, -0.483189f, + 0.055592f, -0.316834f, 0.069858f, 0.218788f, -0.200044f, 0.227588f, + 0.215496f, -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f, + -0.152512f, -0.332995f, 0.129053f, 0.178668f, -0.302694f, 0.030678f, + 0.925896f, 0.964375f, 0.169021f, -0.218657f, -0.627204f, 0.206437f, + -0.521336f, 0.176206f, 0.142733f, 0.139248f, 0.411682f, 0.181544f, + 0.224850f, -0.935547f, -0.558208f, 0.348096f, 0.342129f, -0.389340f, + -0.236308f, -0.132099f, 0.073642f, 0.089391f, -0.306901f, -0.397842f, + 0.444282f, 0.074623f, -0.051075f, -0.106617f, -0.184037f, -0.239046f, + -0.138761f, 0.120794f, -0.647577f, -0.336471f, 0.527899f, -0.164234f, + -0.028354f, 1.083678f, -0.251534f, -0.145903f, -0.182783f, 0.070976f, + -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f, + -1.152632f, 0.383685f, -0.105895f, -0.096829f, 0.118382f, 0.047447f, + -0.019051f, 0.310180f, -0.162793f, -0.029574f, 0.058054f, -0.636017f, + 0.490639f, 0.158347f, -0.385701f, -0.147057f, 1.285825f, -1.276083f, + -0.021795f, -0.101600f, 0.163254f, 0.267160f, -2.317864f, -0.098598f, + -0.296337f, -0.309017f, 0.164127f, -0.270012f, -0.071187f, -0.262270f, + 0.075415f, -0.368328f, 0.186728f, -0.158031f, 0.481663f, 0.515950f, + -0.162551f, 0.497981f, 0.262196f, 0.168479f, 0.726066f, -0.243856f, + -0.058998f, 0.140168f, 0.053242f, -0.624623f, -0.249480f, 0.055197f, + -1.376804f, 0.417571f, 0.203784f, 0.174370f, -0.155531f, -0.029400f, + -0.491473f, 0.079811f, -0.080123f, 1.345900f, 0.637077f, 0.434862f, + -1.787438f, 0.005756f, -0.362706f, 0.179458f, -0.288263f, 0.516788f, + -0.921248f, 0.043794f, -0.137729f, -0.196171f, -0.046295f, -0.793781f, + -0.156532f, -0.132566f, 0.517989f, -0.154321f, -0.054174f, -0.077900f, + -0.373316f, -0.117718f, 0.188986f, -0.476188f, -0.245312f, 0.181439f, + -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f, + -0.135429f, 0.125766f, -0.081314f, -0.350894f, -0.163165f, -1.936507f, + -0.205966f, 0.031472f, 0.744446f, -0.006680f, -0.837551f, 0.605862f, + -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f, -0.183586f, + -0.010307f, 0.099373f, -0.228278f, 0.175236f, -0.000133f, 0.104491f, + -1.540545f, -0.570971f, -0.252885f, 0.483036f, 0.052531f, 0.260214f, + -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f, -1.775975f, + -0.298634f, 0.323626f, -0.373579f, -0.872977f, 0.619574f, 0.026862f, + -0.122531f, -0.084698f, -2.436297f, 0.483996f, -0.203640f, -0.302157f, + -0.150666f, -0.238320f, 0.089250f, 0.236485f, -0.668654f, -0.122863f, + 0.491152f, -0.226444f, -0.181248f, 0.120158f, 0.294027f, 0.250056f, + 0.307601f, 0.357875f, -1.746455f, -0.175670f, 0.385447f, -0.108808f, + -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f, + 0.607555f, -0.489426f, 0.150624f, 0.598114f, -0.128816f, -0.445793f, + -0.066524f, -0.254380f, 0.227106f, -0.406495f, -0.121632f, -0.275960f, + -0.136494f, 0.339457f, -1.318132f, -0.417572f, -2.614077f, 0.324603f, + -0.001211f, 0.375192f, -0.473448f, -0.162510f, 0.099329f, -0.277965f, + 0.101221f, -0.060263f, 0.121867f, -1.042140f, 0.440851f, 0.078898f, + -0.209007f, -0.243699f, 0.715197f, -0.093997f, 0.086022f, -0.178203f, + -2.275496f, -0.098413f, 0.199352f, -0.526791f, -0.162086f, -0.197806f, + -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f, 0.416236f, + 0.064082f, 0.197655f, 0.340871f, -0.186645f, -0.291498f, 0.433938f, + -1.110063f, 0.003751f, 0.392738f, 0.069360f, 0.102088f, -0.302128f, + -1.518457f, 0.106939f, 0.404527f, -0.306868f, -0.286928f, 0.729276f, + -0.531710f, 0.745048f, -0.168837f, -1.953886f, -0.258828f, -0.190252f, + 0.241877f, -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f, + -0.489957f, 0.100850f, 0.323999f, -0.802837f, -0.462408f, -0.079350f, + -0.029374f, 0.131213f, -0.825032f, 0.040202f, 0.351821f, 0.002869f, + -0.132516f, -0.471264f, -0.297002f, 0.263913f, 0.033478f, 0.146161f, + 0.533229f, -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f, + 0.005151f, 0.018584f, -0.029771f, -0.396038f, -0.159236f, 0.038691f, + -1.197056f, 0.146302f, 0.226840f, -0.852126f, 0.031214f, 0.108880f, + 0.562000f, -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f, + 0.515073f, -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f, + 0.347673f, 0.623379f, 0.722067f, -0.492458f, -0.513263f, 0.585167f, + 0.721518f, -0.693499f, 0.343725f, -0.273861f, -0.040230f, -0.785664f, + -0.157500f, -0.308445f, 0.054062f, 0.600131f, -0.860887f, 0.434470f, + -0.191382f, -0.306150f, -0.243965f, 0.705444f, 0.007789f, -0.146154f, + -0.054499f, -0.073500f, -1.067364f, 0.404936f, -2.864590f, 0.182323f, + 0.326126f, 0.102405f, -0.135800f, 1.128095f, -0.012267f, -0.023996f, + -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f, -0.498361f, + 0.083560f, -0.210074f, 0.019225f, -0.201614f, -0.904760f, 0.181421f, + 0.586384f, -0.177706f, 0.065471f, 0.168552f, 0.054705f, 0.045241f, + 0.048057f, -0.410957f, -2.188854f, -0.169812f, 0.015521f, 0.176856f, + -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f, 0.010454f, + 0.823643f, -0.119781f, -0.098359f, 0.093119f, +}; + +static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = { + -0.433195f, -0.120488f, -0.116721f, 0.112134f, 0.118170f, -0.259769f, + -0.077530f, 0.394044f, 0.279167f, -0.317988f, 0.189538f, 0.314776f, + 0.325655f, -0.107123f, 0.591049f, 0.358744f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_128_layer0, + av1_ab_partition_nn_weights_128_layer1, + }, + { + av1_ab_partition_nn_bias_128_layer0, + av1_ab_partition_nn_bias_128_layer1, + }, +}; + +// nn model for ab partition pruning, 64x64. +static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = { + -0.495347f, -0.049498f, -0.026804f, 0.030474f, -0.289308f, -0.264193f, + -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f, + -0.038217f, 0.014872f, -0.289728f, -0.233577f, -0.415875f, -0.343615f, + -0.442543f, -0.482492f, 0.073510f, 0.007503f, 2.162329f, -0.362849f, + 2.145915f, -0.883135f, 0.185636f, -0.062859f, -0.465574f, -0.486205f, + -0.056710f, -0.330642f, -0.321860f, 0.042321f, -0.348965f, 0.003542f, + -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f, + 0.246622f, 0.199651f, -0.663420f, -0.154152f, -1.220383f, 0.047138f, + 0.816811f, 0.083247f, -0.218839f, 0.038143f, -0.063436f, 0.015517f, + -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f, + 0.050425f, -0.221723f, -0.256942f, -0.287285f, 0.144011f, -0.033245f, + 0.083649f, 0.119428f, -0.056706f, -0.117805f, 0.021866f, -0.257300f, + -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f, -0.347247f, + 0.042539f, -0.302697f, 1.652316f, 0.000701f, -0.482843f, -0.160332f, + -0.450099f, 0.212399f, -4.715360f, -5.336774f, -5.375758f, -6.048339f, + 0.085956f, -0.037767f, 1.052409f, -0.931924f, -2.221907f, 0.268946f, + 0.015512f, 1.237094f, -1.092185f, 0.418247f, -0.082143f, -0.076914f, + -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f, + -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f, + -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f, 0.019839f, + 0.451127f, 0.004376f, 1.410392f, 3.255835f, -0.344815f, 0.145202f, + 0.204132f, 0.171948f, -0.527736f, -0.110353f, 0.901448f, 0.003238f, + -3.822090f, 0.235462f, 1.024823f, -0.821244f, 0.876056f, 2.553762f, + -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f, + -0.246040f, 0.039430f, -0.071769f, -0.118847f, -0.304053f, -0.281541f, + -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f, + 0.084721f, 0.168089f, -0.272169f, -0.204998f, -0.008303f, -0.173998f, + 0.079376f, -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f, + 0.066176f, -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f, + -0.385845f, 0.119769f, -0.006567f, -0.382126f, -0.214221f, 0.038449f, + -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f, -0.114423f, + -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f, + -0.279011f, -0.008132f, 0.208463f, 0.020569f, -0.206803f, -0.213408f, + -0.206131f, -0.290245f, 0.069701f, -0.000371f, -0.307572f, -0.451785f, + -0.300838f, -0.453186f, -0.301691f, 0.046327f, -0.312668f, 0.058272f, + -0.303131f, -0.376252f, 0.108384f, -0.086623f, -0.100630f, -0.027330f, + -0.003969f, 0.089502f, -0.200722f, -0.107889f, 0.061843f, -0.008478f, + -0.265057f, -0.271132f, -0.073562f, 0.129337f, -0.283698f, -0.353414f, + 0.076420f, -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f, + -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f, + -0.011932f, -0.585700f, 0.253212f, -1.061900f, -0.205116f, -0.336407f, + -0.762199f, 0.577737f, 0.230832f, 0.434440f, -0.096713f, 0.038552f, + -0.147800f, -0.213553f, 0.041740f, -0.281907f, -0.026154f, -0.082356f, + -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f, + -0.391963f, -0.467392f, 0.027453f, -0.394761f, -0.045544f, 0.076052f, + 0.483985f, 0.067093f, 0.141361f, 0.576772f, 0.859718f, 2.566515f, + -0.025476f, 0.769738f, -0.680235f, -1.683309f, -2.394131f, -0.000714f, + -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f, + 0.551148f, 1.777227f, -0.461630f, 0.043093f, 0.012293f, -0.255841f, + -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f, + -0.297266f, -0.128699f, -0.149555f, 0.016534f, -0.375498f, -0.346759f, + -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f, + -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f, + -0.048421f, -0.144133f, 0.889073f, 0.012606f, 3.007608f, -0.602584f, + -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f, -1.867208f, + -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f, 0.119141f, + -0.230715f, 0.083247f, 0.020367f, -0.128629f, -0.217455f, -0.159640f, + 1.815952f, -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f, + 0.662971f, 0.486475f, 0.159746f, -0.018932f, 3.692397f, 1.384353f, + -0.401984f, -0.248380f, -0.140861f, 0.215248f, -0.023711f, 0.059679f, + -0.072260f, 0.004271f, 0.039545f, -0.347971f, -0.081851f, -0.474896f, + -0.181572f, 0.066736f, -0.157822f, -0.163760f, -0.171113f, -0.089935f, + -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f, + -0.102701f, -0.312336f, 0.149831f, 0.007229f, -0.155700f, -0.173611f, + 4.074261f, 1.342306f, -1.272712f, 1.570899f, -0.545093f, -0.317605f, + -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f, + -0.239130f, -0.067211f, 0.041957f, -0.039234f, -1.003587f, -0.094412f, + 0.532512f, -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f, + 0.419466f, 0.492122f, -0.004368f, -0.022096f, -1.115132f, 0.150886f, + 2.396852f, 2.660000f, -0.376537f, 0.468628f, 0.149413f, -0.074898f, + -0.067154f, 0.021245f, 0.127857f, 0.294189f, 0.508056f, 0.390232f, + -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f, + -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f, + -0.398724f, -0.372068f, -0.234279f, 0.017799f, -0.424760f, -0.646717f, + -0.047568f, 2.924664f, -0.644165f, 0.359349f, -0.294800f, 0.591746f, + -0.404710f, -0.092358f, -0.250729f, 0.030829f, -0.147149f, -0.476023f, + -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f, + -0.377052f, -0.449899f, -0.056452f, 0.138081f, -0.085350f, -0.308391f, + 0.106661f, 0.176234f, 0.258869f, -0.230172f, -0.233029f, -0.241208f, + -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f, + -0.158114f, -0.223167f, -0.026689f, 0.051863f, 0.212834f, -0.304714f, + -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f, + 0.280815f, -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f, + -0.380595f, 0.109504f, -0.111141f, -0.437685f, -0.094459f, 0.144206f, + -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f, + -0.418429f, -0.183240f, 0.031319f, -0.095785f, -0.315447f, 0.069404f, + -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f, + -0.178198f, 0.177208f, 0.134688f, -0.081933f, -0.229452f, -0.208872f, + 0.026287f, -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f, + -0.267238f, -0.494125f, -0.056255f, 0.053715f, -0.487754f, 0.014818f, + 0.087383f, -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f, + -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f, 0.010186f, + -0.001587f, 0.086735f, -2.465718f, 1.482185f, 1.621193f, -2.081680f, + 1.386553f, -3.204335f, -0.267111f, -0.004508f, 0.164712f, 0.274147f, + 1.724306f, -2.273659f, 0.749574f, -0.891905f, 0.105965f, -0.030428f, + -0.416018f, -0.300762f, 0.122911f, -0.316908f, -0.292504f, 0.138666f, + -0.161327f, -0.042143f, -0.249128f, 0.149210f, -0.088987f, -0.654101f, + -1.501843f, 0.216777f, 0.955914f, 0.524158f, -1.642561f, -1.643626f, + 0.864797f, -0.425451f, -2.115764f, -0.012502f, 0.065172f, 1.297270f, + 0.018845f, 1.167276f, -0.470970f, -0.244995f, 0.374782f, -1.811056f, + -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f, + -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f, + -0.370683f, 0.172816f, -0.265069f, 0.194321f, -0.273478f, 0.037442f, + -0.235552f, -0.078625f, -0.447541f, 0.016836f, -0.271123f, -0.171481f, + -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f, + -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f, + 0.230343f, -0.034318f, -0.022687f, -0.047090f, +}; + +static const float av1_ab_partition_nn_bias_64_layer0[64] = { + -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f, -0.150944f, + -0.075727f, -0.208414f, 1.054996f, 0.713758f, -0.300051f, -0.151482f, + -2.443570f, 0.430590f, -0.129001f, -0.160733f, -0.230547f, -0.143228f, + -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f, + 0.578161f, -0.220318f, -0.210107f, -3.111584f, 0.604419f, -0.232622f, + -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f, -2.535531f, + -0.209783f, -0.211189f, -2.766337f, 0.000000f, 0.450177f, -1.754884f, + 3.262664f, -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f, + -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f, + -0.305430f, 0.739171f, 0.991277f, -0.088150f, 0.086313f, -0.023379f, + -0.125366f, -0.063576f, -0.212169f, -0.047463f, +}; + +static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = { + -0.036800f, 0.528721f, 0.490767f, 0.144409f, 1.103640f, 0.361910f, + -0.180069f, 0.068033f, -14.868382f, 0.359013f, 0.322567f, -0.199212f, + 0.906164f, -0.488254f, 0.149653f, -0.216394f, -0.099347f, 0.004936f, + -0.111391f, 0.074848f, -0.041709f, 0.147627f, -0.018905f, 0.096116f, + 0.184817f, -0.016241f, 0.115739f, 2.376754f, 0.637097f, 0.052954f, + 0.136428f, 0.225267f, -0.181873f, -0.142876f, 0.684048f, 0.658791f, + 0.105795f, 0.241705f, 1.381114f, -0.209379f, 1.145949f, 0.795293f, + -9.361877f, 0.198302f, 0.539600f, 0.092317f, -0.081695f, 0.200777f, + 0.102334f, 0.081583f, 0.060948f, -0.025110f, 0.160951f, -0.020170f, + 0.234006f, -0.029369f, 0.375036f, 0.270209f, -0.556529f, 1.402949f, + 0.101777f, -0.027331f, 0.004502f, -0.153166f, -0.116651f, 0.151573f, + -0.022187f, 0.144044f, -0.108719f, -0.129942f, -0.270321f, 0.227363f, + 1.892330f, -0.661052f, -0.219398f, -0.229417f, -0.856438f, -1.196988f, + -0.081774f, 0.078847f, -0.207057f, -0.048947f, 0.152073f, -0.243056f, + -0.233329f, -0.288689f, -0.158333f, -0.141177f, -0.715436f, 0.016947f, + -0.093752f, 0.204984f, -1.209782f, 0.155683f, 0.092239f, 0.146495f, + 0.813146f, -0.027757f, 0.330982f, 2.173948f, -0.028867f, -0.141815f, + 0.292708f, -0.204794f, 0.014496f, 1.032799f, 1.312155f, 0.107020f, + 0.824752f, -0.013945f, 0.184829f, -0.041633f, 0.215300f, -0.476088f, + -0.053213f, 0.126862f, -0.020777f, 0.082893f, -0.223727f, -0.923063f, + 0.466529f, 0.082140f, -0.845758f, -1.140791f, -0.262033f, 0.138491f, + 0.151717f, -0.182479f, -0.131128f, 0.055411f, 0.106771f, 0.125552f, + 0.297184f, -0.257403f, -0.059884f, -0.274903f, 2.694357f, -0.108244f, + 0.025377f, 0.043092f, -0.558317f, 3.517159f, -0.270833f, -0.240676f, + 0.205100f, -0.057068f, -0.140445f, -0.193449f, -0.030061f, -0.286762f, + -0.467523f, -0.012647f, 0.190564f, 0.022394f, -0.101479f, 0.339684f, + -0.902743f, -0.169578f, -0.178029f, -0.041836f, -3.952108f, -0.028298f, + -0.221137f, -0.733895f, -0.223895f, 0.039012f, 0.687867f, 0.021423f, + 0.113063f, 0.676087f, -0.961000f, -0.064847f, 0.712856f, -0.192765f, + -0.001132f, 0.016689f, -0.236020f, -0.766186f, -0.175729f, 0.012879f, + -0.251064f, -0.105523f, -0.039212f, -0.347584f, 0.304352f, -0.034174f, + -0.364258f, -0.685252f, -0.266115f, -0.247345f, -0.155905f, 0.152283f, + -0.156315f, 0.174082f, -0.757654f, 0.102303f, -2.192316f, -0.245815f, + 0.119882f, -0.086542f, 1.987246f, -1.353163f, -0.374813f, -0.233504f, + -1.980895f, 0.692093f, -0.168351f, 0.172700f, -0.009052f, -0.015734f, + 0.106679f, -0.060472f, -0.256813f, -0.074874f, -0.207488f, -0.329515f, + -0.418268f, -0.017940f, -0.036081f, 0.064719f, -1.488016f, 0.020591f, + -0.176325f, -0.141074f, 0.944494f, 0.150237f, -0.249805f, -0.277280f, + 0.012686f, 0.132483f, 0.116123f, 0.013737f, -0.116091f, 0.750340f, + 3.251343f, -0.188864f, 1.096992f, 0.058467f, -0.041433f, -0.037937f, + -0.133294f, -0.137908f, -0.171132f, 0.106362f, 0.069383f, -0.052662f, + -0.177883f, -0.408049f, 0.680221f, -0.117035f, -0.904240f, -1.395228f, + 0.154527f, 0.134427f, 0.022767f, -0.158886f, -0.230316f, 0.161096f, + 0.362213f, -0.235060f, -0.941620f, 0.055912f, -0.049458f, -0.166632f, + 0.481418f, 0.930146f, 0.041108f, 0.033674f, 1.372066f, -1.847709f, + 0.003324f, 0.259534f, 0.177014f, -0.202761f, -0.262017f, -0.190852f, + -0.102839f, 0.028338f, 0.187193f, -0.041684f, 0.123973f, -0.198576f, + -0.110369f, -1.431400f, 0.208369f, -0.302370f, -0.248549f, 0.062985f, + 0.673409f, 0.036662f, -0.711340f, -0.120584f, -0.189789f, 0.098812f, + 2.947819f, 0.216567f, -0.414472f, -0.181742f, 1.873779f, -0.222726f, + -0.782870f, 0.007889f, 0.015062f, -0.554328f, 0.182928f, -0.191430f, + 0.123636f, -0.215460f, -0.225245f, 0.251516f, -0.013025f, -1.359595f, + -0.750602f, 0.342667f, -0.141899f, -0.687493f, -0.072639f, 0.048018f, + -0.242107f, -0.031917f, -0.287472f, -0.046088f, 0.832197f, -0.016576f, + -1.553349f, -0.216341f, 0.023077f, -0.410867f, 4.243743f, -0.514878f, + -0.066007f, -0.160696f, -0.262678f, -0.648790f, -0.430586f, 0.199940f, + -0.202496f, -0.222241f, -0.016406f, -0.121473f, 0.000828f, -0.081584f, + -0.152641f, -0.190166f, 0.644400f, 0.040196f, -0.302104f, -1.143654f, + -0.160327f, -0.320780f, -0.187006f, 0.037311f, 0.440618f, -0.070733f, + -0.117785f, 1.527539f, -0.419310f, 0.001300f, 1.389956f, -0.036366f, + -0.269203f, 0.612265f, 2.721897f, -0.086836f, -0.446999f, 0.012525f, + -0.078317f, -0.287052f, -0.111188f, -0.085181f, -0.164667f, -0.010466f, + -0.569722f, -0.018888f, -0.101663f, -1.147130f, -0.465204f, 0.114524f, + -2.192402f, -0.221325f, 0.375748f, 0.206284f, -0.261548f, -0.246257f, + -0.143004f, -0.069981f, -0.057306f, -0.116481f, -0.435903f, -0.314970f, + 0.013210f, -0.010175f, 4.630571f, -0.473226f, -0.197199f, -0.028204f, + 0.122907f, 2.475548f, 0.025011f, -0.092603f, -0.127561f, -0.151330f, + -0.077295f, 0.245016f, -0.045005f, 0.183396f, -0.330556f, -0.384887f, + 0.356374f, -0.016618f, -0.463353f, -1.291546f, -0.071986f, -0.311599f, + 0.072385f, -0.430786f, -2.094788f, 0.202733f, -0.910109f, -1.336543f, + -0.086800f, -0.096413f, 1.544383f, 0.031860f, -0.796211f, 0.762786f, + 3.250022f, -0.441798f, -0.698537f, 0.062839f, 0.033525f, -0.362996f, + 0.027022f, -1.131264f, -0.228926f, 0.053885f, -0.338628f, 0.155037f, + -0.046844f, -0.888172f, -0.241767f, 0.084965f, -0.617743f, -0.049896f, + -0.036894f, -0.304783f, -0.002639f, 0.137957f, 0.052121f, -0.131161f, + -0.117200f, -0.253380f, -0.205561f, -0.302450f, -0.047397f, -0.330518f, + 3.613420f, -1.525951f, -0.026738f, 0.209150f, -2.103534f, 2.019689f, + -0.366199f, -0.095260f, 0.027417f, -0.242512f, 0.162579f, 0.052113f, + -0.293851f, -0.068138f, -0.005799f, -0.344696f, -0.114824f, -0.431107f, + -0.120058f, -1.139926f, -1.048379f, 0.036446f, -0.323020f, -0.432945f, + 0.454151f, -0.140058f, 0.050649f, -0.094900f, -0.017278f, -0.238719f, + 1.193153f, 0.120447f, -0.496061f, 0.917431f, 2.936126f, -0.115521f, + -0.347397f, -0.435325f, -0.004383f, -0.211864f, 0.162383f, -1.040726f, + 0.089537f, -0.128579f, -0.133505f, 0.107129f, -0.435657f, -0.180388f, + 0.043650f, 0.018709f, -0.773242f, -0.687192f, -0.120633f, -0.063626f, + 0.029912f, 0.113972f, -0.403502f, -0.127640f, -0.269625f, 0.129794f, + -0.188539f, 0.041641f, 0.029769f, -0.198374f, 1.401407f, 0.353887f, + -0.219925f, 0.260515f, 1.157034f, -2.992044f, -0.097618f, -0.064417f, + -0.203626f, -0.008217f, -0.112339f, -0.227407f, -0.155118f, 0.247705f, + -0.012304f, -0.248447f, -0.913463f, -0.064788f, -0.214619f, -0.251761f, + -0.386861f, -0.040574f, -0.163219f, -0.100700f, 1.488274f, -0.071684f, + -0.033626f, -0.006497f, -0.246945f, -0.145221f, -3.747390f, 0.149609f, + -0.263326f, -0.297385f, -1.039896f, -0.083174f, -0.025473f, -0.235586f, + -0.001087f, 0.254286f, 0.265106f, 0.007325f, 0.199239f, 0.134103f, + -0.578211f, -0.259801f, -0.062373f, 2.368348f, 0.560556f, -0.252260f, + 0.889997f, -0.447872f, -0.059218f, -0.095315f, -0.061667f, 0.183580f, + -0.157479f, 0.055387f, -0.831734f, 0.007606f, -1.104906f, 0.301180f, + -0.117115f, 0.212959f, 4.727223f, -0.243833f, -0.397495f, -0.025021f, + -0.367587f, -2.082058f, -0.217699f, 0.148111f, 0.252430f, 0.111088f, + -0.260692f, 0.095124f, -0.407774f, -0.322169f, 0.002927f, 0.126169f, + -1.272325f, -0.279772f, -0.373680f, -0.485177f, -0.605458f, 0.021225f, + -0.092031f, -0.226585f, 1.895162f, 0.037866f, -0.275475f, 1.614360f, + -0.014972f, -0.277679f, -3.449082f, -0.092060f, -0.747873f, 0.020716f, + 2.776178f, -0.049963f, 0.183999f, -0.295259f, -0.028868f, 0.221895f, + 0.001265f, 0.336823f, 0.219372f, 0.112824f, 0.408132f, -0.017940f, + -0.311666f, 1.489606f, -0.058093f, -0.305659f, -0.491933f, -0.143847f, + 0.166115f, 0.042867f, -0.123447f, -0.087099f, -0.305395f, -0.365079f, + -0.755801f, -0.160649f, 0.736260f, -0.008611f, 0.095836f, -0.017345f, + 5.697515f, -0.498971f, -0.125280f, 0.199907f, 0.300053f, 0.605026f, + -0.228225f, -0.259523f, 0.016384f, 0.146973f, 0.210258f, 0.226766f, + -0.075178f, -0.050924f, 0.188496f, -0.415266f, -0.484880f, -0.236384f, + 0.071931f, -0.331863f, -0.601243f, -0.232479f, -0.285272f, 0.123789f, + -1.341333f, 0.037082f, -0.315202f, -1.587215f, -0.271576f, 0.003216f, + -4.437186f, -0.256205f, -0.576589f, -0.114147f, 2.153916f, -0.369618f, + 0.271415f, 0.145036f, -0.158731f, -0.240938f, -0.187369f, 0.036325f, + 0.254771f, 0.211488f, -0.240297f, 0.098417f, -0.415011f, 2.334793f, + -0.127252f, 0.020069f, -0.168755f, -0.448922f, -0.219207f, 0.016232f, + -0.221935f, -0.269500f, -0.100636f, 0.102545f, -0.809376f, -0.054979f, + 0.360713f, -0.326541f, 0.112933f, 0.138073f, 4.229404f, -0.763801f, + -0.305429f, 0.199955f, -1.787713f, 0.272866f, 0.109895f, 0.138466f, + -0.250259f, -0.167162f, -0.212588f, -0.217589f, -0.067125f, -0.077490f, + -0.208970f, -0.006863f, -0.671146f, -0.298320f, -0.165509f, 0.044597f, + -1.408624f, -0.213957f, -0.220947f, 0.129718f, 1.316777f, -0.098928f, + -0.008121f, -0.558293f, -0.297290f, -0.218873f, -4.346638f, -0.228174f, + -0.204710f, -0.388864f, 2.697919f, 0.025260f, 0.857020f, 0.009921f, + 0.036915f, -0.320275f, -0.087937f, 0.022636f, 0.236667f, 0.135496f, + -0.059616f, -0.192955f, 0.009470f, 2.139589f, -0.200449f, 0.129818f, + 1.017444f, -0.608299f, 0.257914f, -0.134306f, -0.033327f, 0.002855f, + -0.338598f, 0.015559f, 0.117362f, -0.166760f, 0.086903f, -0.167666f, + 0.193523f, 0.033852f, -1.147686f, 0.489468f, -0.006969f, 0.125630f, + 1.557907f, -1.604449f, -0.071114f, 0.096178f, 0.007065f, 0.200013f, + 0.213393f, 0.168466f, -0.100568f, -0.117861f, -0.161542f, -0.072561f, + -1.069871f, -0.470138f, -0.352578f, -1.503513f, -0.001394f, -0.380109f, + 0.065089f, -0.281668f, 0.988953f, -0.002778f, -0.659026f, -0.470692f, + -0.407292f, 0.011710f, -1.362085f, 0.184738f, -0.135786f, -1.374241f, + 4.487930f, -0.067274f, -0.956404f, -0.233995f, 0.224527f, -0.454556f, + 0.037900f, -0.281658f, 0.208224f, -0.254753f, 0.045740f, 0.051444f, + -0.388281f, 0.257112f, -0.485030f, -0.082659f, 0.148103f, -1.007456f, + -0.022295f, 0.036984f, -0.369401f, -0.076943f, -0.007636f, -0.293022f, + 0.470466f, 0.199012f, -2.158182f, 0.036577f, -0.014725f, -0.229516f, + 2.236929f, 0.030945f, -0.400045f, 0.109348f, 0.214691f, -0.891516f, + -0.251379f, -0.217358f, 0.013733f, 0.205573f, -0.151725f, -0.191782f, + -0.339630f, -0.163905f, -0.119191f, -0.032516f, 0.503015f, 0.025772f, + 0.029094f, -1.146153f, 0.216723f, -0.330023f, 0.064695f, -0.262521f, + 0.425612f, -0.093080f, -0.489648f, 1.051293f, -0.092332f, 0.095557f, + -0.874132f, 0.218483f, -0.127648f, -1.605802f, 2.763617f, -0.186734f, + -1.243166f, -0.193514f, -0.173748f, 0.337822f, 0.183873f, -0.251594f, + -0.211582f, 0.144081f, 0.029620f, -0.024853f, -0.385140f, 0.467341f, + -0.928316f, -0.195442f, 0.917783f, 0.357084f, 0.174445f, -0.073659f, + -0.012811f, -0.115420f, -0.181147f, -0.364449f, -0.567395f, -0.012969f, + -1.680714f, 0.065323f, 0.198063f, -0.244201f, 1.428545f, -0.432539f, + -0.208931f, -0.091205f, 0.957125f, 0.813519f, -0.262677f, 0.246852f, + 0.015536f, 0.055026f, 0.067054f, 0.262103f, -0.358115f, -0.095206f, + -0.267522f, -0.402710f, -0.680397f, -0.123627f, -0.385590f, -1.504680f, + -0.169513f, -0.215338f, 0.043633f, -0.079052f, -0.464410f, 0.122894f, + -0.278231f, -2.456445f, -0.159917f, -0.015597f, -0.735449f, -0.078854f, + -0.400290f, -1.153870f, 3.657228f, -0.287093f, -1.174355f, -0.102001f, + -0.288281f, 0.185209f, -0.145228f, -0.200449f, -0.099914f, -0.138354f, + 0.254428f, -0.161751f, -0.118206f, 0.296043f, -0.482613f, 0.080932f, + 1.097605f, -0.010190f, 0.232439f, 0.447617f, -0.133508f, 0.115763f, + -0.388589f, 0.174695f, -0.236014f, 0.006284f, -1.374129f, 0.092015f, + -0.241419f, -0.231667f, 2.763950f, -0.922932f, -0.061605f, 0.208740f, + -1.597190f, 1.353325f, -0.198528f, 0.250498f, -0.013950f, -0.203861f, + -0.254563f, 0.081931f, -0.413369f, 0.011844f, 0.080961f, -0.231161f, + -1.234909f, -0.440843f, -0.174980f, -0.315283f, -0.337474f, -0.123243f, + -0.310001f, -0.271028f, 0.364179f, 0.022845f, -0.535517f, -0.772936f, + -0.188435f, 0.039667f, -0.807463f, 0.266550f, -0.288857f, -1.630789f, + 1.280155f, 0.065712f, -0.279960f, -0.300056f, 0.258440f, -0.073781f, + 0.213878f, 0.042196f, 0.021360f, 0.211698f, -0.003751f, -0.192673f, + -0.137008f, 0.247878f, -0.470604f, 0.073164f, 1.523241f, 0.734755f, + -0.114126f, -0.193834f, -0.025759f, 0.263183f, +}; + +static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = { + -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f, + -0.148074f, 0.923430f, -0.364770f, 0.203550f, 0.401216f, 0.938246f, + -0.872737f, 0.718723f, 0.703398f, 2.560015f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_64_layer0, + av1_ab_partition_nn_weights_64_layer1, + }, + { + av1_ab_partition_nn_bias_64_layer0, + av1_ab_partition_nn_bias_64_layer1, + }, +}; + +// nn model for ab partition pruning, 32x32. +static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = { + -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f, + -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f, + 0.344916f, -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f, + 0.411575f, -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f, + -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f, + 0.225887f, -0.000493f, 2.682241f, 0.871204f, 0.059014f, 0.803542f, + -1.407028f, -1.154669f, 1.388148f, -0.293348f, -0.003669f, -0.009607f, + 1.330030f, -0.337841f, 2.118617f, 1.033059f, -0.084788f, 0.212904f, + 0.082405f, -0.070579f, -0.494005f, -0.173392f, 0.039546f, -0.463865f, + 0.077163f, -0.434066f, 0.030835f, -0.427139f, -0.560520f, -0.031606f, + -0.368541f, -0.027458f, 0.370574f, 0.461418f, 1.087682f, -0.572137f, + -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f, + -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f, -0.156744f, + -0.267922f, 0.171216f, 0.110556f, 0.002954f, -0.200327f, -0.187663f, + 3.691601f, 1.234152f, 0.186315f, -0.125370f, -0.211235f, -0.554432f, + -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f, 0.012896f, + -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f, 0.016307f, + 0.384673f, -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f, + -0.169709f, 0.421681f, -0.033360f, -0.072817f, 0.003647f, -0.110632f, + -0.158651f, -0.095136f, 0.223759f, 0.165767f, -0.269129f, -0.196075f, + -0.023183f, -0.293420f, 0.014875f, 0.018688f, -0.153407f, -0.172009f, + -0.259947f, -0.124015f, 0.173653f, -0.089103f, -0.021001f, -0.334230f, + 0.027177f, 0.103371f, -0.183860f, -0.204051f, -0.023721f, -0.192297f, + -0.143771f, -0.247106f, 0.218116f, -0.013240f, 2.831783f, 1.483928f, + -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f, 0.234684f, + -0.119150f, -0.075182f, -0.330463f, 0.071503f, -0.254924f, -0.360071f, + -0.037022f, 0.063261f, -0.148759f, -0.238254f, -0.462018f, -0.027166f, + 0.065318f, -0.235743f, -0.257194f, -0.094784f, 0.022423f, 0.055925f, + 0.086672f, -0.021010f, 0.009965f, -0.001648f, -0.104917f, -0.387443f, + -0.102673f, -0.281706f, 0.145923f, -0.233391f, -0.378365f, -0.145584f, + -0.077751f, -0.121166f, 1.134565f, -0.097500f, -0.749202f, -0.544566f, + -1.361374f, -0.102494f, 1.089275f, 0.375299f, -0.105091f, 0.037641f, + -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f, + -0.339326f, -0.128217f, -0.282905f, 0.014937f, 1.067185f, -0.171764f, + 0.484458f, 0.396706f, -0.557055f, -0.891596f, -0.257839f, -0.720879f, + -0.218449f, -0.004755f, 1.572857f, 0.006229f, 1.962895f, -0.029746f, + -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f, + -1.263078f, -0.304560f, 1.072374f, 2.556429f, 0.312850f, 0.257488f, + -0.634264f, 0.156769f, -0.188943f, 0.040295f, -0.389915f, 0.085250f, + -0.248525f, 0.045667f, -0.776115f, -0.274680f, -0.448145f, -0.566161f, + -1.285316f, 0.079060f, 0.389124f, -0.510401f, -0.015299f, -0.664661f, + 0.099901f, -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f, + -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f, + -0.064569f, -0.156516f, 0.543522f, -0.005924f, 0.161432f, 0.974793f, + 0.273712f, 1.104850f, -0.290312f, 0.313417f, -0.125370f, 0.136234f, + -0.191227f, -0.165054f, 0.011872f, -0.298871f, 0.095740f, 0.142760f, + -0.215771f, -0.031437f, 0.101041f, -0.085620f, 0.435387f, 0.002786f, + 1.971375f, 0.018392f, -1.771940f, -0.401433f, 0.808263f, -3.350013f, + 2.296952f, -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f, + -0.276088f, -0.455907f, 0.266021f, 0.087348f, -0.146566f, 0.040492f, + -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f, + -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f, 0.002185f, + -4.225019f, 0.344025f, 0.728796f, -0.262936f, 1.383924f, 1.577300f, + -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f, + -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f, + 0.031970f, -0.373402f, -0.396079f, 0.045566f, 0.072595f, -0.222681f, + -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f, + -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f, + -0.205936f, -0.316275f, 0.103729f, -0.197893f, -0.128029f, -0.218796f, + -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f, + -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f, + -0.270504f, 0.234505f, 0.272144f, 0.266938f, -0.392395f, -0.011717f, + -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f, + -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f, + -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f, + -0.119432f, -0.222351f, 0.000450f, 0.208724f, -0.510526f, -0.144656f, + -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f, + 0.043714f, -0.235414f, 0.115594f, -0.195616f, -0.106693f, -0.124242f, + 0.083990f, 0.049110f, -0.196130f, -0.059860f, -0.464235f, -0.516443f, + -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f, + -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f, + -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f, + -0.023459f, -0.222538f, 0.028849f, -0.088038f, -0.301550f, -0.273566f, + 0.067295f, -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f, + -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f, + -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f, + -0.086147f, -0.430088f, 0.058466f, -0.152129f, -0.058411f, -0.236392f, + -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f, + -0.324501f, 0.000490f, -0.282167f, -0.073163f, -0.281452f, 0.047932f, + -0.175500f, 0.165220f, -0.276212f, 0.062153f, -0.217054f, -0.255487f, + -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f, + -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f, + -0.158325f, 0.151907f, -0.266835f, -0.144697f, -0.193960f, -0.046587f, + -0.220028f, -0.247355f, 0.135584f, 0.016511f, 0.367705f, -1.855877f, + 0.435622f, 0.444710f, -3.372301f, -3.030489f, 1.013267f, 0.380951f, + -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f, + -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f, + -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f, + -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f, + -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f, 1.189112f, + 1.458468f, -0.005876f, -0.927475f, 0.062038f, -1.170818f, 0.338227f, + -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f, + -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f, + -0.310094f, 0.062721f, 0.251422f, -0.014350f, -1.282910f, 1.619560f, + 1.180566f, -0.032163f, -1.322951f, -0.603601f, 1.443710f, 0.654650f, + -0.393227f, 0.003536f, 0.029725f, -0.108925f, -0.053911f, 0.133977f, + -0.036145f, -0.168438f, 0.046989f, -0.331463f, -0.176983f, -0.311922f, + -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f, + -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f, -0.032867f, + -0.039424f, -0.063670f, 0.193808f, -0.303514f, -0.013376f, -0.057761f, + 0.187922f, 0.006938f, 0.031810f, 0.180594f, -1.198427f, 2.820662f, + 0.154986f, -0.375518f, 0.116925f, -0.795782f, -0.085139f, -0.079365f, + -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f, + -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f, + -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f, + -2.453587f, -0.045568f, -0.296932f, 0.613061f, -0.320284f, 0.191620f, + -0.827145f, -0.225277f, 0.275800f, 1.696635f, +}; + +static const float av1_ab_partition_nn_bias_32_layer0[64] = { + -0.176206f, 0.660189f, -0.186156f, -2.481963f, -1.564218f, -0.280424f, + 0.732684f, -0.135581f, -2.193132f, -0.172771f, 0.605001f, -0.060392f, + -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f, + 0.632779f, 0.005585f, 1.310169f, 1.392136f, -0.563860f, -0.051053f, + 0.660998f, -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f, + -0.177726f, 1.200859f, -0.178902f, -0.172620f, -0.184476f, -0.175559f, + 0.538503f, -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f, + -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f, -0.020116f, + -0.208096f, 0.000000f, 1.246166f, -0.225421f, -0.181555f, 0.861761f, + 1.172429f, -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f, + -1.384604f, -0.201713f, -0.271948f, 0.372351f, +}; + +static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = { + -0.037828f, 1.529029f, 0.004927f, 1.475763f, 0.627172f, 0.325872f, + -0.990757f, 0.129476f, 0.889958f, -0.082031f, 0.332133f, 0.074422f, + -0.176212f, -0.074355f, 0.774378f, 0.110987f, -0.155469f, 0.253310f, + 0.882538f, 0.253605f, 0.332436f, -5.389474f, 0.278470f, 0.168644f, + 0.914611f, 0.154165f, 0.809262f, -0.174734f, 0.923673f, 0.064716f, + -0.070228f, -0.228735f, 0.002312f, 0.112222f, -0.045502f, -0.046004f, + 0.514101f, 0.306480f, 0.021232f, -0.015955f, -0.288260f, 0.189177f, + -0.104158f, 0.103273f, 0.096910f, -0.086328f, 1.327289f, -0.154247f, + 0.056676f, -0.243327f, -0.646676f, 0.177221f, -0.086761f, 0.729729f, + -14.710893f, -0.044881f, 0.339003f, -0.134737f, 0.073621f, -0.162913f, + 1.215237f, 0.140723f, 0.138630f, 1.241719f, 0.204092f, -0.463080f, + -0.176086f, 1.125868f, 1.034814f, 0.225455f, -0.203421f, -0.078787f, + -0.527498f, 0.012491f, -0.563307f, -0.170792f, 0.002679f, 0.116153f, + 0.211348f, -0.191900f, -0.212505f, 0.263445f, -0.074679f, -0.081441f, + -0.815405f, 2.448215f, 0.781299f, 0.149542f, -1.045162f, 0.043014f, + 0.217381f, -0.094500f, -0.090427f, 0.025784f, -0.228906f, -2.741798f, + 0.230475f, -0.256112f, -0.103297f, 0.159121f, -0.229793f, -0.014883f, + -0.104131f, -0.123816f, 0.164148f, -0.052279f, -0.071845f, -0.041197f, + 0.208527f, -0.234197f, -0.542336f, 0.020053f, 0.088870f, 0.014346f, + 2.502164f, -0.010244f, -0.267792f, 0.844394f, 2.711486f, -0.015262f, + -0.868053f, -0.295704f, 0.222289f, -0.000286f, -0.352098f, -0.079000f, + 0.021267f, -0.721739f, -0.240558f, -0.384775f, 0.065974f, -2.161058f, + 0.195889f, 0.268966f, -0.009329f, 0.014949f, 0.314943f, 0.235885f, + 0.072591f, -0.127120f, 0.150784f, 0.105697f, -1.297403f, -0.207509f, + -0.217688f, -0.076752f, 0.170952f, -0.294235f, 0.449973f, -1.712690f, + 0.860989f, 0.054757f, -0.812627f, -0.105316f, -0.736230f, -0.133192f, + -3.741608f, 0.495660f, -0.288936f, 4.654852f, -0.021305f, -0.308916f, + 0.049205f, -0.259996f, 0.114248f, -0.252647f, -0.253180f, -0.449314f, + 0.022979f, 0.063281f, -0.196154f, 0.078295f, -0.322317f, -0.145142f, + 0.300573f, 0.048385f, -0.254787f, 0.123939f, -1.263088f, -0.228565f, + -0.389061f, 0.391084f, 2.322438f, 0.075009f, 0.225743f, -0.198808f, + -0.280538f, -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f, + -0.102756f, -1.760965f, 0.019149f, -0.867342f, 0.347141f, 0.031588f, + 0.302572f, -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f, + -0.108561f, -0.167077f, -2.851509f, -0.307116f, 0.202720f, -0.160280f, + -0.215525f, 0.064355f, -0.427220f, 1.516230f, 0.634453f, 0.099400f, + -1.013887f, -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f, + -0.160953f, 0.399036f, -0.030685f, -0.113619f, -0.184704f, 0.040519f, + -0.588252f, -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f, + -0.253959f, -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f, + -0.175087f, -0.055171f, 1.642014f, -0.192559f, -0.288147f, 0.610311f, + 4.688195f, -0.128728f, -0.914869f, -0.108286f, 0.013789f, 0.092125f, + 0.019770f, -0.178386f, 0.074164f, -1.152658f, -0.216738f, -0.277286f, + 0.012381f, 0.418259f, -0.680727f, -0.221917f, -0.485946f, 0.101672f, + 2.009457f, 0.054302f, 1.019838f, -0.116170f, 0.165134f, -0.112567f, + 0.852632f, -0.385796f, -0.108666f, 0.053181f, -0.311797f, -0.372875f, + -0.675717f, 2.409268f, -0.514720f, -0.214245f, -0.646596f, 0.009756f, + 0.203993f, 0.093617f, -0.301290f, 0.253551f, -0.128909f, -1.448442f, + -0.186823f, -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f, + -0.212084f, -0.137326f, 0.012505f, 0.087850f, -0.200413f, -0.394119f, + -0.132224f, 0.146917f, 0.155746f, 0.198725f, -0.322541f, 0.196391f, + -0.945500f, 0.036736f, -0.155646f, -0.677341f, 1.130545f, -0.339554f, + 0.411628f, -0.355813f, -0.249843f, 0.213694f, -2.035607f, 0.055694f, + -0.111669f, 0.408696f, -0.067043f, -0.048182f, 0.398110f, -0.067542f, + 1.459801f, 0.236833f, -0.178806f, 0.168758f, 0.492387f, 0.099691f, + -0.776680f, -0.172865f, 0.204225f, 0.193982f, 0.575685f, -0.062248f, + 0.011486f, 0.058571f, -0.493391f, 0.026893f, -0.900467f, 3.793129f, + -0.634613f, -0.064660f, -0.048262f, 0.361905f, 0.033641f, 0.245171f, + -0.064671f, 0.034954f, 0.204358f, -0.904023f, -0.052714f, -0.250134f, + 0.136700f, 0.000734f, -0.371720f, 0.226483f, 0.217958f, 0.060559f, + 0.180111f, 0.000970f, 0.079556f, -0.096775f, 0.093855f, -0.026224f, + -0.243664f, 0.004290f, 0.123281f, -0.239476f, 1.230374f, -0.107826f, + -0.101982f, -0.153917f, 5.464427f, 0.304375f, -0.809957f, 0.090564f, + -0.278416f, -0.245555f, -2.078421f, 0.243093f, -0.127666f, 0.052451f, + -0.126662f, -0.783505f, 0.025149f, -1.422675f, -0.207769f, -0.362547f, + 0.115310f, 0.133390f, 1.264754f, -0.027055f, -0.485312f, -0.240717f, + -0.239722f, 0.146818f, -1.265043f, -0.235553f, 0.267104f, -0.021357f, + -0.435949f, -0.309371f, 0.049920f, 1.302721f, -0.233978f, -0.097551f, + -0.240631f, -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f, + -0.029361f, 2.703590f, -0.430659f, 0.067927f, -0.387520f, -0.370630f, + -0.229236f, 0.085653f, -0.370956f, -0.065556f, -0.187859f, 0.068309f, + -0.109299f, -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f, + -0.196713f, -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f, + 5.963707f, -0.201157f, 0.726377f, -0.011076f, 0.010553f, -0.102918f, + -2.230088f, -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f, + -0.094735f, -1.381839f, 0.587298f, -0.173048f, 0.721360f, 0.241900f, + 0.764302f, -0.023609f, -1.173755f, 0.103912f, -0.185363f, 0.078435f, + -2.245062f, -0.127269f, 0.202234f, 0.158975f, -0.260909f, 0.098608f, + -0.348247f, 1.732502f, -0.412298f, -0.269602f, -0.425771f, -0.146243f, + -0.530730f, 0.125716f, -1.004419f, 0.145109f, -0.059289f, 1.096304f, + 0.012891f, 0.045033f, -0.306875f, 0.003514f, -0.176110f, 0.037544f, + -0.441537f, -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f, + -0.128894f, -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f, + 1.173404f, 0.088312f, -0.393568f, -0.175134f, 6.529819f, -0.326652f, + -0.631917f, -0.393476f, 0.057781f, -0.217748f, -1.781139f, -0.012614f, + -0.212621f, -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f, + -0.608744f, -0.265146f, 0.238517f, 0.066882f, -2.916806f, 0.054642f, + 0.282590f, 0.075248f, 0.010188f, -0.133486f, 0.985945f, -0.045849f, + -0.347564f, 0.057320f, -0.417920f, 0.063664f, 0.387062f, -2.692059f, + -0.535549f, 0.263736f, 0.327889f, -0.070273f, -0.775254f, 0.147250f, + 3.309425f, -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f, + 0.022907f, 0.138421f, -0.112159f, -0.288447f, -0.010799f, 0.056049f, + -0.036527f, 0.021525f, 0.106649f, -0.291883f, 0.088424f, -0.057773f, + -0.086031f, 0.015277f, -0.318505f, -0.269049f, -1.008913f, -0.224785f, + -0.025820f, -0.649037f, 0.706381f, 0.096410f, 0.643776f, -0.046743f, + -0.009654f, -0.024246f, 1.469255f, -0.183536f, -0.370046f, -0.048442f, + -0.376527f, -0.431264f, -0.245109f, -0.093951f, 0.203683f, -0.099872f, + 0.087210f, 0.160692f, -3.527694f, -0.068891f, -0.228994f, -0.231817f, + -0.241949f, 0.193613f, 0.979597f, -0.091259f, 0.414424f, -0.047341f, + -0.209582f, -0.295134f, -0.016824f, 0.460327f, -0.072671f, 0.246234f, + 0.235896f, 0.127238f, -1.068683f, 0.035648f, 2.254888f, 0.180105f, + -0.260098f, -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f, + -0.237916f, 0.031103f, -0.274063f, -0.049384f, -0.044917f, 0.102477f, + -0.342148f, -0.257558f, -0.346300f, 0.115333f, -0.115456f, 0.208354f, + -0.359301f, -0.167395f, 1.146514f, -0.177861f, -0.098658f, -0.444570f, + 6.759993f, -0.369772f, -0.831118f, 0.001866f, -0.073298f, -0.072095f, + 0.811902f, -0.431997f, -0.286587f, -0.269500f, 0.111492f, -0.525364f, + -0.351785f, -2.463474f, -1.852659f, 0.135325f, 0.138267f, 0.100643f, + -2.373278f, -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f, + -0.716424f, -0.031674f, 0.011147f, 0.057405f, -0.215873f, -0.094401f, + 0.573528f, -1.223820f, 0.414852f, -0.059053f, -0.076488f, -0.287168f, + -0.842640f, 0.174084f, -0.567186f, 0.336629f, -0.062514f, 2.075448f, + -0.061680f, -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f, + -0.049616f, -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f, + 0.141501f, -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f, + -1.521661f, -0.122639f, -0.015760f, -0.718912f, 5.877828f, 0.146916f, + 0.151767f, 0.220785f, -0.032298f, 0.230902f, 0.663943f, -0.252613f, + 0.057718f, -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f, + -1.031206f, -0.104136f, 0.389897f, 0.127602f, -2.667789f, -0.212366f, + -0.506262f, -0.009115f, -0.213202f, 0.076167f, -1.629405f, 0.055129f, + 0.375393f, -0.150272f, -0.241515f, -0.326497f, 0.100069f, 0.410703f, + 0.340622f, 0.042437f, -0.349945f, 0.041176f, -1.178950f, 0.030992f, + 0.933908f, -0.035844f, -0.098660f, 1.030584f, -0.092043f, -0.355739f, + -0.305562f, 0.036161f, -0.049558f, -0.033225f, -0.403856f, -0.088276f, + 0.215493f, -0.149105f, -0.013363f, 0.025886f, -0.101306f, -0.205781f, + -1.072487f, -0.076019f, 0.077555f, 0.131003f, 1.267763f, -0.008954f, + -0.327617f, -0.246539f, 6.664081f, -0.404403f, -1.442489f, 0.191301f, + -0.336361f, 0.181156f, 0.833108f, 0.007879f, -0.194464f, -1.029408f, + -0.036268f, -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f, + -0.065990f, 0.203160f, -0.291788f, 0.000680f, 0.587011f, -0.241289f, + 0.037034f, 0.000552f, 1.072308f, -0.387230f, -0.230050f, 0.292322f, + -0.720001f, 0.034109f, -0.467260f, 2.211644f, -1.839191f, -0.048797f, + -0.083469f, -0.334686f, -0.269056f, 0.051295f, 1.319904f, -0.035603f, + -0.018457f, -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f, + -0.305469f, -0.099011f, 0.014225f, -0.452772f, 0.170331f, -0.389312f, + -0.115084f, -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f, + -0.125137f, 0.067228f, -1.329271f, -0.117874f, -0.132499f, -0.218376f, + -0.588325f, -0.320024f, 0.085695f, -0.235047f, -0.217790f, 0.103015f, + -0.698644f, 0.017766f, -0.058299f, 0.199411f, -0.122485f, -0.563949f, + -0.349011f, -0.557045f, -0.131165f, 0.002281f, 0.118559f, -0.210302f, + -1.153815f, 0.116738f, -0.236007f, -0.003487f, -0.006885f, -0.244816f, + 0.953222f, 0.093748f, 0.266869f, 0.241869f, -0.860832f, -0.387012f, + -0.338986f, 2.097515f, -1.942512f, -0.298021f, 0.543911f, -0.043214f, + 0.082125f, -0.120242f, 0.712231f, 0.213327f, -0.301687f, -0.544011f, + -0.392131f, 0.004302f, 0.004825f, -0.317440f, -0.107518f, -0.293407f, + -0.159111f, -0.080367f, 0.132663f, -0.017726f, -0.237521f, -0.190297f, + -0.361633f, 0.200518f, -0.538296f, -0.027975f, -0.381704f, -0.016963f, + 0.630105f, -0.190997f, -0.287840f, -0.603488f, 3.605598f, -0.276614f, + -1.346383f, 0.186912f, -0.047575f, -0.189232f, -1.519072f, 0.097816f, + -0.223722f, 0.304924f, -0.213022f, -1.052433f, -0.322283f, -1.706734f, + -2.458027f, 0.237976f, 0.171050f, -0.103139f, -0.278689f, 0.329824f, + -0.262448f, -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f, + 0.091018f, -0.386471f, -0.723940f, 0.064956f, -0.057652f, 1.321024f, + -1.397418f, -0.143136f, 0.272468f, -0.030749f, 0.037324f, 0.069316f, + -0.904925f, -0.333693f, -0.117709f, 2.279598f, -0.428065f, -0.131157f, + -0.014288f, -0.402862f, -0.666090f, 0.017070f, -0.028333f, 0.002481f, + 0.197156f, -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f, + -0.905007f, -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f, + -0.089948f, -0.936827f, 1.437569f, -0.388908f, 0.126170f, 0.186162f, + -0.018819f, -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f, + -0.230436f, -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f, + 0.378157f, 0.113377f, -0.850610f, 0.080245f, -0.087305f, -0.002852f, + 0.044408f, -0.188172f, -1.891998f, 0.092189f, 0.125325f, -0.105090f, + -0.848510f, -0.396308f, -0.384130f, 2.007509f, -1.480787f, -0.126946f, + 0.314767f, 0.000195f, -0.285628f, -0.110442f, -0.293948f, 0.258559f, + -0.417603f, 1.570705f, 0.092459f, -0.340974f, -0.284754f, -0.007801f, + -0.324610f, -0.004734f, -0.207716f, -0.057175f, 0.055467f, -0.210830f, + -0.113005f, -0.299177f, 0.068074f, 0.017929f, -2.897598f, -0.260074f, + -0.014422f, -0.206467f, 1.246997f, -0.372863f, -0.214160f, -0.114035f, + 5.805862f, 0.003611f, -1.340990f, -0.021085f, -0.260431f, -0.002720f, + -1.251640f, -0.353531f, -0.304009f, -0.153376f, +}; + +static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = { + -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f, + 0.001427f, 0.523607f, 0.225068f, -0.055273f, 1.019519f, 1.181880f, + -0.010198f, 0.130597f, 1.276752f, 2.028188f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_32_layer0, + av1_ab_partition_nn_weights_32_layer1, + }, + { + av1_ab_partition_nn_bias_32_layer0, + av1_ab_partition_nn_bias_32_layer1, + }, +}; + +// nn model for ab partition pruning, 16x16. +static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = { + 0.151902f, 0.007947f, -1.788454f, 0.431869f, -2.971387f, 0.923566f, + 1.632542f, -1.665136f, -0.338632f, -5.075884f, 0.398267f, 0.030467f, + 2.263534f, -0.045532f, -1.066128f, 0.915139f, -0.560500f, -3.293125f, + 2.072793f, -1.011414f, 0.122716f, -0.060169f, -0.388860f, 0.031019f, + -0.381861f, 0.001551f, -0.328472f, 0.038296f, -0.060398f, -0.375556f, + 0.209226f, 0.014764f, -1.443469f, -0.345486f, 2.409269f, 1.524846f, + -0.640666f, 1.322139f, -2.074771f, -0.580944f, -0.203960f, -0.072893f, + 0.329701f, 0.115339f, -1.339542f, 0.249024f, -0.421545f, -0.409151f, + -0.258293f, 0.836288f, -0.073685f, -0.009624f, 0.895712f, 0.320639f, + 0.451002f, -1.544558f, 0.193709f, -1.389012f, 1.305451f, 0.089795f, + 0.050338f, -0.017433f, -0.304667f, 0.500729f, 0.504346f, 0.073757f, + 0.582649f, -0.993623f, 1.766766f, -3.067265f, -0.415774f, -0.006036f, + -1.245281f, 0.253205f, -0.591245f, -0.626238f, 0.551852f, 0.593755f, + 0.491023f, 1.099384f, -0.348448f, 0.054564f, -0.451422f, -0.375781f, + -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f, + -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f, + 0.068066f, -0.374920f, 0.057536f, -0.189748f, 0.058375f, -0.267749f, + -0.147286f, -0.246153f, 0.006183f, -0.202029f, -0.059128f, 0.116852f, + 0.134719f, -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f, + -0.264499f, 0.155816f, -0.107255f, -0.056983f, -0.209771f, -0.099070f, + 0.007313f, -0.254124f, -0.231964f, -0.275972f, 0.032098f, -0.264564f, + -0.208743f, 0.155599f, -0.121511f, -0.156145f, -0.162315f, -0.059788f, + -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f, + -0.154114f, 0.017032f, -0.017364f, -0.233247f, 0.009918f, -0.179289f, + -0.190722f, 0.147106f, -0.063910f, -0.396872f, -0.263123f, -0.003850f, + -0.040718f, -0.324699f, 0.118660f, -0.170727f, -0.316788f, 0.100886f, + -0.202842f, 0.045371f, 0.150561f, -0.057054f, -0.308150f, 0.028346f, + -0.381473f, -0.195365f, 0.026221f, -0.281795f, 0.087204f, 0.047689f, + -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f, + -0.340273f, 0.048086f, 0.046103f, -0.121527f, 0.021697f, 0.054109f, + -0.002768f, -0.008461f, -2.297240f, 0.124651f, 3.621661f, -0.057120f, + -1.151656f, 2.296894f, -3.678720f, -0.290240f, 0.087683f, -0.186389f, + 0.007656f, -0.090236f, -0.245217f, 0.110389f, -0.251719f, -0.029084f, + -0.128203f, -0.100005f, -0.032779f, 0.007281f, -0.366596f, -0.267870f, + -0.215620f, 0.047687f, 0.010303f, 0.097980f, -0.191569f, -0.341162f, + 0.119249f, 0.026279f, -2.161546f, 0.459591f, 1.290566f, 1.791797f, + -0.409835f, 0.127081f, -1.156367f, 0.198286f, 0.099561f, -0.067445f, + -0.034352f, 0.017966f, -0.277380f, -0.057220f, -0.174198f, -0.014164f, + 0.146090f, -0.357530f, 0.097644f, -0.000932f, 0.446603f, -0.066793f, + 2.448620f, 0.937617f, -1.232922f, 0.313183f, 0.816827f, -0.275115f, + -0.245205f, -0.126895f, 0.156668f, -0.186977f, -0.273505f, 0.013315f, + 0.168629f, -0.089084f, 0.006166f, -0.116107f, -0.199316f, -0.024010f, + -0.242303f, 0.011612f, -0.218485f, -0.229661f, -0.123922f, 0.136699f, + 0.006732f, -0.148718f, -0.164225f, 0.116063f, 1.587898f, 0.690519f, + 0.360566f, 0.009739f, -0.678702f, -0.046003f, 0.126984f, 0.605212f, + 1.240663f, -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f, + -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f, + -0.242255f, 0.137424f, -0.307490f, -0.084637f, -0.023812f, -0.196582f, + -0.078695f, 0.038257f, -0.012110f, -0.263521f, 0.009839f, -0.109125f, + -0.226036f, 0.060712f, 0.093671f, 0.153143f, 0.039116f, -0.290891f, + 0.227057f, -0.204633f, -0.207539f, -0.148242f, 0.046204f, -0.231268f, + -0.209315f, -0.307579f, -0.436556f, 0.023475f, 0.131793f, -0.038301f, + 1.650584f, 0.392570f, 1.446576f, 1.254380f, -0.516867f, -0.057116f, + 0.149320f, 0.414424f, -0.246309f, 0.003877f, -0.480238f, -1.037035f, + -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f, 0.940609f, + -1.113370f, -0.018554f, 0.141064f, -0.182504f, 1.270707f, 0.414904f, + -0.216036f, 0.203831f, 0.450716f, -0.452909f, 0.139358f, -0.027143f, + 1.956892f, 1.643732f, -0.867839f, -0.620520f, -0.334607f, -0.519982f, + 0.205023f, 0.661159f, -0.000809f, 0.049033f, -0.348579f, -0.200338f, + -0.362144f, -0.346590f, -0.230096f, 0.180746f, -0.149954f, -0.253429f, + -0.378170f, -0.040724f, -0.041597f, 0.243659f, -0.472181f, 0.015401f, + -0.180376f, 0.153139f, -0.247738f, -0.010485f, -0.157158f, 0.016825f, + -0.238925f, -0.265798f, -0.318374f, 0.142352f, -0.210520f, 0.051928f, + -0.352190f, -0.179052f, -0.185498f, 0.025540f, -0.111667f, -0.235187f, + -0.215454f, 0.010931f, -0.238372f, -0.126659f, 0.075691f, -0.091167f, + -2.462379f, -0.007950f, -0.637990f, 0.285554f, -0.051275f, 0.282279f, + -0.744083f, -0.570646f, 0.592198f, 1.421332f, -0.256027f, -0.140315f, + 0.160247f, -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f, + -0.071228f, 0.055864f, -1.084764f, -0.263409f, 0.779266f, 0.228187f, + 0.375013f, 0.121204f, -0.656948f, 0.533561f, 0.272671f, -0.015423f, + -0.124180f, -0.009127f, 2.934838f, -0.150998f, 1.163152f, 0.081997f, + -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f, 0.024046f, + -1.451709f, 0.332558f, 0.990504f, 0.376290f, -1.466773f, -0.448439f, + -2.929108f, -4.255188f, 0.065238f, 0.019950f, 1.372393f, 0.444052f, + -2.538772f, 1.579767f, -0.464911f, -1.866114f, 1.053958f, 0.434467f, + -0.125964f, 0.034671f, 0.077116f, -0.138466f, -0.413395f, -0.223453f, + -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f, 0.037459f, + -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f, + -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f, -0.328036f, + -0.169790f, 0.036506f, 0.052572f, -0.183570f, -0.073617f, -0.244959f, + 0.266498f, 0.032846f, -1.902106f, 0.486078f, 2.414993f, 0.975182f, + -0.382875f, 1.647810f, -2.197017f, -0.890107f, 0.221287f, 0.010889f, + 3.817042f, 0.572728f, 0.092466f, 0.473337f, -1.634659f, -1.069455f, + 1.486776f, -1.023850f, 0.088184f, 0.008842f, 0.518202f, 0.270259f, + 1.757191f, -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f, + -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f, + -0.267836f, -0.319354f, -0.274975f, 0.068970f, -0.406467f, 0.044074f, + -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f, -0.177674f, + -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f, + -0.312272f, -0.222217f, -0.100548f, 0.106260f, -0.034655f, 0.135109f, + -0.021276f, 0.018177f, -0.353097f, -0.011128f, 0.061136f, -0.511662f, + -0.223236f, -0.308841f, 0.118789f, -0.154628f, -0.053178f, -0.055973f, + 0.013175f, -0.368337f, -0.090863f, -0.116920f, 0.178990f, -0.025278f, + -0.190553f, -0.238092f, 0.303943f, -0.024944f, 0.719373f, 0.384332f, + -0.378480f, -0.423316f, 0.709922f, 0.758514f, -1.559023f, -2.503173f, + 0.068652f, -0.234741f, -0.182932f, 0.037878f, 0.020684f, -0.174142f, + -0.182300f, -0.052796f, -0.219145f, 0.113028f, -1.041826f, 0.035317f, + 0.919904f, -0.676011f, 0.652297f, 1.456447f, -0.166904f, -0.861823f, + 0.895827f, 0.429821f, -0.180376f, -0.076587f, -0.273945f, -0.288990f, + -0.206692f, -0.080745f, -0.085444f, 0.186953f, -0.050135f, 0.044243f, + -0.391706f, -0.160498f, -0.292268f, 0.164060f, 0.412649f, 0.211611f, + -0.327294f, -0.919399f, 0.320297f, 0.385284f, -0.088848f, -0.072556f, + -0.384813f, -0.176267f, -0.065918f, 0.134724f, -0.231104f, -0.337707f, + -0.195442f, -0.263569f, 0.098090f, -0.341411f, -0.189211f, -0.439276f, + -0.404046f, 0.262491f, -0.311093f, -0.086454f, -0.013400f, -0.061447f, + -0.026945f, -0.112036f, -0.322985f, 0.078500f, -0.230205f, -0.344535f, + -0.021087f, 0.110220f, -0.128671f, 0.044219f, +}; + +static const float av1_ab_partition_nn_bias_16_layer0[64] = { + 2.936406f, -0.396539f, -0.110456f, -1.254954f, 0.785350f, 0.516290f, + -0.172341f, 0.254386f, -0.192465f, -0.106751f, -0.055518f, -0.094994f, + 0.000000f, -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f, + 0.457446f, -0.125051f, -0.107712f, 0.714607f, -0.140809f, -1.788650f, + -0.087199f, 0.000000f, -1.290050f, 0.443930f, -0.110634f, -0.109380f, + -0.188213f, -1.414179f, 1.193579f, 0.388775f, -0.873193f, -0.110050f, + -0.072565f, -0.117050f, -0.119132f, 0.456959f, -0.132069f, 0.131974f, + 1.160474f, 1.746465f, 0.442628f, -0.188849f, -0.207794f, -0.108364f, + -0.856655f, -2.141620f, 0.335476f, -0.105508f, -0.212162f, -0.109319f, + -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f, -0.023908f, + 0.123809f, -0.109797f, 0.200510f, -0.147542f, +}; + +static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = { + -6.823716f, 1.406568f, -0.144009f, 2.228765f, 0.838336f, 0.738107f, + -0.319014f, -0.148756f, 0.240862f, -0.111089f, -0.004241f, 0.025758f, + -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f, 0.252994f, + -0.289443f, 0.194932f, 0.057467f, 0.724735f, 0.014063f, 1.361352f, + 0.025191f, 0.024274f, 0.231462f, -7.227959f, -0.094515f, 0.039946f, + 0.412719f, 0.812318f, 3.038903f, -0.286289f, 0.647482f, -0.115114f, + 0.053590f, 0.066069f, 0.153134f, 0.996250f, -0.125700f, 0.951365f, + -6.243494f, -4.827697f, 0.566320f, 0.239515f, -0.099702f, 0.054546f, + 1.847330f, 3.680076f, -3.049829f, -0.127709f, 0.068469f, -0.017794f, + 0.223864f, -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f, + -0.552073f, 0.043311f, 0.218668f, 0.033209f, -3.199210f, 0.193079f, + 0.321406f, 0.718307f, -0.181418f, -0.459612f, -1.981170f, 0.968496f, + -0.029757f, -0.130065f, 0.043782f, 0.072394f, -0.088686f, 0.025322f, + 0.129882f, 0.101324f, 0.335707f, 0.072714f, -2.079774f, 0.203997f, + 0.239321f, -0.301757f, 0.257845f, 1.288382f, -0.031275f, -0.234194f, + 0.310722f, 2.045469f, 0.034716f, 0.135638f, -0.251388f, 0.320071f, + -1.065301f, -0.322731f, -0.545028f, 0.226276f, 0.090799f, 0.019289f, + 0.048950f, -1.079300f, 0.231938f, 0.083683f, 4.762127f, 0.145037f, + -0.145549f, 0.075592f, 0.172336f, 0.108175f, 0.333751f, 1.090501f, + 1.056114f, 0.047073f, 0.182052f, -0.081587f, 0.089900f, 0.339286f, + 2.049988f, 0.073585f, 0.537355f, -0.243322f, -0.010179f, -0.052601f, + -0.174915f, 0.117793f, 2.222990f, -2.520837f, -0.092699f, 1.199887f, + 0.138720f, 0.679918f, -0.463155f, -0.659496f, -0.109913f, -0.003398f, + 0.114633f, -0.128377f, 0.092970f, -0.107489f, -0.191078f, 0.185182f, + 0.216980f, -0.019343f, 3.443133f, 0.287953f, 0.099314f, 0.985958f, + 0.157268f, -0.606516f, 0.049418f, -0.221809f, -0.453081f, -0.344796f, + -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f, + -1.011192f, 0.022795f, 0.186363f, -0.076356f, -0.050932f, -0.165098f, + 0.168177f, -0.101596f, -5.270886f, 2.553943f, -0.440870f, -0.017494f, + 0.215208f, -0.017032f, 1.495915f, -4.304677f, 0.762211f, 0.182937f, + 0.254406f, -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f, + 0.737697f, -0.234989f, 0.168095f, 0.245118f, -0.077262f, 0.195718f, + 0.753302f, -1.637869f, 0.126227f, 0.982129f, -0.121444f, -0.295570f, + -1.215799f, 0.147867f, -0.068496f, 0.132726f, -0.005772f, -0.181774f, + 0.126513f, 0.204723f, -0.366123f, 0.103906f, -0.148053f, -0.075272f, + 0.243884f, -0.104828f, 0.198988f, 0.501034f, -0.112671f, 0.111421f, + 0.167508f, -0.117803f, -0.738624f, 2.046292f, 0.124011f, 0.057983f, + -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f, + 0.122417f, 0.060291f, -0.129033f, -0.843086f, 0.268241f, -0.399927f, + 1.585888f, 1.816393f, -0.631427f, 0.127826f, 0.088105f, 0.073488f, + 0.717694f, -1.497362f, 2.608528f, 0.066896f, -0.079230f, 0.223436f, + -0.010530f, 0.175310f, 1.120365f, 0.034391f, 0.835312f, 0.071652f, + -0.080615f, 0.111395f, 0.162742f, 0.079927f, -3.859582f, -0.638431f, + -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f, 0.931940f, + -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f, 0.102793f, + -0.048546f, 0.063545f, 0.023864f, -0.190863f, 1.934257f, -0.136286f, + -0.107916f, -0.637468f, 0.066449f, 1.089693f, -0.214047f, -0.265780f, + 0.899660f, -0.130333f, 0.288311f, -0.049024f, 0.090202f, 0.487969f, + 0.339704f, 0.858479f, 0.841253f, -0.184100f, -0.637070f, -0.125071f, + -0.077650f, -0.087877f, 0.202268f, -0.027300f, 2.842862f, -0.100698f, + -0.259080f, 0.260556f, 0.157912f, -0.070364f, 0.467190f, 1.200037f, + 1.419317f, -0.033588f, -0.227824f, 0.292617f, 0.228574f, 0.213839f, + -1.091099f, -0.022258f, -1.294681f, 0.136118f, 0.081652f, -0.185359f, + -0.039706f, 0.191407f, -2.053219f, -0.261934f, 0.047812f, -0.029536f, + -0.823869f, -1.090534f, -0.755890f, 0.441035f, -0.167945f, 0.231441f, + -0.135013f, -0.260762f, 0.256872f, 0.130339f, -0.243751f, 0.189760f, + -0.288454f, 0.145363f, 0.338490f, 0.403898f, -0.022814f, -1.263598f, + -0.101315f, 0.860135f, 0.136511f, 0.028942f, 0.574047f, 2.656370f, + 0.037587f, -0.188690f, -0.125312f, 1.100435f, -1.080402f, 0.380905f, + 0.004635f, 0.097144f, -0.214309f, 0.085552f, -0.285066f, -0.705134f, + -0.054704f, -0.319951f, 5.486626f, 0.958158f, -1.380585f, 0.223340f, + -0.169167f, -0.170697f, -0.216748f, 0.324232f, 2.684204f, -0.008490f, + -0.211052f, -0.201190f, 0.123466f, -0.000234f, 0.579907f, 0.096938f, + -0.042745f, 0.201855f, 0.157195f, -0.261440f, 0.029699f, -0.046599f, + 1.618216f, -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f, + 0.579699f, -0.100392f, 0.150694f, 0.061794f, 0.200425f, -0.062515f, + -0.179122f, 0.250112f, -0.344675f, -0.118359f, -0.095670f, 0.152311f, + 3.662276f, -0.154921f, -0.312991f, 0.972008f, -0.308596f, -0.190426f, + 0.133889f, -0.238673f, -0.094726f, 1.683835f, -0.215629f, -0.198890f, + -0.035278f, -0.367973f, -0.822435f, 0.240848f, -0.194656f, 0.034655f, + -0.079424f, 0.146670f, 0.026646f, -0.034507f, 0.059467f, -0.153109f, + -0.431033f, 2.552991f, -1.894091f, -0.180462f, -0.306839f, -0.025648f, + 1.026326f, -3.096230f, 1.346935f, 0.033633f, -0.181827f, 0.094376f, + 0.001696f, -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f, + 0.281795f, -0.127251f, 0.180776f, 0.067763f, 0.697124f, -1.040779f, + 0.111280f, 0.188351f, -0.340234f, -0.207790f, -0.720075f, -0.137409f, + -0.070310f, -0.032918f, -0.060787f, 0.131484f, -0.077845f, -0.258652f, + 0.056911f, -0.062034f, 0.007663f, -0.185100f, 1.340361f, 0.014096f, + -0.124602f, 0.194241f, 0.128383f, 0.360465f, 0.082979f, -0.050475f, + -0.519294f, 3.323262f, 0.067014f, 0.221203f, -0.085082f, -0.228606f, + -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f, + -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f, 1.790253f, + -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f, + 2.251166f, -0.146007f, 0.138527f, -0.003134f, 0.103665f, 0.006928f, + -0.240253f, -0.227464f, 0.578437f, -0.214724f, 0.503085f, 0.158093f, + 0.033091f, 0.008061f, 4.815371f, 2.132264f, 0.281850f, -2.288560f, + -0.145012f, 1.296832f, -0.362401f, -0.403252f, 0.109873f, 0.185746f, + 0.244764f, 0.172367f, -0.185588f, 0.139801f, -0.178254f, 0.068629f, + 0.358488f, -0.153969f, -6.433524f, 0.225983f, -0.138123f, -0.095971f, + -0.036089f, -1.400083f, 0.265908f, 0.257787f, 0.181144f, -1.647228f, + -0.136289f, -0.074206f, 0.122988f, -0.088895f, -1.266717f, 0.006010f, + 0.536681f, 0.263061f, -0.032207f, -0.155136f, 0.086431f, 0.441950f, + -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f, 0.117667f, + -0.000408f, 0.225719f, -2.199698f, 0.141447f, -1.459051f, 0.051315f, + 0.203228f, 0.354432f, -0.005775f, -0.028073f, -0.965817f, 0.231083f, + -0.666884f, 0.026283f, -0.317486f, 0.210754f, 0.123897f, 0.223827f, + 4.214405f, 1.457334f, -0.253945f, -1.306733f, -0.391235f, 0.451154f, + -1.553888f, -0.353429f, 0.069533f, 0.159278f, -0.173836f, -0.004952f, + -0.137033f, 0.127012f, 0.143600f, 0.051587f, -0.070549f, 0.066509f, + -5.776547f, 0.180021f, -0.189183f, -1.288504f, -0.233575f, -1.473873f, + 0.140940f, 0.144451f, -0.104534f, 2.089873f, -0.168168f, 0.110726f, + 0.132134f, -0.215223f, -1.682754f, 0.157757f, -0.146163f, 0.064882f, + 0.117313f, -0.038780f, -0.124720f, -0.501697f, 0.092047f, -0.233992f, + 3.324976f, 0.516601f, 1.294202f, 0.119989f, 0.061055f, 0.043420f, + -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f, + -0.282998f, -0.282705f, 0.073798f, 0.169851f, 0.135651f, 0.182677f, + -0.040220f, 0.132462f, -0.303120f, -0.230113f, 6.165739f, -0.258596f, + 0.024127f, -1.388283f, -0.006042f, 0.572600f, 0.348411f, -0.387376f, + -0.075845f, 0.122319f, -0.029616f, 0.077873f, 0.154763f, 0.049073f, + 0.018597f, 0.102688f, -0.204165f, 0.020734f, -1.389133f, -0.032854f, + -0.147561f, 0.853944f, 0.132100f, -3.259659f, 0.243745f, 0.181529f, + -0.738414f, 1.509994f, 0.023470f, -0.005329f, 0.066115f, -1.345081f, + -1.455402f, -0.172023f, -0.194625f, 0.071885f, -0.201742f, -0.262402f, + 0.077601f, -0.048938f, 0.257993f, -0.504029f, -2.032415f, 1.158880f, + 0.448647f, -0.025633f, 0.117586f, -0.072275f, -0.673744f, -3.854342f, + -0.983843f, 0.047766f, -0.017193f, -0.215775f, -0.158743f, -0.232042f, + -0.509112f, 0.148812f, 0.130122f, 0.006486f, -0.099016f, 0.022514f, + -0.486850f, -0.059623f, 4.012731f, 0.025454f, 0.029059f, -0.783546f, + -0.295260f, 0.322521f, -0.473201f, -0.172100f, -0.100087f, -0.076516f, + -0.258367f, -0.112897f, 0.269364f, -0.065912f, 0.169022f, -0.178783f, + -0.095114f, 0.122089f, -2.790099f, -0.100431f, -0.087963f, -0.009431f, + -0.087819f, -2.774399f, -0.100757f, 0.013005f, -0.964533f, 3.236665f, + -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f, + -1.799262f, -0.365269f, 0.108611f, 0.037994f, 0.024747f, -1.073639f, + -0.203158f, -0.935006f, 1.880891f, 1.578385f, 0.726272f, -0.024546f, + -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f, 0.075451f, + 0.182899f, 0.092215f, -0.207347f, -0.030111f, 0.054316f, 0.192481f, + 0.594639f, -0.247694f, 0.547471f, -0.032094f, -0.065000f, 0.007198f, + 1.605377f, -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f, + 0.919365f, 0.599980f, 0.125545f, 0.265813f, 0.246884f, 0.095385f, + -0.260374f, -0.202916f, -0.042770f, 0.234967f, -0.233139f, -0.326994f, + -1.375256f, 0.121766f, 0.077433f, -1.103569f, 0.019497f, -1.029185f, + 0.253905f, 0.206569f, 0.187334f, -0.237089f, -0.294351f, 0.164137f, + 0.149696f, -0.749787f, -0.413433f, 0.976587f, 1.027976f, -0.285264f, + 0.209273f, -0.124762f, 0.050884f, 0.250764f, -0.082031f, -0.646520f, + 4.116680f, 0.437336f, 0.671684f, 0.129509f, -0.078462f, 0.014072f, + -0.678232f, 0.094831f, 1.125624f, 0.207070f, -0.154750f, -0.025780f, + -0.103030f, 0.118019f, -0.908186f, -0.263546f, -1.555324f, -0.236887f, + -0.217854f, -0.051790f, 0.017915f, 0.171001f, 1.355562f, 0.094603f, + -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f, + -0.298901f, 0.038162f, 0.251899f, 0.039612f, -0.022935f, -0.232308f, + -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f, + -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f, + 0.185269f, 1.465082f, 0.040240f, 0.112665f, 0.144329f, -0.286112f, + -0.617649f, 0.916177f, 0.221044f, -0.079867f, 0.170251f, -0.093638f, + -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f, 1.241179f, + 0.355922f, -0.170848f, -0.189168f, 0.080225f, -1.357793f, 0.190890f, + 0.976800f, -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f, + -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f, + -0.049715f, -0.178005f, 3.029985f, -1.141546f, 0.080066f, -1.932316f, + -0.641137f, -0.189564f, 0.935080f, 0.136119f, 0.015558f, -0.179331f, + 0.204571f, 0.020350f, 0.009362f, 0.108478f, 0.037076f, -0.049009f, + 0.081090f, -0.180202f, 1.455561f, -0.081559f, 0.059361f, 0.484971f, + 0.160923f, -2.170744f, -0.013204f, 0.126561f, -0.407122f, 1.223661f, + 0.044262f, 0.118044f, 0.058274f, -1.747100f, -0.171318f, 0.971374f, + 0.306995f, -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f, + -0.106479f, -0.907933f, 1.121231f, 1.673840f, -0.421458f, -0.021146f, + -0.254838f, 0.097632f, 0.235109f, -2.901782f, 0.289518f, -0.355459f, + -0.068264f, -0.179121f, 0.068560f, -0.047570f, -0.522523f, -0.228963f, + -1.037158f, -0.163723f, 0.280563f, -0.000868f, -0.197220f, -0.239329f, + 1.985274f, -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f, + -0.792024f, -0.114290f, 0.060969f, 0.104106f, -0.252123f, -0.150400f, + -0.133277f, 0.267147f, 0.274413f, 0.223744f, -0.180223f, -0.345415f, + -0.104883f, 0.119210f, -0.095041f, -0.301635f, 0.013175f, -2.128121f, + -0.147208f, -0.151509f, -0.692013f, 3.418555f, -0.016541f, 0.171511f, + 0.107159f, -1.516672f, 0.127408f, 0.687035f, -0.906486f, -0.145463f, + -0.169382f, -0.143906f, 0.125091f, -0.960645f, -0.180869f, -0.716908f, + 2.840951f, 1.904919f, -0.416268f, -0.425181f, -0.194697f, -0.075932f, + -0.950604f, -1.599800f, 0.943671f, -0.022744f, -0.270492f, 0.080843f, + -0.372916f, 0.047838f, -0.100300f, -0.026600f, 0.011733f, -0.226051f, + 0.172790f, -0.172982f, 0.041258f, -0.299379f, +}; + +static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = { + -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f, + 0.748430f, 0.203096f, 0.059317f, 0.418219f, 0.841294f, 0.402693f, + -0.658522f, 0.723479f, 0.544264f, 1.035225f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_16_layer0, + av1_ab_partition_nn_weights_16_layer1, + }, + { + av1_ab_partition_nn_bias_16_layer0, + av1_ab_partition_nn_bias_16_layer1, + }, +}; + +#undef FEATURE_SIZE +#undef LABEL_SIZE + +#define FEATURE_SIZE 18 +#define LABEL_SIZE 4 + +static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = { + -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f, + 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f, + 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f, + 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f, + -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f, + -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f, + 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f, + 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f, + -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f, + -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f, + -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f, + -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f, + 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f, + 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f, + -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f, + -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f, + -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f, + -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f, + 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f, + -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f, + -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f, + -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f, + -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f, + -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f, + -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f, + 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f, + 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f, + -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f, + 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f, + -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f, + 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f, + 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f, + -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f, + -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f, + 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f, + -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f, + 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f, + -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f, + 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f, + -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f, + 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f, + 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f, + -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f, + 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f, + 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f, + 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f, + 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f, + -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f, + -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f, + -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f, + 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f, + 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f, + -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f, + -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f, + 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f, + -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f, + -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f, + -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f, + 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f, + -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f, + -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f, + 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f, + 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f, + 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f, + 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f, + -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f, + 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f, + -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f, + -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f, + -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f, + 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f, + 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f, +}; + +static const float av1_4_partition_nn_bias_16_layer0[24] = { + 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f, + -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f, + 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f, + -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f, +}; + +static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = { + -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f, + 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f, + -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f, + -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f, + 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f, + -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f, + -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f, + 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f, + 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f, + -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f, + 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f, + -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f, + 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f, + -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f, + -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f, + -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f, +}; + +static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = { + -0.462133f, + 0.465060f, + 0.062211f, + 0.401786f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 24, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_16_layer0, + av1_4_partition_nn_weights_16_layer1, + }, + { + av1_4_partition_nn_bias_16_layer0, + av1_4_partition_nn_bias_16_layer1, + }, +}; + +static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = { + -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f, + 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f, + -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f, + 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f, + -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f, + -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f, + -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f, + -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f, + -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f, + -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f, + 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f, + -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f, + -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f, + 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f, + -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f, + -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f, + -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f, + -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f, + -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f, + -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f, + 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f, + -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f, + -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f, + 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f, + 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f, + -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f, + 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f, + 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f, + -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f, + -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f, + -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f, + 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f, + -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f, + 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f, + -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f, + -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f, + 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f, + -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f, + 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f, + -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f, + -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f, + -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f, + -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f, + 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f, + 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f, + -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f, + -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f, + 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f, + 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f, + 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f, + 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f, + -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f, + 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f, + 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f, + -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f, + -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f, + -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f, + -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f, + -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f, + -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f, + -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f, + -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f, + 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f, + -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f, + 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f, + -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f, + 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f, + 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f, + 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f, + -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f, + 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f, + 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f, + 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f, + -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f, + 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f, + 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f, + -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f, + 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f, + -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f, + -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f, + -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f, + -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f, + -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f, + 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f, + -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f, + 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f, + -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f, + -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f, + 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f, + -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f, + -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f, + 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f, + -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f, + -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f, + -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f, + 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f, +}; + +static const float av1_4_partition_nn_bias_32_layer0[32] = { + 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f, + -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f, + -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f, + -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f, + -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f, + 0.109579f, -0.082685f, +}; + +static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = { + 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f, + 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f, + 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f, + -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f, + 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f, + 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f, + -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f, + 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f, + 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f, + 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f, + -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f, + 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f, + -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f, + -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f, + 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f, + -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f, + 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f, + 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f, + 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f, + 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f, + -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f, + -0.800926f, -0.134132f, +}; + +static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = { + -0.019518f, + 0.198546f, + 0.339015f, + -0.261961f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 32, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_32_layer0, + av1_4_partition_nn_weights_32_layer1, + }, + { + av1_4_partition_nn_bias_32_layer0, + av1_4_partition_nn_bias_32_layer1, + }, +}; + +static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = { + -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f, + -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f, + 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f, + -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f, + -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f, + 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f, + 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f, + 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f, + 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f, + -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f, + -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f, + 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f, + -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f, + 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f, + -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f, + -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f, + 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f, + -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f, + 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f, + -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f, + -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f, + -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f, + -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f, + -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f, + -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f, + -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f, + -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f, + 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f, + 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f, + -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f, + -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f, + 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f, + -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f, + 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f, + -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f, + 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f, + 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f, + -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f, + -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f, + 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f, + 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f, + 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f, + 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f, + -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f, + -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f, + 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f, + -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f, + 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f, + -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f, + 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f, + -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f, + -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f, + 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f, + -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f, + -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f, + -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f, + -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f, + -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f, + 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f, + 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f, + -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f, + -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f, + -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f, + 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f, + 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f, + -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f, + -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f, + 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f, + 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f, + 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f, + -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f, + 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f, +}; + +static const float av1_4_partition_nn_bias_64_layer0[24] = { + 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f, + -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f, + -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f, + -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f, +}; + +static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = { + -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f, + 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f, + 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f, + -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f, + -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f, + 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f, + -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f, + 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f, + 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f, + -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f, + -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f, + -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f, + 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f, + -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f, + -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f, + -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f, +}; + +static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = { + -0.478735f, + 0.292948f, + 0.293172f, + 0.040013f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 24, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_64_layer0, + av1_4_partition_nn_weights_64_layer1, + }, + { + av1_4_partition_nn_bias_64_layer0, + av1_4_partition_nn_bias_64_layer1, + }, +}; + +#undef FEATURE_SIZE +#undef LABEL_SIZE + +#define FEATURE_SIZE 4 +static const float + av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = { + -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f, + -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f, + 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f, + -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f, + -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f, + -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f, + -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f, + -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f, + 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f, + 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f, + -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f, + -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f, + 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f, + -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f, + -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f, + -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f, + 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f, + -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f, + -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f, + -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f, + 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f, + -0.007193f, -0.257836f, + }; + +static const float av1_partition_breakout_nn_bias_128_layer0[32] = { + 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f, + -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f, + 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f, + 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f, + -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f, + 0.429660f, -8.439470f, +}; + +static const float av1_partition_breakout_nn_weights_128_layer1[32] = { + -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f, + 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f, + 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f, + -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f, + -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f, + -0.039662f, 0.131499f, +}; + +static const float av1_partition_breakout_nn_bias_128_layer1[1] = { + 0.86678213f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_128_layer0, + av1_partition_breakout_nn_weights_128_layer1, + }, + { + av1_partition_breakout_nn_bias_128_layer0, + av1_partition_breakout_nn_bias_128_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = { + 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f, + -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f, + 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f, + 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f, + -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f, + 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f, + 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f, + -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f, + 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f, + -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f, + -2.407131f, -0.062304f, 0.000874f, 0.108786f, + }; + +static const float av1_partition_breakout_nn_bias_64_layer0[16] = { + 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f, + -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f, + -0.337413f, 4.492778f, 0.000000f, 17.043072f, +}; + +static const float av1_partition_breakout_nn_weights_64_layer1[16] = { + -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f, + 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f, + -0.038572f, 0.307899f, -0.294283f, 0.118323f, +}; + +static const float av1_partition_breakout_nn_bias_64_layer1[1] = { + -1.33438122f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_64_layer0, + av1_partition_breakout_nn_weights_64_layer1, + }, + { + av1_partition_breakout_nn_bias_64_layer0, + av1_partition_breakout_nn_bias_64_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = { + -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f, + 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f, + -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f, + -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f, + -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f, + 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f, + 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f, + -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f, + -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f, + -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f, + -0.520814f, -0.045386f, -0.443123f, -0.484209f, + }; + +static const float av1_partition_breakout_nn_bias_32_layer0[16] = { + 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f, + 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f, + -0.423808f, 0.000000f, 6.352258f, -0.155787f, +}; + +static const float av1_partition_breakout_nn_weights_32_layer1[16] = { + 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f, + 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f, + -0.004171f, 0.157694f, 0.117845f, 0.272115f, +}; + +static const float av1_partition_breakout_nn_bias_32_layer1[1] = { + 0.09049262f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_32_layer0, + av1_partition_breakout_nn_weights_32_layer1, + }, + { + av1_partition_breakout_nn_bias_32_layer0, + av1_partition_breakout_nn_bias_32_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = { + 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f, + -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f, + -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f, + -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f, + -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f, + -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f, + -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f, + -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f, + -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f, + -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f, + -0.509287f, -0.048877f, -0.001512f, 0.077086f, + }; + +static const float av1_partition_breakout_nn_bias_16_layer0[16] = { + 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f, + 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f, + 5.625762f, 0.615822f, 0.040057f, 16.668884f, +}; + +static const float av1_partition_breakout_nn_weights_16_layer1[16] = { + -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f, + 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f, + 0.269773f, -0.021105f, -0.146698f, 0.188764f, +}; + +static const float av1_partition_breakout_nn_bias_16_layer1[1] = { + 1.60751927f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_16_layer0, + av1_partition_breakout_nn_weights_16_layer1, + }, + { + av1_partition_breakout_nn_bias_16_layer0, + av1_partition_breakout_nn_bias_16_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = { + -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f, + 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f, + -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f, + -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f, + 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f, + -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f, + -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f, + -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f, + -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f, + -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f, + -0.596269f, 0.098494f, -0.005765f, 0.173652f, + }; + +static const float av1_partition_breakout_nn_bias_8_layer0[16] = { + 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f, + 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f, + 2.336705f, -0.278834f, 0.231905f, 7.954366f, +}; + +static const float av1_partition_breakout_nn_weights_8_layer1[16] = { + -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f, + -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f, + 0.055858f, 0.230970f, -0.056466f, 0.119780f, +}; + +static const float av1_partition_breakout_nn_bias_8_layer1[1] = { + 1.27784479f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_8 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_8_layer0, + av1_partition_breakout_nn_weights_8_layer1, + }, + { + av1_partition_breakout_nn_bias_8_layer0, + av1_partition_breakout_nn_bias_8_layer1, + }, +}; +#undef FEATURE_SIZE + +#define FEATURE_SIZE 9 // Input layer size +#define NUM_NODES 32 // Hidden layer size +#define LABEL_SIZE 3 // Output layer size + +static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE * + NUM_NODES] = { + 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f, + -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f, + 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f, + -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f, + 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f, + 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f, + 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f, + -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f, + 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f, + 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f, + -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f, + -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f, + 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f, + 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f, + -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f, + 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f, + -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f, + 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f, + 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f, + -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f, + -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f, + -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f, + 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f, + 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f, + -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f, + 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f, + -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f, + -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f, + -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f, + 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f, + -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f, + -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f, + -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f, + 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f, + 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f, + -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f, + 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f, + 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f, + 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f, + 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f, + -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f, + -1.08228f, +}; + +static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = { + 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f, + -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f, + 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f, + -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f, + -0.22638f, 1.40940f, -0.09309f, 0.05828f, +}; + +static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f, + -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f, + -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f, + -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f, + -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f, + 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f, + -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f, + 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f, + 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f, + -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f, + -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f, + 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f, + -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f, + -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f, +}; + +static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = { + 1.70665f, + -0.77954f, + -0.92709f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_8 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_8_layer0, + av1_rect_partition_nn_weights_8_layer1 }, + { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 } +}; + +static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f, + -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f, + 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f, + -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f, + 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f, + -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f, + 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f, + 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f, + 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f, + -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f, + 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f, + 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f, + 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f, + 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f, + 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f, + -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f, + -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f, + 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f, + -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f, + -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f, + -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f, + 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f, + 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f, + -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f, + -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f, + -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f, + 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f, + 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f, + -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f, + -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f, + -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f, + -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f, + -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f, + 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f, + 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f, + 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f, + -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f, + -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f, + 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f, + -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f, + -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f, + -0.05573f, +}; + +static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = { + -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f, + 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f, + 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f, + -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f, + -0.12044f, 1.65478f, -0.75153f, 1.18441f, +}; + +static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES * + LABEL_SIZE] = { + -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f, + 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f, + 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f, + 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f, + -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f, + 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f, + 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f, + 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f, + 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f, + -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f, + -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f, + -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f, + 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f, + -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f, +}; + +static const float av1_rect_partition_nn_bias_16_layer1[3] = { + 2.68750f, + -1.31894f, + -1.36768f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_16_layer0, + av1_rect_partition_nn_weights_16_layer1 }, + { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 } +}; + +static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f, + -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f, + -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f, + -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f, + -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f, + -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f, + -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f, + -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f, + -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f, + 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f, + -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f, + -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f, + 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f, + 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f, + -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f, + -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f, + 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f, + 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f, + 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f, + 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f, + 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f, + -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f, + 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f, + 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f, + -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f, + -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f, + -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f, + -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f, + -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f, + -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f, + 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f, + -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f, + -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f, + 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f, + 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f, + -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f, + 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f, + 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f, + 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f, + -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f, + -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f, + 0.33984f, +}; + +static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = { + -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f, + 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f, + 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f, + -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f, + -0.27602f, -1.98063f, 0.20816f, -0.01315f, +}; + +static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f, + -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f, + 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f, + -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f, + 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f, + 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f, + 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f, + 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f, + 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f, + -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f, + 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f, + -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f, + -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f, + -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f, +}; + +static const float av1_rect_partition_nn_bias_32_layer1[3] = { + 2.47332f, + -1.65756f, + -0.81573f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_32_layer0, + av1_rect_partition_nn_weights_32_layer1 }, + { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 } +}; + +static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE * + NUM_NODES] = { + 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f, + 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f, + 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f, + 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f, + 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f, + 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f, + 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f, + -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f, + 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f, + 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f, + -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f, + -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f, + -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f, + -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f, + 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f, + 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f, + 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f, + -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f, + -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f, + -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f, + -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f, + -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f, + 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f, + 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f, + 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f, + -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f, + -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f, + 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f, + 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f, + 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f, + -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f, + -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f, + -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f, + 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f, + -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f, + -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f, + 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f, + -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f, + -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f, + 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f, + -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f, + 0.09101f, +}; + +static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = { + 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f, + -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f, + -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f, + -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f, + 0.59835f, -0.31269f, -0.30585f, -1.66212f, +}; + +static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f, + -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f, + 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f, + 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f, + 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f, + -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f, + -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f, + 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f, + -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f, + 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f, + -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f, + -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f, + -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f, + 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f, +}; + +static const float av1_rect_partition_nn_bias_64_layer1[3] = { + 0.32215f, + -0.57522f, + 0.25314f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_64_layer0, + av1_rect_partition_nn_weights_64_layer1 }, + { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 } +}; + +static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f, + 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f, + 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f, + 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f, + -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f, + 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f, + 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f, + 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f, + 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f, + 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f, + -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f, + 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f, + -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f, + -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f, + 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f, + -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f, + -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f, + -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f, + -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f, + -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f, + -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f, + -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f, + -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f, + 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f, + 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f, + -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f, + -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f, + 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f, + 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f, + 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f, + -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f, + 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f, + -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f, + 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f, + -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f, + -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f, + 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f, + -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f, + -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f, + -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f, + 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f, + 2.02519f, +}; + +static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = { + 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f, + 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f, + -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f, + -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f, + 0.66120f, 0.61119f, -1.42293f, 0.32676f, +}; + +static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES * + LABEL_SIZE] = { + 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f, + 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f, + -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f, + 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f, + 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f, + 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f, + 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f, + 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f, + -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f, + -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f, + 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f, + 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f, + 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f, + 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f, +}; + +static const float av1_rect_partition_nn_bias_128_layer1[3] = { + 1.09014f, + -0.53317f, + -0.55668f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_128_layer0, + av1_rect_partition_nn_weights_128_layer1 }, + { av1_rect_partition_nn_bias_128_layer0, + av1_rect_partition_nn_bias_128_layer1 } +}; +#undef FEATURE_SIZE +#undef NUM_NODES +#undef LABEL_SIZE + +// Below are the models used for simple_motion_search_based_split +// Thresholds +// The first index level is for aggresiveness, and the second is frame +// resolution, third is bsize +static const float av1_simple_motion_search_split_thresh[4][3][5] = { + // Aggressiveness = 0 + { + // lowres + { + 1.40402595879f, // p = 0.8028197 + 4.72845183649f, // p = 0.99123732 + 1.86517797783f, // p = 0.86589934 + 1.58715223005f, // p = 0.83021506 + 7.22695596987f, // p = 0.9992738 + }, + // midres + { + 5.839480f, // p = 0.997098 + 1.877167f, // p = 0.867285 + 3.073499f, // p = 0.955783 + 1.405601f, // p = 0.803071 + 2.555636f, // p = 0.927951 + }, + // hdres + { + 5.839480f, // p = 0.997098 + 1.877167f, // p = 0.867285 + 3.073499f, // p = 0.955783 + 1.405601f, // p = 0.803071 + 2.555636f, // p = 0.927951 + }, + }, + // Aggressiveness = 1 + { + // Lowres + { + 100.0000f, // p = 1.000000 + 4.952535f, // p = 0.992984 + 1.720880f, // p = 0.848242 + 1.426233f, // p = 0.806314 + 1.491905f, // p = 0.816364 + }, + // Midres + { + 100.0000f, // p = 100.0000 + 3.137263f, // p = 0.958404 + 2.703262f, // p = 0.937219 + 1.877166f, // p = 0.867285 + 2.221149f, // p = 0.902133 + }, + // Hdres + { + 4.417680f, // p = 0.988082 + 3.086898f, // p = 0.956349 + 3.966704f, // p = 0.981416 + 1.532565f, // p = 0.822381 + 3.449975f, // p = 0.969230 + }, + }, + // Aggressiveness = 2 + { + // lowres + { + 100.000000f, // p = 0.998048 + 1.484020f, // p = 0.815179 + 1.866781f, // p = 0.866085 + 1.706711f, // p = 0.846409 + 2.080369f, // p = 0.888980 + }, + // midres + { + 100.000000f, // p = 0.0 + 3.265763f, // p = 0.963235428881 + 2.024598f, // p = 0.883355591569 + 1.846446f, // p = 0.863709256976 + 2.240962f, // p = 0.903868036126 + }, + // hdres + { + 3.133026f, // p = 0.958234684141 + 2.940954f, // p = 0.949834204693 + 2.484544f, // p = 0.923051170045 + 1.702972f, // p = 0.845922460525 + 1.655562f, // p = 0.839641385729 + }, + }, + // Aggressiveness = 3 + { + // lowres + { 100.000000f, 1.41409519484f, 0.606066095487f, 0.0993410805635f, + 0.762099214988f }, + // midres + { 100.000000f, 0.702207995397f, 0.503550081119f, 0.0403228785199f, + 0.557298794638f }, + // hdres + { 1.21895384144f, 1.26798450469f, 0.872537808115f, 0.975869438148f, + 1.86572095242f }, + }, +}; + +static const float av1_simple_motion_search_no_split_thresh[4][3][5] = { + // Aggressiveness = 0 + { + // lowres + { + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + }, + // midres + { + -3.38168078f, // p = 0.032872917 + -4.08610739f, // p = 0.016526795 + -1.78302370f, // p = 0.15270848 + -100.000000f, // p = 0.0 + -100.000000f, // p = 0.0 + }, + // hdres + { + -100.000000f, // p = 0.0 + -100.000000f, // p = 0.0 + -2.98718897f, // p = 0.048008 + -100.000000f, // p = 0.0 + -3.33229488f, // p = 0.03447975 + }, + }, + // Aggressiveness = 1 + { + // Lowres + { + -100.0000f, // p = 0.0 + -4.893793f, // p = 0.007437 + -3.387766f, // p = 0.032680 + -2.982806f, // p = 0.048209 + -2.330372f, // p = 0.088639 + }, + // Midres + { + -100.0000f, // p = 0.000000 + -6.131853f, // p = 0.002168 + -2.346579f, // p = 0.087338 + -2.712849f, // p = 0.062219 + -3.195430f, // p = 0.039338 + }, + // Hdres + { + -3.491416f, // p = 0.029557 + -2.192853f, // p = 0.100394 + -3.620180f, // p = 0.026079 + -2.030855f, // p = 0.116001 + -2.797586f, // p = 0.057455 + }, + }, + // Aggressiveness = 2 + { + // lowres + { + -100.0000f, // p = 0.0 + -3.617350f, // p = 0.026151 + -5.902503f, // p = 0.002725 + -4.677840f, // p = 0.009213 + -2.168378f, // p = 0.102626 + }, + // midres + { + -100.0000f, // p = 0.0 + -3.204195f, // p = 0.0390081679555 + -2.354128f, // p = 0.0867382128969 + -2.523326f, // p = 0.0742390077132 + -3.112328f, // p = 0.0426016085803 + }, + // hdres + { + -5.047760f, // p = 0.00638270448225 + -3.414994f, // p = 0.0318301469487 + -5.628090f, // p = 0.00358255438917 + -2.122691f, // p = 0.10691083145 + -1.972387f, // p = 0.122132728355 + }, + }, + // Aggressiveness = 3 + { + // lowres + { -100.000000f, -2.04766486133f, -1.00442099188f, -1.15077982642f, + -1.0830321897f }, + // midres + { -100.000000f, -0.985686808303f, -0.757739584866f, -0.890120107569f, + -0.228236297886f }, + // hdres + { -1.03535679263f, -1.57431743203f, -0.564851540156f, -0.35442301663f, + -1.36741555171f }, + }, +}; + +static const float av1_simple_motion_search_split_mean_128[17] = { + 14.119120f, 14.087010f, 12.016185f, 11.966075f, 12.042454f, 11.994805f, + 12.152105f, 12.100394f, 12.178377f, 12.128937f, 4.779944f, 0.714786f, + 3.535450f, 3.566207f, 0.835913f, 3.315452f, 3.302908f, +}; + +static const float av1_simple_motion_search_split_std_128[17] = { + 1.832420f, 1.835338f, 2.019207f, 2.020793f, 2.008731f, 2.008403f, + 1.900999f, 1.907081f, 1.908915f, 1.913122f, 2.109345f, 0.451517f, + 1.407097f, 1.372501f, 0.370355f, 1.321495f, 1.319665f, +}; + +static const float av1_simple_motion_search_split_mean_64[17] = { + 12.363721f, 12.314348f, 10.404341f, 10.333541f, 10.405775f, 10.336996f, + 10.402246f, 10.330084f, 10.405584f, 10.334330f, 4.554232f, 0.896393f, + 2.819613f, 2.855845f, 0.926296f, 2.808782f, 2.798229f, +}; + +static const float av1_simple_motion_search_split_std_64[17] = { + 1.878920f, 1.882255f, 1.950167f, 1.953289f, 1.913869f, 1.914781f, + 1.920096f, 1.924454f, 1.880200f, 1.882499f, 2.050922f, 0.304750f, + 1.144391f, 1.125088f, 0.261289f, 1.145059f, 1.131215f, +}; + +static const float av1_simple_motion_search_split_mean_32[17] = { + 10.750278f, 10.679627f, 8.745625f, 8.644149f, 8.757436f, 8.656657f, + 8.759780f, 8.656299f, 8.772563f, 8.669839f, 4.208026f, 0.958573f, + 2.308769f, 2.347375f, 0.961685f, 2.323464f, 2.296322f, +}; + +static const float av1_simple_motion_search_split_std_32[17] = { + 1.879269f, 1.883531f, 1.935828f, 1.935677f, 1.915823f, 1.914773f, + 1.909733f, 1.910315f, 1.890451f, 1.890032f, 1.913318f, 0.199276f, + 0.988825f, 0.972115f, 0.191956f, 0.977131f, 0.951418f, +}; + +static const float av1_simple_motion_search_split_mean_16[17] = { + 9.076768f, 8.974986f, 7.078364f, 6.926072f, 7.088739f, 6.936111f, + 7.096697f, 6.942841f, 7.114978f, 6.961046f, 3.865480f, 0.982632f, + 1.886023f, 1.912892f, 0.981492f, 1.926059f, 1.891233f, +}; + +static const float av1_simple_motion_search_split_std_16[17] = { + 1.922965f, 1.925609f, 1.851980f, 1.847558f, 1.848410f, 1.843990f, + 1.843931f, 1.839582f, 1.840304f, 1.836144f, 1.760042f, 0.130639f, + 0.841086f, 0.833523f, 0.134780f, 0.840790f, 0.831309f, +}; + +static const float av1_simple_motion_search_split_mean_8[17] = { + 7.120238f, 6.957731f, 5.176309f, 4.889594f, 5.178396f, 4.886607f, + 5.195322f, 4.905566f, 5.198845f, 4.904745f, 3.648933f, 0.993198f, + 1.496831f, 1.520804f, 0.991864f, 1.489763f, 1.460761f, +}; + +static const float av1_simple_motion_search_split_std_8[17] = { + 1.698498f, 1.696000f, 1.629605f, 1.614641f, 1.632476f, 1.618831f, + 1.618352f, 1.603742f, 1.623089f, 1.609674f, 1.668587f, 0.082193f, + 0.759407f, 0.759684f, 0.089830f, 0.742797f, 0.730632f, +}; + +static const float *const av1_simple_motion_search_split_mean[5] = { + av1_simple_motion_search_split_mean_128, + av1_simple_motion_search_split_mean_64, + av1_simple_motion_search_split_mean_32, + av1_simple_motion_search_split_mean_16, + av1_simple_motion_search_split_mean_8, +}; + +static const float *const av1_simple_motion_search_split_std[5] = { + av1_simple_motion_search_split_std_128, av1_simple_motion_search_split_std_64, + av1_simple_motion_search_split_std_32, av1_simple_motion_search_split_std_16, + av1_simple_motion_search_split_std_8, +}; + +#define NUM_HIDDEN_LAYERS_128 1 +#define NUM_FEATURES_128 17 +#define NUM_LAYER_0_UNITS_128 20 +#define NUM_LOGITS_128 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_128[] = { + 0.24095f, -0.397761f, -0.388619f, -0.0629548f, -0.44577f, 0.688212f, + -0.20889f, -1.08227f, -0.0313894f, -0.615505f, -0.401839f, 0.40233f, + -0.171305f, 0.439803f, 1.58527f, -0.968535f, -1.29255f, 1.14846f, + 0.885777f, 0.116412f, -0.225704f, 0.316506f, 0.793951f, -0.63591f, + 0.097789f, -0.327027f, -0.778396f, -0.231667f, -0.9622f, 1.0044f, + 0.32594f, 0.179768f, -0.115529f, -0.499395f, -1.14727f, -1.26111f, + 0.269818f, -0.0882028f, -0.349107f, 0.100901f, 0.0249506f, 0.528929f, + 0.113961f, 0.929794f, 0.242494f, -0.122828f, -0.0477379f, 0.170659f, + 0.0500187f, 0.28859f, 0.78783f, 0.482412f, 0.795298f, 0.179517f, + 0.453911f, -0.298029f, -0.903332f, 0.510615f, 0.691994f, 0.433383f, + -0.140802f, -1.11635f, -0.547326f, 1.11318f, 0.71905f, 0.978538f, + 0.097444f, -0.0386012f, 0.713599f, 0.465164f, 0.391278f, -0.472864f, + 0.230224f, -0.279508f, 0.558192f, -0.468625f, 0.55995f, -0.57507f, + -1.39947f, -0.755819f, -1.04512f, -0.411552f, -0.830444f, -0.106571f, + -0.0972184f, 0.251842f, 0.269955f, 0.230492f, -0.290581f, -0.484799f, + 0.0151041f, 0.171047f, 0.829999f, -0.384581f, 0.220301f, -0.121687f, + 1.88848f, -0.482809f, -0.48185f, 1.34482f, -0.716438f, -0.284482f, + -1.78592f, -1.29333f, 0.886867f, 0.80106f, 0.456415f, 0.649095f, + 0.231093f, 0.361562f, 0.290018f, 0.128009f, -0.196343f, 0.0607802f, + 0.576761f, -0.0413836f, 0.0300984f, -0.318998f, 0.204434f, -0.712524f, + 0.833394f, -0.81168f, 0.765488f, -0.720973f, 1.12866f, -0.838694f, + 1.295f, -0.159127f, 1.05404f, 0.736519f, 0.248662f, 0.229233f, + 0.0434302f, 0.0551856f, 0.197862f, 0.354823f, -0.32429f, -0.227353f, + -0.132198f, -0.438118f, -0.210401f, -0.81046f, 0.653555f, 0.826737f, + 0.154235f, 0.228945f, 0.123089f, 0.614964f, -0.0940471f, -0.00676807f, + 0.24996f, 0.949233f, 0.746526f, -0.044474f, 0.386414f, 0.503221f, + 0.155133f, -0.698848f, -0.735356f, -0.255091f, 0.413235f, -0.335295f, + -0.145757f, 0.326299f, -0.602629f, -0.844474f, -0.346722f, -0.42598f, + -0.491016f, -0.447732f, -0.965366f, -0.0242841f, 0.836606f, -0.104877f, + 1.23236f, 0.683986f, 0.787005f, -0.0253437f, 1.2145f, 1.29554f, + -1.24302f, -0.229495f, 0.439415f, 0.885087f, -0.408704f, -0.119299f, + -0.0960972f, 0.60148f, 0.683271f, -0.057129f, -0.180295f, -0.264815f, + -0.363184f, 0.638271f, 0.631083f, -0.252899f, -0.164364f, -1.31274f, + 0.354408f, 0.0429172f, 0.371154f, -1.0978f, 0.0433642f, -0.467394f, + -0.706572f, 1.57198f, -0.0701271f, 1.93149f, -0.446267f, 1.4519f, + -1.29567f, 0.309978f, -0.878062f, 0.891494f, 0.364005f, -0.209611f, + -0.125927f, 0.184097f, 0.0629695f, -0.43375f, -0.0980562f, 1.08547f, + 0.578312f, 0.16566f, -0.198852f, -0.241854f, -0.523934f, -0.206037f, + -0.867721f, 1.00041f, 1.09848f, -2.12562f, -0.19992f, -0.186128f, + -0.03507f, 0.0484884f, 0.160856f, 0.10802f, -0.805141f, -1.06902f, + 0.290363f, 0.0222096f, -0.849266f, 0.112932f, 0.148682f, -0.0457585f, + 1.139f, 1.79141f, 0.194122f, -0.342508f, -0.403572f, 0.133678f, + 0.217553f, -0.263759f, 0.18441f, 0.254529f, 0.0471115f, 0.733178f, + -0.416205f, 0.441447f, -0.443335f, 0.725005f, -0.78946f, 0.71301f, + -0.644969f, 1.5445f, 0.365277f, -0.455775f, -0.365066f, 0.4742f, + -0.381714f, -0.545794f, -0.0464861f, -0.222768f, -0.0106466f, -0.069743f, + 0.0335566f, 0.378348f, -0.249663f, 0.922286f, 0.125711f, -0.894619f, + 0.444682f, 0.447893f, -1.98936f, -1.41978f, 0.0406667f, -0.199928f, + -0.199786f, 0.463481f, 0.334931f, -0.396222f, -0.0732259f, 0.796684f, + -0.140817f, -0.26878f, 0.194642f, 0.895784f, -0.369976f, -2.26981f, + -0.0791776f, -0.0492268f, 0.6715f, 0.281805f, 0.0156664f, -0.779785f, + 0.17743f, 0.188786f, -0.588077f, -0.359153f, 0.258319f, 0.881688f, + 0.846894f, 1.00292f, 0.838134f, 0.680632f, 0.273098f, -0.329261f, + 0.217757f, -0.506726f, -0.336523f, -0.695875f, -0.252006f, 0.751216f, + 0.334409f, -0.0151467f, 0.0885474f, 0.0973114f, -0.248754f, -0.263716f, + 0.369906f, -0.213749f, -0.0355395f, -0.137799f, 2.43233f, -0.944233f, + -0.745167f, 0.318558f, 0.316608f, 0.568678f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_128[] = { + 0.821344f, 1.11542f, -1.24172f, 1.03642f, 1.13511f, + 1.16414f, -0.278655f, -1.35558f, -1.26788f, -1.63189f, + -0.323271f, 1.21319f, -0.888415f, 0.987145f, -1.16767f, + 0.255833f, -0.1392f, 1.43265f, -1.54952f, 1.65159f +}; + +static const float av1_simple_motion_search_split_logits_kernel_128[] = { + 0.3565753f, 0.5490161f, -1.015597f, 0.565366f, 0.751604f, + 0.922747f, -1.931846f, 1.759353f, -0.7362949f, 0.5707034f, + -1.092127f, 0.936767f, 2.034499f, 2.08148f, 0.9509507f, + -1.342504f, -0.834566f, 0.618184f, 0.844113f, 1.182693f +}; + +static const float av1_simple_motion_search_split_logits_bias_128[] = { + 1.819351f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_128 = { + NUM_FEATURES_128, + NUM_LOGITS_128, + NUM_HIDDEN_LAYERS_128, + { + NUM_LAYER_0_UNITS_128, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_128, + av1_simple_motion_search_split_logits_kernel_128, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_128, + av1_simple_motion_search_split_logits_bias_128, + }, +}; + +#undef NUM_HIDDEN_LAYERS_128 +#undef NUM_FEATURES_128 +#undef NUM_LAYER_0_UNITS_128 +#undef NUM_LOGITS_128 + +#define NUM_HIDDEN_LAYERS_64 1 +#define NUM_FEATURES_64 17 +#define NUM_LAYER_0_UNITS_64 24 +#define NUM_LOGITS_64 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_64[] = { + -1.40663f, -0.851503f, -0.0613111f, 0.741591f, 0.302754f, + 0.184001f, 0.0474853f, 0.371096f, 0.0541624f, 0.381508f, + 0.355427f, 0.0428822f, 0.154916f, -0.00490099f, 0.025484f, + 0.0208921f, 0.140596f, -0.292525f, -0.459067f, -0.081393f, + 0.109824f, -0.290183f, 0.720236f, 0.385835f, -0.150643f, + -0.078518f, 0.0979819f, -0.102135f, 0.137152f, -0.0786457f, + 0.0171441f, 0.991338f, -0.546583f, -1.0714f, -0.0842851f, + 0.244072f, 0.427379f, 0.146775f, -0.921613f, -0.912093f, + 0.393566f, -0.232375f, 0.19963f, 0.312355f, 0.55659f, + -0.104714f, -0.137563f, 0.0985237f, 0.0788307f, -0.225514f, + 0.0228832f, -0.288733f, -0.00737685f, -0.711657f, -0.256796f, + 0.0869605f, 0.583977f, 0.384306f, 1.46692f, -0.741126f, + -0.21105f, -0.276604f, -0.0151463f, -0.0227997f, -0.0403232f, + 0.044122f, 0.0185784f, -0.0451951f, 0.00489513f, -0.387131f, + 0.0966724f, -0.599174f, -0.00243351f, -0.21439f, 0.302043f, + 0.130334f, -0.191251f, 0.863261f, -1.50112f, 0.00901057f, + 0.000324294f, -0.0572545f, 0.0117685f, -0.0734682f, -0.0570435f, + -0.126253f, 1.2313f, -0.328267f, 0.211788f, -0.175438f, + -0.0419298f, 0.166447f, -0.178739f, -0.326221f, -0.0439188f, + 1.01182f, -0.390678f, -0.426343f, 0.0944665f, -0.225042f, + -0.183344f, 0.0500763f, -0.377393f, -0.673401f, -0.436907f, + -0.00366876f, -0.363412f, 0.195194f, 0.250248f, -0.397193f, + -0.0917222f, -0.0221579f, 1.7693f, -0.0694484f, -0.0410764f, + -0.134571f, -0.159992f, -0.170359f, -0.249333f, -0.128056f, + -0.617054f, -0.808701f, -0.540642f, 0.396391f, 0.147787f, + 0.346916f, 0.709852f, 0.116064f, 0.0509731f, 0.073713f, + -0.365082f, -1.09287f, -0.618214f, 0.20545f, 0.126161f, + -0.140012f, 0.62592f, 0.316326f, -0.392765f, -0.15934f, + 0.337617f, -0.41669f, -0.295225f, 0.0602025f, -0.0150657f, + -0.319629f, 0.783729f, -0.0661199f, -0.362657f, 0.390042f, + -0.043614f, -0.0414596f, 0.121155f, -0.309775f, -0.284761f, + -0.243932f, 0.279855f, -0.266823f, 0.734824f, -0.164028f, + 0.261776f, -0.105585f, 0.10733f, -0.180469f, 1.18875f, + -1.12836f, -0.173008f, 0.150221f, 0.111598f, 0.148306f, + -1.2833f, -1.06346f, 0.233546f, 0.16432f, 0.00142378f, + 0.340574f, -0.0140885f, 0.634761f, -0.122096f, 0.821487f, + 0.421424f, -0.0256687f, -0.035503f, -0.0453547f, -0.0215179f, + -0.0671277f, -0.0486862f, -0.962761f, -0.208383f, 0.109573f, + -0.210668f, -0.176485f, 0.421279f, 0.41605f, 0.342084f, + 0.619364f, 0.103718f, -0.00341643f, 0.00266677f, 0.249089f, + -0.22848f, -0.0368968f, 1.12092f, -0.64912f, -0.456579f, + 0.477823f, 0.418345f, 1.41515f, 0.0936279f, 0.886155f, + -0.785656f, -0.217109f, -0.561829f, -0.286435f, -0.884068f, + -0.148839f, -0.282848f, 0.0683745f, 0.0962815f, -0.111975f, + 0.0509158f, -0.211274f, 0.744909f, -0.8982f, 0.315232f, + -0.78624f, 0.598387f, -0.530952f, 0.677357f, 0.0371339f, + 0.99209f, -0.681899f, -0.291416f, -0.224822f, -0.26049f, + -0.0436525f, -0.380004f, -0.27187f, 0.534779f, 0.717939f, + 0.418197f, -0.152539f, -0.0684039f, -0.186308f, -0.0653121f, + 0.194145f, -0.196367f, 0.256997f, -0.726269f, -0.307672f, + -0.153362f, 0.450827f, 0.708842f, -0.0667079f, 0.555564f, + 0.0486892f, 0.0715072f, -0.7211f, -0.849797f, 0.0650271f, + 1.2747f, -0.646738f, -0.53042f, 0.182197f, 0.928203f, + 0.180621f, -0.00640791f, -0.171416f, 0.092688f, -0.391275f, + -0.0650657f, 0.0843773f, 0.170824f, 0.378085f, 0.0596657f, + 0.844398f, -1.3083f, -1.27828f, -0.199179f, 0.557855f, + 0.241479f, 0.385804f, 0.169533f, -0.0028072f, 0.0538041f, + 0.00136234f, 0.0130481f, 0.0349449f, -0.0366494f, -0.000474055f, + 0.437956f, 0.286724f, -0.298187f, 0.461967f, 0.43065f, + -0.0877194f, -0.19133f, 0.379121f, -0.687751f, -1.64077f, + -0.375191f, -0.336836f, -0.323904f, -0.101859f, 0.0126672f, + -0.346332f, 0.112303f, -0.863336f, 0.155538f, 0.366509f, + -0.0976829f, 0.635278f, -0.681967f, -0.527729f, 0.591839f, + 0.366678f, 0.189981f, 0.0208007f, -0.565809f, 0.70183f, + -0.282844f, -0.327485f, 0.347243f, -1.13014f, -0.373378f, + -0.514978f, 0.662994f, -0.144931f, 0.1402f, -0.820049f, + 0.711498f, 0.681156f, 1.06515f, -0.423409f, -0.0392664f, + 0.0675396f, -0.0508602f, 0.0431443f, 0.0212639f, -0.0279887f, + -0.62611f, -0.202064f, 0.701934f, 1.28452f, -0.00858481f, + -0.517249f, 0.0615832f, -0.260215f, 0.0949119f, -0.28423f, + -0.39573f, -0.0574246f, -0.318658f, 0.0601775f, -0.0629386f, + -0.134208f, 0.111686f, -0.23355f, 0.078667f, 0.741023f, + 0.828523f, -0.345067f, -0.315135f, -0.0957154f, 0.522825f, + -0.190057f, -0.473789f, -0.390489f, 0.200677f, -0.0271802f, + 0.110336f, 0.493302f, 0.663126f, 0.570148f, -0.380042f, + -0.437349f, -0.660884f, 0.301908f, 0.0644179f, 0.172494f, + 0.461917f, 0.330938f, -0.140041f, -0.0430205f, -1.51003f, + -0.410984f, -0.182161f, 0.0235313f, -0.364849f, 0.154183f, + -0.592465f, 0.272701f, 0.192389f, -0.0497777f, -0.924467f, + -0.179513f, -0.592217f, 0.436363f, -0.0716164f, 0.189094f, + -0.574697f, -0.304303f, 0.326441f, -0.0865553f, 0.735948f, + 0.266912f, 0.435824f, -0.123322f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_64[] = { + -1.19333f, 1.01834f, -1.10844f, 0.0454873f, -1.45506f, 0.580864f, + -0.040979f, -0.505681f, -1.15072f, 0.692697f, -0.520812f, -0.479384f, + 0.529652f, 0.507252f, -1.08619f, 0.0586375f, 0.0929614f, -0.46753f, + -0.701857f, -0.362933f, -0.291983f, -0.133933f, -0.0131351f, -0.267582f +}; + +static const float av1_simple_motion_search_split_logits_kernel_64[] = { + -3.32501f, 0.43082f, -1.060692f, 1.328908f, 0.8892894f, 0.6488833f, + -1.096516f, -0.664786f, -1.301339f, 0.508805f, -2.128406f, -0.757304f, + 0.383839f, 0.694763f, -0.591725f, 0.770385f, 1.021594f, 0.589181f, + -0.76238f, 1.488826f, 0.709135f, -0.575738f, 0.26421759f, -0.2484219f +}; + +static const float av1_simple_motion_search_split_logits_bias_64[] = { + 0.699037f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_64 = { + NUM_FEATURES_64, + NUM_LOGITS_64, + NUM_HIDDEN_LAYERS_64, + { + NUM_LAYER_0_UNITS_64, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_64, + av1_simple_motion_search_split_logits_kernel_64, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_64, + av1_simple_motion_search_split_logits_bias_64, + }, +}; + +#undef NUM_HIDDEN_LAYERS_64 +#undef NUM_FEATURES_64 +#undef NUM_LAYER_0_UNITS_64 +#undef NUM_LOGITS_64 + +#define NUM_HIDDEN_LAYERS_32 1 +#define NUM_FEATURES_32 17 +#define NUM_LAYER_0_UNITS_32 20 +#define NUM_LOGITS_32 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_32[] = { + -0.980626f, -0.946611f, 0.103761f, 0.408899f, 0.498149f, + 0.0490161f, 0.253279f, 0.332029f, 0.00367441f, 0.364401f, + -0.236433f, 0.0592119f, -0.0978848f, 0.159733f, -0.018052f, + -1.10726f, 1.16167f, -0.244982f, -0.147819f, -0.147095f, + 0.111404f, -0.349502f, 0.441178f, 0.0984191f, -0.135537f, + -0.0423312f, 0.0123079f, 0.358012f, -0.266796f, 0.0125811f, + 0.196563f, 0.337093f, -1.07266f, -1.25134f, 0.57337f, + -0.521717f, 0.259824f, 0.537383f, -0.463688f, -0.336128f, + 0.373385f, 0.483443f, -0.229293f, -0.33373f, -0.656021f, + 0.768647f, 0.179279f, 0.315415f, 0.187749f, 1.07839f, + 0.0626629f, -0.230299f, 0.662606f, -0.414154f, 0.459334f, + -0.6312f, 0.427704f, -0.249849f, 0.701056f, -0.707969f, + 0.057401f, 0.620434f, 0.665748f, -0.501356f, -0.230685f, + 0.0722371f, -0.0988625f, -0.114035f, -0.653799f, 0.571353f, + 0.268276f, 1.13251f, -1.0695f, -0.225607f, -0.984355f, + -0.42213f, 0.300422f, 1.21492f, -0.139931f, -0.000726004f, + 0.045964f, -0.0817352f, -0.0278813f, -0.0102341f, -0.0144087f, + -0.475882f, 1.20682f, -0.359919f, 0.277189f, -0.166401f, + 0.599211f, -0.129872f, 0.574211f, -0.247573f, 0.824405f, + -1.53329f, -0.202151f, -0.328698f, -0.516322f, -0.281416f, + -0.383651f, -0.252862f, -0.43185f, 0.456802f, -0.430055f, + -0.55245f, -0.6884f, -0.541456f, -0.281376f, 1.10425f, + -0.140706f, 1.59816f, -0.0343895f, -0.00920039f, -0.0307667f, + 0.0560132f, -0.0340302f, -0.10848f, 0.0593314f, -0.951795f, + 0.876831f, -1.00548f, -0.566244f, 0.430061f, 1.10109f, + -0.634212f, -0.0755369f, -0.108953f, 1.03191f, 0.109036f, + -0.0415309f, 0.0681162f, -0.0611775f, -0.0231938f, 0.0973158f, + -0.0558169f, -0.823484f, -0.918509f, 0.16756f, 0.27087f, + 0.286074f, 0.174069f, 0.1304f, 0.386074f, 0.433953f, + 0.0291467f, -1.74087f, 0.0296094f, -0.00793714f, -0.13041f, + 0.00990992f, -0.0137848f, -0.0742606f, -0.251029f, -0.645316f, + 0.640029f, 0.550607f, 0.470097f, 0.549451f, -0.285723f, + -0.164759f, -0.128166f, -0.391496f, -0.80287f, 0.0769472f, + 1.34391f, 0.0215005f, 0.0669497f, 0.131919f, 0.291674f, + 0.0952889f, -0.677953f, -0.364054f, 0.144823f, 0.246198f, + -0.12393f, 0.363661f, 0.215091f, -0.239658f, 0.18491f, + 0.118703f, 0.0064156f, 1.38619f, -1.3845f, 0.0567323f, + 1.20812f, -0.720374f, -1.92158f, -1.48657f, 0.335601f, + 0.409379f, 0.373618f, 0.231274f, 0.292194f, 0.368619f, + 0.2398f, 0.473579f, 0.83402f, -0.0133751f, -0.00344358f, + 2.20688e-05f, 0.00836757f, 0.00405377f, 0.0110539f, -0.260154f, + 0.192112f, -0.666986f, 0.302875f, -0.113302f, 0.17882f, + -0.221493f, 0.146161f, -0.448697f, 0.584187f, 0.122109f, + 0.989981f, -1.14706f, -0.734042f, 0.0638213f, 0.213357f, + 0.068543f, -0.808558f, 0.404741f, 0.808313f, 1.57523f, + -0.113448f, 0.254102f, -0.350065f, -0.615f, 0.0753549f, + -0.540936f, -0.0250732f, -0.225681f, -0.161384f, 0.0128342f, + -0.0933368f, -0.286904f, 0.130133f, -0.874747f, 0.392585f, + -0.493135f, 0.169708f, 0.0909804f, 1.89921f, -0.469954f, + 0.65165f, -0.953401f, -0.21595f, -0.37479f, 0.0451146f, + 0.0234621f, -0.0596903f, -0.0682308f, -0.0830426f, 0.130011f, + -0.409141f, 0.0627038f, -0.581148f, -0.513922f, 0.631676f, + 0.0637034f, 0.0539081f, 0.0638872f, 0.515863f, -0.0123463f, + 0.177238f, 0.279506f, -0.930345f, 1.23726f, 0.202851f, + 0.708792f, -0.445086f, -0.0267075f, -0.913822f, -0.0714978f, + -0.281107f, -0.0770565f, -0.23086f, -0.165893f, -0.319683f, + 0.216235f, -0.490999f, 2.04841f, -0.0524071f, -0.239043f, + -0.0526375f, 0.023002f, -0.132685f, -0.155354f, -0.186503f, + -0.904296f, 0.166478f, 0.063268f, -0.302842f, -0.27179f, + -0.428299f, 0.50193f, 0.480717f, -0.864275f, 0.317096f, + 0.40698f, 0.0286107f, 0.189432f, -0.0374374f, 0.0671728f, + 0.203681f, -0.457959f, -0.155776f, 0.340948f, 0.542841f, + 0.342675f, -0.000952399f, 0.470957f, 0.744418f, -1.11763f, + -0.658812f, -0.044832f, 0.0688237f, -0.357766f, 0.428662f, + -0.087152f, -0.291903f, 0.373244f, -0.587853f, 0.415895f, + -0.535694f, 0.621785f, -0.143648f, 0.0451373f, 0.00068827f, + 1.84432f, -1.26239f, -0.432087f, -0.152307f, 0.0293551f, + 0.184744f, -0.0173156f, -0.00572154f, -0.0305062f, -0.0900071f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_32[] = { + 0.160011f, 0.903856f, -0.13738f, 0.358221f, -0.0906044f, + -0.606558f, -0.0215651f, -0.03377f, -1.67017f, -0.144554f, + -0.201482f, -0.87719f, 0.639815f, -0.51976f, -0.309922f, + -1.33421f, 0.721328f, -0.889354f, -1.7158f, -0.285963f +}; + +static const float av1_simple_motion_search_split_logits_kernel_32[] = { + -0.2745374f, 0.333548f, -0.2437388f, 0.288009f, 0.55635f, + 0.4560176f, 0.2970518f, 0.391192f, 1.311854f, -0.231219f, + -0.2968651f, -1.819984f, 0.2775824f, 0.28929857f, 0.419126f, + -0.32868411f, -0.916399f, -0.1921077f, -0.617489f, 0.637953f +}; + +static const float av1_simple_motion_search_split_logits_bias_32[] = { + 0.208473f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_32 = { + NUM_FEATURES_32, + NUM_LOGITS_32, + NUM_HIDDEN_LAYERS_32, + { + NUM_LAYER_0_UNITS_32, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_32, + av1_simple_motion_search_split_logits_kernel_32, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_32, + av1_simple_motion_search_split_logits_bias_32, + }, +}; + +#undef NUM_HIDDEN_LAYERS_32 +#undef NUM_FEATURES_32 +#undef NUM_LAYER_0_UNITS_32 +#undef NUM_LOGITS_32 + +#define NUM_HIDDEN_LAYERS_16 1 +#define NUM_FEATURES_16 17 +#define NUM_LAYER_0_UNITS_16 20 +#define NUM_LOGITS_16 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_16[] = { + 0.0136957f, 0.182135f, -0.583394f, 0.0556956f, 0.211152f, + 0.168234f, -0.694203f, -0.678216f, 0.289943f, 1.00014f, + -0.0427784f, -0.0427538f, -0.0276009f, -0.00133608f, 0.0901944f, + 0.0674892f, 0.104068f, -0.308582f, -0.43596f, 0.855997f, + -0.223414f, 0.0390026f, 0.366492f, 0.216065f, -0.386863f, + -0.148823f, -0.297022f, 0.0529546f, -0.202885f, 1.26471f, + -0.861163f, -0.0949431f, 0.573627f, -0.00277083f, -0.616063f, + -0.626927f, 0.371583f, -0.411743f, 0.173387f, -0.209734f, + 0.293697f, -0.260714f, 0.442728f, -0.594486f, 1.38987f, + 0.208025f, -0.0433776f, 0.01173f, 0.921766f, -0.168379f, + 0.000697326f, 0.209967f, -0.304577f, 0.149551f, -0.196658f, + 0.389251f, -0.449106f, -0.456329f, 0.669073f, -0.163806f, + 0.083348f, -0.0783998f, 0.0678355f, 0.0510435f, 0.103964f, + 0.104537f, -0.778093f, -1.0641f, -0.626102f, -2.02131f, + 0.159591f, 0.254161f, -0.000362642f, 0.289859f, 0.192713f, + 0.139801f, -0.0251327f, 0.164002f, 1.22892f, -0.0852193f, + 0.0769487f, 0.0296408f, -0.0418688f, 0.0936023f, 0.0448523f, + 0.674015f, -0.0732944f, 0.313575f, -0.593432f, 0.642067f, + -1.06063f, 0.468223f, -0.769085f, -0.173798f, -0.175663f, + 0.692808f, 0.00753295f, -0.123327f, -0.0234937f, -0.0923153f, + 0.0216917f, -0.0690157f, -0.397488f, 0.426628f, 0.264475f, + 0.342074f, -0.139817f, 0.215915f, 0.422544f, -0.321102f, + 0.0355587f, 0.460193f, 0.0315326f, 0.080556f, -0.0256533f, + -0.0857874f, -0.488283f, -0.299653f, -0.245987f, 0.104383f, + 0.203731f, 0.328734f, 0.668104f, -0.586909f, -0.501335f, + -0.661292f, -0.359811f, 0.00951363f, 0.816315f, -0.0124104f, + 0.0545827f, 0.089863f, 0.0125486f, 0.043609f, -0.0259544f, + 0.0123911f, 0.12557f, -0.539875f, -0.0556721f, 0.16532f, + 0.265834f, -0.384171f, 0.646496f, 0.366147f, -0.111272f, + 0.262096f, -0.0845724f, 0.382724f, 0.165783f, 0.1025f, + 0.392988f, 0.290525f, 0.038659f, 0.540269f, -0.485586f, + -0.273065f, -0.154052f, -0.0896895f, -0.35394f, 0.193214f, + -0.423728f, 0.654576f, -0.373321f, 0.814914f, 0.026278f, + -0.0328304f, -0.220913f, -0.0442121f, 0.487545f, -0.509537f, + -0.777581f, -1.23886f, 0.223482f, 0.206009f, 0.20391f, + 0.194628f, 0.226762f, 0.171609f, -0.219037f, 0.557892f, + -0.312011f, 1.27709f, 0.064013f, 0.105384f, 0.0493933f, + 0.074059f, -0.0100078f, -0.0176888f, -0.440005f, 0.302922f, + -0.197456f, 0.296128f, -0.326647f, 0.305323f, -0.30696f, + 0.201951f, -0.15874f, -0.793042f, 0.0197254f, 0.0569867f, + -0.0295468f, -0.0215012f, 0.025855f, -0.0196102f, 0.215558f, + -0.253069f, 0.298469f, 0.261269f, 0.435305f, 0.0120354f, + -0.384789f, -0.2772f, 0.0366613f, -0.494994f, 0.149072f, + 1.32981f, -0.427717f, 0.43938f, -0.16375f, -0.444342f, + 0.548214f, 0.127955f, -1.24387f, 0.0863676f, 0.175071f, + 0.172673f, -0.0906204f, 0.444454f, -0.546669f, 0.215857f, + -0.100621f, 0.200699f, -0.0985915f, 0.134706f, -0.256396f, + 0.393427f, 0.119606f, -0.214278f, -0.0183637f, 0.194266f, + -0.238025f, 0.182203f, 0.599718f, 0.846933f, 0.0607852f, + -0.183434f, -0.723743f, -0.72414f, -0.124701f, 0.0227527f, + -0.0664636f, -0.0385867f, -0.0257377f, -0.149054f, 0.12077f, + 0.678029f, -0.624456f, 0.189644f, -0.518604f, 0.134397f, + -0.189777f, -0.309376f, -0.00377086f, 0.701132f, -0.170915f, + 0.00736111f, -0.121906f, 0.329136f, 0.165514f, 0.0328356f, + 0.171275f, 0.248619f, 0.247704f, -0.449933f, 0.0841684f, + 0.136982f, 0.122703f, -0.0169439f, -0.0726496f, 0.302648f, + -0.128556f, 0.0667425f, -0.289717f, -0.207532f, -1.20269f, + -0.68892f, 0.045259f, 0.0973945f, 0.0988314f, -0.944748f, + -0.180401f, 0.134331f, 0.033834f, 0.109023f, 0.265723f, + 0.38063f, -0.106518f, -0.0686953f, 0.3744f, -1.0957f, + 0.0302782f, 0.0515164f, 0.00188222f, 0.0014413f, -0.0404425f, + 0.0124618f, -0.0828645f, 0.506166f, -0.776352f, -0.405138f, + -0.123887f, 0.0732116f, 0.379928f, 0.604524f, -0.492317f, + 0.439191f, 0.0744193f, 0.389101f, 0.0604518f, 0.0943165f, + 0.0339942f, 0.0917975f, 0.0161988f, 0.512227f, 0.538021f, + -0.411495f, 0.307281f, 0.33746f, -0.218639f, 0.265742f, + 0.39738f, -0.12442f, 0.125236f, -0.0845223f, -0.150396f, + 0.0334878f, -0.00391915f, 0.0406864f, -0.0487059f, 0.0377073f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_16[] = { + 0.0535976f, -0.0130279f, 0.150146f, -0.511132f, -0.357698f, + 0.6719f, -1.27877f, -0.0208048f, 0.0961914f, 0.263603f, + 0.704574f, -1.48998f, 0.728063f, 0.941829f, -0.199981f, + 0.797802f, -0.29816f, -0.60894f, -0.116624f, -1.16723f +}; + +static const float av1_simple_motion_search_split_logits_kernel_16[] = { + 0.343153f, -0.2110482f, -0.487199f, 0.3274144f, -2.1975f, + -0.6051438f, 0.1901127f, 0.4741924f, -0.24029f, -0.185018f, + -0.652635f, 2.57714f, -0.31033031f, -0.307222f, 0.329035f, + -0.430181f, 0.3429f, 0.742292f, 0.3269808f, 0.4142165f +}; + +static const float av1_simple_motion_search_split_logits_bias_16[] = { + -0.783658f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_16 = { + NUM_FEATURES_16, + NUM_LOGITS_16, + NUM_HIDDEN_LAYERS_16, + { + NUM_LAYER_0_UNITS_16, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_16, + av1_simple_motion_search_split_logits_kernel_16, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_16, + av1_simple_motion_search_split_logits_bias_16, + }, +}; + +#undef NUM_HIDDEN_LAYERS_16 +#undef NUM_FEATURES_16 +#undef NUM_LAYER_0_UNITS_16 +#undef NUM_LOGITS_16 + +#define NUM_HIDDEN_LAYERS_8 1 +#define NUM_FEATURES_8 17 +#define NUM_LAYER_0_UNITS_8 20 +#define NUM_LOGITS_8 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_8[] = { + 0.079443f, -1.04068f, 0.336819f, -0.20901f, 0.796251f, + 0.181066f, 0.0118876f, -0.207145f, 0.250671f, -0.402119f, + -0.0847227f, 1.88683f, 0.303469f, 0.0718458f, 0.0338589f, + 0.158896f, 0.0540238f, -0.385426f, 0.955925f, 0.424506f, + 0.492584f, -0.795058f, -0.248667f, -0.905349f, -0.316989f, + 0.545471f, 0.63762f, -0.232613f, -0.238947f, -0.395338f, + -0.322673f, -0.0761563f, -0.125357f, 0.0694415f, -0.371599f, + 0.358387f, -0.486841f, 0.403863f, -0.0295666f, 0.283074f, + -0.424396f, 0.156318f, -0.685355f, 0.6663f, 0.337949f, + 0.273198f, 0.517448f, 0.458911f, 0.157252f, 0.692096f, + 0.64965f, -0.23987f, -1.08431f, -0.252475f, -0.332614f, + -0.712291f, -0.380973f, 0.460545f, 0.48936f, 0.337601f, + 0.489223f, 1.65336f, -0.223585f, 0.17367f, -0.235057f, + -0.456773f, 0.327877f, -0.221192f, -0.940151f, -1.06616f, + 0.687084f, -0.109973f, 0.106636f, 0.445895f, 0.163432f, + 0.378306f, 0.201902f, 0.176811f, 0.693082f, 1.62156f, + -0.178346f, 0.455175f, 1.61943f, 0.231376f, 0.0890932f, + -0.889693f, -1.03298f, 0.778196f, -0.0289539f, 0.137848f, + 0.18707f, 0.171889f, 0.119157f, 0.24893f, -0.313628f, + 0.00250735f, -0.0758209f, 0.272974f, -0.229825f, 2.47926f, + -0.0354665f, 0.175366f, 0.0411555f, -1.52149f, -0.0258663f, + 0.253027f, -0.0520839f, -0.0189782f, 0.362387f, -0.371154f, + 0.622929f, 0.0447056f, 0.242529f, -0.168391f, 0.308935f, + -0.117294f, 2.16307f, 0.0673638f, 0.080771f, -0.460779f, + -0.940176f, 0.473266f, -0.0125302f, 0.475145f, -0.218187f, + 0.43258f, -0.0380196f, 0.413607f, -0.110856f, -1.52076f, + 0.0896812f, 0.246636f, -0.0612008f, 0.189583f, 0.0106902f, + -0.158403f, -0.629377f, -0.0634279f, -0.0864584f, -0.226568f, + -0.286234f, -0.0721132f, -0.43702f, 0.113702f, 0.433372f, + 0.743396f, 0.14312f, 0.29914f, 0.801188f, 0.7609f, + 0.385046f, 0.480314f, 0.171119f, -1.59058f, -1.18853f, + 0.150676f, 0.408123f, -0.00677924f, 0.398145f, 0.0914611f, + 0.176945f, 0.0677457f, 0.316478f, 0.998219f, -0.22618f, + 0.0756793f, -0.0156674f, 0.105716f, 0.0496245f, -0.0827133f, + -0.423119f, -0.161033f, 0.212962f, -0.234453f, 0.743366f, + 1.04108f, 0.0597604f, -0.285993f, -0.114829f, -0.557364f, + -0.840051f, 0.326509f, -0.192508f, -0.141769f, 0.370626f, + -0.126353f, 0.00672923f, 0.493623f, -0.852076f, 0.466798f, + -0.226436f, 0.259268f, -0.452662f, 0.0721126f, 0.0198245f, + 0.2048f, 0.02506f, 0.316194f, 0.814651f, 1.01288f, + -0.569607f, -0.0838994f, 1.37146f, -0.613135f, 0.441761f, + -0.643901f, 0.364269f, -0.147177f, 0.338001f, -0.332376f, + 0.518875f, -0.628964f, -0.291889f, -0.050736f, 0.108047f, + 1.05673f, 0.0479492f, 0.466756f, -0.0867334f, -0.0355575f, + 0.57626f, -0.227583f, -0.146421f, 0.0990489f, 0.117351f, + -0.103858f, -0.0336936f, 0.0201903f, -0.0766383f, -0.010211f, + 0.0400779f, 0.0725462f, 0.137142f, 0.478261f, 0.287869f, + 0.0882359f, -0.739754f, -0.853521f, -0.43703f, 0.316856f, + 0.27593f, 0.312149f, 0.175575f, 0.441839f, 0.264325f, + 0.0148051f, -0.005559f, 0.373176f, 0.933701f, -0.0197615f, + 0.0219723f, -0.0559883f, -0.103456f, -0.0323009f, 0.0773202f, + -0.390838f, 0.855488f, -0.596525f, -0.249093f, 0.124262f, + 0.220172f, 0.0552478f, 1.04041f, -0.960992f, -0.495255f, + -0.211612f, 0.350007f, -0.238998f, -0.0265068f, 0.384686f, + -0.0815808f, -0.0570019f, 0.123903f, -0.485114f, -0.00282573f, + -0.0649603f, 0.163719f, -0.469479f, -0.439713f, 0.0602562f, + -0.527993f, -0.111458f, 2.48686f, -0.180723f, 0.0553895f, + 0.0560679f, -0.0978928f, -0.216063f, 0.089457f, -1.5602f, + -1.62332f, -0.147388f, 0.736155f, 0.440409f, 0.243519f, + 0.0622638f, 0.522932f, 0.109686f, 0.422849f, 0.510589f, + 1.01116f, 0.174019f, 0.0191171f, -0.0717751f, -0.0068308f, + 0.172932f, -0.834888f, -0.635788f, 0.32012f, 0.298656f, + 0.274309f, -0.155456f, 0.1755f, -0.175171f, 0.343498f, + -0.122832f, -0.107696f, 0.279924f, -0.797633f, -0.344658f, + 0.162669f, 0.389092f, 0.644479f, -0.635216f, -0.181868f, + 0.0579244f, -0.0568976f, 0.433003f, -0.591067f, 0.71013f, + -0.165515f, 0.225725f, -0.358156f, 0.0541944f, 1.95485f, + -0.315223f, 0.61537f, -0.0401568f, 0.22811f, 0.271147f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_8[] = { + 1.63441f, -0.616459f, -0.437775f, -0.71669f, 1.56616f, 2.28109f, 1.64054f, + -1.51476f, 0.0274108f, 0.935156f, -0.966329f, 0.906069f, 1.19954f, -1.25867f, + -1.7376f, -0.594211f, 0.322242f, 0.438631f, -1.01682f, 1.30032f +}; + +static const float av1_simple_motion_search_split_logits_kernel_8[] = { + -0.463187f, 0.2936127f, 0.16762f, -0.1663271f, -0.292418f, + -0.421457f, -0.378265f, 1.053049f, 0.32432879f, -0.49775575f, + 0.427357f, -0.239251f, -0.1631546f, 0.335468f, 0.255371f, + 0.276901f, -0.665683f, -0.7021493f, 0.381513f, -0.1339761f +}; + +static const float av1_simple_motion_search_split_logits_bias_8[] = { + -1.739754f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_8 = { + NUM_FEATURES_8, + NUM_LOGITS_8, + NUM_HIDDEN_LAYERS_8, + { + NUM_LAYER_0_UNITS_8, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_8, + av1_simple_motion_search_split_logits_kernel_8, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_8, + av1_simple_motion_search_split_logits_bias_8, + }, +}; + +#undef NUM_HIDDEN_LAYERS_8 +#undef NUM_FEATURES_8 +#undef NUM_LAYER_0_UNITS_8 +#undef NUM_LOGITS_8 + +static const NN_CONFIG *const av1_simple_motion_search_split_nn_config[5] = { + &av1_simple_motion_search_split_nn_config_128, + &av1_simple_motion_search_split_nn_config_64, + &av1_simple_motion_search_split_nn_config_32, + &av1_simple_motion_search_split_nn_config_16, + &av1_simple_motion_search_split_nn_config_8, +}; + +// Model based on simple_motion_search for pruning rect +// Thresholds. The first idx level is aggresiveness, second is frame resolution, +// third is bsize +static const float av1_simple_motion_search_prune_rect_thresh[4][3][5] = { + // Aggressivness = 0 + { + // Lowres + { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, + 0.000961189195907f, 0.0f }, + // Midres + { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, + 0.000961189195907f, 0.0f }, + // Hdres + { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, + 0.000961189195907f, 0.0f }, + }, + // Aggressivness = 1 + { + // Lowres + { + 0.000000f, + 0.116076f, + 0.049759f, + 0.057747f, + 0.006001f, + }, + // Midres + { + 0.000000f, + 0.017380f, + 0.026077f, + 0.078111f, + 0.064477f, + }, + // Hdres + { + 0.002994f, + 0.103093f, + 0.076408f, + 0.010456f, + 0.187211f, + }, + }, + // Aggressiveness = 2 + { + // Lowres + { + 0.000000f, + 0.003111f, + 0.144294f, + 0.144884f, + 0.069924f, + }, + // Midres + { + 0.000000f, + 0.013696f, + 0.055203f, + 0.152271f, + 0.078886f, + }, + // Hdres + { + 0.030577f, + 0.082486f, + 0.040690f, + 0.140924f, + 0.067608f, + }, + }, + // Aggressiveness = 3 + { + // Lowres + { 0.0f, 0.352338114654f, 0.171190796972f, 0.322629318068f, + 0.287219697095f }, + // Midres + { 0.0f, 0.30938393361f, 0.271772875141f, 0.240627957104f, + 0.178833795641f }, + // Hdres + { 0.285731215187f, 0.37521798723f, 0.142380566244f, 0.338288917819f, + 0.21329309279f }, + }, +}; + +// Mean and std +static const float av1_simple_motion_search_prune_rect_mean_128[25] = { + 13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f, + 10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f, + 12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f, + 12.152315f, 11.517566f, 11.465651f, 5.383040f, 0.757934f, + 4.012611f, 4.052191f, 0.853365f, 3.954503f, 3.944135f, +}; + +static const float av1_simple_motion_search_prune_rect_std_128[25] = { + 2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f, + 3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f, + 2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f, + 1.208679f, 0.353742f, 1.228122f, 1.211777f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_64[25] = { + 11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f, + 9.084122f, 8.559063f, 8.499496f, 8.095865f, 8.041795f, + 10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f, + 10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f, + 3.306144f, 3.351039f, 0.928582f, 3.319739f, 3.287726f, +}; + +static const float av1_simple_motion_search_prune_rect_std_64[25] = { + 2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f, + 3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f, + 2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f, + 1.081292f, 0.257521f, 1.112510f, 1.089404f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_32[25] = { + 9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f, + 7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f, + 8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f, + 2.751266f, 0.963302f, 2.716584f, 2.709725f, +}; + +static const float av1_simple_motion_search_prune_rect_std_32[25] = { + 1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f, + 1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f, + 1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f, + 0.952221f, 0.188018f, 0.985295f, 0.946228f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_16[25] = { + 8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f, + 6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f, + 7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f, + 2.131698f, 0.981005f, 2.110868f, 2.106539f, +}; + +static const float av1_simple_motion_search_prune_rect_std_16[25] = { + 1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f, + 1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f, + 1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f, + 0.829935f, 0.136507f, 0.828972f, 0.808563f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_8[25] = { + 6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f, + 4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f, + 6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f, + 1.531762f, 0.989606f, 1.496581f, 1.484139f, +}; + +static const float av1_simple_motion_search_prune_rect_std_8[25] = { + 1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f, + 1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f, + 1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f, + 0.754040f, 0.101419f, 0.738239f, 0.729455f, +}; + +static const float *const av1_simple_motion_search_prune_rect_mean[5] = { + av1_simple_motion_search_prune_rect_mean_128, + av1_simple_motion_search_prune_rect_mean_64, + av1_simple_motion_search_prune_rect_mean_32, + av1_simple_motion_search_prune_rect_mean_16, + av1_simple_motion_search_prune_rect_mean_8, +}; + +static const float *const av1_simple_motion_search_prune_rect_std[5] = { + av1_simple_motion_search_prune_rect_std_128, + av1_simple_motion_search_prune_rect_std_64, + av1_simple_motion_search_prune_rect_std_32, + av1_simple_motion_search_prune_rect_std_16, + av1_simple_motion_search_prune_rect_std_8, +}; + +#define NUM_HIDDEN_LAYERS_128 1 +#define NUM_FEATURES_128 25 +#define NUM_LAYER_0_UNITS_128 8 +#define NUM_LOGITS_128 4 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_128[] = { + -0.129103f, 0.457758f, -0.489986f, 0.65462f, -0.184312f, 3.81202f, + -0.444407f, -0.64198f, -0.575008f, 0.0311711f, 0.525243f, -20.892f, + 1.08811f, -65.0976f, -12.3973f, -1.38278f, -0.264233f, 0.241636f, + -10.6925f, -0.725414f, -18.8987f, -40.2284f, -16.08f, 0.995331f, + 1.47614f, -0.964864f, 0.405506f, 0.140449f, 0.459534f, -1.9093f, + 0.398452f, 0.696949f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_128[] = { + 1.22789f, -1.34527f, 0.759048f, 0.315086f, + 1.0834f, -1.58019f, -0.465158f, 1.20716f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_128[] = { + -0.668677f, 0.58694f, -0.417094f, 0.754735f, -0.7859f, + 0.377479f, -0.0415929f, -0.0140585f, -0.730001f, 0.747528f, + -0.135247f, 0.406505f, -0.234184f, 0.956362f, -0.637555f, + 0.791884f, 0.0303722f, 1.04424f, -0.727859f, -0.274321f, + -0.122986f, 0.066312f, -0.00559175f, -0.239643f, -0.0188767f, + -0.102787f, -0.262967f, 0.071882f, -0.283398f, 0.111607f, + -0.425826f, 0.02699f, 0.108873f, -0.180558f, -0.0794057f, + 0.29665f, -0.0252969f, -0.0266213f, -0.277462f, -0.361973f, + 0.512552f, 0.395011f, -0.225876f, 0.301924f, 0.136954f, + 0.507259f, 1.23425f, 0.0137135f, 0.662572f, 0.591583f, + 0.101564f, 0.416805f, -0.645081f, -0.179086f, -0.36747f, + -0.332213f, 0.095177f, 0.220739f, -0.153256f, 0.706155f, + 0.161701f, 0.696815f, -1.21531f, -0.115059f, 0.486764f, + -0.396093f, 0.784883f, 0.535357f, -0.278021f, 0.143496f, + -0.44931f, -0.144543f, 0.319326f, 0.0190167f, -0.206295f, + 0.373995f, -0.247897f, -0.608095f, -0.41796f, -0.137129f, + -0.709562f, 0.678273f, 0.537607f, 0.557474f, 0.453308f, + 0.21405f, -0.0466495f, 0.519139f, -0.168832f, 0.902911f, + 0.681131f, -0.139876f, -0.2052f, -0.393271f, 0.262222f, + -0.246246f, -0.213993f, 0.646619f, 0.0496181f, -0.00354157f, + 0.822927f, 0.0939522f, 0.180738f, 0.118355f, 0.120456f, + -0.0472214f, -0.144958f, 0.173405f, -0.886644f, -0.0949769f, + -0.813518f, -0.3947f, -0.128021f, 0.356196f, 0.469169f, + -0.413702f, 1.04242f, 0.428853f, -0.387293f, 0.0850877f, + 0.279409f, -0.142276f, 0.0579376f, 0.211112f, 0.0703013f, + -1.9274f, -0.729147f, 0.534193f, 0.773586f, 0.922864f, + 0.642881f, 1.15127f, 0.621032f, 0.933942f, 1.01837f, + -0.660282f, -0.40059f, -1.11279f, -0.77088f, -0.43349f, + 0.202361f, -0.0840912f, 0.0935707f, 0.056333f, -0.0779369f, + 0.0173447f, -0.0104756f, 0.0115005f, -0.0195593f, 0.03592f, + -0.343454f, -0.618048f, 0.258172f, -0.412322f, -0.0463746f, + -0.0413654f, -0.0400194f, 0.615981f, -0.452094f, 0.644555f, + 0.0822476f, -0.359791f, -0.0904274f, 0.209427f, 0.0116338f, + -0.190978f, 0.890233f, 0.737769f, -1.66663f, -0.392605f, + 0.0785728f, -0.224553f, -0.128258f, -0.227227f, -0.0777773f, + 0.685976f, 0.347042f, -0.555325f, -0.249221f, 0.0919837f, + -0.0660016f, -0.272316f, 0.0390632f, -0.619624f, -0.0565801f, + 0.585026f, 0.597375f, 0.54114f, 0.593389f, 0.604391f, + 0.0820294f, -0.85339f, -1.40741f, -0.391675f, 0.0579205f, + -0.197626f, 0.130044f, -0.234488f, -0.0373991f, -0.0717973f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_128[] = { + 1.58571f, -4.6314f, -2.00273f, 0.543699f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_128 = { + NUM_FEATURES_128, + NUM_LOGITS_128, + NUM_HIDDEN_LAYERS_128, + { + NUM_LAYER_0_UNITS_128, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_128, + av1_simple_motion_search_prune_rect_logits_kernel_128, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_128, + av1_simple_motion_search_prune_rect_logits_bias_128, + }, +}; + +#undef NUM_HIDDEN_LAYERS_128 +#undef NUM_FEATURES_128 +#undef NUM_LAYER_0_UNITS_128 +#undef NUM_LOGITS_128 + +#define NUM_HIDDEN_LAYERS_64 1 +#define NUM_FEATURES_64 25 +#define NUM_LAYER_0_UNITS_64 32 +#define NUM_LOGITS_64 10 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_64[] = { + 0.10424f, -0.346025f, 0.534547f, -0.385925f, 2.58341f, -0.256414f, + -0.232498f, 0.329823f, -0.0777376f, -0.590939f, 0.062657f, -0.628252f, + 0.0934588f, 2.04029f, -0.224448f, 0.371168f, -0.385348f, -0.589883f, + -3.73627f, -0.943144f, 0.346409f, -0.211215f, -0.351008f, 0.418807f, + 0.943663f, 0.173267f, 1.16585f, -0.0840888f, 0.227464f, 0.374412f, + 0.0422597f, -0.338868f, 0.222576f, 0.431713f, 1.12366f, 0.00753411f, + 0.248412f, -0.0902425f, 0.542455f, -0.665629f, -0.311245f, -0.205639f, + -0.447149f, -0.0502733f, -0.290186f, -0.794384f, 0.0940881f, -0.0686117f, + -0.0199961f, -0.587965f, 0.777096f, -0.083381f, -1.21282f, 0.652959f, + -1.18238f, 0.539991f, 0.352497f, -0.540076f, -0.26222f, -0.568556f, + 0.409102f, -0.131146f, -0.407161f, -0.188287f, -0.478657f, 0.000401932f, + -0.689324f, 0.351064f, -1.43704f, -0.315185f, -0.868726f, 0.376341f, + -0.0566277f, 0.364831f, 0.611298f, -0.495253f, -0.0193132f, 0.617978f, + 0.189586f, -0.236758f, -0.608246f, -0.149017f, -1.78303f, 0.143023f, + 0.698386f, -0.994086f, -0.673327f, 0.233868f, 0.360425f, 0.0294123f, + -0.248683f, -0.148392f, 0.0861829f, -0.190843f, -0.414906f, 0.607378f, + -0.756715f, -0.511713f, -0.321556f, 1.0078f, -1.18141f, 0.519751f, + 0.834629f, -0.359343f, 0.612262f, -0.0730553f, 0.262935f, 0.488276f, + 0.387071f, -1.44123f, 1.08269f, 0.554402f, -0.069f, 0.14113f, + 0.323817f, 0.824314f, -0.431417f, -0.349448f, 0.950728f, -0.587836f, + -0.83914f, -0.10844f, 0.26602f, 0.831933f, -0.271315f, 0.231563f, + 0.417049f, 0.190627f, -0.0940667f, 0.255363f, -0.0741022f, -0.0987662f, + -0.847522f, 0.00287554f, 0.0615741f, -0.0832218f, 0.0847148f, -0.392843f, + -0.938068f, -0.10621f, -0.260859f, -0.825175f, -0.401039f, 0.315213f, + -0.108269f, 0.288036f, -8.66166f, -0.970752f, -0.66678f, -0.593405f, + -0.518294f, -0.138722f, -0.454698f, -0.22969f, -0.553006f, -0.440111f, + 0.462661f, -0.536854f, 0.0108295f, -0.522888f, 0.00111157f, 0.229999f, + 0.0267768f, 0.176266f, -1.57043f, 0.0318106f, 0.257534f, -0.198583f, + 0.175564f, -0.251465f, -0.262441f, -1.65283f, -0.319603f, -0.875282f, + -0.301303f, 0.0170948f, -0.227075f, 0.0299545f, -4.98346f, 0.470046f, + -1.28051f, -0.213809f, -0.486585f, -0.906463f, -0.169984f, -0.333153f, + -0.376733f, 0.108016f, 0.486744f, -0.186936f, -0.429259f, 0.056501f, + -0.266545f, 0.265447f, -0.137718f, -0.490687f, -0.935668f, -0.16229f, + -0.696932f, 0.173157f, 0.434959f, -0.140595f, 0.345845f, -1.08013f, + -0.0205929f, -0.815874f, -0.179812f, 0.02767f, -0.141727f, 0.471936f, + -7.29453f, -1.04362f, -0.745482f, -0.28725f, -0.214997f, -0.0850651f, + -0.748471f, 0.161325f, -1.04387f, -0.705305f, 0.489427f, -0.765373f, + -0.301576f, 0.0742467f, -0.331282f, 0.0372328f, -0.90298f, -0.0608646f, + -2.18756f, 0.170384f, -0.258357f, 0.106287f, -0.161684f, -0.103799f, + -0.127774f, -0.156313f, 0.0705286f, -0.977908f, -0.281191f, -0.056757f, + -0.309474f, 0.050476f, -9.78198f, -2.42795f, -0.289626f, -1.07579f, + -0.439256f, -1.09948f, -0.564671f, 0.0913182f, -0.417216f, -1.19909f, + 0.287063f, 0.402315f, -0.17646f, 0.540488f, 0.00840239f, 0.397492f, + 0.702393f, -0.10566f, 0.655296f, -0.0443876f, 0.154918f, -0.760479f, + -0.0523153f, -0.366199f, -1.08212f, -0.398556f, -0.415203f, -1.10488f, + 0.208349f, 0.27079f, 0.101546f, -0.205752f, -13.7923f, -0.218637f, + -1.10077f, 0.355735f, -0.306196f, 0.627434f, -0.473101f, -0.308027f, + -1.12724f, 0.301597f, 0.660785f, 0.0576217f, -0.155925f, -0.56107f, + -0.223537f, 0.114299f, -0.53803f, -0.252674f, -2.66103f, -0.185245f, + -0.314673f, 0.403337f, 0.679821f, -0.69231f, 0.506264f, -0.999705f, + -0.549097f, 0.353745f, 0.188249f, 0.414484f, -0.615853f, 0.525681f, + -5.23065f, -3.05174f, 1.02074f, -0.965499f, -0.158947f, 0.0436088f, + -0.485824f, 0.0375094f, -1.39985f, -0.481392f, 0.485785f, -0.24874f, + -0.359633f, 0.668108f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_64[] = { + 0.0735592f, -0.045064f, -0.0114103f, 1.39246f, -0.683467f, 0.155765f, + -0.667652f, -0.202425f, -0.585433f, -0.146752f, -0.0812931f, 0.580642f, + 0.578542f, -0.831916f, 0.610063f, 0.0101856f, -0.235863f, 0.538141f, + -2.91334f, -1.71887f, 0.126616f, 0.582497f, -0.438879f, 0.221833f, + 0.850773f, -0.280886f, 0.443233f, -0.0964873f, -0.216161f, 0.34413f, + 0.656818f, 0.0169274f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_64[] = { + -0.310947f, -0.232675f, 0.0171092f, 0.0834474f, 0.373977f, + 0.300429f, 0.215072f, -0.454074f, 0.187565f, 0.282742f, + 0.562562f, -0.0419322f, 0.000978486f, -0.298267f, 0.216934f, + -0.388722f, -0.146866f, -0.275946f, 0.202361f, 0.225847f, + 1.42868f, 0.473127f, -0.145747f, -0.104986f, 0.153459f, + 0.69382f, 0.162266f, 0.0207715f, -0.45095f, -0.412071f, + -0.235109f, -0.130199f, 0.231741f, 0.460193f, 0.0378202f, + 0.429516f, 0.387691f, -0.272479f, 0.0723884f, -0.453914f, + -0.150618f, -0.10745f, -0.258615f, 0.0838312f, -0.00554958f, + 0.105377f, -0.0415479f, 0.13228f, 1.09044f, -0.73053f, + -0.422553f, -0.435842f, 0.211416f, 0.420332f, 0.0181353f, + -0.030891f, 0.522788f, 0.613526f, 0.374032f, 0.287986f, + -0.403118f, -0.287362f, -1.11523f, -0.577713f, -0.020228f, + 0.86465f, -0.0590579f, 0.341274f, -0.0115644f, -0.260236f, + 0.192123f, -0.0849825f, 0.0501709f, 0.444382f, 0.0762727f, + 0.0926596f, -0.101157f, -0.142787f, 0.40861f, 0.555805f, + -0.00614654f, -0.122846f, 0.203163f, 0.234266f, 0.409795f, + -0.0206245f, -0.224679f, 0.025081f, 0.518044f, -0.287186f, + 0.016494f, -0.0886331f, 0.236438f, -1.01032f, 0.118332f, + 0.364217f, 0.061438f, 0.0381303f, 0.128418f, 0.0257077f, + -0.975751f, -0.694894f, 0.00351914f, 0.278179f, 0.29363f, + 0.525576f, 0.0604849f, 0.531734f, 0.406643f, 0.812497f, + -0.403196f, -0.16664f, -0.620887f, -0.428194f, 0.275401f, + 0.432063f, -0.00378342f, 0.295758f, 0.105615f, -0.00683626f, + 0.00396146f, 0.00598654f, -0.0131701f, -0.0115787f, 0.00386643f, + -0.69686f, -0.139623f, -0.440817f, 0.0542873f, 0.217962f, + 0.527035f, -0.0201046f, 0.0471354f, 0.0271858f, -0.0775197f, + -0.309797f, 0.184879f, -0.232854f, -0.407081f, 0.706227f, + -0.0877534f, 0.306843f, 0.455075f, -0.333961f, 0.0759148f, + 0.0444791f, -0.0693626f, -0.0850289f, -0.513063f, -0.643971f, + -0.630279f, -0.153889f, 0.123315f, 0.00548238f, 0.170707f, + 0.734339f, -0.176988f, 0.322519f, 0.178365f, 0.183519f, + -0.698683f, -0.12043f, -0.349914f, -0.0696762f, -0.53986f, + -0.104738f, 1.05264f, 0.983568f, -0.109035f, 0.0113748f, + 0.0815189f, -0.0628812f, 0.0769389f, 0.010261f, 0.146573f, + -0.433194f, -0.211572f, -0.000397392f, 0.445325f, 0.145091f, + -0.0625902f, 0.29394f, 0.302315f, 0.0892226f, -0.209504f, + -0.0150374f, 0.242608f, 0.216223f, 0.366857f, 0.209829f, + -0.540035f, 0.117599f, -0.329315f, 0.0471133f, -0.0115449f, + -0.0638235f, 0.0527461f, 0.348149f, 0.360802f, 1.06624f, + -0.615991f, -0.341396f, 0.18972f, 0.0709888f, -0.0414466f, + -0.0193809f, 0.0938933f, 0.209058f, 0.575042f, 0.483608f, + -0.285875f, -0.115905f, -0.363637f, 0.375425f, 0.336217f, + 0.0336358f, -0.00265618f, -0.406854f, -0.792959f, -0.219354f, + 0.0331615f, 0.0298859f, -0.211446f, -0.00280773f, -0.194011f, + 0.262109f, 0.548076f, 0.120183f, -0.661603f, 0.241855f, + -0.501428f, 0.00102718f, -0.347331f, -0.58306f, 0.0977254f, + 0.117491f, 0.0840667f, 0.00693675f, 0.000600294f, 0.649569f, + -0.0553811f, -0.197198f, 0.397236f, -0.523737f, -0.564192f, + -0.374679f, -0.249344f, 0.00861428f, 0.00393439f, -0.0834608f, + 0.124389f, -0.0393049f, 0.0425391f, -0.153383f, -0.182346f, + 0.420953f, 0.464221f, 0.288984f, 0.570921f, -0.239965f, + 0.247239f, -0.083434f, 0.714418f, 0.986323f, -0.460244f, + -0.260993f, -0.947743f, -1.0789f, -0.0391231f, 0.612407f, + -0.0306767f, 0.281419f, 0.0072426f, -0.37623f, 0.188744f, + 0.221666f, -0.424914f, 0.29703f, 0.261715f, 0.277809f, + -0.0617616f, -0.000611999f, -0.0547053f, -0.0901018f, -0.347669f, + 0.856072f, 0.596675f, -0.467639f, -1.09324f, -0.184224f, + -0.56051f, -0.0144704f, 0.102894f, -0.122982f, -0.0020749f, + -0.0423487f, 0.0328702f, -0.0154263f, 0.0349021f, -0.00315595f, + 0.0254802f, -0.729191f, 0.207296f, -0.0212349f, -0.207078f, + 0.20636f, -0.156883f, 0.429765f, -0.42672f, 0.138775f, + -0.0267343f, 0.631528f, 0.300646f, -0.4793f, -0.273833f, + -0.0135367f, -0.530819f, -0.534881f, 0.830896f, 0.0266992f, + 0.473744f, 0.210334f, 0.0234739f, 0.255394f, 0.123531f, + -0.489341f, -0.796627f, 0.372617f, 0.190136f, 0.275342f, + 0.739505f, 0.402354f, 0.782806f, 0.437374f, 1.04948f, + -0.55963f, 0.382704f, -0.698321f, 0.0817868f, -0.440108f, + -0.0635004f, -0.277851f, -0.524194f, 0.286157f, -0.01097f, + -0.0293145f, -0.0405071f, -0.035662f, -0.012871f, -0.0516409f, + -0.406671f, 0.709259f, -0.525177f, 0.521123f, -0.44813f, + 0.48412f, -0.0546513f, 0.305253f, -0.468328f, 0.316453f, + -0.36307f, 0.497515f, -0.0606276f, 0.315764f, -0.422066f, + 0.554025f, -0.679183f, 0.616914f, 0.00283324f, -0.000643824f, + 0.0639999f, 0.0488285f, -0.141031f, 0.068003f, -0.0792678f, + -0.425307f, -0.152235f, 0.269917f, -0.352327f, 0.44792f, + -0.116514f, -0.465868f, 0.154287f, 0.0161028f, -0.16848f, + -0.255487f, 0.189832f, 0.254883f, 0.0240822f, 0.432638f, + -0.136564f, 0.137036f, 0.0375734f, 0.989246f, -0.126287f, + 0.111416f, -0.0271002f, 0.718755f, -0.0412969f, 0.00645681f, + 0.253811f, -0.0186998f, 0.691971f, -0.282042f, -0.0783915f, + 0.274592f, -0.358449f, 0.34155f, -0.186374f, -0.136907f, + -0.192334f, -0.251168f, -0.100874f, -0.166578f, -0.336507f, + 0.402373f, 0.173695f, 0.108788f, 0.00885581f, -0.310063f, + 1.05545f, 0.0295867f, 0.180785f, -0.173469f, -0.469924f, + -0.224155f, 0.665862f, -0.126546f, 0.240691f, -0.0415301f, + -0.598534f, 0.0012723f, -0.122297f, -0.558947f, 0.268844f, + 0.241193f, 0.0524422f, -0.1683f, 0.575588f, -0.139012f, + 0.0636691f, -0.446709f, -0.094532f, 0.883809f, -0.112981f, + -0.224047f, 0.0811193f, -0.140571f, -0.09683f, -0.0796143f, + -0.102246f, -0.863392f, -0.0755124f, 0.23125f, -0.0301361f, + -0.153029f, -0.172238f, -0.0286382f, -0.338495f, -0.317216f, + -0.146629f, -0.242264f, -0.702306f, -0.285052f, 0.0623479f, + 0.265735f, 0.00674475f, 0.666196f, 0.883586f, 0.278416f, + -0.341692f, -0.509931f, -0.156263f, 0.635885f, -0.544143f, + -0.572632f, -0.213285f, 0.443396f, -0.268329f, 0.0638439f, + -0.185397f, 0.071126f, 0.386503f, -0.402212f, -0.140784f, + -0.411661f, 0.049398f, -0.0672907f, -0.267034f, -0.0560875f, + 0.0607937f, 0.0445484f, -0.547651f, 0.574718f, 0.417189f, + -0.0610166f, 0.0632293f, 0.391619f, -0.00671215f, -0.136883f, + -0.339346f, 0.0356183f, 0.511993f, 0.178676f, 0.286998f, + 0.136511f, -0.00796929f, 0.203985f, 0.0423532f, -0.175196f, + 0.378534f, 0.770417f, 0.593778f, 0.0256067f, -0.82394f, + -0.500691f, -0.425725f, -0.623708f, -0.0406241f, -0.00226464f, + 0.0207836f, 0.30732f, -0.00784268f, 0.0065445f, -0.0991039f, + -0.20871f, -0.206835f, 0.281219f, 0.119361f, 0.259346f, + -0.102713f, 0.186488f, -0.034455f, -0.00198392f, -0.279107f, + -0.638993f, -0.374404f, -0.48601f, -0.262345f, 0.624532f, + 0.620632f, -0.227014f, 0.433579f, -0.0455096f, 1.22123f, + -0.429156f, 0.12396f, 0.0815152f, -0.0837355f, 0.0282623f, + -0.407475f, 0.787321f, -0.434974f, 0.312904f, -0.230805f, + 0.213042f, -0.250929f, 0.302997f, -0.354709f, 0.0504905f, + -0.561706f, 0.595558f, 0.374951f, 0.802969f, -0.674902f, + 0.33136f, 0.156606f, 0.0218968f, -0.694188f, -0.0221949f, + -0.00639123f, 0.0146536f, 0.0104145f, 0.021635f, -0.0499428f, + -0.575116f, -0.239035f, -0.0588276f, 0.599722f, 0.541932f, + 0.437433f, 0.716268f, 0.193207f, 0.548351f, 0.326951f, + -0.197124f, 0.0355353f, -0.0952009f, -0.217265f, -0.389789f, + 0.0528124f, -0.21334f, -0.190296f, -1.17367f, 0.108905f, + 0.109397f, -0.0192577f, 0.0343813f, 0.085004f, -0.0556737f, + -0.0411158f, -0.534989f, 0.0361896f, 0.124415f, 0.291603f, + -0.0311974f, -0.326726f, 0.343131f, 0.0276456f, -0.231827f, + -0.373894f, -0.208898f, -0.273011f, 0.061323f, -0.0910538f, + -0.30746f, -0.108644f, -0.190736f, 1.58048f, -0.0739711f, + -0.0623489f, -0.137967f, -0.0601359f, -0.133004f, -0.0857153f, + 0.00955987f, -0.365561f, -0.0329051f, 0.463463f, 0.14758f, + -0.512256f, -0.227463f, -0.26008f, -0.567777f, 0.0646234f, + 1.02161f, 0.66157f, -0.16733f, 0.264921f, -0.242036f, + 0.214622f, 0.0712054f, -0.260377f, 0.0849665f, 0.735094f, + 0.11001f, 0.297301f, -0.333342f, 0.066978f, -0.123625f, + 1.07596f, 0.401263f, 0.0800875f, -0.340862f, -0.115587f, + -0.32692f, -0.300842f, 0.0277397f, 0.0630788f, -0.261198f, + 0.428695f, -0.0544757f, -0.124511f, 0.036992f, 0.126322f, + 0.0317603f, 0.0820762f, 0.117277f, -1.14594f, -0.108076f, + -0.0258198f, -0.00337525f, -0.00512531f, 0.1274f, -0.0660535f, + -0.640733f, 0.197142f, 0.147278f, 0.489271f, 0.226507f, + -0.0668414f, 0.0946318f, 0.0994164f, -0.820516f, 0.512939f, + -0.305172f, -0.715187f, -0.195125f, 0.279346f, 0.462144f, + 0.913882f, -0.453879f, 0.0582033f, -0.462866f, 0.0538736f, + 0.0115737f, 0.00626993f, -0.0185185f, 0.0114601f, -0.0181164f, + 0.41588f, -0.0447331f, 0.611756f, 0.43385f, 0.834465f, + 0.122019f, -0.352983f, 0.340429f, -0.245425f, -0.365328f, + -0.521825f, 0.0371057f, 0.172188f, -0.387949f, 0.221054f, + 0.0126359f, 0.422958f, 0.584198f, -0.581498f, -0.019466f, + -0.0271737f, -0.0740885f, 0.00540879f, 0.186086f, -0.0324402f, + -0.563462f, -0.458759f, -0.425296f, -0.0118862f, -0.641508f, + 0.0132084f, 0.0581128f, 0.0231444f, 0.468587f, 0.258838f, + 0.0296665f, 0.0562801f, 0.630014f, 0.381816f, -0.269761f, + -0.135515f, 0.046186f, 1.07632f, -0.050616f, 0.104987f, + 0.29991f, 0.119316f, 0.117248f, 0.0795009f, 0.242573f, + 0.0416634f, -0.0577639f, -0.0974078f, 0.106255f, -0.13098f, + 0.0141486f, -0.00418257f, 0.144848f, -0.463934f, 0.0452591f, + 0.252617f, 0.205222f, -0.189843f, 0.0652245f, -0.135386f, + 0.0500646f, -0.200368f, -0.0142312f, -0.0286832f, -0.254355f, + -1.02752f, -0.73549f, 0.0364518f, 0.0416227f, -0.13185f, + -0.0886515f, -0.502314f, -0.102916f, 0.410911f, -0.355655f, + 0.400416f, -0.340217f, 0.208829f, 0.245972f, 0.149739f, + -0.49458f, 0.589482f, 0.550827f, 0.912709f, -0.351275f, + -0.128076f, -0.285172f, -0.672752f, 0.090583f, -0.245286f, + -0.737297f, -0.201515f, -0.025122f, -0.109854f, 0.36738f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_64[] = { + 0.346819f, 0.442965f, -0.0216032f, 0.0229235f, -0.402797f, + -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_64 = { + NUM_FEATURES_64, + NUM_LOGITS_64, + NUM_HIDDEN_LAYERS_64, + { + NUM_LAYER_0_UNITS_64, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_64, + av1_simple_motion_search_prune_rect_logits_kernel_64, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_64, + av1_simple_motion_search_prune_rect_logits_bias_64, + }, +}; + +#undef NUM_HIDDEN_LAYERS_64 +#undef NUM_FEATURES_64 +#undef NUM_LAYER_0_UNITS_64 +#undef NUM_LOGITS_64 + +#define NUM_HIDDEN_LAYERS_32 1 +#define NUM_FEATURES_32 25 +#define NUM_LAYER_0_UNITS_32 28 +#define NUM_LOGITS_32 10 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_32[] = { + 0.486581f, 0.340847f, -0.109226f, 0.467224f, -0.541561f, + 0.0943619f, -0.429442f, -0.207442f, 0.959963f, 0.618666f, + -0.0636751f, 0.144508f, -0.0278289f, 0.332293f, -0.751493f, + 0.245438f, -0.917758f, 0.612128f, -0.32648f, 0.534618f, + -0.615239f, 2.71641f, 0.233759f, 0.820558f, -0.249758f, + -0.427783f, -0.359361f, 0.0375732f, 0.806973f, 0.352512f, + -0.0532192f, 0.0576861f, -0.464178f, -0.334877f, -0.697042f, + 0.0538218f, 0.0919659f, -0.00765812f, 0.0603847f, -0.460315f, + 0.37979f, -0.0867612f, -0.670683f, -0.188619f, -0.570586f, + 0.233418f, 0.153581f, 0.290905f, -0.624885f, -0.557842f, + -0.555567f, 0.463773f, -0.123909f, -0.277731f, 0.0374468f, + 0.409903f, 0.287638f, -0.593066f, -0.223434f, 0.154263f, + -0.250464f, -0.077696f, 0.229652f, -0.304174f, 0.308053f, + 0.33155f, -0.502825f, 0.361216f, -0.499294f, 0.00595444f, + -0.307201f, 0.5766f, -0.438384f, -0.093701f, -0.118586f, + 0.202337f, -0.486623f, 0.261552f, 0.139756f, -0.655642f, + -0.0627001f, -0.213053f, -0.243037f, 0.205918f, 0.0718368f, + 0.188041f, 0.141529f, -0.132239f, 0.425827f, -0.218353f, + 0.153114f, 0.33268f, 0.0226116f, 0.167394f, 0.269854f, + -0.457001f, 0.1973f, -0.526087f, 0.467528f, 0.290934f, + 1.16267f, 0.0823663f, -0.754389f, -0.83716f, 0.270157f, + -1.41229f, 0.148511f, -0.286832f, 0.664796f, 0.492254f, + 0.360567f, -0.533993f, 0.0435672f, -0.103001f, 0.220668f, + 0.594621f, -0.0213356f, -0.347638f, -0.694457f, 0.0759505f, + 0.161358f, -0.389384f, -0.0455192f, -0.61252f, -0.174173f, + -0.00788878f, -1.22487f, 0.332233f, -0.0457021f, -0.225918f, + -0.197657f, -0.115408f, -0.240589f, -2.05681f, 0.00914629f, + -1.92213f, 0.0268578f, -0.49076f, -0.0120123f, 0.291157f, + 0.267116f, -0.0775724f, 0.181115f, -0.392441f, -0.488114f, + -0.28842f, -0.115465f, 0.128974f, -0.0829899f, -0.14096f, + -0.140145f, -0.700281f, 0.0368945f, -0.437598f, 0.243485f, + -1.00301f, 0.332324f, 0.125014f, -0.0604481f, -0.0652028f, + -0.207295f, -1.0209f, -0.341525f, 0.191326f, -0.147578f, + 0.0878327f, 0.129827f, -0.0848319f, 0.187381f, -1.28663f, + 0.00537885f, -0.134277f, -0.0411126f, -0.3434f, -0.0456494f, + 0.37861f, 0.409095f, 0.237177f, -0.396855f, -0.205418f, + -1.31701f, -0.319032f, -0.123404f, -0.240005f, -0.305206f, + -0.0258176f, -0.26367f, -0.142396f, 0.191672f, -1.44061f, + 0.0554776f, -0.571839f, -0.284789f, -0.425677f, -0.0307376f, + 0.20275f, -0.223146f, 0.144612f, 0.0212636f, 0.0238303f, + -0.253802f, -0.188922f, -0.0637066f, -0.340836f, 0.124774f, + 0.130474f, -0.154099f, -0.0292733f, 0.158148f, -0.246989f, + -0.259059f, 0.220224f, 0.228449f, -0.41956f, -0.321848f, + -0.2396f, -0.316449f, -1.3363f, 0.0264099f, -1.46865f, + 0.113073f, 0.0722885f, -0.166986f, -0.164877f, 0.0360911f, + 0.534472f, -0.551152f, -0.328501f, 0.0781121f, -0.378112f, + -0.459502f, 0.28015f, -0.212302f, -0.521641f, 0.618993f, + -0.347709f, 0.266253f, -0.0280894f, 0.348511f, -0.0155031f, + -0.100693f, 0.0447673f, 0.277519f, -0.233998f, -0.0796738f, + -1.73644f, -0.160776f, 0.53092f, -0.180406f, 0.056447f, + 0.385356f, -0.262337f, -0.241479f, -0.271426f, -0.457354f, + -0.266788f, 0.367371f, -0.103065f, 0.47783f, -0.188327f, + -0.159636f, 0.00142907f, -0.409756f, 0.454889f, -0.24566f, + -0.0760084f, 0.286355f, 0.462102f, 0.0431695f, -0.127395f, + -0.200476f, -0.350557f, 0.217275f, -0.23975f, 0.255148f, + -0.280626f, 0.42476f, 0.157411f, 0.0358675f, -0.192591f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_32[] = { + 0.940498f, 0.15602f, -0.234831f, 0.0268585f, 0.144769f, 0.243081f, + 0.611406f, 0.366093f, 0.361868f, 0.39668f, 0.401479f, 0.369467f, + 0.0909503f, 0.710595f, 0.032786f, 0.525891f, -1.0232f, 0.732557f, + -0.064425f, 0.865222f, -0.042917f, -0.237191f, -0.527006f, -0.0172101f, + 0.59681f, -0.472405f, 0.0969218f, -0.250624f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_32[] = { + 0.355607f, 0.126701f, -0.0825159f, 0.200675f, -0.011308f, + -0.280057f, 0.559816f, 0.142689f, 0.0422419f, -0.151692f, + -0.0275637f, -0.283101f, -0.20822f, -0.200394f, 0.465427f, + 0.344491f, -0.525319f, -0.358813f, -0.39767f, 0.0974486f, + 0.00559058f, -0.00546089f, 0.0506486f, 0.114475f, -0.0436463f, + -0.574152f, -0.376294f, 0.16563f, -0.0967032f, 0.00579838f, + 0.0639909f, -0.037129f, 0.407574f, -0.231428f, 0.489326f, + -0.221566f, -0.270382f, -0.784628f, -0.155502f, 0.481698f, + -0.0296057f, 0.431855f, 0.840807f, 0.112291f, 0.773874f, + -0.0610936f, -0.012892f, 0.365154f, 0.0267687f, -0.0751114f, + 0.25043f, 0.516472f, -0.186133f, -0.12762f, -0.168804f, + -0.146309f, 0.139314f, -0.367113f, -0.601079f, 0.0559856f, + 0.176081f, 0.22397f, 0.434113f, 0.0363256f, 0.313051f, + 0.0143976f, 0.190076f, 0.474607f, -0.681134f, -0.0709097f, + -0.253289f, -0.216277f, -0.0593789f, -0.107795f, -0.194842f, + 0.513945f, 0.239171f, -0.720561f, 0.0136723f, -0.391147f, + -0.272043f, -0.164766f, 0.124248f, 0.147178f, -0.35497f, + 0.397725f, -0.117603f, 0.262937f, -0.331964f, 0.182418f, + 0.315671f, -0.0385649f, 0.488769f, -0.334568f, 0.00596018f, + 0.0661557f, -0.0446985f, -0.0928255f, -0.0221032f, -0.019045f, + -0.20881f, 0.197907f, -0.381881f, 0.0598071f, -0.0434551f, + 0.159283f, -0.110631f, 0.266996f, -0.0265494f, 0.135199f, + -0.00833162f, 0.804482f, -0.114698f, -0.15066f, -0.479553f, + 0.448407f, -0.344069f, -0.0280952f, -0.208211f, -0.102269f, + -0.679066f, -0.37476f, -0.0228875f, 0.0535049f, 0.111015f, + -0.18125f, -0.167584f, 0.0110497f, 0.262723f, -0.413839f, + -0.0611238f, 0.358499f, 0.0807514f, 0.208254f, 0.214499f, + 0.11137f, -0.14262f, -0.0513973f, 0.243718f, -0.373716f, + -0.00413366f, 0.216501f, -0.164149f, -0.064935f, -0.0840282f, + 0.0566148f, 0.0377686f, 0.289835f, 0.769388f, 0.891198f, + -0.592739f, 0.40744f, -0.153095f, 0.657311f, 0.140737f, + 0.28209f, 0.158344f, 0.353546f, 0.0868246f, 0.116887f, + 0.402004f, 0.437184f, 0.589219f, 0.760594f, -0.575419f, + -0.754308f, -0.709219f, -0.297814f, -0.418609f, -0.0262104f, + 0.0411959f, 0.0597708f, -0.143728f, -0.136642f, 0.099614f, + -0.257601f, -0.2404f, 0.305893f, 0.254009f, -0.0301398f, + -0.0653091f, -0.459002f, -0.163404f, 0.123152f, -0.0284252f, + -0.457272f, 0.00788622f, -0.828399f, -0.0534199f, 0.586877f, + 0.982728f, 0.424581f, 0.0891856f, 0.383182f, -0.122053f, + 0.0808408f, -0.00384914f, -0.0560201f, -0.0524772f, -0.263444f, + -0.239287f, -0.882777f, 0.0180592f, -0.0948711f, -0.177946f, + 0.0296473f, 0.096082f, 0.0455604f, -0.108608f, 0.00777951f, + -0.140896f, 0.117187f, -0.342467f, -0.0691604f, 0.0761611f, + -0.0892053f, 0.111386f, -0.167456f, 1.40616f, -0.00478793f, + 0.00547665f, -0.0441829f, 0.0151323f, -0.0674099f, -0.0380578f, + 0.16072f, 0.31882f, 0.245486f, -0.424318f, 0.101845f, + -0.203343f, -0.197402f, -0.163025f, -0.0771961f, -0.264435f, + 0.319429f, 0.250076f, 0.782726f, 0.386003f, 0.00700673f, + -0.375715f, 0.151453f, -0.296265f, -0.560183f, -0.00767249f, + -0.109593f, -0.119419f, -0.0161516f, 0.0380283f, -0.156417f, + 0.131708f, 0.396268f, -0.221796f, 0.232099f, 0.128852f, + 0.0567268f, 0.297297f, 0.173269f, 0.213411f, 0.0384426f, + -0.290985f, -0.0426841f, -0.488292f, -0.087101f, -0.311582f, + 0.83009f, -0.153163f, 0.903335f, -1.15644f, -0.0378635f, + -0.0552129f, -0.126362f, -0.176945f, 0.0653115f, 0.0989368f, + -0.333543f, -0.330586f, 0.29775f, -0.103535f, 0.210824f, + -0.00300509f, 0.317105f, 0.216852f, 0.479718f, 0.0485808f, + -0.15662f, 0.718199f, 0.327513f, 0.115169f, -0.423598f, + -0.456633f, -0.575814f, -0.494454f, 0.304411f, 0.0493055f, + -0.381171f, 0.467251f, -0.122872f, -0.167441f, 0.017253f, + -0.0583646f, -0.1586f, 0.214046f, -0.0284424f, -0.217112f, + 0.606567f, -0.107533f, 0.36615f, -0.0709227f, 0.604761f, + -0.244657f, -0.296651f, -0.595611f, -0.156629f, -0.693468f, + -0.310603f, 0.499272f, 0.282941f, 0.295043f, -0.178704f, + 0.281186f, 0.014329f, -0.120819f, 0.154234f, 0.0131325f, + -0.472231f, -0.631281f, 0.422955f, 0.711432f, -0.118025f, + 0.0864996f, 0.343971f, -0.301477f, -0.246638f, 0.165068f, + 0.218044f, 0.224236f, -0.0848522f, 0.00671216f, 0.401141f, + -0.218857f, -0.0298495f, -0.135725f, -0.377618f, 0.022473f, + 0.106955f, -0.0582005f, 0.0468484f, -0.0217442f, 0.130911f, + -0.0926905f, 0.383007f, -0.159353f, -0.222711f, -0.0286419f, + 0.372315f, -0.469095f, 0.797571f, -0.301315f, 0.239327f, + -0.997507f, -0.363409f, 0.353717f, 0.676686f, -0.0500028f, + 0.0638539f, -0.431927f, 0.243852f, 0.000884826f, -0.00166585f, + 0.0613292f, -0.029558f, -0.0248432f, -0.0125607f, -0.0309674f, + -0.743308f, 0.0409806f, 0.0921015f, 0.167816f, 0.406849f, + 0.095677f, 0.0308913f, 0.139956f, -0.400472f, 0.396617f, + 0.936517f, 0.355057f, -0.423816f, -0.232472f, -0.220188f, + -0.399746f, -0.409623f, -0.158797f, 0.361153f, 0.0327019f, + 0.0690844f, -0.032197f, 0.0248558f, 0.00438518f, 0.0222724f, + -0.326832f, -0.314295f, 0.156563f, 0.0562703f, 0.332694f, + 0.299424f, 0.228206f, 0.322038f, 0.0136098f, 0.0060297f, + -0.165851f, -0.306512f, 0.0796508f, -0.37158f, 0.239395f, + -0.349442f, 0.198515f, -0.253854f, -1.13694f, 0.0202873f, + -0.0504009f, -0.130528f, -0.017126f, -0.0370001f, -0.087458f, + -0.119952f, -0.130404f, 0.0333733f, -0.184736f, 0.182162f, + 0.227776f, -0.166563f, -0.156162f, 0.118215f, -0.220183f, + 0.00474779f, -0.107792f, 0.260493f, 0.11884f, 0.156587f, + 0.303936f, -0.131788f, -0.314774f, 0.310606f, 0.0935523f, + 0.790767f, 0.26461f, 0.0236426f, 0.0629469f, 0.0344072f, + -0.151513f, 0.211498f, 0.0245435f, 0.0629973f, 0.052019f, + -0.03308f, 0.123487f, 0.0885027f, 0.159172f, -0.0510615f, + 0.0298033f, -0.130515f, -0.121799f, -0.104915f, 0.208822f, + -0.310496f, -0.314106f, 0.303307f, -0.0196736f, 0.0420045f, + 0.461777f, -0.433699f, 0.00345407f, 0.703139f, -0.655637f, + -0.210767f, -0.201278f, 0.163694f, -0.236534f, 0.300877f, + 0.0769982f, -0.282453f, 0.149721f, -0.0303466f, -0.191473f, + -0.406056f, -0.213472f, 0.1619f, -0.245953f, 0.00544399f, + -0.121434f, 0.193012f, -0.307165f, 1.45431f, -0.161468f, + -0.12444f, -0.146129f, -0.0528212f, -0.0925165f, -0.134528f, + -0.479475f, 0.315525f, 0.133845f, 0.382158f, -0.0799693f, + -0.151041f, 0.255772f, 0.409536f, -0.240663f, -0.323741f, + -0.205876f, 0.03699f, -0.217541f, 0.108511f, 0.640628f, + 0.705993f, -0.423899f, -0.78314f, -0.100733f, -0.00859087f, + 0.0251879f, 0.0458335f, 0.00210128f, -0.047576f, -0.0560518f, + -1.23869f, -0.829914f, 0.0346551f, 0.350505f, 0.193688f, + 0.459154f, 0.137898f, 0.503818f, 0.260867f, 0.649539f, + 0.0150802f, 0.0239274f, -0.276069f, -0.0621478f, -0.193106f, + -0.0375665f, -0.654529f, 0.189493f, 0.446625f, -0.0208265f, + 0.019838f, -0.0201955f, 0.00180428f, -0.0110678f, -0.0172414f, + 0.0276489f, -0.252882f, -0.0351807f, -0.0518874f, 0.279098f, + -0.245122f, 0.101287f, -0.114202f, -0.0812187f, 0.572429f, + -0.0821731f, 0.564183f, 0.0222552f, 0.190111f, -0.0417497f, + -0.00385925f, -0.182995f, -0.240482f, -0.291572f, -0.0450444f, + 0.0962974f, -0.165973f, -0.0954637f, -0.163841f, -0.833405f, + -1.31541f, -0.336473f, -0.0920702f, 0.816105f, 0.393377f, + 0.0340241f, -0.0844545f, 0.61729f, -0.17596f, 0.241149f, + -0.42825f, -0.59091f, -0.290702f, 0.0796465f, 0.0982819f, + 0.466934f, 0.261666f, 0.0373333f, 0.332509f, -0.0266694f, + -0.0476951f, -0.00642167f, -0.0132542f, -0.000320841f, 0.00475532f, + 0.000502778f, 0.296534f, -0.13297f, -0.113082f, -0.327923f, + 0.35901f, -0.302246f, 0.189799f, -0.37994f, 0.16107f, + -0.20414f, 0.548575f, -0.460821f, 0.591878f, -0.213113f, + -0.169373f, -0.07332f, 0.228841f, 0.682302f, -0.0665316f, + -0.142456f, -0.0873117f, 0.00607451f, 0.0376443f, 0.0536673f, + -0.0109536f, -0.400279f, 0.550058f, 0.820871f, -0.666373f, + -0.471962f, -0.315925f, -0.313142f, 0.952742f, 0.473928f, + -0.119006f, 0.153241f, -0.0383078f, 0.631869f, -0.343423f, + -0.233473f, -0.218195f, -0.077688f, -0.728291f, 0.0382408f, + -0.00662886f, -0.0419666f, 0.0309776f, -0.0281592f, 0.0154229f, + -0.198534f, 0.0206324f, 0.0152272f, -0.235067f, 0.0330486f, + 0.139198f, -0.0612118f, 0.133154f, -0.258675f, 0.0900275f, + -0.127771f, 0.157322f, -0.00767807f, -0.329258f, 0.327458f, + 0.0528581f, -0.181125f, 0.409995f, -0.162979f, -0.0193475f, + 0.186009f, 0.0519501f, 0.651877f, -0.37821f, -1.10341f, + -0.189776f, -0.0922788f, 0.460256f, 0.168011f, 0.440295f, + 0.478135f, 0.374573f, 0.384048f, 0.116953f, 0.68886f, + -0.427727f, -0.36676f, -0.500013f, -0.228685f, -0.218859f, + 0.208396f, -0.0173765f, -0.0680241f, -0.00538013f, -0.0674409f, + -0.092764f, 0.0295707f, -0.0462887f, -0.00636006f, 0.0334169f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_32[] = { + 0.176459f, 0.154405f, 0.281821f, 0.375264f, -0.882863f, + -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_32 = { + NUM_FEATURES_32, + NUM_LOGITS_32, + NUM_HIDDEN_LAYERS_32, + { + NUM_LAYER_0_UNITS_32, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_32, + av1_simple_motion_search_prune_rect_logits_kernel_32, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_32, + av1_simple_motion_search_prune_rect_logits_bias_32, + }, +}; + +#undef NUM_HIDDEN_LAYERS_32 +#undef NUM_FEATURES_32 +#undef NUM_LAYER_0_UNITS_32 +#undef NUM_LOGITS_32 + +#define NUM_HIDDEN_LAYERS_16 1 +#define NUM_FEATURES_16 25 +#define NUM_LAYER_0_UNITS_16 32 +#define NUM_LOGITS_16 10 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_16[] = { + -0.520913f, 0.395611f, 0.0369091f, -0.318591f, -0.463252f, + 0.134992f, -0.43154f, -0.0739112f, -0.118817f, 0.476373f, + -0.281406f, 0.3413f, 0.456255f, 0.33307f, 0.2942f, + 0.1317f, 0.498113f, 1.95406f, -0.165726f, -0.219306f, + -0.302656f, -1.31157f, -0.433662f, 0.151716f, -0.214817f, + 0.504523f, -0.710049f, 0.359616f, -0.412695f, -0.103193f, + 0.341912f, 0.351378f, -0.181486f, 0.573862f, -0.0396254f, + -0.17855f, -0.276163f, 0.0367465f, -0.353905f, -0.204689f, + 0.309581f, -0.0439686f, -0.147855f, 0.152745f, 0.290871f, + 0.131049f, -0.27808f, -0.142997f, 0.207843f, -1.23074f, + -0.267714f, -0.336923f, 0.313781f, -0.61488f, -0.161984f, + 0.238059f, -0.0879942f, -0.085543f, -0.260156f, -0.13614f, + -0.242196f, 0.201216f, -0.248691f, 0.0936671f, -0.350522f, + -0.35002f, -0.156583f, -0.00579001f, 0.300578f, -0.341269f, + -0.290712f, 0.354802f, -0.31629f, 0.509107f, -0.236953f, + -0.0923519f, 0.544509f, -0.280991f, -0.017437f, -0.202721f, + -0.116388f, -0.7191f, 0.324586f, 0.254249f, 0.125505f, + 0.00658697f, -0.333322f, -0.126537f, -0.140004f, -0.0241202f, + -0.172466f, 0.210035f, -0.270833f, 0.0579044f, 0.0950352f, + -0.120382f, 0.063292f, -0.394925f, 0.482165f, 0.147753f, + 0.331465f, -0.187444f, 0.1083f, 0.414028f, 0.279238f, + -0.486889f, -0.674349f, -0.313656f, -0.131186f, -0.100662f, + 0.238191f, -1.19083f, -0.30667f, -2.4324f, 0.235311f, + 0.108605f, 1.67197f, 0.476157f, 0.30055f, 0.0839538f, + 0.408469f, -0.473517f, 0.560283f, -0.0188136f, 0.273824f, + -0.43707f, -0.0346978f, -0.438315f, -0.0196275f, -0.0567921f, + -0.220166f, 0.216175f, -0.0180461f, 0.0116429f, -0.0096949f, + -0.32613f, 0.176829f, -0.243563f, -0.240972f, -0.621819f, + -0.00619648f, -0.145525f, 0.124324f, -0.0306925f, 0.172208f, + -2.04631f, -0.200087f, -0.594135f, -0.352303f, -0.309826f, + 0.0922786f, -0.698371f, -0.0366823f, 0.0244036f, 0.338775f, + -0.115947f, 0.144971f, -0.0607037f, -0.762412f, 0.0125584f, + -0.262427f, -0.0830273f, -0.291252f, -0.176059f, -0.203983f, + 0.0871455f, -0.0894925f, 0.0426263f, -0.060001f, -0.542355f, + -0.407837f, -0.0419273f, 0.226608f, -0.114844f, 0.158733f, + -0.187237f, 0.113163f, -1.86337f, -0.367544f, -0.547048f, + -0.24192f, -0.226764f, 0.090912f, 0.819604f, 0.433766f, + -0.841657f, 0.446987f, -0.622761f, -0.0296385f, -0.130176f, + -0.0518136f, -0.640326f, -0.330107f, -0.137832f, -0.0119033f, + 0.39401f, 0.111331f, -0.141367f, -0.230289f, 0.171054f, + -0.924059f, -0.107317f, -0.347983f, 0.0261109f, 0.423002f, + -0.305817f, 0.247696f, 0.0436002f, 0.0305862f, -1.52448f, + -0.595587f, -0.155552f, -1.11949f, -0.513937f, 0.138347f, + -0.301487f, 0.352144f, -0.615801f, 0.0326701f, -0.215322f, + -0.0608176f, -0.416557f, -0.306073f, -0.441512f, -0.0569277f, + -0.709768f, -0.602527f, -0.311134f, 0.152471f, -0.255299f, + 0.354505f, 0.194464f, 0.0144251f, 0.110732f, -0.4452f, + -0.804814f, 0.205325f, -0.0957486f, 0.502684f, 0.09112f, + -0.533087f, -1.77979f, 0.556992f, -0.176157f, -0.642633f, + 0.11553f, -0.232561f, 0.161277f, -0.0631125f, -0.20759f, + 0.489253f, -0.067533f, 0.0231024f, -0.179831f, -0.272985f, + -0.390059f, 0.3089f, 0.185733f, -0.257065f, -0.508838f, + -0.550028f, 0.0665621f, -0.138288f, -0.413188f, 0.191193f, + -1.32969f, -0.431025f, 0.270242f, -0.340062f, 0.0817257f, + 0.0376051f, -0.18633f, 0.0828274f, 0.00670051f, -0.431295f, + -0.450316f, -0.173042f, -0.322248f, 0.370628f, 0.10019f, + 0.317293f, -0.266613f, 0.0752441f, -0.425656f, -0.112223f, + 0.557991f, -0.324368f, -0.195261f, -0.0526129f, -0.807472f, + -0.387466f, 0.192186f, 0.353213f, -0.120238f, 0.107686f, + 0.200678f, -0.75363f, 0.466857f, -0.282345f, -0.0849236f, + -0.0490695f, -0.00643182f, 0.123047f, -0.207805f, -0.130456f, + -1.09455f, 0.340973f, 0.334784f, 0.0706643f, -1.65681f, + -0.319952f, -0.198514f, -0.0787972f, 0.089524f, 0.0531034f, + -0.202705f, -0.0852339f, -0.62572f, -0.0734234f, -0.838088f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_16[] = { + -0.0616197f, 0.939947f, 0.521161f, 0.213886f, 0.130324f, -0.127443f, + -0.0538715f, 0.708746f, 0.445031f, 0.418781f, -0.114539f, 0.521941f, + 1.13719f, 0.606545f, -0.32193f, -0.150788f, 0.158487f, -0.224005f, + 0.654715f, 0.115729f, -0.286506f, -2.06223f, 0.0117697f, 0.503905f, + -0.102339f, 0.653256f, -0.813561f, 0.905235f, -0.417269f, -0.206265f, + 0.661496f, 0.95533f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_16[] = { + -0.203489f, 0.00686229f, -0.161414f, 0.0637276f, 0.27516f, + 0.512219f, 0.164205f, 0.00326062f, -0.41914f, -0.400334f, + 0.554419f, 0.715772f, -0.295569f, -0.703503f, 0.0137744f, + -0.0934259f, 0.174234f, -0.148618f, -0.0360558f, -0.0986598f, + -0.138502f, -0.0770713f, 0.122922f, -0.00784415f, 0.0953234f, + -0.255754f, -0.310967f, 0.185306f, 0.464554f, 0.147338f, + -0.0612304f, 0.164783f, 0.301097f, 0.161364f, -0.12723f, + -0.0265984f, -0.471361f, 0.0578776f, -0.362865f, 0.425789f, + 0.402758f, -0.190235f, 0.00549738f, -0.570908f, 1.27206f, + 0.048868f, -0.0097675f, 0.0708324f, 0.0456103f, 0.0149062f, + -0.563032f, -0.420573f, 0.107278f, 0.0938258f, 0.142712f, + -0.00251036f, -0.250583f, 0.522272f, 0.0113175f, 0.126751f, + -0.433028f, -0.035542f, -0.536686f, -0.0668722f, 0.253094f, + 0.254007f, -0.435505f, 0.343001f, 0.0531542f, -0.361914f, + -0.102664f, 0.0404874f, 0.132686f, 0.0762298f, 0.0236971f, + -0.419454f, 0.230877f, -0.223714f, 0.037813f, 0.0818604f, + 0.383705f, -0.235028f, -0.0554801f, 0.429851f, 0.0845829f, + 0.166295f, 0.355111f, -0.421197f, 0.298949f, 0.0218224f, + 0.445705f, -0.392217f, -0.429578f, -0.076276f, -0.0963531f, + -0.631425f, -0.225977f, 8.06349e-06f, 0.0676679f, 0.0779651f, + 0.0706891f, 0.101377f, 0.517103f, 0.0945502f, -0.52522f, + -0.312022f, 0.0358089f, 0.616509f, -0.0507444f, -0.465814f, + -0.0326024f, 0.591298f, 0.188544f, -0.0633316f, -0.199987f, + 0.403118f, -0.511281f, -0.696263f, 0.112996f, 0.103875f, + 0.0495595f, -0.0107449f, 0.521539f, -0.0123823f, -0.0642751f, + 0.08548f, -0.0679207f, 0.526558f, 0.0651114f, -0.342643f, + -0.349934f, 0.307437f, 0.368763f, -0.194851f, -0.134117f, + 0.102448f, -0.0520666f, 0.0415824f, -0.175085f, 0.272685f, + 0.0675856f, 0.120627f, 0.391408f, -0.135249f, -0.357024f, + 0.019666f, -0.0622677f, 0.407427f, 0.22655f, -0.129432f, + -0.165327f, 0.004893f, 0.5479f, 0.0613981f, -0.479682f, + -0.144228f, -0.130106f, 0.206458f, -0.342086f, 0.12691f, + -0.113554f, 0.231164f, -0.051419f, 0.0401286f, -0.560429f, + -0.070609f, 0.420232f, 0.442465f, -0.237501f, -0.000293732f, + -1.017f, -0.210222f, 0.0157063f, 0.0488178f, 0.0734721f, + -0.52626f, -0.276441f, -0.521579f, 0.443532f, -0.0819051f, + -0.0732633f, -0.17999f, 0.258525f, -0.0374872f, 0.150115f, + 0.0510939f, 0.168116f, 0.473372f, 0.824489f, 0.302195f, + -0.348613f, 0.238569f, 0.176444f, -0.633945f, -0.0567195f, + -0.0305827f, -0.0551851f, 0.85822f, -0.0628099f, 0.0364294f, + -0.234823f, 0.179067f, 0.143208f, -0.0511014f, -0.404191f, + 0.428035f, 0.0235506f, 0.371991f, -0.312909f, 0.550933f, + -0.389265f, -0.271813f, -0.293461f, -0.583752f, 0.179991f, + 0.191698f, 0.659094f, 1.07941f, -0.509555f, -0.100638f, + 0.079988f, -0.0519107f, -0.112723f, -0.0663326f, 0.0353569f, + -0.795055f, -0.465999f, 0.283579f, 0.340913f, 0.152738f, + 0.294664f, 0.527839f, 0.187735f, 0.359461f, 0.164629f, + 0.107512f, 0.390402f, 0.236702f, 0.114674f, -0.525655f, + -0.555476f, -0.6589f, -0.266601f, -0.0946547f, 0.6306f, + 0.0248513f, 0.038497f, 0.432706f, -0.0715465f, 0.0410172f, + -0.115313f, -0.428684f, 0.136283f, 0.0913185f, 0.11277f, + 0.0968689f, -0.00437052f, 0.0888981f, 0.10304f, 0.02442f, + -0.211315f, 0.00981596f, -0.0974827f, 0.208611f, 0.140644f, + 0.0315567f, 0.350332f, -0.291049f, -0.0715449f, -0.352992f, + -0.858004f, 0.828658f, 0.439092f, 0.0151291f, 0.0503828f, + 0.0656112f, -0.710749f, -0.0951757f, 0.193908f, 0.00908018f, + 0.141486f, -0.0657711f, 0.099791f, 0.153729f, -0.419576f, + -0.892636f, -0.0449268f, -0.170786f, -0.156564f, 0.384511f, + 0.296565f, 0.0569815f, -0.103938f, 1.27479f, -0.0406475f, + 0.154083f, -0.186442f, 0.0282588f, 0.0312102f, -0.188994f, + 0.284243f, -0.564693f, 0.425525f, -0.00924596f, 0.810003f, + 0.233812f, -0.0180273f, 0.121082f, -0.209096f, 0.151437f, + 0.286921f, -0.348095f, 0.174813f, -0.413798f, 0.108994f, + -0.34266f, -0.0337981f, -0.459f, -0.409812f, -0.0890104f, + 0.0834802f, -0.00259191f, -0.105914f, -0.164207f, 0.0697689f, + -0.312098f, -0.00650536f, -0.486758f, -0.248486f, 0.24314f, + -0.0857144f, 0.0884781f, -0.65615f, -0.121744f, 0.0709335f, + -0.0237193f, 0.10764f, -0.0409452f, -0.0824305f, 0.42329f, + 0.138258f, 0.502607f, 0.228545f, 0.0687789f, 0.0361586f, + 0.39074f, 0.0722654f, -0.0133148f, 0.283278f, 0.0743384f, + 0.310292f, -0.297675f, -0.359935f, 0.521021f, -0.10082f, + -0.272333f, 0.0120283f, 0.138118f, -0.123711f, -0.0711386f, + 0.0170747f, 0.831039f, 0.0509626f, 0.790608f, -0.0863406f, + -0.31962f, 0.0631013f, 0.0873453f, -0.472331f, -0.0826027f, + -0.241722f, 0.148835f, -0.131611f, 0.000195347f, -0.0615804f, + -0.838663f, -0.586979f, 0.247713f, 0.362254f, 0.492727f, + -0.132163f, 0.0516545f, 0.477838f, -0.0395182f, 0.0124993f, + -0.771514f, 0.0386912f, -0.118525f, -0.346172f, -0.265905f, + -0.175257f, -0.406287f, 0.393837f, 0.409096f, -0.408501f, + -0.0207146f, 0.0487809f, 0.0636982f, 0.0276368f, 0.0878249f, + 0.0425889f, 0.0868633f, 0.17423f, -0.128217f, -0.477068f, + -0.321294f, 0.0393771f, 0.00812823f, -0.350529f, -0.129012f, + 0.439953f, 0.396662f, 0.410475f, -0.123129f, -0.565966f, + 0.0298635f, -0.614611f, -0.477514f, 0.453651f, 0.0617068f, + 0.0530563f, 0.0479074f, 0.213551f, 0.039034f, 0.0449095f, + -1.06868f, -1.2654f, -0.175482f, 0.595068f, -0.230095f, + 0.719838f, -0.272148f, 0.696564f, 0.0485396f, 0.468584f, + 0.0695439f, -0.0842122f, -0.228978f, 0.161397f, -0.000441421f, + -0.0297514f, -0.250599f, 0.196656f, 0.608423f, -0.0112096f, + 0.0236881f, -0.00167311f, 0.0040709f, 0.015495f, 0.00757698f, + -0.165886f, 0.359767f, -0.0214696f, 0.377208f, 0.0303547f, + 0.0657094f, 0.140775f, 0.21867f, -0.203922f, 0.263878f, + -0.0529099f, 0.202438f, -0.243226f, 0.156659f, -0.627056f, + -0.845036f, -0.500873f, 0.172588f, 0.402972f, -0.147734f, + 0.151792f, -0.075579f, 0.443519f, 0.0311335f, -0.0328222f, + -0.0299781f, 0.435956f, -0.0987376f, 0.288402f, 0.135902f, + -0.173584f, -0.186255f, 0.224524f, -0.249645f, 0.123702f, + -0.0846244f, 0.491317f, 0.544846f, 0.338677f, -0.258885f, + -0.617434f, -0.629003f, -0.347233f, 0.181262f, -0.0606015f, + -0.537766f, 0.215089f, -0.334527f, 0.0488534f, 0.0577997f, + -1.12431f, -0.932292f, -0.11559f, 0.573715f, 0.151128f, + 0.693818f, -0.16956f, 0.802591f, -0.231531f, 1.04318f, + -0.476417f, 0.293452f, -0.610136f, 0.27506f, -0.384012f, + 0.305366f, -0.0540464f, -0.337583f, -0.174285f, 0.157248f, + 0.0477345f, -0.0229535f, 0.0475766f, -0.00603319f, 0.00856119f, + -0.702893f, -0.0579673f, 0.183024f, -0.166222f, 0.109763f, + -0.148019f, -0.258873f, -0.0820157f, -0.186716f, -0.449265f, + -0.0534138f, 0.15732f, 0.46357f, 0.00502591f, -0.0282085f, + 0.152277f, -0.855199f, -0.357115f, 0.0366159f, 0.0131101f, + -0.0407758f, 0.0462835f, 0.146309f, -0.00276278f, -0.0591814f, + -0.109437f, 0.506764f, -0.044421f, 0.465907f, 0.114444f, + -0.241053f, -0.362649f, -0.432615f, 0.199989f, -0.00635866f, + -0.521886f, 0.0958924f, -0.485725f, 0.0430527f, 0.069746f, + 0.681091f, -0.288144f, 0.505671f, 0.0489065f, -0.0373836f, + 0.266079f, 0.145173f, -0.011481f, -0.225074f, -0.754501f, + -0.122939f, -0.294213f, 0.334738f, 0.281561f, 0.558977f, + -0.21551f, -0.346507f, -0.0625635f, 0.0782034f, -0.236999f, + -0.803783f, -0.601117f, 0.091192f, 0.636122f, -0.250626f, + 0.0354961f, 0.103915f, 0.508571f, 0.329911f, -0.0425999f, + -0.0867587f, -0.0385824f, 1.13914f, -0.0261992f, 0.00484478f, + 0.124603f, -0.012173f, -0.377358f, -0.243563f, 0.236094f, + 0.145663f, -0.132752f, 0.347497f, -0.529315f, 0.271632f, + -0.372805f, 0.0261836f, 0.126169f, 0.0941008f, 0.283773f, + 0.765701f, -0.226477f, -0.181549f, -0.306896f, 0.110165f, + -0.0784234f, -0.0827892f, -0.0374252f, -0.0950872f, -0.451015f, + -0.995793f, -0.452663f, 0.293338f, -0.380865f, 0.032683f, + 0.0178248f, 0.0699194f, -0.0811722f, -0.0866096f, 0.139289f, + 0.296604f, 0.192293f, -0.0589607f, -0.179878f, 0.00360266f, + -0.0905794f, 0.136744f, -0.191555f, 1.31877f, -0.0592033f, + -0.158766f, 0.0214746f, -0.190113f, -0.116671f, 0.0449292f, + -0.109533f, -0.709307f, 0.386424f, 0.40201f, 0.262211f, + -0.155244f, 0.233988f, -0.0166317f, 0.462665f, 0.0484462f, + 0.210902f, -0.352798f, 0.38698f, -0.228261f, -0.084309f, + -0.220751f, -0.170879f, -0.352617f, -1.24277f, 0.266004f, + -0.0125749f, -0.0380073f, 0.101838f, -0.0483024f, -0.0629178f, + -0.0695577f, -0.103439f, 0.242131f, -0.0796858f, 0.349718f, + -0.332045f, 0.0138352f, -0.380235f, -0.28717f, -0.176276f, + 0.865903f, 0.36593f, 0.243925f, -0.422289f, -0.117327f, + 0.21876f, 0.245393f, -0.426134f, -0.186077f, 0.0352515f, + -0.123742f, 0.249376f, 1.3281f, 0.0707771f, 0.071415f, + -0.286827f, -0.131691f, -0.270881f, -0.434378f, 0.376064f, + 0.35966f, 0.513374f, 0.439378f, -0.222716f, -0.5874f, + 0.487997f, -0.293271f, -0.184245f, -0.037256f, 0.17723f, + -0.438651f, 0.428184f, 0.112983f, -0.449287f, -0.0451963f, + 0.0854929f, 0.0735442f, -0.0148642f, -0.0586782f, -0.176455f, + -0.438979f, -0.127109f, 0.211478f, 0.388035f, -0.0372021f, + 0.220575f, 0.382144f, 0.302121f, 0.0857121f, 0.193445f, + -0.488858f, -0.195288f, -0.316184f, -0.314026f, -0.111956f, + 0.0744768f, 0.292709f, 0.30187f, -0.285506f, -0.105006f, + 0.0851402f, -0.082318f, 0.277518f, 0.725294f, -0.756304f, + 0.0155309f, -0.378542f, 0.293377f, -0.347252f, -0.338458f, + 0.221449f, -0.176443f, -0.131972f, 0.0129163f, -0.290649f, + 0.198596f, -0.0721333f, 0.620591f, 0.568736f, 0.174001f, + -0.205186f, -0.265606f, -0.249155f, 0.299163f, 1.11842f, + 0.17423f, 0.196417f, -0.014484f, 0.0735422f, 0.26329f, + 0.12284f, -0.750305f, -0.351337f, 0.121994f, -0.00542878f, + -0.295707f, -0.094124f, 0.300993f, 0.412408f, -0.170761f, + -0.0676329f, -0.106638f, -0.419785f, -0.43878f, 0.22421f, + 0.0339903f, 0.619851f, 0.0615381f, 0.514631f, 1.35424f, + -0.0679228f, -0.203457f, 0.131948f, -0.0041251f, -0.209054f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_16[] = { + 0.304025f, 0.131887f, 0.259279f, -0.561564f, -0.161729f, + -0.208036f, 0.102206f, -0.162937f, -1.42311f, -0.708305f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_16 = { + NUM_FEATURES_16, + NUM_LOGITS_16, + NUM_HIDDEN_LAYERS_16, + { + NUM_LAYER_0_UNITS_16, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_16, + av1_simple_motion_search_prune_rect_logits_kernel_16, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_16, + av1_simple_motion_search_prune_rect_logits_bias_16, + }, +}; + +#undef NUM_HIDDEN_LAYERS_16 +#undef NUM_FEATURES_16 +#undef NUM_LAYER_0_UNITS_16 +#undef NUM_LOGITS_16 + +#define NUM_HIDDEN_LAYERS_8 1 +#define NUM_FEATURES_8 25 +#define NUM_LAYER_0_UNITS_8 32 +#define NUM_LOGITS_8 4 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_8[] = { + -0.266303f, -0.387676f, 0.204501f, -0.120842f, -0.0752326f, 0.0337739f, + 0.0243477f, -0.356748f, 0.0143051f, -0.16403f, -0.139013f, 0.175003f, + -0.206754f, 0.349059f, 0.181763f, 0.212768f, -0.313783f, 0.182829f, + 0.00205376f, -0.939525f, -0.0992424f, 0.306254f, 0.083329f, -0.133137f, + -0.179022f, -0.0237902f, 0.0601026f, -0.216698f, -0.551149f, 0.081711f, + -0.442191f, 0.0680832f, -0.0353678f, 0.237704f, 0.23155f, -0.36097f, + 0.123389f, -0.288927f, 0.178133f, -0.152222f, -0.235648f, -0.0495293f, + -0.316522f, 0.034207f, 0.0463139f, -0.817825f, 0.417443f, -0.110984f, + -0.402371f, 0.0341694f, -0.37383f, 0.414532f, 0.093993f, 0.0039505f, + 0.0803175f, -0.511859f, -0.0154802f, 0.0979595f, 0.0909049f, -0.120938f, + -0.577382f, -0.155041f, -0.404295f, 0.122223f, -0.084703f, 0.00415336f, + 0.149135f, 0.113219f, 0.124236f, -0.240905f, 0.163909f, -0.154202f, + -0.208917f, 0.00200158f, -0.71796f, 0.105984f, -0.131996f, -0.539603f, + 0.223768f, -0.0710733f, -0.346679f, -0.0745909f, 0.171032f, 0.215701f, + 0.218519f, 0.105981f, -0.096209f, -0.166453f, -0.468894f, -0.401578f, + -0.239222f, 0.111382f, 0.38747f, -0.164734f, -0.175955f, 0.336621f, + -0.0305501f, -0.0576765f, 0.0672671f, -0.183692f, 0.412082f, -0.262951f, + -0.153429f, -0.128589f, -0.530472f, 0.0936412f, -1.08296f, -0.45147f, + 0.0714904f, -3.96842f, 0.438125f, -0.313945f, 0.231104f, -0.00183851f, + -0.0192768f, -0.637531f, -0.109296f, 0.0531702f, 0.00262162f, -0.615951f, + -0.546241f, -0.635305f, -0.0762367f, 0.0122019f, 0.423693f, -0.129142f, + -0.112242f, 0.295184f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_8[] = { + -2.16023f, -3.12831f, -0.213206f, -2.97875f, -1.83791f, -2.84713f, + -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f, + -0.853224f, -3.29503f, -0.537517f, 0.923106f, -3.18665f, -1.29905f, + 1.64506f, -1.99848f, -2.24315f, 0.408613f, 0.503671f, -3.83393f, + -2.88388f, -3.52337f, 1.46818f, -1.67169f, -3.83253f, 1.52644f, + -0.490783f, -0.415782f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_8[] = { + -0.702198f, -0.102148f, 0.0564545f, -0.0555548f, 0.16184f, + 0.0950792f, 0.136974f, -0.00824146f, 0.05746f, 0.0447542f, + 0.145978f, 0.0855769f, -0.041449f, 0.301347f, -0.0206691f, + -0.0662514f, -0.0525079f, -0.0998387f, -0.0891438f, 0.110545f, + -0.863098f, -1.83798f, 0.238818f, 0.127797f, 0.116872f, + -0.270655f, -0.21057f, 0.197013f, -0.123332f, 0.137104f, + -0.174766f, -0.00803025f, 0.0234369f, -0.0894175f, -0.0380927f, + 0.00827928f, -0.134148f, 0.110575f, -0.250173f, 0.116273f, + 0.0197749f, 0.270391f, 0.108437f, 0.173197f, -0.0650348f, + 0.0884626f, 0.262792f, 0.0649228f, 0.5573f, -2.81315f, + -0.479801f, -1.15825f, 0.0807932f, -0.19144f, 0.404016f, + -0.211521f, 0.233269f, -0.391414f, 0.160381f, -0.277233f, + 0.426354f, 0.156839f, 0.494315f, -0.214259f, -0.0132062f, + 0.148628f, -0.0899568f, 0.161845f, 0.467689f, 0.229474f, + 0.590634f, -0.705793f, -0.0486113f, -0.439088f, 0.994566f, + 0.679065f, 0.777869f, -0.225291f, -0.0303006f, -0.638782f, + -0.0824632f, -0.128561f, -0.327603f, 0.105624f, 0.567581f, + -0.396135f, -0.471028f, 0.181286f, 0.274604f, 0.180169f, + 0.0612144f, -0.865004f, 0.0306804f, 0.142985f, -0.0914358f, + -0.243284f, 0.358359f, -0.443847f, -0.371978f, 0.606933f, + -0.900408f, -0.52076f, 0.472118f, 0.0610973f, 0.152526f, + -0.550379f, 0.309331f, -0.141573f, 0.203046f, -0.231485f, + 0.505156f, 0.393224f, 0.435487f, -0.218681f, 0.123707f, + -0.270383f, -0.033565f, 0.210373f, -2.33967f, 0.367434f, + 0.0308118f, -0.205771f, 0.546141f, 0.19837f, 0.035648f, + -0.467007f, -1.50995f, -0.0314176f, 0.11762f, -0.15307f, + 0.618257f, -0.139502f, 0.303386f, -0.00758681f, 0.228107f, + -0.594499f, -0.201984f, -0.239666f, 0.114878f, -0.922174f, + -0.530137f, -0.379366f, -0.319582f, 0.0889624f, -0.00544663f, + 0.316264f, -0.204262f, -0.0959358f, 0.23552f, 0.141369f, + -0.207129f, -1.04067f, -0.0780501f, 0.226768f, -0.246752f, + 0.0823105f, 0.114783f, 0.49315f, 0.0197732f, 0.705433f, + 0.158076f, -0.250584f, -0.157326f, -0.0439547f, -0.139047f, + 0.090531f, -0.38833f, 0.743143f, -1.47418f, -0.155009f, + 0.511466f, -0.726716f, -0.181075f, 0.450133f, -0.390204f, + 0.292725f, 0.00811462f, -0.347738f, 0.613381f, -0.237124f, + 0.750748f, -0.383123f, 0.410309f, -0.204166f, 0.667199f, + -0.313197f, 0.436059f, -0.607571f, 0.193681f, 0.409399f, + 0.631747f, -0.0454149f, 0.198232f, 0.345591f, -0.0137374f, + -0.307014f, -0.535515f, 0.764678f, -0.225686f, -0.451621f, + -2.75564f, -1.52877f, 0.0511933f, 0.905979f, 0.145029f, + 0.759615f, 0.130166f, 0.83827f, 0.0655081f, 1.07555f, + -0.529777f, 0.682967f, -0.412052f, 0.611947f, -0.83676f, + 0.940695f, -0.465681f, 0.51505f, -0.883659f, -0.105524f, + -0.0344173f, -0.0683618f, -0.00698688f, -0.139349f, 0.135741f, + -0.294455f, -0.377834f, -0.602084f, -1.00128f, 0.483291f, + 1.25327f, 0.178987f, 0.75068f, -0.520731f, -0.325517f, + 0.272032f, 0.144144f, -0.279453f, 0.564907f, 0.144036f, + 0.297448f, -0.504243f, -0.250508f, -1.26395f, 0.4816f, + 0.392771f, -0.389961f, -0.261585f, -0.127124f, -0.202945f, + -0.709716f, -0.174719f, 0.113613f, 0.477753f, -0.226659f, + 0.0697828f, -0.177994f, 0.300726f, -0.185504f, 0.339424f, + -0.316746f, 0.369693f, -0.339723f, -0.143886f, -0.0326589f, + -0.268761f, -0.241094f, 0.284876f, -0.0270867f, -0.207397f, + -1.42738f, 0.495612f, -0.0277732f, 0.199675f, 1.48638f, + -0.659257f, -1.28199f, 0.498702f, 0.140695f, 0.571152f, + 0.416368f, 0.14153f, 0.126876f, 0.521114f, -0.00150571f, + 0.375581f, 0.00537624f, 0.1286f, -0.332227f, 0.417663f, + -0.539023f, 0.217124f, -0.787111f, -0.0335266f, 1.56751f, + 0.0640563f, -0.158791f, 0.118195f, 0.000970493f, -0.0403852f, + -0.0572557f, -0.0201181f, -0.10255f, 0.63237f, 0.156662f, + 0.418696f, -0.274802f, -0.663923f, -0.375232f, -0.40846f, + 0.462092f, 1.2176f, -0.301532f, -0.779704f, -0.112876f, + 0.0806591f, -0.0141923f, 0.00960801f, -0.663557f, 0.0979948f, + -0.0575999f, -0.012847f, 0.0403853f, -0.133666f, -0.00330217f, + -0.931518f, -0.774599f, -0.21391f, 0.377601f, -0.183365f, + 0.299094f, 0.0238552f, 0.206716f, -0.18959f, 0.346013f, + -0.150991f, -0.192817f, -0.293962f, -0.0537604f, -0.0648171f, + -0.275941f, -0.144854f, -0.224092f, 2.43113f, 0.0422494f, + -0.047236f, -0.0262028f, 0.0282119f, -0.175553f, 0.0888502f, + 0.580682f, 0.951055f, -0.284441f, -0.120133f, -0.268058f, + -0.312083f, -0.411556f, 0.21431f, -0.28033f, 0.324851f, + -1.02787f, -0.936816f, -0.577628f, 0.544743f, 0.295807f, + 0.406157f, 0.447927f, 0.25369f, -0.811421f, -0.0424979f, + -0.189867f, 0.00778673f, -0.113587f, -0.116175f, -0.0542222f, + -1.80089f, -1.44175f, -0.35332f, 0.191314f, -0.236691f, + -0.0261926f, -0.502363f, 0.252278f, -0.485478f, 0.296495f, + 0.455612f, -0.0489631f, 0.227255f, 0.170975f, 0.473487f, + 0.257812f, 0.178048f, 0.2506f, 2.04637f, -0.173857f, + 0.0583379f, 0.00765589f, -0.025772f, -0.162666f, -0.016214f, + -0.607486f, -0.0808025f, 0.0551611f, -0.0772291f, 0.126421f, + 0.10869f, -0.0877463f, -0.111527f, -0.0775766f, 0.503886f, + -0.002757f, -0.0421354f, -0.247857f, 0.140827f, 0.383576f, + 0.228232f, -0.157877f, -0.0927911f, 0.344687f, 0.191181f, + 0.236533f, 0.00102869f, -0.0184502f, -1.4509f, -1.15945f, + -0.521978f, -0.643225f, 0.133139f, 0.0660321f, 0.0851957f, + 0.0303648f, 0.0296239f, 0.0455713f, 0.175647f, 0.080532f, + 0.0445691f, -0.257356f, -0.125602f, -0.138829f, -0.167057f, + -0.0992552f, -0.13944f, 0.507531f, 0.444997f, 0.221452f, + -0.308384f, -0.327554f, 0.13235f, 2.1487f, -1.15453f, + -0.280239f, -0.363582f, -0.00358745f, 0.012866f, 0.251088f, + 0.0676416f, 0.178492f, -0.136631f, 0.197938f, -0.078198f, + 0.812439f, 1.1173f, 0.712113f, 1.10124f, -0.836503f, + -1.22433f, -1.07894f, -1.29215f, 0.56057f, 2.23928f, + -0.419029f, 0.282178f, -0.0719266f, -0.172192f, 0.28034f, + -2.99124f, -2.01481f, 0.0688982f, 0.697466f, 0.00635555f, + 0.566069f, 0.047534f, 0.507755f, -0.00690707f, 0.712594f, + -0.191467f, 0.355733f, -0.480016f, 0.664669f, -0.390619f, + 0.351199f, -0.482342f, 0.325005f, 1.9089f, 0.155987f, + 0.17032f, 0.132729f, 0.0402649f, 0.146991f, 0.0314905f, + -0.775316f, -0.208892f, -0.105993f, 0.0181653f, -0.12735f, + 0.0897852f, 0.0470231f, 0.25807f, 0.127406f, -0.0893252f, + -0.279776f, 0.190844f, 0.110384f, -0.148833f, 0.025293f, + 0.239838f, 0.00932245f, 0.35103f, -0.128268f, -0.0536754f, + 0.506899f, -0.16793f, 0.0955582f, -2.01108f, 0.721433f, + -2.31413f, -2.08646f, 0.033315f, 0.689828f, -0.271213f, + 0.790425f, -0.114234f, 0.755325f, -0.211533f, 0.774544f, + -0.263268f, 0.795762f, -0.551455f, 0.953602f, -0.168454f, + 0.529055f, -0.768991f, 0.882371f, 0.29763f, -0.155017f, + 0.00464101f, 0.121093f, 0.948271f, 0.113138f, -0.110332f, + -2.0492f, -1.31322f, -0.129212f, 0.464778f, -0.181465f, + 0.618403f, 0.0627984f, 0.465228f, 0.165729f, 0.278277f, + -0.563276f, -0.358358f, -0.590638f, 0.0104993f, 0.731206f, + 0.752569f, 0.631615f, 0.811822f, 0.129804f, -0.0558327f, + 0.570081f, -0.417922f, -0.168275f, 0.0703671f, 0.269127f, + 0.240457f, -0.197159f, -0.00179261f, 0.220065f, 0.463511f, + 0.0714626f, -0.716477f, -0.441865f, -0.717028f, -0.149176f, + 0.452182f, 0.662699f, -0.906534f, -0.817133f, 0.237747f, + 0.26024f, -7.7441e-05f, 0.0934616f, 0.824641f, -0.0404494f, + -0.088297f, -0.157899f, 0.037408f, 0.132435f, -0.316155f, + -0.276785f, 0.0117868f, 0.185008f, 0.32369f, -0.465855f, + -0.302127f, 0.303289f, 0.338597f, -0.665408f, -0.507594f, + 0.526979f, 0.532091f, 0.234395f, 0.754063f, 0.116769f, + 0.0800309f, -0.939344f, -1.51269f, 1.4583f, 0.178444f, + 0.0106756f, -0.213468f, -0.00369439f, 0.071015f, -0.192798f, + -0.0933147f, -0.129901f, -0.368279f, -0.246564f, 0.126966f, + 0.478565f, -0.476246f, -0.762863f, 0.168883f, 0.536136f, + -0.272969f, 0.2573f, -0.161577f, 0.311428f, -0.777994f, + -1.29752f, 0.216046f, 0.329016f, 1.57265f, 0.168075f, + -0.192518f, 0.0829308f, -0.073533f, -0.0202034f, 0.114716f, + -0.34888f, -0.519215f, 0.190809f, 0.0138507f, 0.133635f, + 0.14194f, 0.410618f, -0.165106f, 0.214438f, 0.0438265f, + -0.8481f, -1.19182f, -1.07878f, -0.882217f, 0.45616f, + 0.977385f, 0.74929f, 0.918466f, 0.904704f, 0.041938f, + 0.0362776f, 0.0757255f, 1.14007f, 0.0516825f, -0.160068f, + 0.219535f, 0.638634f, -0.0284544f, -0.222849f, -0.0344915f, + -0.0350256f, -0.0504452f, -0.0458416f, 0.146099f, 0.0783083f, + 0.206579f, 0.241264f, 0.28401f, 0.0425312f, -0.802049f, + -0.746271f, -0.578969f, -0.078218f, 0.436176f, -0.281465f, + -2.5539f, 0.237868f, -0.121796f, 0.0715619f, 0.106992f, + -0.621862f, -0.167142f, 0.153716f, 0.0570912f, -0.06525f, + -0.923773f, 0.130759f, 0.0517066f, 0.0729862f, -0.873064f, + 0.0403328f, -0.186499f, -0.0831918f, -0.223723f, 0.144697f, + 0.212845f, 0.416876f, 0.361598f, 0.138229f, 0.0728777f, + -1.95419f, -0.00382816f, -0.0440387f, 0.433627f, 0.44781f, + -1.05229f, -1.54506f, 0.564827f, -0.263456f, 0.296105f, + -0.158055f, 0.388274f, -0.366639f, 0.212006f, -0.245619f, + 0.593064f, 0.088727f, 0.410632f, -0.263462f, 0.507075f, + -0.0974155f, 0.275268f, -0.1293f, 0.136679f, 1.98276f, + 0.411766f, 0.391987f, 0.34283f, -0.114077f, 0.258462f, + -0.302443f, 0.301138f, -0.00726621f, 0.276441f, -0.291582f, + 0.66498f, -0.321451f, -0.332805f, 0.0943272f, 0.572253f, + -0.45818f, -0.0219593f, -0.151679f, 0.402033f, -1.15502f, + -0.882955f, 0.772904f, 0.88126f, -0.149555f, 0.709525f, + 0.350116f, -0.21531f, 0.797893f, 0.0230234f, 0.0203034f, + 0.2744f, 1.08273f, 0.039349f, 0.503909f, -0.45892f, + -0.579516f, -0.344058f, 0.390628f, -0.386941f, -0.430317f, + -0.0807066f, 0.435906f, 0.522996f, 0.724476f, -0.74371f, + -0.05376f, -0.340898f, -0.962646f, -0.0278005f, 0.0981149f, + -0.0811161f, 0.00237994f, 0.850042f, 0.0665473f, 0.134413f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_8[] = { + 1.63404f, -0.715866f, -1.0132f, -2.08745f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_8 = { + NUM_FEATURES_8, + NUM_LOGITS_8, + NUM_HIDDEN_LAYERS_8, + { + NUM_LAYER_0_UNITS_8, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_8, + av1_simple_motion_search_prune_rect_logits_kernel_8, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_8, + av1_simple_motion_search_prune_rect_logits_bias_8, + }, +}; + +#undef NUM_HIDDEN_LAYERS_8 +#undef NUM_FEATURES_8 +#undef NUM_LAYER_0_UNITS_8 +#undef NUM_LOGITS_8 + +static const NN_CONFIG + *const av1_simple_motion_search_prune_rect_nn_config[5] = { + &av1_simple_motion_search_prune_rect_nn_config_128, + &av1_simple_motion_search_prune_rect_nn_config_64, + &av1_simple_motion_search_prune_rect_nn_config_32, + &av1_simple_motion_search_prune_rect_nn_config_16, + &av1_simple_motion_search_prune_rect_nn_config_8, + }; + +// nn model for predicting max square partition level of a superblock +#define NUM_HIDDEN_LAYERS 1 +#define NUM_FEATURES 13 +#define NUM_LAYER_0_UNITS 48 +#define NUM_LOGITS 4 + +static const float av1_max_part_pred_logits_kernel[] = { + -0.304561f, 0.0885596f, -0.988539f, 1.08147f, 0.215213f, + 0.202965f, -0.828457f, -0.233945f, -0.0866977f, -0.115521f, + 0.02079f, 0.196491f, -0.0285075f, 0.05067f, -0.00872862f, + 0.00281844f, -0.238954f, 0.0253801f, 0.0257775f, 0.339269f, + 0.176174f, -0.152545f, -0.0588704f, -1.62275f, -0.189329f, + 0.0808033f, 0.233844f, -4.53798f, 0.674968f, -0.0361688f, + -0.0754075f, 1.16129f, -0.0188879f, 0.113255f, -3.04378f, + 0.814728f, -0.568517f, -0.00179383f, -3.61223f, -1.67535f, + -2.20417f, -0.197196f, 0.0507745f, -0.0909394f, -0.0507879f, + -1.27999f, -0.055623f, 0.0318497f, 0.192867f, 0.138726f, + 0.0443392f, -0.595075f, -0.166774f, 0.0882958f, -0.348161f, + 0.0214428f, -0.0599275f, -0.0995385f, -0.82358f, 0.141205f, + -0.053232f, 0.00508296f, -1.90872f, 1.15004f, -0.194219f, + 0.0229019f, -0.00354318f, 0.22016f, 0.154101f, -0.159231f, + -0.0446647f, -0.197503f, 0.0408453f, 0.197659f, 0.797858f, + -0.189722f, 0.343653f, 0.124666f, -1.03083f, 0.603059f, + 0.101565f, 0.0932993f, 0.462484f, 0.295984f, 1.11198f, + 0.143709f, -0.846232f, -0.464392f, -1.06058f, -0.124889f, + 0.0727475f, 1.18446f, -0.100302f, 0.0641918f, -0.101622f, + 0.10219f, 0.130189f, 0.0915623f, -0.166904f, -1.10606f, + -0.16726f, -0.146152f, 0.145443f, -0.177091f, -0.0215214f, + 0.0158506f, -0.553294f, 0.0784749f, -0.0416628f, -0.027785f, + 0.280027f, 0.484898f, -0.164225f, 0.0238317f, -0.0345254f, + 0.0410244f, 0.131529f, 0.0239622f, -0.0749436f, -0.0224914f, + 0.128926f, 0.224539f, 0.413297f, 0.0638572f, 0.103308f, + 0.0913242f, -0.119274f, 0.0163103f, 0.113828f, 0.119809f, + 0.297057f, -0.124889f, -0.533108f, -0.181408f, -0.129896f, + 0.0221064f, -0.0773281f, -0.0386467f, 0.0342961f, 0.126575f, + -0.24114f, 0.0735576f, 0.0524791f, 0.246896f, -0.130674f, + -0.03979f, 0.173639f, 1.95193f, -0.113029f, -0.0305852f, + -0.00671737f, 0.157159f, -0.00102858f, -0.543688f, 0.566772f, + 0.124124f, -0.0294064f, -0.0699021f, -0.0704103f, -0.766097f, + -0.0625802f, -0.0906173f, -0.0520414f, -0.0272724f, 0.283064f, + 0.236213f, -0.127319f, 0.019392f, 0.170042f, -0.0214542f, + 0.0740938f, 0.356578f, -0.236257f, 0.269021f, 0.114759f, + -0.641166f, 0.136308f, -0.0386959f, -0.112024f, -0.361209f, + 0.686095f, 0.183906f, 0.288656f, 0.182007f, 0.337458f, + 0.058974f, -0.305512f, -0.841708f, -0.243779f, -0.0614058f, + 0.208747f, 0.448697f +}; + +static const float av1_max_part_pred_layer_0_bias[] = { + -0.776544f, -2.0022f, -0.330294f, 2.47665f, 1.90206f, -1.61571f, + 0.536246f, 1.00455f, 5.24561f, 1.55111f, -0.816399f, -4.88703f, + -1.06417f, -1.15359f, -0.145289f, 1.91831f, 0.630915f, -1.94256f, + -3.35239f, -1.05007f, -1.05186f, 1.36824f, -5.2878f, 1.10482f, + -5.00077f, -0.0445198f, 3.41427f, 2.3439f, -0.413306f, -1.88152f, + -2.28638f, 8.24783f, -1.91961f, -1.49324f, 1.96599f, -6.32309f, + -0.332426f, -0.425506f, 4.06511f, 5.84386f, 4.15747f, 1.22402f, + 2.8512f, 2.53027f, 0.0170272f, -1.43966f, -0.997785f, 5.43064f +}; + +static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f, + 1.96217f, 0.728905f }; + +static const float av1_max_part_pred_layer_0_kernel[] = { + 0.992471f, 0.533006f, 0.143743f, -2.51788f, -0.468337f, + -0.201376f, -0.151834f, 0.479883f, 1.16061f, -0.278878f, + -0.814954f, -0.152405f, -0.0521608f, 0.797104f, -2.08912f, + 0.385839f, -2.22889f, -0.106858f, -0.239766f, -0.951128f, + -0.698753f, 0.0831051f, 1.1702f, 0.342834f, -0.0352795f, + -0.0847639f, -0.802086f, 0.258982f, 1.14174f, 0.645885f, + -1.19226f, -0.592888f, -0.343659f, 1.1912f, 1.45411f, + -1.22927f, 0.152858f, 0.00373585f, -1.60637f, 0.592611f, + 0.0857475f, -0.346147f, -0.150784f, -0.0817408f, -0.189918f, + -0.804952f, -1.33036f, -1.03307f, 0.0248769f, 0.16607f, + -2.896f, -2.1293f, 0.12293f, -0.173179f, -0.212128f, + -6.76221f, 0.033188f, 0.0231787f, 0.905957f, 0.0551327f, + -0.356276f, 0.0181795f, 0.0977523f, -0.0352873f, -0.0396386f, + 2.3241f, 0.0632874f, -0.11804f, -6.32521f, 0.0224659f, + -0.00188896f, 0.267992f, 0.272337f, 0.00936963f, 0.659969f, + -2.25707f, -0.0278229f, -0.0185089f, -1.14466f, 0.104827f, + 0.0435885f, 0.558586f, -0.00697004f, 0.0312611f, 0.540574f, + -0.568625f, 0.218608f, 0.378911f, -0.0289192f, -0.0734742f, + -1.08782f, -2.42069f, -0.0127239f, 0.0493651f, -1.15837f, + 0.261831f, 0.401824f, -1.04545f, 0.284173f, 0.784972f, + -0.511243f, -0.982599f, -0.106134f, -0.325964f, -1.44107f, + -1.42434f, -1.02402f, -1.52034f, 0.0737116f, 0.0462242f, + 0.628722f, -1.0405f, -0.113718f, 2.20573f, -4.33951f, + -0.0192695f, -0.0229314f, -1.89156f, 0.645942f, 0.375708f, + -1.97447f, -0.267014f, 0.0989443f, -0.450534f, -1.01737f, + -0.642416f, -0.0897288f, -2.08724f, -0.190965f, -0.279135f, + -0.830178f, 0.808754f, -0.139091f, 1.11004f, -0.454439f, + -0.479238f, -1.44001f, 0.0888059f, 0.885689f, -0.642505f, + -0.00773651f, -0.0265721f, -0.906346f, 1.68504f, 0.084257f, + -0.951101f, -8.06495f, 0.19231f, 0.16389f, -0.193678f, + 0.729837f, -1.98392f, -5.98513f, 3.32638f, -0.0658378f, + -0.0910426f, -0.666567f, -0.315339f, 0.123124f, -2.66375f, + -0.714852f, -0.136176f, -0.460166f, -0.567551f, -1.06193f, + -1.21389f, -0.83865f, 0.00280695f, -0.199519f, -0.534704f, + 0.419311f, -0.149008f, -3.68707f, 0.00285113f, -0.0718198f, + -1.41026f, -1.34155f, -0.538687f, -0.623666f, -2.56462f, + -0.0183333f, -0.323532f, -1.27141f, -0.0212039f, 0.198633f, + 0.459554f, -4.65103f, -1.01293f, -1.39512f, -0.289026f, + 0.208724f, -0.665226f, 1.13369f, -1.96734f, -1.45442f, + -3.46172f, 0.810681f, -0.603973f, 0.842764f, -3.90371f, + -0.394561f, -3.61363f, -2.88085f, 0.031645f, -0.23125f, + -2.63898f, -1.35314f, -0.46726f, 1.33145f, 1.20269f, + 1.38682f, -0.331637f, 0.069021f, 0.149523f, -1.24957f, + -0.878857f, -0.200368f, 0.465744f, 1.01365f, -0.0122221f, + -0.550586f, -1.12581f, -0.422132f, -0.0744868f, -2.4804f, + -1.07072f, -0.479006f, 0.101817f, -0.118947f, 0.341576f, + -1.0538f, -0.812346f, -1.13727f, -0.00939806f, 10.1571f, + -0.0441302f, 0.00280407f, -21.5044f, 0.0181152f, -0.0143246f, + 3.23462f, -1.38624f, -1.80416f, 4.89763f, -2.67364f, + 2.31771e-05f, 0.000393989f, 0.352204f, -0.193455f, 0.531455f, + 0.488757f, -0.442555f, -0.518528f, 0.431482f, -2.67727f, + -2.00626f, -0.39729f, -0.221494f, -0.0188888f, -0.0377649f, + -1.80169f, 0.0810332f, -0.0408335f, -1.28675f, -0.0353824f, + -0.666723f, -1.07281f, 0.252912f, -1.24547f, -1.7831f, + -1.14354f, -0.137662f, 0.00230182f, 0.736862f, 0.175872f, + -0.187556f, 0.43963f, -0.796524f, 0.056219f, -0.387874f, + 0.0710224f, -0.16548f, -0.100993f, 0.931481f, -3.20738f, + -0.0197576f, 0.266148f, -0.173909f, -0.337795f, -0.0682381f, + 0.176844f, 0.140286f, 1.12033f, 0.429064f, -2.24192f, + -1.54682f, 2.23646f, -0.0371138f, -0.0475339f, -3.21766f, + 0.0412858f, 0.387811f, 6.6711f, 0.140649f, 0.0559547f, + -0.802839f, 0.599977f, 0.64552f, -2.08103f, -0.503401f, + -0.0407036f, -0.0299199f, 0.0849445f, -0.111657f, -1.63462f, + 3.33762f, 0.0441394f, 0.0466889f, -0.951806f, 0.0723954f, + 0.00348661f, -1.36903f, 2.24625f, -0.0348915f, -0.0508893f, + -0.240891f, -0.120143f, -0.17991f, -2.09137f, 0.0150871f, + 0.0480333f, 1.72012f, 0.0309551f, -0.0370507f, -0.377075f, + 0.103916f, -0.0169255f, -0.0145395f, -4.02144f, 0.83193f, + -0.316502f, 6.3832f, -1.70038f, -1.97215f, -1.94501f, + 1.45479f, 0.711725f, -0.348496f, -0.279056f, -1.13396f, + -1.51744f, -0.853307f, 1.53131f, -0.0032358f, 1.41808f, + -1.32989f, -0.245221f, -0.161614f, -0.500845f, -0.449252f, + 0.0724151f, -0.116333f, -0.0946182f, -2.0945f, 0.0564572f, + 0.393261f, -1.06861f, -0.111458f, -0.839943f, -0.0880348f, + 0.0365742f, 0.415339f, -1.57494f, -0.713697f, 1.02349f, + -0.221371f, -0.0446281f, 1.89223f, -0.0811754f, -0.402773f, + -0.930987f, 0.0243194f, 0.0678332f, -0.0233014f, 0.165372f, + -0.44083f, -1.2404f, 0.35675f, -0.040916f, -0.0512548f, + -2.9071f, 0.861174f, -0.778133f, 2.14436f, -0.688427f, + -0.480371f, -1.69032f, 0.706687f, -0.281982f, -2.30451f, + 1.61541f, -0.0213638f, -0.740509f, -0.266677f, 0.0268434f, + -0.0116908f, -3.17595f, 0.0114825f, 0.0196997f, -0.144005f, + 0.0550181f, -0.851459f, -0.000285073f, -0.538441f, -0.0254868f, + -0.0104454f, -0.0661998f, -0.196469f, -0.346372f, -5.52892f, + -0.643683f, -0.622224f, -0.31463f, -0.555956f, -0.520132f, + -0.843166f, -2.59479f, -0.750195f, 0.00635995f, -0.338615f, + -0.216676f, -0.391544f, -1.62185f, -0.718471f, -0.475406f, + -0.782041f, -0.608824f, -1.09633f, -1.27308f, -0.560719f, + -0.207539f, -0.0196445f, -1.05519f, -0.575249f, -1.0642f, + 1.01615f, -0.873633f, -0.417953f, -0.428051f, 0.350259f, + -2.53833f, -2.72203f, 0.672846f, -0.503094f, -1.1374f, + 0.214291f, 0.013305f, 0.0112064f, 1.10532f, 0.030455f, + 0.0239614f, 0.628072f, 0.0539135f, -0.472441f, -0.688439f, + -0.32044f, -0.0234867f, -0.0158436f, -0.949314f, -0.0453161f, + -1.18306f, 0.626845f, -0.426925f, -0.688371f, 0.415062f, + 0.0640985f, -0.638387f, -2.01399f, -0.209744f, -0.762892f, + -0.0753296f, -0.879315f, -0.520433f, -0.111375f, 0.389742f, + -0.398862f, -0.643227f, -0.246396f, 0.0317051f, 1.06973f, + 0.413617f, 0.180506f, -0.0507897f, -0.00650435f, 0.620892f, + 0.046312f, 0.475032f, 0.906993f, -0.0388061f, -0.256271f, + -1.03323f, 0.0125266f, -0.31116f, -0.377611f, -0.0386407f, + -0.0232745f, -0.353644f, -2.27289f, 0.0571779f, -0.00865006f, + 1.65101f, 0.0175711f, 0.0184585f, 0.558458f, 0.2213f, + -0.285089f, 0.433445f, -0.427177f, -0.0103682f, -0.0101273f, + 0.214085f, -0.0459885f, 0.00761981f, 0.836381f, 0.0175293f, + 0.02508f, -1.51778f, 0.0143956f, -0.162589f, 0.595418f, + 0.21445f, -0.0335848f, -0.0136684f, -0.16686f, -0.14612f, + 0.0816238f, 0.499636f, 0.12458f, -2.41673f, -0.261721f, + -0.676805f, -1.88366f, 0.730462f, 0.69196f, -0.0288489f, + -2.38272f, 0.329876f, 0.014517f, -0.115145f, -3.48151f, + -0.00209072f, -0.0732377f, 0.820443f, -0.0118701f, 0.112145f, + 0.272315f, 0.137531f, -0.0200997f, -0.0397883f, -2.19458f, + 0.183554f, -0.639716f, 0.481605f, -0.621639f, -0.0980299f, + -0.710534f, -0.143105f, -6.77626f, -1.65139f, -2.37718f, + -0.533127f, -1.12574f, 3.34182f, -0.0758663f, 0.0334238f, + -9.48647f, 0.0674974f, 0.0507665f, 0.523007f, -0.0668f, + 0.5736f, -0.589761f, -1.1692f, -0.0236497f, -0.00828928f, + -0.265823f, 1.15284f, 0.307927f, -0.695308f, 0.13725f, + -0.20394f, -0.363965f, -0.331159f, -1.50927f, -1.20051f, + -0.0205825f, -0.0381859f, -0.0579876f, -1.6913f, -1.94626f, + 3.4214f, 3.3922f, -2.13798f, -0.679848f, -0.890735f, + 0.235017f, -0.253202f, -1.0571f, 1.40354f, 0.00719052f, + -1.54365f, -0.7289f, -1.05492f, 0.0238169f, -0.00543592f, + -0.0510353f, -0.175386f, -0.724207f, -0.788936f, 0.039976f, + 1.36966f, 0.869475f, -0.0302774f, -0.0537556f +}; + +static const NN_CONFIG av1_max_part_pred_nn_config = { + NUM_FEATURES, + NUM_LOGITS, + NUM_HIDDEN_LAYERS, + { + NUM_LAYER_0_UNITS, + }, + { + av1_max_part_pred_layer_0_kernel, + av1_max_part_pred_logits_kernel, + }, + { + av1_max_part_pred_layer_0_bias, + av1_max_part_pred_logits_bias, + }, +}; + +#undef NUM_HIDDEN_LAYERS +#undef NUM_FEATURES +#undef NUM_LAYER_0_UNITS +#undef NUM_LOGITS + +// Early termination in second pass +static const float av1_simple_motion_search_term_none_mean_128[28] = { + 12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f, + 11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f, + 11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f, + 5.940535f, 0.770746f, 4.292692f, 4.309581f, 0.848423f, 4.292334f, + 4.298179f, 8.514713f, 14.911736f, 19.825352f, +}; + +static const float av1_simple_motion_search_term_none_std_128[28] = { + 1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f, + 1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f, + 1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f, + 1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f, +}; + +static const float av1_simple_motion_search_term_none_mean_64[28] = { + 10.904455f, 10.853546f, 9.247903f, 9.184479f, 9.251985f, 9.186686f, + 9.253490f, 9.190190f, 9.270079f, 9.204357f, 10.086511f, 10.031060f, + 10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f, + 4.888378f, 0.878113f, 3.598450f, 3.628491f, 0.925833f, 3.560971f, + 3.573322f, 8.807137f, 13.348477f, 18.269117f, +}; + +static const float av1_simple_motion_search_term_none_std_64[28] = { + 1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f, + 1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f, + 1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f, + 0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f, +}; + +static const float av1_simple_motion_search_term_none_mean_32[28] = { + 9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f, 8.012570f, + 7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f, 8.859187f, + 8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f, 2.645082f, + 2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f, +}; + +static const float av1_simple_motion_search_term_none_std_32[28] = { + 1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f, + 1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f, + 1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f, + 0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f, +}; + +static const float av1_simple_motion_search_term_none_mean_16[28] = { + 8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f, 7.088727f, + 6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f, 7.967233f, + 8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f, 1.901347f, + 1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f, +}; + +static const float av1_simple_motion_search_term_none_std_16[28] = { + 1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f, + 1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f, + 1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f, + 0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f, +}; + +static const float av1_simple_motion_search_term_none_model_128[] = { + -0.6106842357f, -1.0402954455f, 0.6054417656f, -0.2116623578f, + 0.2447714930f, 0.3782256209f, 0.5095592479f, -0.3275620904f, + 0.3886188013f, 0.2629499420f, -0.1979599415f, -0.5389565605f, + 0.1209207902f, -0.4913347466f, 0.3798542731f, -0.2812861709f, + -0.1049824167f, -0.1088672020f, 0.4059596517f, -0.1347896613f, + 0.2276868621f, 0.0506386970f, 0.0071088411f, 0.0467952100f, + 0.2091247458f, -0.7371964736f, 0.1368935545f, 0.3175247786f, + -0.5493146094f, +}; + +static const float av1_simple_motion_search_term_none_model_64[] = { + -0.4150046575f, -0.3954358561f, 0.1997997444f, 0.3395826831f, + 0.2827215753f, 0.3395683652f, 0.2483140395f, 0.2722216476f, + 0.2610308009f, 0.3724974359f, -0.0551479654f, -0.1721616359f, + -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f, + -0.3169539902f, -0.0269429900f, 0.9891530919f, -0.0125084982f, + 0.0972182377f, 0.0008889801f, 0.0205418050f, 0.0057237854f, + 0.1005222691f, -0.2851321920f, -1.5150336445f, 0.1893942436f, + -0.4337360901f, +}; + +static const float av1_simple_motion_search_term_none_model_32[] = { + -0.4667392852f, -0.3893302767f, 0.1603498635f, 0.2304974726f, + 0.1404975592f, 0.2505516225f, 0.1423053884f, 0.2189318406f, + 0.1379765409f, 0.2638241296f, -0.1342865463f, -0.0549054345f, + -0.1925223436f, -0.1142702769f, 0.0127811659f, 0.0868639997f, + -0.0643197251f, 0.0279496470f, 0.9904395769f, -0.0095178685f, + 0.1179410649f, -0.0013411972f, 0.0095060660f, 0.0195730400f, + 0.0779717771f, -0.2498860763f, -0.8168817125f, -0.4798397348f, + -0.6609679881f, +}; + +static const float av1_simple_motion_search_term_none_model_16[] = { + -0.3021081992f, -0.4620153673f, 0.0448577479f, 0.1738455035f, + 0.0663209177f, 0.1629614573f, 0.0555168744f, 0.1631870212f, + 0.0425805150f, 0.1688564954f, 0.0434083772f, -0.0046603915f, + -0.0271580056f, -0.0183879127f, 0.1073730471f, 0.0314201476f, + 0.0576891756f, 0.0119723753f, 0.9084332022f, -0.0188429077f, + 0.0755089811f, -0.0172550234f, 0.0037663075f, 0.0022094472f, + 0.0500247894f, -0.2944572004f, -0.8908521199f, -0.2555515792f, + -0.5396254205f, +}; + +#define FEATURES 31 +#define HIDDEN_NODES 32 +static const float av1_early_term_after_split_nn_weights_64_layer0[] = { + -0.306296f, -0.691664f, 0.335148f, -0.298465f, -0.509241f, -0.632796f, + -0.527979f, -0.009904f, -0.503646f, -0.494002f, -0.575101f, 0.239911f, + -0.413312f, -0.622825f, -0.405448f, -0.419103f, -0.505903f, -0.392550f, + -0.240293f, 0.121749f, -0.489777f, -0.756647f, 0.001047f, -0.016528f, + 0.145714f, 0.172910f, 0.086197f, 0.162882f, -0.070588f, -0.077104f, + 0.502730f, -0.244954f, 0.265605f, -0.323994f, 0.223397f, -1.086453f, + 0.391886f, 0.200343f, 0.253878f, 0.018925f, 0.201819f, -0.205136f, + 0.427314f, 0.041155f, 0.070484f, 0.159925f, -0.057095f, -0.146544f, + -0.073792f, 0.152628f, 0.003986f, -0.515965f, -0.209754f, 0.037457f, + 0.070622f, -0.143571f, -0.059602f, 0.111734f, 0.319674f, 0.149894f, + -0.219883f, 0.206678f, 0.015809f, -0.210549f, 0.130156f, -0.189502f, + -0.850392f, -0.156363f, -0.060354f, 0.189044f, 0.266495f, 0.151305f, + -0.563677f, -0.354896f, 0.300637f, 0.257568f, -0.008359f, -0.535497f, + -0.003127f, 0.293054f, -0.020212f, -0.157278f, 0.229972f, -0.309799f, + -0.329927f, -0.077140f, 0.001177f, -0.024415f, 0.134044f, -0.181587f, + -0.135380f, 0.230989f, -0.281451f, 0.912282f, 0.511562f, -3.900779f, + -0.039917f, 1.956406f, -0.357589f, 0.292998f, -0.950158f, 0.422041f, + 0.526572f, 0.605746f, -0.147110f, 0.256576f, 0.090010f, 0.221641f, + 0.029763f, 0.351592f, 0.458324f, -0.005888f, 0.010521f, -0.389326f, + -0.094006f, -0.171489f, -0.013153f, 0.026333f, -0.454571f, -1.932891f, + -0.168211f, 0.051298f, -0.258061f, -0.028936f, -0.555937f, -0.475566f, + -0.304046f, -0.318113f, 0.099697f, -0.217145f, 0.139433f, -0.203986f, + -0.164012f, 0.051527f, 0.138603f, -0.085100f, -0.082887f, -0.242955f, + -0.663410f, -0.535772f, -0.181665f, -0.197883f, 0.071319f, 0.135086f, + 0.146200f, 0.184827f, -0.199041f, 0.162570f, -0.300167f, 0.017748f, + -0.140111f, 0.103553f, 0.206929f, 0.193446f, 0.123141f, -1.201898f, + -0.052254f, -0.750121f, 0.111741f, 0.204092f, -0.166266f, 0.124008f, + -0.455496f, 0.306035f, 0.275903f, 0.193599f, -0.730011f, 0.126808f, + 0.051059f, 0.103634f, -0.044334f, 0.048889f, 0.405228f, 0.574099f, + 0.061167f, 0.260576f, 0.070032f, -0.038040f, 0.229183f, -0.243269f, + -0.130116f, -0.538563f, -0.070199f, -0.129249f, -0.205153f, -0.268530f, + -0.290828f, -0.233006f, 0.068712f, 0.618085f, -0.407008f, 0.686868f, + 0.172247f, 0.826287f, -0.002672f, 0.239825f, -0.051548f, 0.420773f, + 0.218747f, 0.041057f, -0.071189f, 0.286987f, -0.113915f, 0.122561f, + 0.013979f, -0.049046f, 0.148175f, 0.031313f, -0.248601f, 0.209488f, + 0.069008f, 0.072763f, 0.332475f, 0.079986f, -0.151042f, -0.205110f, + -0.155550f, -0.510408f, 0.330429f, 0.577729f, 0.266524f, -0.378489f, + 0.228204f, 0.055318f, 0.117583f, -0.588557f, -0.778201f, 0.434622f, + -0.227820f, 0.611642f, 0.170548f, 0.817761f, 0.006642f, -1.005794f, + -0.911490f, 1.633684f, -0.290664f, 0.308128f, 0.295986f, 0.243377f, + -0.001275f, -0.131156f, 0.275205f, -0.041865f, -0.201951f, -0.016380f, + 0.336604f, -0.258118f, 0.890810f, 0.441065f, -0.968006f, 0.135989f, + -1.447191f, 0.353426f, -0.343235f, 0.376837f, -0.071602f, -0.319639f, + -0.072347f, 0.547450f, -0.215380f, 0.182141f, -0.066186f, 0.033787f, + 0.257482f, 0.217428f, -0.130249f, 0.057525f, 0.263991f, 0.230664f, + -0.245113f, 0.048610f, -0.079955f, 0.251737f, -0.070368f, -0.017968f, + -0.151815f, 0.025945f, -0.257769f, 0.299735f, 0.077263f, -0.565526f, + 0.326263f, 0.096429f, 0.113414f, 0.092754f, -0.141908f, 0.172060f, + 0.393117f, -0.216755f, 0.331051f, -0.363369f, -0.113363f, -0.095164f, + -0.072784f, 0.214572f, 0.010993f, 0.209456f, 0.260381f, -0.314747f, + -0.422173f, -0.189963f, -0.225130f, 0.339448f, 0.153814f, 0.265616f, + -0.103575f, -0.123841f, -0.106236f, 0.155894f, -0.156264f, -1.361406f, + -0.040736f, -0.614998f, -0.468200f, -0.266505f, -0.342786f, -0.908088f, + 0.105758f, 0.040788f, -0.313589f, -1.359318f, 0.071329f, 0.176404f, + -0.476141f, 0.010108f, -0.201440f, -0.221167f, -0.197448f, -0.013927f, + -0.610270f, -0.607285f, 0.178070f, 0.174320f, 0.313115f, 0.026191f, + -0.112330f, 0.122338f, -0.367751f, 0.196794f, 0.153709f, -0.205454f, + -0.397471f, -1.879336f, -0.030129f, 0.143429f, -0.079832f, 0.435259f, + -1.729539f, 0.518301f, -0.141393f, 0.199399f, -1.914601f, 0.142865f, + -0.219899f, 0.508458f, 0.086365f, -0.220740f, -0.012507f, 1.263320f, + 0.042136f, 0.050922f, -0.329644f, -0.188198f, 0.251522f, 0.394731f, + -0.047866f, -0.260853f, -0.267207f, -0.248489f, 0.146474f, 0.359257f, + -0.427732f, -0.100652f, 0.192129f, 0.075572f, 0.916708f, 0.255747f, + 0.486384f, 0.127989f, -0.556449f, -0.484913f, 0.392298f, 0.045401f, + -0.839551f, -0.703619f, 0.069263f, -0.040720f, 0.542265f, 0.443739f, + 0.862552f, -0.021726f, 0.230858f, -0.261004f, -0.125697f, -0.106435f, + 0.002341f, 0.013904f, 0.011034f, 0.542296f, -0.284325f, 0.135736f, + 0.113882f, 0.040610f, -0.255485f, 0.224061f, -0.087140f, 0.127872f, + -0.002638f, 0.164889f, -0.335958f, -0.031166f, -0.393581f, 0.075455f, + 0.055995f, 0.087934f, -0.133859f, -0.342187f, 0.002492f, -0.340722f, + 0.058304f, 0.104165f, -0.142136f, -0.351111f, -0.158037f, -0.079924f, + -0.253209f, -0.092840f, -0.174646f, -0.202772f, -0.353438f, -0.031111f, + 0.076088f, -0.232091f, -0.070052f, 0.097595f, 0.063173f, -0.211195f, + 0.126478f, -0.178828f, 0.278723f, -0.070807f, -0.179783f, 0.034123f, + 0.035721f, -0.200431f, 0.170640f, 0.107933f, 0.226594f, -0.301499f, + -0.291096f, 0.228076f, -0.272951f, 0.002490f, -0.210707f, -0.128033f, + -0.194009f, -0.011347f, -0.256694f, -0.011841f, -0.005167f, -0.163203f, + -0.253796f, -0.198877f, -0.055827f, -0.882685f, -0.443471f, 0.349601f, + 0.749334f, -1.161845f, 0.505480f, 0.221733f, 0.210490f, -0.234984f, + 0.014183f, -0.510401f, 0.238692f, -0.134111f, 0.083844f, -0.478751f, + -0.088434f, 0.304063f, 0.150336f, -0.749682f, -0.081999f, 0.729739f, + 0.412508f, 0.132571f, 0.058306f, -0.047451f, -0.117435f, -0.445395f, + -0.005182f, -0.025757f, 0.175051f, -0.258194f, -0.150311f, -0.196533f, + -1.314316f, -0.428627f, 0.512451f, 0.045138f, -0.200925f, 0.081538f, + -0.346151f, -0.358197f, -0.422258f, -0.028542f, -0.383534f, -0.026163f, + -0.419858f, -0.154321f, 0.376970f, 0.094017f, 0.783520f, 0.110641f, + 0.077966f, -0.093064f, 0.160522f, -0.863041f, 0.086210f, 0.560764f, + 0.057032f, 0.159224f, 0.323068f, -0.173109f, 0.014042f, -0.126856f, + -0.128237f, -0.245273f, -0.317312f, -0.257597f, -0.181977f, 0.259485f, + -0.215834f, 0.062076f, -0.270596f, 0.271581f, -0.153486f, -0.247165f, + 0.079737f, -0.157049f, -0.027459f, -0.299397f, 0.136729f, -0.334192f, + -0.191722f, 0.145865f, -0.031324f, -0.307165f, -0.244923f, -0.228027f, + 0.063807f, 0.054965f, -0.005709f, -0.041977f, -0.276245f, 0.020003f, + 0.133323f, -0.145992f, -0.951030f, 0.414083f, -1.063323f, 0.137872f, + 0.104732f, -0.123728f, 0.542532f, 0.213654f, 0.542954f, 0.155619f, + 0.543072f, 0.399067f, 0.191402f, -0.102552f, -0.176734f, -0.136776f, + -0.012814f, -0.021298f, -0.802467f, -0.957481f, -0.238787f, -0.138482f, + 0.058331f, 0.126601f, 0.104420f, -0.148684f, 0.343218f, 0.093604f, + -0.055642f, -0.383918f, -0.045250f, -0.090480f, -0.155464f, 0.278299f, + 0.042791f, -0.029084f, -0.373861f, -0.073233f, -0.085172f, 0.186841f, + -0.070898f, -0.156415f, 0.112831f, -0.065931f, -0.353007f, 0.058453f, + -0.136982f, 0.233393f, 0.017240f, -0.018428f, 0.229104f, -0.371440f, + -0.262212f, 0.203075f, -0.263293f, 0.034413f, -0.299354f, 0.227269f, + 0.204977f, -0.118107f, -0.359832f, -0.068252f, 0.480105f, -0.214711f, + -0.614381f, 0.209048f, -0.456014f, -0.188819f, -0.220995f, -0.322104f, + -0.191457f, 0.420874f, -0.454919f, 0.023119f, 0.291700f, -0.532885f, + -0.032642f, 0.043271f, 0.133974f, 0.002399f, -0.179899f, -0.044158f, + -0.027078f, -0.350075f, 0.236766f, 0.346771f, -0.118534f, -0.421221f, + 0.019544f, 0.109349f, 0.141517f, 0.403561f, 0.409102f, 0.054555f, + -0.561751f, 0.577183f, -0.705156f, -0.231188f, -1.969772f, 0.172289f, + -0.048122f, 0.205671f, -0.667130f, -0.066870f, 0.202838f, -0.095538f, + -0.842651f, 0.254170f, 0.046256f, -0.271891f, -0.369254f, 0.492101f, + 0.001189f, -0.186525f, 0.188470f, -0.207072f, 0.030086f, -0.132904f, + 0.127001f, 0.116662f, -0.079246f, 0.227241f, -0.462178f, 0.446304f, + -1.660753f, 0.241832f, -0.288040f, 0.054663f, -0.435804f, 0.296782f, + -0.026421f, -0.115618f, 0.163416f, 0.834001f, 0.008019f, -0.014243f, + 0.524658f, 0.067894f, -0.253936f, -0.100657f, 1.285389f, -0.005952f, + 0.087134f, -0.088375f, -0.121866f, -0.171172f, 0.279463f, -0.598593f, + -0.727761f, 0.189831f, -0.822575f, -0.291141f, -0.012410f, -0.069999f, + 0.098842f, -0.218513f, 0.009494f, 0.100106f, -0.402884f, -0.299236f, + -0.345668f, -0.057739f, -0.213248f, -0.426661f, -0.360268f, -0.349860f, + -0.382177f, -0.357802f, -0.032030f, -0.110597f, -0.155442f, -0.418794f, + -0.012113f, -0.032962f, -0.450648f, 0.129060f, -0.135227f, -0.298593f, + 0.001435f, 0.278790f, -0.272945f, 0.162759f, -0.290208f, 0.058481f, + -0.490971f, 0.019630f, -0.210347f, 0.000520f, -0.340413f, 0.641562f, + 0.023104f, 0.194832f, -0.441894f, -0.253538f, -0.228332f, 0.423264f, + -1.094073f, -0.475657f, -0.238752f, 0.033910f, 0.440425f, 0.036320f, + 0.566989f, -0.065326f, -0.297939f, 0.406098f, 0.529561f, -0.113084f, + 0.141472f, -0.024462f, -0.179212f, 0.187801f, -0.235787f, -0.229624f, + 0.357791f, 0.061110f, -0.607788f, -1.713694f, -0.651041f, 1.734283f, + -0.334701f, 0.161687f, 0.010215f, 0.320708f, 0.169447f, 0.513558f, + 0.488340f, -0.619036f, -0.525441f, -1.144352f, -0.546154f, 0.669973f, + 0.327028f, -0.100539f, 0.012048f, -0.223013f, -0.239680f, 0.323035f, + 0.165950f, -0.155110f, 0.128664f, -0.157378f, -0.124490f, 0.291553f, + 0.055849f, -0.221664f, 0.077770f, -0.350658f, -0.181939f, 0.110230f, + -0.078219f, 0.007472f, -0.031620f, 0.007708f, -0.201794f, 0.017594f, + -0.027480f, 0.058884f, -0.369166f, -0.369770f, 0.181635f, -0.183318f, + -0.389184f, -0.256661f, 0.160107f, 0.037127f, -0.082573f, -0.095815f, + -0.322782f, 0.072528f, -0.348875f, 0.216247f, -0.161757f, -0.385502f, + -0.315738f, 0.020123f, -0.155609f, 0.114403f, -0.383232f, 0.629529f, + 0.066142f, 0.448392f, -0.389557f, -0.083315f, 0.829535f, -0.015531f, + -0.050728f, -0.325127f, 0.812992f, -0.196780f, 0.021060f, -0.952647f, + 0.006687f, -0.512715f, -0.066778f, 0.410067f, -0.116945f, -0.288283f, + 0.189334f, -0.083153f, 0.159980f, -0.068208f, 0.107358f, -0.154411f, + -0.068914f, 0.186816f, 0.032251f, 0.109242f, 0.134825f, 0.035101f, + -0.253175f, 0.157309f, -0.363597f, -0.138176f, -0.334141f, -0.172697f, + 0.045800f, -0.286057f, 0.173403f, -0.172444f, -0.117996f, -0.383848f, + -0.173303f, -0.258482f, -0.021404f, -0.017898f, -0.001970f, 0.003273f, + 0.056121f, 0.155046f, 0.044708f, -0.295609f, -0.211688f, -0.233229f, + -0.264980f, 0.145549f, 0.045323f, -0.027112f, 0.175638f, -0.207251f, + -0.055274f, 0.092706f, 0.086200f, -0.241340f, -0.147416f, 0.024510f, + -0.357194f, -0.181944f, -0.050104f, -0.079024f, -0.290473f, -0.169790f, + -0.277982f, -0.017781f, -0.004854f, -0.094132f, -0.348555f, 0.199291f, + -0.343989f, -0.319299f, -0.268935f, -0.021208f, 0.020938f, -0.090609f, + 0.006595f, -0.200790f, 0.171856f, -0.027766f, -0.032017f, -0.006745f, + 0.566426f, -0.096850f, 0.727633f, -0.408065f, -0.012436f, 0.005646f, + -0.305148f, -0.095075f, -0.391549f, -0.020378f, -0.236498f, -0.252773f, + -0.231385f, -0.203175f, 0.041903f, -0.373694f, 0.058239f, -0.101116f, + 0.183772f, 0.164523f, -0.099046f, -0.201272f, -0.394523f, -0.157517f, + 0.032079f, -0.381173f, -0.238496f, -0.037990f, -0.294553f, 0.141473f, + 0.100268f, -0.023806f, 0.004978f, 0.184916f, 0.142699f, -0.113240f, + -0.213364f, -0.160059f, -0.216263f, -0.406387f, -0.301140f, -0.406355f, + -0.113085f, -0.279699f, -0.267434f, 0.126263f, -0.260527f, -0.153904f, + -0.494653f, -0.355144f, 0.030549f, -0.216400f, -0.123363f, 0.189090f, + 0.219122f, 0.096677f, -0.202037f, -0.014489f, -0.137859f, -0.114184f, + -0.279423f, -0.270683f, +}; + +static const float av1_early_term_after_split_nn_bias_64_layer0[] = { + -0.491455f, 0.464538f, -0.005742f, -0.219951f, -0.073682f, 0.102027f, + 0.567071f, 0.441402f, 0.277521f, 0.314498f, -0.448199f, -0.065032f, + 0.488139f, -0.079632f, 0.000000f, 0.521555f, -0.151950f, -0.034616f, + 0.393438f, -0.072242f, -0.087343f, -0.571308f, 0.017372f, -0.126144f, + 0.372261f, -0.451537f, -0.140238f, -0.092377f, -0.074475f, -0.068879f, + -0.109614f, -0.164492f, +}; + +static const float av1_early_term_after_split_nn_weights_64_layer1[] = { + -0.373195f, -0.283141f, 0.416113f, 0.483659f, 0.230583f, 0.349197f, + -0.168582f, -0.813338f, -0.472369f, -0.173872f, 1.297845f, 0.339355f, + -0.828033f, 0.019617f, 0.118757f, -0.619360f, 0.282295f, -0.054116f, + -0.730596f, 0.068567f, -0.248707f, 0.461225f, 0.330224f, -0.287080f, + -0.458103f, 0.591852f, -0.008491f, 0.632119f, -0.007872f, 0.007869f, + -0.230698f, -0.011437f, +}; + +static const float av1_early_term_after_split_nn_bias_64_layer1[] = { + -0.55403697f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_64 = { + FEATURES, + 1, + 1, + { + HIDDEN_NODES, + }, + { + av1_early_term_after_split_nn_weights_64_layer0, + av1_early_term_after_split_nn_weights_64_layer1, + }, + { + av1_early_term_after_split_nn_bias_64_layer0, + av1_early_term_after_split_nn_bias_64_layer1, + }, +}; + +static const float av1_early_term_after_split_nn_weights_32_layer0[] = { + 0.026050f, -0.226531f, 0.308107f, -0.083744f, 0.201785f, 0.098562f, + 0.147595f, -0.495771f, -0.245741f, 0.201616f, -0.272070f, -0.579545f, + -0.127261f, -0.229588f, 0.250831f, -0.176929f, -0.031689f, 0.284718f, + 0.085845f, -0.285027f, 0.012304f, 0.382402f, -0.204591f, 0.272514f, + -0.065854f, -0.054228f, -0.231174f, -0.174504f, 0.258287f, 0.195689f, + 0.242530f, 0.023528f, -0.294242f, -0.272132f, 0.460180f, -0.731281f, + -0.208103f, 0.208204f, 0.348250f, 0.016328f, 0.043707f, -0.169551f, + 0.108521f, 0.226895f, -0.020471f, 0.102443f, 0.429640f, -0.252555f, + -0.218434f, -0.163665f, 0.175531f, 0.101588f, -0.135798f, -0.158102f, + 0.142565f, 0.128277f, 0.174985f, -0.100073f, 0.113967f, 0.223682f, + -0.145576f, -0.008443f, 0.112748f, -0.037845f, 0.076954f, -0.287137f, + -0.518185f, -0.106833f, 0.175359f, 0.031408f, 0.219069f, -0.294440f, + 0.007766f, 0.067754f, -0.049168f, -0.212368f, -0.261708f, 0.309252f, + 0.220859f, -0.274852f, -0.653157f, 0.083438f, -0.265386f, 0.174429f, + -0.116931f, -0.091594f, -0.244897f, -0.089015f, 0.274453f, 0.212890f, + 0.272053f, -0.425315f, -0.107726f, 0.294444f, -0.354629f, 0.104402f, + -0.307663f, 0.558430f, 0.140334f, -0.054831f, -0.449456f, 0.058274f, + -0.033768f, -0.354117f, -0.331618f, -0.411772f, 0.232064f, -0.079297f, + -0.638571f, 0.181823f, -0.039611f, 0.206310f, -0.659157f, -0.102930f, + -0.067303f, -0.176881f, -0.001038f, 0.091835f, 0.079739f, -0.121923f, + 0.211070f, 0.362719f, -0.154915f, -0.151876f, -0.165460f, 0.023469f, + -0.251036f, 0.210014f, -0.537125f, 0.156832f, -0.216987f, 0.062975f, + -0.198462f, 0.329123f, 0.125870f, 0.225830f, 0.086377f, -0.128773f, + -0.179673f, -0.074612f, 0.456645f, 0.021905f, -0.243140f, 0.059145f, + -0.273942f, -0.277822f, 0.154556f, -0.025459f, 0.227614f, -0.313076f, + 0.044705f, -0.019017f, 0.108999f, -0.020243f, -0.016373f, 0.560270f, + -0.064818f, 0.050880f, -0.218458f, 0.825699f, -0.534056f, -0.258253f, + 0.222073f, 0.013295f, 0.477870f, -0.386727f, 0.388509f, 0.004128f, + 0.451388f, -0.175788f, 0.264093f, -0.109812f, 0.358132f, 0.500992f, + -0.446933f, -0.222397f, 0.345834f, 0.370943f, -0.233115f, -0.047005f, + -0.111335f, -0.111586f, 0.026975f, -0.052191f, -0.111800f, -0.129782f, + 0.225132f, 0.102524f, 0.544557f, -0.111674f, -0.857884f, 0.133258f, + 0.310001f, 0.043829f, 0.104143f, 0.256493f, 0.242520f, -0.342082f, + 0.421447f, 0.124227f, 0.061542f, -0.090206f, 0.316681f, 0.353452f, + -0.918408f, -0.001903f, -0.052303f, -0.004816f, -0.446393f, -0.053038f, + 0.255725f, -0.126346f, 0.034095f, -0.240276f, -0.135918f, 0.095682f, + -0.147457f, -0.338216f, -0.200426f, 0.010265f, -0.243915f, -0.231375f, + -0.323924f, -0.014353f, 0.150252f, -0.264346f, 0.205303f, -0.194610f, + -0.282527f, 0.180555f, -0.000087f, 0.027240f, -0.000903f, -0.345877f, + -0.353274f, -0.311829f, 0.172985f, -0.111748f, -0.309380f, 0.108110f, + -0.260914f, -0.164990f, 0.183625f, -0.319692f, -0.096988f, 0.094147f, + -0.047062f, -0.080978f, 0.227387f, -0.000450f, -0.220159f, -0.211448f, + -0.020885f, -0.139646f, -0.086721f, 0.067928f, -0.033084f, -0.251996f, + 0.090317f, 0.086313f, -0.228420f, -0.111356f, -0.314304f, -0.223664f, + 0.188176f, -0.002360f, -0.029491f, -0.006000f, -0.075343f, 0.173699f, + -0.272800f, -0.238507f, -0.272071f, -0.015000f, -0.215305f, -0.192943f, + -0.038595f, 0.119537f, 0.260477f, -0.168014f, -0.172751f, 0.532861f, + -0.753250f, -0.017485f, -0.115541f, -0.109291f, -1.098943f, 0.418559f, + -0.532110f, 0.359323f, -0.254786f, 0.471316f, -0.545024f, 0.291912f, + -0.836939f, 0.443427f, -0.441709f, 0.168866f, -0.140372f, 0.546607f, + -0.315465f, 0.023328f, 0.137709f, -0.083492f, -0.049986f, -0.071302f, + -0.293680f, -0.105049f, 0.315317f, 0.279569f, 0.220762f, 0.088161f, + -0.756456f, -0.074512f, 0.958318f, -0.332924f, -0.004906f, -0.629271f, + 0.212050f, 0.279123f, 0.311523f, -0.599580f, 0.516150f, 0.456952f, + 0.020255f, 0.247290f, -0.182670f, -0.335554f, 0.021203f, 0.131081f, + -0.208584f, 0.112530f, -0.198980f, 0.211583f, -0.101271f, -0.206453f, + -0.502688f, -0.294976f, -0.187019f, -0.114473f, 0.282050f, -0.165483f, + 0.094953f, -0.182578f, 0.055068f, 0.135605f, -0.266941f, -0.297556f, + 0.199181f, 0.015979f, -0.158659f, -0.226841f, 0.171306f, 0.013438f, + -0.286309f, -0.071753f, -0.170300f, -0.238188f, 0.093572f, -0.026230f, + -0.254502f, -0.297786f, -0.063480f, -0.300799f, -0.065644f, 0.074710f, + 0.248576f, -0.144425f, -0.113948f, -0.247297f, 0.276682f, 0.010963f, + -0.737786f, 0.026347f, 0.007830f, 0.753543f, 0.371904f, 0.305614f, + 0.105028f, 0.073530f, -0.119137f, 0.102352f, -0.080523f, 0.176366f, + -0.159457f, -0.339948f, 0.360131f, -0.007051f, -0.388378f, -0.101695f, + 0.663041f, -0.234486f, -0.142536f, -0.099931f, 0.041478f, 0.230425f, + 0.005743f, 0.154060f, 0.056233f, -0.080668f, -0.009754f, -0.194356f, + 0.185474f, -0.296474f, 0.192700f, 0.257767f, 0.348529f, 0.458265f, + 0.060276f, -0.130473f, 0.139889f, 0.310073f, -0.306869f, -0.272922f, + -0.259862f, 0.409207f, 0.431991f, -0.100357f, -0.050415f, -0.071830f, + -0.239665f, 0.153399f, 0.177192f, -0.611644f, -0.176114f, -0.022694f, + -0.033701f, -0.345842f, 0.015660f, 0.158931f, -0.097586f, 0.222001f, + 0.257887f, -0.171307f, -0.222607f, -0.245508f, -0.145742f, -0.096461f, + -0.010895f, 0.052815f, -0.265306f, -0.081059f, 0.219162f, -0.256084f, + -0.372676f, 0.148977f, 0.174831f, 0.086980f, 0.108518f, 0.074011f, + 0.038032f, -0.070856f, -0.109407f, 0.126174f, 0.022341f, -0.249786f, + -0.356164f, -0.202841f, -0.087437f, -0.133740f, 0.090956f, -0.017953f, + -0.028353f, 0.233621f, 0.109426f, 0.232798f, -0.104950f, -0.241798f, + -0.018995f, -0.167954f, 0.002473f, 0.060418f, -0.232717f, -0.195980f, + -0.283971f, -0.371881f, 0.219728f, 0.018072f, -0.166694f, -0.083301f, + -0.000616f, -0.212641f, -0.173158f, 0.222739f, -0.235302f, 0.237624f, + 0.222232f, -0.041235f, -0.342411f, 0.121194f, 0.211291f, -0.032237f, + -0.249401f, -0.291668f, 0.206055f, -0.148200f, 0.011824f, -0.272728f, + -0.194854f, 0.367175f, -0.257243f, 0.103433f, -0.231077f, 0.236734f, + 0.135733f, -0.362845f, 0.197147f, 0.242782f, -0.135289f, 0.123311f, + 0.259420f, -0.116278f, 0.127287f, 0.236789f, -0.097438f, 0.118073f, + 0.112796f, -0.035949f, 0.184408f, 0.200948f, -0.008859f, 0.195989f, + 0.161970f, -0.295320f, -0.330389f, 0.141034f, 0.066081f, -0.707857f, + 0.357037f, 0.149633f, 0.679877f, 0.548674f, 0.469076f, 0.194123f, + -0.209872f, -0.071764f, -0.126960f, 0.199420f, 0.327116f, -0.169053f, + -0.429156f, 0.443429f, -0.225530f, -0.130738f, -0.028351f, 0.644393f, + 0.049606f, -0.243602f, -0.409920f, 0.117028f, -0.258557f, 0.073865f, + -0.200454f, -0.139957f, -0.031314f, 0.162325f, 0.247221f, 0.071909f, + -0.336276f, 0.079922f, 0.192780f, -0.148882f, 0.133192f, -0.143177f, + -0.121327f, 0.126221f, -0.089521f, -0.181826f, 0.149923f, -0.280682f, + 0.391572f, 0.108990f, -0.445494f, -0.170787f, 0.225182f, 0.223313f, + -0.234828f, -0.071072f, -0.072673f, -0.093686f, 0.223892f, -0.049377f, + 0.057976f, 0.033558f, 0.068733f, -0.283353f, 0.217877f, 0.158093f, + -0.276761f, -0.097049f, -0.351913f, -0.383604f, 0.002863f, -0.474510f, + -0.096738f, 0.256940f, 0.234203f, -0.226667f, -0.260576f, -0.183403f, + -0.035578f, 0.141570f, 0.078764f, -0.028086f, 0.155800f, -0.251115f, + -0.286703f, -0.014739f, -0.072621f, -0.311506f, -0.048639f, 0.081621f, + 0.043057f, 0.068136f, -0.179903f, 0.143699f, -0.002571f, 0.239012f, + 0.197456f, 0.035745f, -0.311927f, 0.220320f, 0.102687f, -0.294105f, + 0.426740f, 0.209050f, 0.211907f, 0.083453f, 0.006578f, -0.143338f, + 0.003157f, 0.040295f, 0.234497f, 0.035344f, -0.163909f, 0.411115f, + 0.289453f, -0.075357f, -0.008884f, 0.469798f, -0.033304f, -0.153293f, + -0.229322f, -0.004162f, 0.113363f, 0.395381f, 0.067414f, -0.188966f, + -0.117424f, -0.166423f, 0.066839f, 0.595641f, -0.204782f, -0.451727f, + 0.198509f, -0.921583f, -0.246765f, -0.153411f, 0.046491f, 0.365906f, + 0.376710f, -0.017355f, -0.035232f, 0.138785f, -0.163918f, -0.283449f, + -0.094340f, 0.192127f, 0.154815f, 0.035787f, -0.029087f, 0.115649f, + -0.220133f, -0.452741f, 0.311667f, 0.157666f, 0.091401f, 0.236040f, + -0.168523f, 0.122176f, -0.219016f, -0.214856f, 0.172824f, -0.091810f, + 0.031520f, -0.857420f, 0.643446f, -0.017471f, 0.206082f, -0.933517f, + -0.020070f, -0.065091f, -0.117680f, -1.271870f, -0.069177f, -0.149409f, + 0.289970f, -0.889775f, -0.044741f, 0.232647f, -0.319416f, 0.073030f, + 0.278549f, 0.238782f, -0.202206f, 0.272540f, 0.201412f, 0.175574f, + -0.127971f, -0.253164f, -0.086352f, -0.005381f, 0.114714f, 0.505169f, + -0.175049f, -1.534280f, -0.320666f, -2.119298f, -0.023075f, -0.021259f, + -0.161019f, 0.344837f, 0.361958f, -0.097050f, 0.014375f, 0.267110f, + 0.341442f, -0.016688f, 0.073393f, 0.131500f, 0.246331f, 0.011059f, + 0.033597f, 0.014779f, -0.269366f, -0.504788f, 0.048651f, 0.295682f, + 0.237363f, 0.227484f, -0.235814f, -0.160530f, 0.182682f, -0.172999f, + -0.126630f, 0.168357f, -0.078729f, 0.052805f, 0.377021f, -0.004727f, + 0.230415f, -0.876673f, 0.458457f, 0.099401f, -0.019616f, 0.611982f, + -0.231508f, -0.070894f, -0.056142f, 0.548969f, -0.376599f, -0.600428f, + 0.241930f, -0.592893f, 0.189371f, 0.488651f, -0.092446f, -0.272569f, + 0.251643f, 0.315945f, -0.301468f, 0.112961f, 0.052119f, -0.066076f, + -0.082249f, 0.252805f, -0.195539f, 0.150386f, -0.865534f, 0.673447f, + 0.030177f, -0.438528f, -1.006174f, 0.575176f, -0.271656f, 0.035835f, + -1.056916f, 0.495267f, -0.092428f, -0.109511f, -0.192359f, 0.166669f, + -0.624326f, -0.000354f, -0.089075f, 0.176279f, -0.289347f, 0.021346f, + 0.020375f, 0.255282f, -0.045588f, 0.173675f, 0.100957f, -0.294373f, + 0.049303f, -0.134132f, -0.255731f, -0.025559f, -0.307463f, -0.205100f, + 0.079024f, 0.101113f, 0.135742f, -0.348869f, -0.026759f, -0.134155f, + -0.179275f, -0.054297f, -0.054948f, 0.029351f, 0.190560f, 0.102476f, + -0.025785f, 0.169442f, -0.271303f, 0.200667f, 0.099063f, 0.074767f, + -0.326533f, 0.044426f, -0.290251f, -0.082443f, -0.164482f, -0.349412f, + 0.045109f, -0.157330f, 0.165935f, 0.012672f, -0.059818f, 0.399140f, + -0.316620f, 0.386638f, -0.285399f, -0.296777f, -0.200473f, -0.144232f, + 0.251851f, -0.203768f, 0.001071f, -0.179063f, 0.248952f, -0.143029f, + 0.010423f, -0.030293f, -0.046786f, -0.196195f, -0.016845f, 0.295023f, + 0.322825f, 0.133683f, 0.017388f, 0.142467f, 0.221320f, 0.004059f, + -0.115770f, 0.143363f, 0.137972f, -0.272584f, 0.489366f, -0.091828f, + -0.014703f, 0.082332f, -0.476226f, -0.202859f, 0.356094f, -0.283049f, + 0.218086f, 0.202015f, 0.201724f, 0.012617f, 0.050720f, 0.255695f, + 0.244653f, 0.111296f, -0.151450f, -0.056210f, -0.757348f, 0.441724f, + -0.022455f, -0.244662f, 0.296205f, -0.421883f, -0.217386f, -0.254301f, + 0.409105f, -0.031309f, 0.050147f, -0.337170f, -0.106620f, -0.606455f, + 0.308024f, 0.298144f, 0.363993f, 0.704870f, -0.047292f, 0.166901f, + 0.105991f, -0.536757f, -0.424031f, -0.226034f, 0.213635f, -0.526754f, + 0.310990f, -0.116038f, 0.007775f, 0.538330f, -0.177912f, 0.445357f, + -0.290365f, 0.451169f, 0.030931f, 0.033388f, 0.209905f, -0.244492f, + -0.097792f, -0.246042f, 0.132047f, 0.032576f, 0.115516f, 0.022890f, + 0.093508f, -0.071840f, 0.362948f, -0.135245f, 0.659911f, -0.321413f, + 0.193118f, -0.795001f, -0.218311f, 0.024862f, 0.206172f, -0.832878f, + -0.255670f, 0.343402f, -0.275211f, -0.898363f, -0.025172f, 0.158565f, + 0.171347f, -0.127518f, -0.215156f, -0.159198f, 0.250355f, -0.132452f, + 0.061254f, -0.097544f, -0.223246f, 0.013183f, 0.239468f, 0.259017f, + -0.217739f, -0.032263f, 0.123755f, -0.701777f, 0.150049f, -0.555293f, + 0.062430f, -0.260304f, 0.494894f, -0.168702f, -0.134829f, -0.113989f, + 0.150092f, -0.060248f, 0.115711f, -0.277202f, 0.499811f, 0.417116f, + 0.191081f, -0.376432f, -0.321092f, 0.033992f, 0.057193f, 0.127077f, + -0.009042f, 0.014443f, 0.142808f, -0.124349f, 0.213087f, -0.381686f, + 0.129726f, -0.038396f, +}; + +static const float av1_early_term_after_split_nn_bias_32_layer0[] = { + -0.107171f, 0.060848f, -0.069480f, -0.121982f, 0.037637f, -0.291839f, + 0.102257f, -0.065889f, -0.032452f, 0.034171f, -0.073984f, -0.005236f, + 0.218820f, 0.132123f, -0.089621f, -0.067679f, 0.049368f, 0.329444f, + -0.184729f, 0.031702f, 0.009735f, -0.039964f, -0.018024f, -0.073031f, + -0.030166f, -0.191037f, -0.074862f, -0.076548f, 0.076537f, 0.216609f, + -0.078358f, -0.007740f, +}; + +static const float av1_early_term_after_split_nn_weights_32_layer1[] = { + 0.047869f, -0.231773f, -0.185663f, 0.460676f, -0.208182f, 0.590555f, + -0.622627f, 0.279377f, 0.351681f, 0.633504f, 1.069884f, 0.332449f, + -0.457703f, -0.435817f, -0.028853f, 0.327490f, -0.282469f, -0.975792f, + -0.062975f, -0.147187f, 0.348340f, -1.207116f, 0.516159f, -1.509626f, + -0.805072f, 0.522999f, 0.143671f, 0.304246f, -0.360720f, -0.612472f, + 0.260045f, -0.223243f, +}; + +static const float av1_early_term_after_split_nn_bias_32_layer1[] = { + -0.07571174f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_32 = { + FEATURES, + 1, + 1, + { + HIDDEN_NODES, + }, + { + av1_early_term_after_split_nn_weights_32_layer0, + av1_early_term_after_split_nn_weights_32_layer1, + }, + { + av1_early_term_after_split_nn_bias_32_layer0, + av1_early_term_after_split_nn_bias_32_layer1, + }, +}; + +static const float av1_early_term_after_split_nn_weights_16_layer0[] = { + -0.113798f, 0.053357f, -0.037947f, -0.477171f, 0.276517f, -0.349252f, + -0.177284f, 0.189597f, 0.141744f, 0.230207f, -0.328104f, 0.074328f, + 0.247717f, 0.233533f, 0.145167f, 0.018029f, -0.398725f, -0.226199f, + -0.309724f, 0.125279f, 0.194759f, 0.025531f, 0.349714f, -0.273944f, + 0.186871f, 0.181735f, -0.520614f, -0.264076f, 0.308207f, 0.157438f, + -0.137791f, -0.054582f, 0.125879f, 0.796218f, -0.897562f, 0.885439f, + 0.381640f, 0.106625f, -2.027456f, 0.000874f, 0.179581f, 0.013287f, + -2.329439f, -0.163169f, -0.136191f, 0.320108f, -2.318779f, -0.196722f, + -0.295721f, 0.203658f, -0.182275f, 0.615941f, 0.015762f, 0.257181f, + -0.115297f, 0.295774f, -0.026144f, -0.022686f, -0.219423f, -0.042861f, + 0.207647f, -0.057791f, 0.201671f, -0.169569f, 0.291492f, -0.994991f, + 0.137473f, 0.230948f, 0.505626f, -1.065860f, 0.275225f, -0.250861f, + 0.519466f, -1.217242f, -0.087384f, 0.053441f, 0.030729f, -1.702304f, + -0.034635f, 0.010177f, -0.035422f, -0.749979f, 0.355499f, 0.408166f, + -0.086883f, 0.017203f, 0.195706f, -0.218056f, -0.029153f, 0.367335f, + -0.061732f, -0.241068f, 0.078496f, -0.370346f, -0.124223f, -0.172708f, + 0.037971f, 0.038875f, -0.282489f, -0.266323f, -0.210864f, 0.214714f, + 0.234695f, -0.045625f, 0.015357f, -0.007464f, -0.362003f, -0.113465f, + 0.145141f, 0.238470f, -0.202664f, -0.286587f, -0.347112f, 0.054501f, + -0.190290f, -0.283256f, 0.062179f, 0.041165f, -0.006935f, -0.220351f, + -0.088800f, 0.220924f, -0.200982f, 0.058493f, -0.225175f, 0.057175f, + -0.618187f, 0.761023f, -0.743774f, -0.500599f, -0.584999f, 1.545211f, + 0.123055f, -0.106848f, -0.353057f, 1.552187f, 0.174104f, 0.068060f, + -0.449859f, 1.254299f, -0.161716f, -0.060630f, -0.230721f, 0.165976f, + -0.101582f, -0.422415f, 0.110384f, -0.130098f, 0.104428f, 0.083518f, + 0.031626f, 0.083048f, 0.158877f, 0.173340f, 0.063962f, 0.427845f, + 0.663268f, 0.376996f, 0.146435f, -0.091329f, 0.443447f, 0.518432f, + -0.182777f, -0.091313f, 0.331229f, 0.532604f, -0.187001f, 0.054774f, + 0.298068f, 0.502295f, -0.362378f, 0.054283f, 0.292806f, 0.168901f, + -0.214787f, 0.025637f, 0.458009f, -0.322714f, -0.264059f, 0.140313f, + -0.102696f, -0.431208f, -0.134450f, -0.545415f, 0.253851f, -0.009061f, + -0.050681f, 0.108681f, 0.043272f, -1.073133f, 0.206410f, 0.469576f, + 0.291494f, -2.021244f, -0.001183f, -0.067542f, 0.364907f, -2.470543f, + 0.049147f, -0.018868f, 0.658500f, -2.531048f, 0.275433f, -0.034224f, + -0.171386f, 0.096369f, 0.728069f, 0.272332f, 0.222255f, -0.030426f, + 0.026994f, 0.208928f, -0.173943f, -0.227581f, -0.214798f, 0.079341f, + 0.032344f, -0.253575f, -0.044353f, -0.239265f, -0.055852f, -0.162582f, + -0.086592f, 0.066487f, 0.337353f, -0.168704f, 0.015702f, 0.022607f, + 0.286647f, 0.218106f, 0.193319f, -0.358714f, 0.030796f, 0.007646f, + -0.045617f, 0.165007f, -0.284641f, -0.291812f, 0.207544f, 0.082823f, + -0.141907f, -0.331336f, -0.052908f, 0.120716f, 0.202521f, 0.232782f, + -0.348141f, -0.017332f, 1.191126f, -0.391987f, -0.154537f, -0.206551f, + -2.378690f, 0.057918f, -0.328183f, 2.151556f, 0.238803f, 0.164880f, + -0.480039f, 1.616200f, 0.260243f, 0.083704f, -0.174461f, 1.804634f, + 0.194810f, 0.223837f, 0.550107f, -0.068171f, -0.293435f, -0.186770f, + -0.364846f, 0.127181f, 0.105556f, -0.016202f, 0.278403f, -0.344995f, + -0.009761f, -0.082555f, 0.046731f, -0.301452f, 0.604259f, 0.055895f, + 0.049862f, 0.314249f, -0.305811f, -0.112937f, 0.658787f, -0.549288f, + -0.307567f, -0.460650f, -0.840643f, 0.082576f, 0.373711f, 0.138318f, + 0.336901f, 0.284984f, -0.281400f, 0.408210f, -0.449858f, 0.461054f, + 0.227629f, -0.131705f, 0.301769f, -0.278540f, 0.189290f, -0.269041f, + 0.111350f, -0.300257f, 0.436858f, -0.265920f, -0.211938f, 0.272631f, + 0.206291f, 0.253273f, -0.229776f, -0.031112f, -0.171183f, -0.109676f, + -0.202390f, -0.068857f, 0.182125f, -0.140523f, -0.308742f, -0.045840f, + 0.256545f, -0.262405f, 0.225951f, -0.287463f, -0.189203f, -0.055552f, + -0.052448f, -0.242839f, -0.278877f, 0.140920f, -0.175755f, 0.215402f, + -0.248841f, -0.264080f, -0.178303f, 0.147777f, 0.049460f, -0.279877f, + -0.539725f, -0.004622f, 0.182874f, 0.338814f, 0.265974f, 0.249851f, + -0.141154f, 0.157228f, -0.090972f, 0.179444f, 0.305255f, 0.127788f, + 0.123270f, 0.355320f, 0.076797f, 0.263495f, 0.235965f, -0.133816f, + 0.243624f, 0.227062f, -0.213629f, 0.002075f, 0.061203f, -0.077820f, + -0.008807f, -0.247324f, -0.051464f, -0.191894f, -0.238713f, -0.389526f, + -0.274248f, 0.053950f, -0.225750f, -0.367097f, -0.122391f, 0.181212f, + -0.411824f, -0.084241f, -0.302288f, 0.077860f, -0.187443f, -0.300262f, + 0.083156f, -0.392461f, -0.332320f, -0.346474f, 0.140658f, -0.283656f, + 0.120714f, -0.056577f, -0.280968f, 0.017795f, -0.024686f, 0.073113f, + -0.346637f, 0.082567f, -0.036556f, -0.369730f, 0.081225f, -0.005211f, + 0.144886f, -0.003544f, 0.178307f, -0.366035f, -0.063887f, -0.191767f, + 0.105835f, -0.273978f, -0.266532f, -0.023984f, 0.039166f, 0.065848f, + -0.026802f, -0.268923f, 0.189659f, 0.086300f, 0.030718f, 0.216565f, + -0.130025f, -0.215687f, 0.146341f, -0.286438f, -0.394226f, -0.181509f, + -0.005612f, 0.186040f, 0.133491f, 0.032096f, -0.261609f, 0.074007f, + -0.042929f, -0.234479f, 0.189704f, 0.088395f, -0.003671f, -0.125055f, + -0.252418f, -0.086387f, 0.111197f, -0.297071f, -0.018793f, -0.031902f, + -0.333191f, -0.186279f, 0.039868f, 0.091419f, -0.264438f, -0.216150f, + -0.212550f, 0.203412f, -0.113028f, -0.197169f, -0.346771f, 0.086066f, + 0.091443f, -0.128507f, -0.007281f, -0.118389f, 0.003370f, -0.338661f, + 0.026739f, -0.063571f, -0.281567f, -0.166824f, 0.167455f, 0.216173f, + 0.199163f, 0.256314f, -0.222679f, 0.040282f, -0.154808f, -0.133943f, + -0.270163f, -0.357398f, 0.260373f, 0.176950f, -0.125162f, -0.085050f, + 0.226376f, -0.124585f, -0.324804f, 0.035536f, -0.133600f, 0.173450f, + 0.068107f, -0.337442f, 0.169629f, 0.047223f, 0.057878f, 0.055555f, + -0.317449f, -0.103768f, 0.080899f, -0.194759f, -1.137593f, 0.508999f, + 0.045372f, 1.746454f, 1.250347f, -0.342930f, -0.127821f, -0.220175f, + -0.417649f, -0.480595f, 0.071902f, 0.050231f, -0.562554f, -0.677866f, + -0.121416f, -0.247558f, -0.483876f, -0.504157f, 1.731953f, 0.572936f, + 0.047325f, 0.050619f, 0.112611f, -0.035393f, 0.052585f, -0.071076f, + -0.015798f, -0.050228f, -0.142875f, 0.189329f, 0.048833f, 0.503633f, + 0.249588f, 0.175492f, -0.137664f, -0.018533f, 0.288453f, -0.025644f, + 0.079131f, 0.195096f, -0.154039f, -0.104220f, -0.224072f, 0.095946f, + -0.208424f, 0.214745f, 0.056468f, 0.182603f, 0.341784f, -0.134664f, + -0.194050f, 0.058532f, -0.107336f, -0.087783f, -0.238795f, -0.387212f, + 0.049055f, -0.127417f, -0.299919f, -0.094371f, -0.011735f, -0.264753f, + 0.407375f, -0.462654f, -0.609488f, 0.027742f, -0.985512f, -0.109154f, + -0.423276f, 2.347960f, 0.129240f, 0.187610f, -0.057081f, 2.424892f, + 0.087666f, 0.106716f, -0.039379f, 2.764866f, 0.113309f, 0.028196f, + -0.582789f, 0.335385f, -0.538029f, -0.477337f, -0.114207f, 0.178829f, + 0.006276f, 0.123179f, 0.095101f, 0.139898f, -0.372074f, -0.111010f, + 0.136330f, 0.272900f, 0.126737f, -0.097808f, -0.363697f, 0.108665f, + -0.227749f, -0.083421f, 1.714677f, 0.451943f, 0.107931f, -0.392281f, + 1.615846f, 0.022307f, -0.247011f, 0.257703f, 1.039134f, 0.537789f, + 0.022177f, -0.271532f, 0.351350f, -0.399205f, -0.240534f, -0.315399f, + 0.026928f, -0.005618f, 0.053179f, -0.010277f, 0.000501f, 0.040896f, + -0.109160f, 0.018282f, 0.003887f, 0.199599f, 0.095349f, -0.337284f, + 0.169929f, -0.109409f, -0.166983f, 0.059908f, -0.226574f, -0.120114f, + 0.077329f, -0.333133f, -0.220936f, 0.114309f, -0.233965f, -0.281551f, + 0.042948f, 0.100940f, 0.116037f, -0.313122f, 0.215149f, -0.309057f, + -0.341052f, -0.294417f, -0.179722f, 0.010795f, 0.192053f, -0.275261f, + -0.033077f, 0.117348f, 0.090206f, 0.781573f, 0.602456f, -0.220296f, + 0.172159f, 0.758513f, 0.157910f, -0.217897f, -0.372659f, 0.031935f, + 0.791463f, 0.267195f, 0.931593f, -0.057349f, 0.405512f, -0.058512f, + -0.641663f, -0.076592f, 0.550227f, -0.024094f, 0.048218f, -0.289971f, + 0.180940f, 0.167533f, 0.052711f, -0.360726f, 0.019210f, -0.488879f, + 0.380498f, 0.151608f, -0.276895f, -0.596554f, 0.106076f, -0.245833f, + -0.048783f, 0.073823f, 0.098780f, 0.000211f, 0.113958f, -0.068964f, + -0.265533f, -0.185457f, 0.175586f, -0.163621f, -0.204919f, 0.145802f, + -0.163421f, 0.129576f, -0.153486f, -0.105573f, 0.067289f, -0.213120f, + -0.286103f, 0.249543f, -0.044970f, -0.170464f, -0.105501f, -0.094765f, + -0.050734f, -0.369468f, 0.180020f, -0.363328f, -0.151654f, -0.262550f, + -0.424503f, 0.829032f, -0.559452f, 0.506837f, 0.143823f, 0.276660f, + -1.808608f, -0.259517f, -0.053945f, 0.035676f, -1.842195f, -0.065960f, + -0.069285f, 0.462022f, -2.319453f, -0.370299f, 0.183329f, -0.146412f, + -0.563875f, 0.305068f, 0.480904f, 0.044319f, -0.016098f, 0.168516f, + 0.114874f, -0.097621f, -0.030373f, 0.177700f, 0.181591f, -0.146003f, + -0.330853f, -0.259200f, 0.779319f, -1.517524f, 0.178781f, 0.135451f, + 0.088784f, -2.076089f, 0.628717f, -0.048685f, 0.281327f, -2.341596f, + 0.422171f, 0.006135f, 0.367096f, -1.663118f, 0.365253f, -0.072884f, + -0.197620f, -0.688634f, 0.477354f, 0.395841f, -0.098505f, 0.208709f, + -0.027523f, 0.127119f, 0.106274f, 0.114424f, -0.122877f, -0.087245f, + 0.086923f, -0.527398f, -0.342062f, -0.764662f, 0.713094f, -0.626453f, + -0.081454f, -0.087683f, 0.885047f, 0.323440f, -0.018579f, -0.217166f, + 1.617984f, -0.159038f, 0.265991f, -0.390313f, 1.933182f, -0.032431f, + -0.057513f, -0.300841f, 0.461248f, -0.072147f, -0.287052f, -0.078056f, + 0.011734f, 0.044013f, 0.177174f, 0.093400f, 0.028819f, 0.193686f, + -0.224853f, 0.268321f, -0.075059f, 0.074526f, -0.015618f, 0.165615f, + -0.276780f, -0.063908f, -0.369264f, -0.171497f, -0.173624f, -0.130743f, + -0.224625f, -0.124980f, -0.104482f, 0.076864f, -0.009631f, -0.164682f, + 0.150480f, -0.111880f, -0.260425f, 0.086234f, -0.176936f, -0.136771f, + -0.168867f, -0.405626f, -0.288716f, -0.128950f, -0.207327f, 0.015581f, + -0.109061f, -0.098970f, 0.090792f, -0.109623f, 0.349851f, 0.266341f, + -0.088602f, -0.108071f, 0.082519f, 0.472650f, -1.838758f, 0.456694f, + 0.119927f, 0.461077f, -2.860022f, 0.231495f, 0.235771f, 0.256424f, + -1.938516f, -0.188202f, -0.000832f, -0.518206f, 0.194644f, 0.505510f, + 0.615657f, 0.193760f, 0.224600f, 0.265732f, -0.121553f, -0.354597f, + -0.242414f, -0.276639f, -0.057591f, 0.026369f, -0.261148f, -0.356155f, + -0.149178f, -0.353566f, -0.340835f, -0.141776f, 0.076535f, 0.221299f, + -0.108857f, -0.156514f, 0.050901f, 0.058541f, -0.077141f, 0.071515f, + -0.333283f, -0.181489f, -0.212900f, -0.224698f, -0.174693f, -0.178665f, + -0.143374f, -0.091811f, 0.165161f, 0.060156f, -0.086103f, -0.039031f, + -0.377759f, -0.370533f, 0.074431f, 0.064192f, 0.186576f, 0.447858f, + -0.082260f, -0.020268f, -0.123089f, -0.402017f, 0.080500f, 0.176286f, + 2.850013f, 0.019385f, -0.225361f, -0.235315f, 1.654694f, -0.073978f, + -0.341412f, -1.187575f, 2.815900f, -0.228063f, -0.174547f, 0.623825f, + -0.010676f, 0.157189f, 0.111879f, -0.198965f, 0.051851f, 0.158396f, + 0.045194f, 0.293531f, -0.246714f, -0.351493f, 0.026954f, 0.076233f, + 0.420367f, 0.168154f, -0.131450f, 0.134487f, -0.288851f, -0.134553f, + 0.014902f, 0.756381f, 0.277713f, 0.190080f, -0.020869f, 1.446672f, + 0.029792f, -0.025927f, 0.060640f, 0.559864f, 0.422229f, 0.198459f, + 0.036167f, 0.029432f, 0.001882f, 0.038480f, -0.160528f, -0.288855f, + -0.310886f, 0.291296f, 0.190558f, -0.182816f, -0.002252f, 0.073101f, + -0.172245f, -0.305980f, 0.112492f, -0.422839f, -0.295999f, -0.078160f, + -0.173405f, -0.032819f, 0.373774f, -0.715223f, 0.018911f, 0.131753f, + -0.237364f, -0.128499f, -0.228406f, 0.341619f, 0.343552f, -0.521581f, + -0.263790f, 0.362502f, -0.018450f, 0.054233f, 0.183068f, 0.382772f, + 0.188811f, -0.627287f, 0.040399f, -0.487338f, -0.192591f, 0.247426f, + 0.154372f, -0.483994f, +}; + +static const float av1_early_term_after_split_nn_bias_16_layer0[] = { + -0.173976f, 0.305495f, 0.250981f, -0.067127f, -0.313100f, 0.242464f, + 0.315196f, -0.056052f, -0.241227f, -0.253308f, -0.002697f, 0.003687f, + -0.124421f, -0.090383f, -0.070366f, -0.064074f, -0.056115f, 0.123313f, + -0.239698f, -0.182082f, -0.065296f, 0.021503f, -0.036787f, 0.311861f, + 0.118135f, -0.320456f, -0.110719f, 0.220692f, -0.071727f, -0.088226f, + -0.110874f, -0.111671f, +}; + +static const float av1_early_term_after_split_nn_weights_16_layer1[] = { + -0.338573f, 0.398159f, 0.314774f, -0.037448f, -0.271950f, -0.774991f, + 0.950901f, -0.225380f, -1.841906f, -0.350379f, -0.079350f, 0.383148f, + -0.183676f, -0.313132f, -0.340820f, -0.309401f, -1.050540f, -0.432267f, + -0.657195f, 0.927632f, -0.040150f, 0.578920f, 0.212301f, 0.292495f, + 0.563590f, -0.205735f, 0.195877f, 0.582122f, -0.217860f, 1.613379f, + 0.313278f, -0.555802f, +}; + +static const float av1_early_term_after_split_nn_bias_16_layer1[] = { + 0.16553f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_16 = { + FEATURES, + 1, + 1, + { + HIDDEN_NODES, + }, + { + av1_early_term_after_split_nn_weights_16_layer0, + av1_early_term_after_split_nn_weights_16_layer1, + }, + { + av1_early_term_after_split_nn_bias_16_layer0, + av1_early_term_after_split_nn_bias_16_layer1, + }, +}; + +static const float av1_early_term_after_split_nn_weights_8_layer0[] = { + -0.719472f, 0.305806f, 0.855829f, 0.100094f, 0.412517f, 1.254673f, + 1.552105f, -5.890773f, -0.089957f, -0.016736f, 1.418074f, -5.393506f, + -0.028214f, 0.117758f, 1.479209f, -5.299794f, 0.171585f, -0.084182f, + -0.162105f, 0.388577f, -0.044319f, -0.025861f, 0.251782f, -0.181462f, + -0.101545f, -0.079999f, -0.033014f, -0.191627f, -0.032802f, -0.053404f, + 0.038038f, -0.119492f, 0.049104f, -0.344384f, -0.354513f, 0.036977f, + 0.017513f, -0.004025f, -0.163212f, -0.261999f, 0.146575f, 0.207541f, + 0.130365f, -0.252127f, 0.097419f, -0.231057f, -0.309421f, 0.347866f, + -0.064670f, -0.283171f, -0.244193f, -0.193323f, -0.226954f, -0.276194f, + -0.233553f, 0.156354f, -0.184009f, 0.344289f, -0.308058f, -0.205202f, + -0.325068f, 0.183820f, -0.361667f, -0.069559f, -0.121834f, -0.038357f, + -0.210043f, -0.266129f, 0.003188f, 0.074902f, -0.328843f, 0.293679f, + -0.234698f, -0.428268f, -0.308772f, -0.136538f, -0.008384f, -0.078227f, + 0.166074f, -0.262899f, 0.102114f, -0.323420f, 0.057064f, -0.203318f, + -0.397413f, -0.317324f, -0.307093f, 0.020574f, -0.188627f, 0.132529f, + 0.118992f, -0.487387f, -0.282975f, 0.573231f, -0.266071f, 0.125140f, + -0.970034f, 1.424008f, -0.487366f, -0.196415f, 3.680273f, -0.008407f, + 0.081109f, -0.187479f, 3.876021f, 0.159168f, 0.111721f, -0.337423f, + 3.901760f, 0.261268f, -0.245555f, -0.187632f, -0.324298f, 0.167234f, + 0.170986f, -0.473055f, 0.087016f, -0.003469f, 0.051035f, 0.251794f, + 0.153549f, 0.217609f, -0.326870f, -0.175511f, 0.637341f, -0.694837f, + -0.873487f, -0.186614f, -1.089884f, -0.607316f, -0.523519f, 5.256331f, + 0.071414f, 0.215265f, -0.835999f, 5.735746f, 0.300101f, 0.089626f, + -0.450261f, 5.608051f, 0.190491f, 0.110220f, -0.595360f, -0.446324f, + 0.311380f, 0.268812f, -0.339656f, -0.008708f, 0.011111f, -0.027557f, + 0.171534f, 0.000676f, 0.227232f, 0.033993f, 0.146684f, 0.094817f, + -0.175381f, -0.211927f, -0.362471f, 0.168834f, 0.264149f, -0.350538f, + -0.463249f, -0.288105f, 0.347155f, 0.183231f, -0.229732f, -0.252202f, + -0.218074f, -0.008769f, -0.156103f, 0.181233f, -0.354736f, 0.263270f, + -0.106636f, 0.081057f, 0.060634f, -0.046887f, 0.050468f, 0.071259f, + 0.221287f, 0.199071f, -0.180185f, -0.406902f, -0.239351f, -0.034957f, + 0.369140f, 0.864600f, 0.233798f, 0.423612f, -0.468918f, 0.976987f, + 0.691198f, -1.597908f, 0.102926f, 0.305546f, 0.391196f, -3.909059f, + 0.333635f, 0.311561f, 0.738886f, -4.002001f, 0.236394f, -0.233141f, + 0.263342f, 0.679898f, 0.136233f, 0.254743f, -0.367571f, 0.066412f, + 0.001606f, -0.059542f, 0.051726f, -0.347145f, -0.045501f, -0.313847f, + -0.021952f, 1.386316f, -0.579139f, -1.275844f, -0.003493f, -1.716577f, + 0.250209f, 0.192086f, 4.177055f, 0.351835f, 0.338177f, 0.140163f, + 4.099592f, 0.321866f, -0.128153f, -0.360414f, 4.350767f, 0.025943f, + -0.116740f, -0.664107f, -0.064558f, -0.039553f, -0.208186f, -0.678774f, + 0.149441f, -0.019823f, 0.012759f, 0.404442f, -0.108881f, 0.067974f, + -0.188278f, 0.136327f, 0.109927f, -0.179270f, -0.272342f, 0.018064f, + -0.304216f, -0.469470f, 0.109310f, -0.326214f, 0.061909f, -0.278997f, + -0.352329f, -0.333770f, -0.186522f, -0.328567f, -0.206211f, -0.008804f, + 0.042441f, -0.126699f, -0.420399f, -0.033842f, 0.016773f, -0.273789f, + 0.081928f, -0.191552f, -0.179533f, -0.263070f, -0.471807f, 0.062601f, + -0.232576f, 0.082955f, -0.490080f, 0.073820f, -0.090384f, 0.035781f, + -0.158880f, -0.506793f, -0.069132f, 0.047602f, -0.349640f, -0.058389f, + -0.017387f, -0.194636f, -0.457227f, -0.143105f, 0.222045f, -0.548909f, + -0.131561f, 0.247196f, -0.207923f, 0.133056f, -0.509854f, -0.193685f, + -0.181327f, -0.242442f, 0.091821f, 0.114430f, -0.375233f, -0.015254f, + -0.336632f, -0.060279f, -0.169169f, -0.429914f, -0.036563f, -0.400560f, + -0.076332f, -0.186232f, -0.268491f, 0.075561f, -0.389082f, -0.077435f, + 0.352562f, -0.020086f, -0.338181f, -0.404629f, 0.254983f, 0.150477f, + -0.265903f, 0.003341f, 0.099969f, -0.211964f, -0.129372f, -0.166366f, + 0.327712f, -0.276234f, 0.140675f, -0.433677f, -0.163050f, -0.143578f, + -0.397840f, -0.422130f, -0.293835f, -0.075362f, -0.468375f, 1.021238f, + 1.394155f, -0.922486f, -1.350222f, 2.030201f, 0.057717f, 0.227650f, + -0.193179f, 0.037224f, 0.065555f, 0.020558f, -0.059205f, -0.023690f, + -0.008718f, 0.095976f, -0.549587f, -0.321164f, -0.243728f, 1.344381f, + -1.254107f, 0.294244f, -0.154737f, -0.152597f, 0.342419f, 0.301883f, + 0.069866f, -0.327766f, 0.209323f, -0.364913f, -0.005530f, -0.558972f, + 0.057684f, -0.309357f, -0.283325f, -0.278445f, -0.420115f, -0.418457f, + -0.391481f, -0.418460f, -0.003897f, -0.023744f, -0.312330f, -0.366213f, + 0.269628f, -0.274877f, -0.189988f, -0.419555f, -0.034033f, 0.192874f, + -0.135487f, -0.326108f, -0.039019f, 0.185029f, -0.264883f, -0.563447f, + -0.163532f, -0.447652f, -0.141851f, 0.001714f, -0.193184f, 0.032609f, + -0.112883f, 0.074599f, 0.490665f, 0.434764f, 0.021652f, -0.219618f, + 0.743267f, 0.147195f, -0.303479f, -0.097674f, 0.195813f, 0.704007f, + -1.290851f, 0.119701f, 0.224065f, 0.260246f, -0.580657f, -0.096201f, + -0.333214f, -0.586689f, 0.567178f, 0.157340f, -0.043184f, 0.194358f, + -0.026506f, -0.339894f, -0.571803f, -0.234828f, 0.147054f, -0.564178f, + -0.156933f, -0.366055f, -0.691687f, -0.187501f, 0.215834f, -0.346106f, + -0.256892f, 0.110915f, -0.337464f, -0.341474f, -0.216113f, 0.249445f, + -0.070175f, -0.412141f, 0.153458f, -0.081280f, 0.164669f, -0.356396f, + -0.294971f, -0.165121f, -0.133585f, -0.071467f, 0.295147f, -0.253233f, + -0.213833f, -0.343416f, -0.474344f, -0.304000f, -0.341379f, -0.331456f, + -0.393952f, -0.508004f, -0.569518f, -0.509864f, 0.121961f, 0.011957f, + 0.000498f, -0.201969f, -0.407195f, -0.414375f, -0.295846f, 0.247492f, + 0.124249f, -0.550804f, -0.420397f, -0.123462f, 0.333292f, -0.240230f, + -0.025604f, 0.337536f, -0.295006f, -0.272614f, -0.496850f, -0.278521f, + 0.234591f, -0.052775f, -0.014052f, -0.260078f, -0.279128f, -0.036385f, + 0.008714f, -0.064018f, -0.124873f, -0.334014f, +}; + +static const float av1_early_term_after_split_nn_bias_8_layer0[] = { + 1.202379f, -0.117005f, -0.135527f, -0.262255f, -0.443658f, -0.078981f, + 0.615653f, -0.124482f, -0.227768f, -0.227014f, -0.135898f, 0.143216f, + -0.225995f, 0.370877f, -0.214821f, -0.227752f, +}; + +static const float av1_early_term_after_split_nn_weights_8_layer1[] = { + 0.376594f, 0.266703f, -0.039847f, 1.680142f, -0.879939f, 0.286806f, + -0.378223f, -0.405295f, -0.021107f, 0.039188f, 0.259308f, 0.193091f, + 0.077994f, -0.269141f, 0.011180f, -0.019262f, +}; + +static const float av1_early_term_after_split_nn_bias_8_layer1[] = { + -1.29585564f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_8 = { + FEATURES, + 1, + 1, + { + 16, + }, + { + av1_early_term_after_split_nn_weights_8_layer0, + av1_early_term_after_split_nn_weights_8_layer1, + }, + { + av1_early_term_after_split_nn_bias_8_layer0, + av1_early_term_after_split_nn_bias_8_layer1, + }, +}; +#undef FEATURES +#undef HIDDEN_NODES + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/partition_search.c b/third_party/aom/av1/encoder/partition_search.c new file mode 100644 index 0000000000..1c17b09ee1 --- /dev/null +++ b/third_party/aom/av1/encoder/partition_search.c @@ -0,0 +1,6263 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/txfm_common.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/nonrd_opt.h" +#include "av1/encoder/partition_search.h" +#include "av1/encoder/partition_strategy.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/var_based_part.h" +#include "av1/encoder/av1_ml_partition_models.h" + +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif + +#define COLLECT_MOTION_SEARCH_FEATURE_SB 0 + +void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) { + part_sf->partition_search_type = SEARCH_PARTITION; + part_sf->less_rectangular_check_level = 0; + part_sf->use_square_partition_only_threshold = BLOCK_128X128; + part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE; + part_sf->default_max_partition_size = BLOCK_LARGEST; + part_sf->default_min_partition_size = BLOCK_4X4; + part_sf->adjust_var_based_rd_partitioning = 0; + part_sf->max_intra_bsize = BLOCK_LARGEST; + // This setting only takes effect when partition_search_type is set + // to FIXED_PARTITION. + part_sf->fixed_partition_size = BLOCK_16X16; + // Recode loop tolerance %. + part_sf->partition_search_breakout_dist_thr = 0; + part_sf->partition_search_breakout_rate_thr = 0; + part_sf->prune_ext_partition_types_search_level = 0; + part_sf->prune_part4_search = 0; + part_sf->ml_prune_partition = 0; + part_sf->ml_early_term_after_part_split_level = 0; + for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) { + part_sf->ml_partition_search_breakout_thresh[i] = + -1; // -1 means not enabled. + } + part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0; + part_sf->simple_motion_search_split = 0; + part_sf->simple_motion_search_prune_rect = 0; + part_sf->simple_motion_search_early_term_none = 0; + part_sf->simple_motion_search_reduce_search_steps = 0; + part_sf->intra_cnn_based_part_prune_level = 0; + part_sf->ext_partition_eval_thresh = BLOCK_8X8; + part_sf->rect_partition_eval_thresh = BLOCK_128X128; + part_sf->ext_part_eval_based_on_cur_best = 0; + part_sf->prune_ext_part_using_split_info = 0; + part_sf->prune_rectangular_split_based_on_qidx = 0; + part_sf->early_term_after_none_split = 0; + part_sf->ml_predict_breakout_level = 0; + part_sf->prune_sub_8x8_partition_level = 0; + part_sf->simple_motion_search_rect_split = 0; + part_sf->reuse_prev_rd_results_for_part_ab = 0; + part_sf->reuse_best_prediction_for_part_ab = 0; + part_sf->use_best_rd_for_pruning = 0; + part_sf->skip_non_sq_part_based_on_none = 0; +} + +// Reset speed features that works for the baseline encoding, but +// blocks the external partition search. +void av1_reset_sf_for_ext_part(AV1_COMP *const cpi) { + cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions = 0; +} + +#if !CONFIG_REALTIME_ONLY +// If input |features| is NULL, write tpl stats to file for each super block. +// Otherwise, store tpl stats to |features|. +// The tpl stats is computed in the unit of tpl_bsize_1d (16x16). +// When writing to text file: +// The first row contains super block position, super block size, +// tpl unit length, number of units in the super block. +// The second row contains the intra prediction cost for each unit. +// The third row contains the inter prediction cost for each unit. +// The forth row contains the motion compensated dependency cost for each unit. +static void collect_tpl_stats_sb(const AV1_COMP *const cpi, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, + aom_partition_features_t *features) { + const AV1_COMMON *const cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) { + return; + } + + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + // If tpl stats is not established, early return + if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) { + if (features != NULL) features->sb_features.tpl_features.available = 0; + return; + } + + const int tpl_stride = tpl_frame->stride; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int mi_width = + AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); + const int mi_height = + AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); + const int col_steps = (mi_width / step) + ((mi_width % step) > 0); + const int row_steps = (mi_height / step) + ((mi_height % step) > 0); + const int num_blocks = col_steps * row_steps; + + if (features == NULL) { + char filename[256]; + snprintf(filename, sizeof(filename), "%s/tpl_feature_sb%d", + cpi->oxcf.partition_info_path, cpi->sb_counter); + FILE *pfile = fopen(filename, "w"); + fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize, + tpl_data->tpl_bsize_1d, num_blocks); + int count = 0; + for (int row = 0; row < mi_height; row += step) { + for (int col = 0; col < mi_width; col += step) { + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, + tpl_data->tpl_stats_block_mis_log2)]; + fprintf(pfile, "%.0f", (double)this_stats->intra_cost); + if (count < num_blocks - 1) fprintf(pfile, ","); + ++count; + } + } + fprintf(pfile, "\n"); + count = 0; + for (int row = 0; row < mi_height; row += step) { + for (int col = 0; col < mi_width; col += step) { + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, + tpl_data->tpl_stats_block_mis_log2)]; + fprintf(pfile, "%.0f", (double)this_stats->inter_cost); + if (count < num_blocks - 1) fprintf(pfile, ","); + ++count; + } + } + fprintf(pfile, "\n"); + count = 0; + for (int row = 0; row < mi_height; row += step) { + for (int col = 0; col < mi_width; col += step) { + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, + tpl_data->tpl_stats_block_mis_log2)]; + const int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + fprintf(pfile, "%.0f", (double)mc_dep_delta); + if (count < num_blocks - 1) fprintf(pfile, ","); + ++count; + } + } + fclose(pfile); + } else { + features->sb_features.tpl_features.available = 1; + features->sb_features.tpl_features.tpl_unit_length = tpl_data->tpl_bsize_1d; + features->sb_features.tpl_features.num_units = num_blocks; + int count = 0; + for (int row = 0; row < mi_height; row += step) { + for (int col = 0; col < mi_width; col += step) { + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, + tpl_data->tpl_stats_block_mis_log2)]; + const int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + features->sb_features.tpl_features.intra_cost[count] = + this_stats->intra_cost; + features->sb_features.tpl_features.inter_cost[count] = + this_stats->inter_cost; + features->sb_features.tpl_features.mc_dep_cost[count] = mc_dep_delta; + ++count; + } + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd, + FRAME_COUNTS *counts, TX_SIZE tx_size, int depth, + int blk_row, int blk_col, + uint8_t allow_update_cdf) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, mbmi->bsize, + tx_size); + const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col); + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index]; + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + assert(tx_size > TX_4X4); + + if (depth == MAX_VARTX_DEPTH) { + // Don't add to counts in this case + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + return; + } + + if (tx_size == plane_tx_size) { +#if CONFIG_ENTROPY_STATS + ++counts->txfm_partition[ctx][0]; +#endif + if (allow_update_cdf) + update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2); + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + +#if CONFIG_ENTROPY_STATS + ++counts->txfm_partition[ctx][1]; +#endif + if (allow_update_cdf) + update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2); + ++x->txfm_search_info.txb_split_count; + + if (sub_txs == TX_4X4) { + mbmi->inter_tx_size[txb_size_index] = TX_4X4; + mbmi->tx_size = TX_4X4; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, TX_4X4, tx_size); + return; + } + + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + int offsetr = row; + int offsetc = col; + + update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr, + blk_col + offsetc, allow_update_cdf); + } + } + } +} + +static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x, + BLOCK_SIZE plane_bsize, + FRAME_COUNTS *td_counts, + uint8_t allow_update_cdf) { + MACROBLOCKD *xd = &x->e_mbd; + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + + xd->above_txfm_context = + cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); + + for (int idy = 0; idy < mi_height; idy += bh) { + for (int idx = 0; idx < mi_width; idx += bw) { + update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx, + allow_update_cdf); + } + } +} + +static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row, + int blk_col) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col); + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index]; + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (tx_size == plane_tx_size) { + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + + } else { + if (tx_size == TX_8X8) { + mbmi->inter_tx_size[txb_size_index] = TX_4X4; + mbmi->tx_size = TX_4X4; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, TX_4X4, tx_size); + return; + } + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { + const int offsetc = blk_col + col; + set_txfm_context(xd, sub_txs, offsetr, offsetc); + } + } + } +} + +static void tx_partition_set_contexts(const AV1_COMMON *const cm, + MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) { + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + + xd->above_txfm_context = + cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); + + for (int idy = 0; idy < mi_height; idy += bh) { + for (int idx = 0; idx < mi_width; idx += bw) { + set_txfm_context(xd, max_tx_size, idy, idx); + } + } +} + +static void update_zeromv_cnt(const AV1_COMP *const cpi, + const MB_MODE_INFO *const mi, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + if (mi->ref_frame[0] != LAST_FRAME || !is_inter_block(mi) || + mi->segment_id > CR_SEGMENT_ID_BOOST2) { + return; + } + const AV1_COMMON *const cm = &cpi->common; + const MV mv = mi->mv[0].as_mv; + const int bw = mi_size_wide[bsize] >> 1; + const int bh = mi_size_high[bsize] >> 1; + const int xmis = AOMMIN((cm->mi_params.mi_cols - mi_col) >> 1, bw); + const int ymis = AOMMIN((cm->mi_params.mi_rows - mi_row) >> 1, bh); + const int block_index = + (mi_row >> 1) * (cm->mi_params.mi_cols >> 1) + (mi_col >> 1); + for (int y = 0; y < ymis; y++) { + for (int x = 0; x < xmis; x++) { + // consec_zero_mv is in the scale of 8x8 blocks + const int map_offset = block_index + y * (cm->mi_params.mi_cols >> 1) + x; + if (abs(mv.row) < 10 && abs(mv.col) < 10) { + if (cpi->consec_zero_mv[map_offset] < 255) + cpi->consec_zero_mv[map_offset]++; + } else { + cpi->consec_zero_mv[map_offset] = 0; + } + } + } +} + +static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, + ThreadData *td, TokenExtra **t, RUN_TYPE dry_run, + BLOCK_SIZE bsize, int *rate) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO **mi_4x4 = xd->mi; + MB_MODE_INFO *mbmi = mi_4x4[0]; + const int seg_skip = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); + const int mis = cm->mi_params.mi_stride; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + const int is_inter = is_inter_block(mbmi); + + // Initialize tx_mode and tx_size_search_method + TxfmSearchParams *txfm_params = &x->txfm_search_params; + set_tx_size_search_method( + cm, &cpi->winner_mode_params, txfm_params, + cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + if (!is_inter) { + xd->cfl.store_y = store_cfl_required(cm, xd); + mbmi->skip_txfm = 1; + for (int plane = 0; plane < num_planes; ++plane) { + av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run, + cpi->optimize_seg_arr[mbmi->segment_id]); + } + + // If there is at least one lossless segment, force the skip for intra + // block to be 0, in order to avoid the segment_id to be changed by in + // write_segment_id(). + if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map && + cpi->enc_seg.has_lossless_segment) + mbmi->skip_txfm = 0; + + xd->cfl.store_y = 0; + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { + for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) { + if (mbmi->palette_mode_info.palette_size[plane] > 0) { + if (!dry_run) { + av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size, + PALETTE_MAP, tile_data->allow_update_cdf, + td->counts); + } else if (dry_run == DRY_RUN_COSTCOEFFS) { + *rate += + av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP); + } + } + } + } + + av1_update_intra_mb_txb_context(cpi, td, dry_run, bsize, + tile_data->allow_update_cdf); + } else { + int ref; + const int is_compound = has_second_ref(mbmi); + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + for (ref = 0; ref < 1 + is_compound; ++ref) { + const YV12_BUFFER_CONFIG *cfg = + get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]); + assert(IMPLIES(!is_intrabc_block(mbmi), cfg)); + av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, + xd->block_ref_scale_factors[ref], num_planes); + } + // Predicted sample of inter mode (for Luma plane) cannot be reused if + // nonrd_check_partition_split speed feature is enabled, Since in such cases + // the buffer may not contain the predicted sample of best mode. + const int start_plane = + (x->reuse_inter_pred && (!cpi->sf.rt_sf.nonrd_check_partition_split) && + cm->seq_params->bit_depth == AOM_BITS_8) + ? 1 + : 0; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + start_plane, av1_num_planes(cm) - 1); + if (mbmi->motion_mode == OBMC_CAUSAL) { + assert(cpi->oxcf.motion_mode_cfg.enable_obmc); + av1_build_obmc_inter_predictors_sb(cm, xd); + } + +#if CONFIG_MISMATCH_DEBUG + if (dry_run == OUTPUT_ENABLED) { + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, + pd->subsampling_x, pd->subsampling_y); + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + continue; + mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, + cm->current_frame.order_hint, plane, pixel_c, + pixel_r, pd->width, pd->height, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } + } +#else + (void)num_planes; +#endif + + av1_encode_sb(cpi, x, bsize, dry_run); + av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate, + tile_data->allow_update_cdf); + } + + if (!dry_run) { + if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1; + if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && + !xd->lossless[mbmi->segment_id] && mbmi->bsize > BLOCK_4X4 && + !(is_inter && (mbmi->skip_txfm || seg_skip))) { + if (is_inter) { + tx_partition_count_update(cm, x, bsize, td->counts, + tile_data->allow_update_cdf); + } else { + if (mbmi->tx_size != max_txsize_rect_lookup[bsize]) + ++x->txfm_search_info.txb_split_count; + if (block_signals_txsize(bsize)) { + const int tx_size_ctx = get_tx_size_context(xd); + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int depth = tx_size_to_depth(mbmi->tx_size, bsize); + const int max_depths = bsize_to_max_depth(bsize); + + if (tile_data->allow_update_cdf) + update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], + depth, max_depths + 1); +#if CONFIG_ENTROPY_STATS + ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth]; +#endif + } + } + assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi))); + } else { + int i, j; + TX_SIZE intra_tx_size; + // The new intra coding scheme requires no change of transform size + if (is_inter) { + if (xd->lossless[mbmi->segment_id]) { + intra_tx_size = TX_4X4; + } else { + intra_tx_size = + tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type); + } + } else { + intra_tx_size = mbmi->tx_size; + } + + const int cols = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_width); + const int rows = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_height); + for (j = 0; j < rows; j++) { + for (i = 0; i < cols; i++) mi_4x4[mis * j + i]->tx_size = intra_tx_size; + } + + if (intra_tx_size != max_txsize_rect_lookup[bsize]) + ++x->txfm_search_info.txb_split_count; + } + } + + if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && + block_signals_txsize(mbmi->bsize) && is_inter && + !(mbmi->skip_txfm || seg_skip) && !xd->lossless[mbmi->segment_id]) { + if (dry_run) tx_partition_set_contexts(cm, xd, bsize); + } else { + TX_SIZE tx_size = mbmi->tx_size; + // The new intra coding scheme requires no change of transform size + if (is_inter) { + if (xd->lossless[mbmi->segment_id]) { + tx_size = TX_4X4; + } else { + tx_size = tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type); + } + } else { + tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4; + } + mbmi->tx_size = tx_size; + set_txfm_ctxs(tx_size, xd->width, xd->height, + (mbmi->skip_txfm || seg_skip) && is_inter_block(mbmi), xd); + } + + if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) { + cfl_store_block(xd, mbmi->bsize, mbmi->tx_size); + } + if (!dry_run) { + if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->svc.temporal_layer_id == 0 && + cpi->sf.rt_sf.use_temporal_noise_estimate && + (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) + update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize); + } +} + +static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + AQ_MODE aq_mode, MB_MODE_INFO *mbmi) { + x->rdmult = cpi->rd.RDMULT; + + if (aq_mode != NO_AQ) { + assert(mbmi != NULL); + if (aq_mode == VARIANCE_AQ) { + if (cpi->vaq_refresh) { + const int energy = bsize <= BLOCK_16X16 + ? x->mb_energy + : av1_log_block_var(cpi, x, bsize); + mbmi->segment_id = energy; + } + x->rdmult = set_rdmult(cpi, x, mbmi->segment_id); + } else if (aq_mode == COMPLEXITY_AQ) { + x->rdmult = set_rdmult(cpi, x, mbmi->segment_id); + } else if (aq_mode == CYCLIC_REFRESH_AQ) { + // If segment is boosted, use rdmult for that segment. + if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) + x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + } + } + +#if !CONFIG_REALTIME_ONLY + if (cpi->common.delta_q_info.delta_q_present_flag && + !cpi->sf.rt_sf.use_nonrd_pick_mode) { + x->rdmult = av1_get_cb_rdmult(cpi, x, bsize, mi_row, mi_col); + } +#endif // !CONFIG_REALTIME_ONLY + + if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM) { + av1_set_ssim_rdmult(cpi, &x->errorperbit, bsize, mi_row, mi_col, + &x->rdmult); + } +#if CONFIG_SALIENCY_MAP + else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP) { + av1_set_saliency_map_vmaf_rdmult(cpi, &x->errorperbit, + cpi->common.seq_params->sb_size, mi_row, + mi_col, &x->rdmult); + } +#endif +#if CONFIG_TUNE_VMAF + else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN || + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { + av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } +#endif +#if CONFIG_TUNE_BUTTERAUGLI + else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { + av1_set_butteraugli_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } +#endif + if (cpi->oxcf.mode == ALLINTRA) { + x->rdmult = (int)(((int64_t)x->rdmult * x->intra_sb_rdmult_modifier) >> 7); + } + + // Check to make sure that the adjustments above have not caused the + // rd multiplier to be truncated to 0. + x->rdmult = (x->rdmult > 0) ? x->rdmult : 1; +} + +void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi, + const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + assert(bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + + set_entropy_context(xd, mi_row, mi_col, num_planes); + xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + // Set up destination pointers. + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height, + mi_width, cpi->oxcf.border_in_pixels); + + set_plane_n4(xd, mi_width, mi_height, num_planes); + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, + cm->mi_params.mi_rows, cm->mi_params.mi_cols); + + // Set up source buffers. + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); + + // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs() + xd->tile = *tile; +} + +void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; + + av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + + // Setup segment ID. + mbmi = xd->mi[0]; + mbmi->segment_id = 0; + if (seg->enabled) { + if (seg->enabled && !cpi->vaq_refresh) { + const uint8_t *const map = + seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; + mbmi->segment_id = + map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0; + } + av1_init_plane_quantizers(cpi, x, mbmi->segment_id, 0); + } +#ifndef NDEBUG + x->last_set_offsets_loc.mi_row = mi_row; + x->last_set_offsets_loc.mi_col = mi_col; + x->last_set_offsets_loc.bsize = bsize; +#endif // NDEBUG +} + +/*!\brief Hybrid intra mode search. + * + * \ingroup intra_mode_search + * \callgraph + * \callergraph + * This is top level function for mode search for intra frames in non-RD + * optimized case. Depending on speed feature and block size it calls + * either non-RD or RD optimized intra mode search. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] rd_cost Struct to keep track of the RD information + * \param[in] bsize Current block size + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ + +static AOM_INLINE void hybrid_intra_mode_search(AV1_COMP *cpi, + MACROBLOCK *const x, + RD_STATS *rd_cost, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + int use_rdopt = 0; + const int hybrid_intra_pickmode = cpi->sf.rt_sf.hybrid_intra_pickmode; + // Use rd pick for intra mode search based on block size and variance. + if (hybrid_intra_pickmode && bsize < BLOCK_16X16) { + unsigned int var_thresh[3] = { 0, 101, 201 }; + assert(hybrid_intra_pickmode <= 3); + if (x->source_variance >= var_thresh[hybrid_intra_pickmode - 1]) + use_rdopt = 1; + } + + if (use_rdopt) + av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + else + av1_nonrd_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); +} + +// For real time/allintra row-mt enabled multi-threaded encoding with cost +// update frequency set to COST_UPD_TILE/COST_UPD_OFF, tile ctxt is not updated +// at superblock level. Thus, it is not required for the encoding of top-right +// superblock be complete for updating tile ctxt. However, when encoding a block +// whose right edge is also the superblock edge, intra and inter mode evaluation +// (ref mv list population) require the encoding of the top-right superblock to +// be complete. So, here, we delay the waiting of threads until the need for the +// data from the top-right superblock region. +static AOM_INLINE void wait_for_top_right_sb( + AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync, + TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int sb_size_in_mi = mi_size_wide[sb_size]; + const int bw_in_mi = mi_size_wide[bsize]; + const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1); + const int blk_col_in_sb = mi_col & (sb_size_in_mi - 1); + const int top_right_block_in_sb = + (blk_row_in_sb == 0) && (blk_col_in_sb + bw_in_mi >= sb_size_in_mi); + + // Don't wait if the block is the not the top-right block in the superblock. + if (!top_right_block_in_sb) return; + + // Wait for the top-right superblock to finish encoding. + const int sb_row_in_tile = + (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2; + const int sb_col_in_tile = + (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2; + + enc_row_mt->sync_read_ptr(row_mt_sync, sb_row_in_tile, sb_col_in_tile); +} + +/*!\brief Interface for AV1 mode search for an individual coding block + * + * \ingroup partition_search + * \callgraph + * \callergraph + * Searches prediction modes, transform, and coefficient coding modes for an + * individual coding block. This function is the top-level interface that + * directs the encoder to the proper mode search function, among these + * implemented for inter/intra + rd/non-rd + non-skip segment/skip segment. + * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during + * encoding + * \param[in] x Pointer to structure holding all the data for + * the current macroblock + * \param[in] mi_row Row coordinate of the block in a step size of + * MI_SIZE + * \param[in] mi_col Column coordinate of the block in a step size of + * MI_SIZE + * \param[in] rd_cost Pointer to structure holding rate and distortion + * stats for the current block + * \param[in] partition Partition mode of the parent block + * \param[in] bsize Current block size + * \param[in] ctx Pointer to structure holding coding contexts and + * chosen modes for the current block + * \param[in] best_rd Upper bound of rd cost of a valid partition + * + * \remark Nothing is returned. Instead, the chosen modes and contexts necessary + * for reconstruction are stored in ctx, the rate-distortion stats are stored in + * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be + * signalled by an INT64_MAX rd_cost->rdcost. + */ +static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, + MACROBLOCK *const x, int mi_row, int mi_col, + RD_STATS *rd_cost, PARTITION_TYPE partition, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + RD_STATS best_rd) { + if (cpi->sf.part_sf.use_best_rd_for_pruning && best_rd.rdcost < 0) { + ctx->rd_stats.rdcost = INT64_MAX; + ctx->rd_stats.skip_txfm = 0; + av1_invalid_rd_stats(rd_cost); + return; + } + + av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize); + + if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab && + ctx->rd_mode_is_ready) { + assert(ctx->mic.bsize == bsize); + assert(ctx->mic.partition == partition); + rd_cost->rate = ctx->rd_stats.rate; + rd_cost->dist = ctx->rd_stats.dist; + rd_cost->rdcost = ctx->rd_stats.rdcost; + return; + } + + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + + int i; + + // This is only needed for real time/allintra row-mt enabled multi-threaded + // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF. + wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync, + &tile_data->tile_info, cm->seq_params->sb_size, + cm->seq_params->mib_size_log2, bsize, mi_row, mi_col); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_sb_modes_time); +#endif + + mbmi = xd->mi[0]; + mbmi->bsize = bsize; + mbmi->partition = partition; + +#if CONFIG_RD_DEBUG + mbmi->mi_row = mi_row; + mbmi->mi_col = mi_col; +#endif + + // Sets up the tx_type_map buffer in MACROBLOCKD. + xd->tx_type_map = txfm_info->tx_type_map_; + xd->tx_type_map_stride = mi_size_wide[bsize]; + + for (i = 0; i < num_planes; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + p[i].dqcoeff = ctx->dqcoeff[i]; + p[i].eobs = ctx->eobs[i]; + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + } + + for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; + + ctx->skippable = 0; + // Set to zero to make sure we do not use the previous encoded frame stats + mbmi->skip_txfm = 0; + // Reset skip mode flag. + mbmi->skip_mode = 0; + + x->source_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi); + // Set error per bit for current rdmult + av1_set_error_per_bit(&x->errorperbit, x->rdmult); + av1_rd_cost_update(x->rdmult, &best_rd); + + // If set best_rd.rdcost to INT64_MAX, the encoder will not use any previous + // rdcost information for the following mode search. + // Disabling the feature could get some coding gain, with encoder slowdown. + if (!cpi->sf.part_sf.use_best_rd_for_pruning) { + av1_invalid_rd_stats(&best_rd); + } + + // Find best coding mode & reconstruct the MB so it is available + // as a predictor for MBs that follow in the SB + if (frame_is_intra_only(cm)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_rd_pick_intra_mode_sb_time); +#endif + av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_rd_pick_intra_mode_sb_time); +#endif + } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_rd_pick_inter_mode_sb_time); +#endif + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, + rd_cost, bsize, ctx, best_rd.rdcost); + } else { + av1_rd_pick_inter_mode(cpi, tile_data, x, rd_cost, bsize, ctx, + best_rd.rdcost); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_rd_pick_inter_mode_sb_time); +#endif + } + + // Examine the resulting rate and for AQ mode 2 make a segment choice. + if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ && + bsize >= BLOCK_16X16) { + av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); + } + + x->rdmult = orig_rdmult; + + // TODO(jingning) The rate-distortion optimization flow needs to be + // refactored to provide proper exit/return handle. + if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX; + + ctx->rd_stats.rate = rd_cost->rate; + ctx->rd_stats.dist = rd_cost->dist; + ctx->rd_stats.rdcost = rd_cost->rdcost; + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_sb_modes_time); +#endif +} + +static void update_stats(const AV1_COMMON *const cm, ThreadData *td) { + MACROBLOCK *x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const CurrentFrame *const current_frame = &cm->current_frame; + const BLOCK_SIZE bsize = mbmi->bsize; + FRAME_CONTEXT *fc = xd->tile_ctx; + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + + if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active && + is_comp_ref_allowed(bsize)) { + const int skip_mode_ctx = av1_get_skip_mode_context(xd); +#if CONFIG_ENTROPY_STATS + td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++; +#endif + update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2); + } + + if (!mbmi->skip_mode && !seg_ref_active) { + const int skip_ctx = av1_get_skip_txfm_context(xd); +#if CONFIG_ENTROPY_STATS + td->counts->skip_txfm[skip_ctx][mbmi->skip_txfm]++; +#endif + update_cdf(fc->skip_txfm_cdfs[skip_ctx], mbmi->skip_txfm, 2); + } + +#if CONFIG_ENTROPY_STATS + // delta quant applies to both intra and inter + const int super_block_upper_left = + ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) && + ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0); + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + if (delta_q_info->delta_q_present_flag && + (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) && + super_block_upper_left) { + const int dq = (mbmi->current_qindex - xd->current_base_qindex) / + delta_q_info->delta_q_res; + const int absdq = abs(dq); + for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) { + td->counts->delta_q[i][1]++; + } + if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++; + if (delta_q_info->delta_lf_present_flag) { + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / + delta_q_info->delta_lf_res; + const int abs_delta_lf = abs(delta_lf); + for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { + td->counts->delta_lf_multi[lf_id][i][1]++; + } + if (abs_delta_lf < DELTA_LF_SMALL) + td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++; + } + } else { + const int delta_lf = + (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / + delta_q_info->delta_lf_res; + const int abs_delta_lf = abs(delta_lf); + for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { + td->counts->delta_lf[i][1]++; + } + if (abs_delta_lf < DELTA_LF_SMALL) + td->counts->delta_lf[abs_delta_lf][0]++; + } + } + } +#endif + + if (!is_inter_block(mbmi)) { + av1_sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi, + frame_is_intra_only(cm)); + } + + if (av1_allow_intrabc(cm)) { + const int is_intrabc = is_intrabc_block(mbmi); + update_cdf(fc->intrabc_cdf, is_intrabc, 2); +#if CONFIG_ENTROPY_STATS + ++td->counts->intrabc[is_intrabc]; +#endif // CONFIG_ENTROPY_STATS + if (is_intrabc) { + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + const int_mv dv_ref = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv; + av1_update_mv_stats(&mbmi->mv[0].as_mv, &dv_ref.as_mv, &fc->ndvc, + MV_SUBPEL_NONE); + } + } + + if (frame_is_intra_only(cm) || mbmi->skip_mode) return; + + FRAME_COUNTS *const counts = td->counts; + const int inter_block = is_inter_block(mbmi); + + if (!seg_ref_active) { +#if CONFIG_ENTROPY_STATS + counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++; +#endif + update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)], + inter_block, 2); + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (inter_block) { + const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1]; + if (current_frame->reference_mode == REFERENCE_MODE_SELECT) { + if (is_comp_ref_allowed(bsize)) { +#if CONFIG_ENTROPY_STATS + counts->comp_inter[av1_get_reference_mode_context(xd)] + [has_second_ref(mbmi)]++; +#endif // CONFIG_ENTROPY_STATS + update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2); + } + } + + if (has_second_ref(mbmi)) { + const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi) + ? UNIDIR_COMP_REFERENCE + : BIDIR_COMP_REFERENCE; + update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type, + COMP_REFERENCE_TYPES); +#if CONFIG_ENTROPY_STATS + counts->comp_ref_type[av1_get_comp_reference_type_context(xd)] + [comp_ref_type]++; +#endif // CONFIG_ENTROPY_STATS + + if (comp_ref_type == UNIDIR_COMP_REFERENCE) { + const int bit = (ref0 == BWDREF_FRAME); + update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2); +#if CONFIG_ENTROPY_STATS + counts + ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (!bit) { + const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME); + update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2); +#if CONFIG_ENTROPY_STATS + counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1] + [bit1]++; +#endif // CONFIG_ENTROPY_STATS + if (bit1) { + update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd), + ref1 == GOLDEN_FRAME, 2); +#if CONFIG_ENTROPY_STATS + counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2] + [ref1 == GOLDEN_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } + } else { + const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME); + update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2); +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (!bit) { + update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1] + [ref0 == LAST2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } else { + update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2] + [ref0 == GOLDEN_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0] + [ref1 == ALTREF_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + if (ref1 != ALTREF_FRAME) { + update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd), + ref1 == ALTREF2_FRAME, 2); +#if CONFIG_ENTROPY_STATS + counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1] + [ref1 == ALTREF2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } + } else { + const int bit = (ref0 >= BWDREF_FRAME); + update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (bit) { + assert(ref0 <= ALTREF_FRAME); + update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1] + [ref0 == ALTREF_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + if (ref0 != ALTREF_FRAME) { + update_cdf(av1_get_pred_cdf_single_ref_p6(xd), + ref0 == ALTREF2_FRAME, 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5] + [ref0 == ALTREF2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } else { + const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME); + update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++; +#endif // CONFIG_ENTROPY_STATS + if (!bit1) { + update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3] + [ref0 != LAST_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } else { + update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4] + [ref0 != LAST3_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } + } + + if (cm->seq_params->enable_interintra_compound && + is_interintra_allowed(mbmi)) { + const int bsize_group = size_group_lookup[bsize]; + if (mbmi->ref_frame[1] == INTRA_FRAME) { +#if CONFIG_ENTROPY_STATS + counts->interintra[bsize_group][1]++; +#endif + update_cdf(fc->interintra_cdf[bsize_group], 1, 2); +#if CONFIG_ENTROPY_STATS + counts->interintra_mode[bsize_group][mbmi->interintra_mode]++; +#endif + update_cdf(fc->interintra_mode_cdf[bsize_group], + mbmi->interintra_mode, INTERINTRA_MODES); + if (av1_is_wedge_used(bsize)) { +#if CONFIG_ENTROPY_STATS + counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++; +#endif + update_cdf(fc->wedge_interintra_cdf[bsize], + mbmi->use_wedge_interintra, 2); + if (mbmi->use_wedge_interintra) { +#if CONFIG_ENTROPY_STATS + counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++; +#endif + update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index, + 16); + } + } + } else { +#if CONFIG_ENTROPY_STATS + counts->interintra[bsize_group][0]++; +#endif + update_cdf(fc->interintra_cdf[bsize_group], 0, 2); + } + } + + const MOTION_MODE motion_allowed = + cm->features.switchable_motion_mode + ? motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->features.allow_warped_motion) + : SIMPLE_TRANSLATION; + if (mbmi->ref_frame[1] != INTRA_FRAME) { + if (motion_allowed == WARPED_CAUSAL) { +#if CONFIG_ENTROPY_STATS + counts->motion_mode[bsize][mbmi->motion_mode]++; +#endif + update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode, + MOTION_MODES); + } else if (motion_allowed == OBMC_CAUSAL) { +#if CONFIG_ENTROPY_STATS + counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++; +#endif + update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2); + } + } + + if (has_second_ref(mbmi)) { + assert(current_frame->reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION); + + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params->enable_masked_compound; + if (masked_compound_used) { + const int comp_group_idx_ctx = get_comp_group_idx_context(xd); +#if CONFIG_ENTROPY_STATS + ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx]; +#endif + update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx], + mbmi->comp_group_idx, 2); + } + + if (mbmi->comp_group_idx == 0) { + const int comp_index_ctx = get_comp_index_context(cm, xd); +#if CONFIG_ENTROPY_STATS + ++counts->compound_index[comp_index_ctx][mbmi->compound_idx]; +#endif + update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx, + 2); + } else { + assert(masked_compound_used); + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->compound_type[bsize][mbmi->interinter_comp.type - + COMPOUND_WEDGE]; +#endif + update_cdf(fc->compound_type_cdf[bsize], + mbmi->interinter_comp.type - COMPOUND_WEDGE, + MASKED_COMPOUND_TYPES); + } + } + } + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { +#if CONFIG_ENTROPY_STATS + counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++; +#endif + update_cdf(fc->wedge_idx_cdf[bsize], + mbmi->interinter_comp.wedge_index, 16); + } + } + } + } + + if (inter_block && cm->features.interp_filter == SWITCHABLE && + av1_is_interp_needed(xd)) { + update_filter_type_cdf(xd, mbmi, cm->seq_params->enable_dual_filter); + } + if (inter_block && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const PREDICTION_MODE mode = mbmi->mode; + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); + if (has_second_ref(mbmi)) { +#if CONFIG_ENTROPY_STATS + ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)]; +#endif + update_cdf(fc->inter_compound_mode_cdf[mode_ctx], + INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES); + } else { + av1_update_inter_mode_stats(fc, counts, mode, mode_ctx); + } + + const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV; + if (new_mv) { + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + for (int idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + const uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2); +#if CONFIG_ENTROPY_STATS + ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx]; +#endif + if (mbmi->ref_mv_idx == idx) break; + } + } + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + for (int idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + const uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2); +#if CONFIG_ENTROPY_STATS + ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1]; +#endif + if (mbmi->ref_mv_idx == idx - 1) break; + } + } + } + if (have_newmv_in_inter_mode(mbmi->mode)) { + const int allow_hp = cm->features.cur_frame_force_integer_mv + ? MV_SUBPEL_NONE + : cm->features.allow_high_precision_mv; + if (new_mv) { + for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + const int_mv ref_mv = av1_get_ref_mv(x, ref); + av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, + allow_hp); + } + } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) { + const int ref = 1; + const int_mv ref_mv = av1_get_ref_mv(x, ref); + av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, + allow_hp); + } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) { + const int ref = 0; + const int_mv ref_mv = av1_get_ref_mv(x, ref); + av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, + allow_hp); + } + } + } +} + +/*!\brief Reconstructs an individual coding block + * + * \ingroup partition_search + * Reconstructs an individual coding block by applying the chosen modes stored + * in ctx, also updates mode counts and entropy models. + * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during encoding + * \param[in] td Pointer to thread data + * \param[in] tp Pointer to the starting token + * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE + * \param[in] mi_col Column coordinate of the block in a step size of + * MI_SIZE + * \param[in] dry_run A code indicating whether it is part of the final + * pass for reconstructing the superblock + * \param[in] bsize Current block size + * \param[in] partition Partition mode of the parent block + * \param[in] ctx Pointer to structure holding coding contexts and the + * chosen modes for the current block + * \param[in] rate Pointer to the total rate for the current block + * + * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters) + * will be updated in the pixel buffers in td->mb.e_mbd. Also, the chosen modes + * will be stored in the MB_MODE_INFO buffer td->mb.e_mbd.mi[0]. + */ +static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, + ThreadData *td, TokenExtra **tp, int mi_row, int mi_col, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + PARTITION_TYPE partition, PICK_MODE_CONTEXT *const ctx, + int *rate) { + const AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const int subsampling_x = cm->seq_params->subsampling_x; + const int subsampling_y = cm->seq_params->subsampling_y; + + av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + const int origin_mult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->partition = partition; + av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run); + + if (!dry_run) { + set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y], + x->cb_offset[PLANE_TYPE_UV]); + assert(x->cb_offset[PLANE_TYPE_Y] < + (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size])); + assert(x->cb_offset[PLANE_TYPE_UV] < + ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >> + (subsampling_x + subsampling_y))); + } + + encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate); + + if (!dry_run) { + update_cb_offsets(x, bsize, subsampling_x, subsampling_y); + if (bsize == cpi->common.seq_params->sb_size && mbmi->skip_txfm == 1 && + cm->delta_q_info.delta_lf_present_flag) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) + mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; + mbmi->delta_lf_from_base = xd->delta_lf_from_base; + } + if (has_second_ref(mbmi)) { + if (mbmi->compound_idx == 0 || + mbmi->interinter_comp.type == COMPOUND_AVERAGE) + mbmi->comp_group_idx = 0; + else + mbmi->comp_group_idx = 1; + } + + // delta quant applies to both intra and inter + const int super_block_upper_left = + ((mi_row & (cm->seq_params->mib_size - 1)) == 0) && + ((mi_col & (cm->seq_params->mib_size - 1)) == 0); + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + if (delta_q_info->delta_q_present_flag && + (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) && + super_block_upper_left) { + xd->current_base_qindex = mbmi->current_qindex; + if (delta_q_info->delta_lf_present_flag) { + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; + } + } else { + xd->delta_lf_from_base = mbmi->delta_lf_from_base; + } + } + } + + RD_COUNTS *rdc = &td->rd_counts; + if (mbmi->skip_mode) { + assert(!frame_is_intra_only(cm)); + rdc->skip_mode_used_flag = 1; + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + assert(has_second_ref(mbmi)); + rdc->compound_ref_used_flag = 1; + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } else { + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active) { + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (is_inter_block(mbmi)) { + av1_collect_neighbors_ref_counts(xd); + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + if (has_second_ref(mbmi)) { + // This flag is also updated for 4x4 blocks + rdc->compound_ref_used_flag = 1; + } + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } + } + } + + if (tile_data->allow_update_cdf) update_stats(&cpi->common, td); + + // Gather obmc and warped motion count to update the probability. + if ((cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) || + (cm->features.allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) { + const int inter_block = is_inter_block(mbmi); + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active && inter_block) { + const MOTION_MODE motion_allowed = + cm->features.switchable_motion_mode + ? motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->features.allow_warped_motion) + : SIMPLE_TRANSLATION; + + if (mbmi->ref_frame[1] != INTRA_FRAME) { + if (motion_allowed >= OBMC_CAUSAL) { + td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++; + } + if (motion_allowed == WARPED_CAUSAL) { + td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++; + } + } + } + } + } + // TODO(Ravi/Remya): Move this copy function to a better logical place + // This function will copy the best mode information from block + // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This + // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during + // bitstream preparation. + av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); + x->rdmult = origin_mult; +} + +/*!\brief Reconstructs a partition (may contain multiple coding blocks) + * + * \ingroup partition_search + * Reconstructs a sub-partition of the superblock by applying the chosen modes + * and partition trees stored in pc_tree. + * + * \param[in] cpi Top-level encoder structure + * \param[in] td Pointer to thread data + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during encoding + * \param[in] tp Pointer to the starting token + * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE + * \param[in] mi_col Column coordinate of the block in a step size of + * MI_SIZE + * \param[in] dry_run A code indicating whether it is part of the final + * pass for reconstructing the superblock + * \param[in] bsize Current block size + * \param[in] pc_tree Pointer to the PC_TREE node storing the picked + * partitions and mode info for the current block + * \param[in] rate Pointer to the total rate for the current block + * + * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters) + * will be updated in the pixel buffers in td->mb.e_mbd. + */ +static void encode_sb(const AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, int mi_row, + int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, + PC_TREE *pc_tree, int *rate) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + assert(bsize < BLOCK_SIZES_ALL); + const int hbs = mi_size_wide[bsize] / 2; + const int is_partition_root = bsize >= BLOCK_8X8; + const int ctx = is_partition_root + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : -1; + const PARTITION_TYPE partition = pc_tree->partitioning; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); +#if !CONFIG_REALTIME_ONLY + int quarter_step = mi_size_wide[bsize] / 4; + int i; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); +#endif + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + if (subsize == BLOCK_INVALID) return; + + if (!dry_run && ctx >= 0) { + const int has_rows = (mi_row + hbs) < mi_params->mi_rows; + const int has_cols = (mi_col + hbs) < mi_params->mi_cols; + + if (has_rows && has_cols) { +#if CONFIG_ENTROPY_STATS + td->counts->partition[ctx][partition]++; +#endif + + if (tile_data->allow_update_cdf) { + FRAME_CONTEXT *fc = xd->tile_ctx; + update_cdf(fc->partition_cdf[ctx], partition, + partition_cdf_length(bsize)); + } + } + } + + switch (partition) { + case PARTITION_NONE: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, pc_tree->none, rate); + break; + case PARTITION_VERT: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, pc_tree->vertical[0], rate); + if (mi_col + hbs < mi_params->mi_cols) { + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize, + partition, pc_tree->vertical[1], rate); + } + break; + case PARTITION_HORZ: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, pc_tree->horizontal[0], rate); + if (mi_row + hbs < mi_params->mi_rows) { + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize, + partition, pc_tree->horizontal[1], rate); + } + break; + case PARTITION_SPLIT: + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize, + pc_tree->split[0], rate); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize, + pc_tree->split[1], rate); + encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize, + pc_tree->split[2], rate); + encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run, + subsize, pc_tree->split[3], rate); + break; + +#if !CONFIG_REALTIME_ONLY + case PARTITION_HORZ_A: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2, + partition, pc_tree->horizontala[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, + partition, pc_tree->horizontala[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize, + partition, pc_tree->horizontala[2], rate); + break; + case PARTITION_HORZ_B: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, pc_tree->horizontalb[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, + partition, pc_tree->horizontalb[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run, + bsize2, partition, pc_tree->horizontalb[2], rate); + break; + case PARTITION_VERT_A: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2, + partition, pc_tree->verticala[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, + partition, pc_tree->verticala[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize, + partition, pc_tree->verticala[2], rate); + + break; + case PARTITION_VERT_B: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, pc_tree->verticalb[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, + partition, pc_tree->verticalb[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run, + bsize2, partition, pc_tree->verticalb[2], rate); + break; + case PARTITION_HORZ_4: + for (i = 0; i < SUB_PARTITIONS_PART4; ++i) { + int this_mi_row = mi_row + i * quarter_step; + if (i > 0 && this_mi_row >= mi_params->mi_rows) break; + + encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize, + partition, pc_tree->horizontal4[i], rate); + } + break; + case PARTITION_VERT_4: + for (i = 0; i < SUB_PARTITIONS_PART4; ++i) { + int this_mi_col = mi_col + i * quarter_step; + if (i > 0 && this_mi_col >= mi_params->mi_cols) break; + encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize, + partition, pc_tree->vertical4[i], rate); + } + break; +#endif + default: assert(0 && "Invalid partition type."); break; + } + + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +} + +static AOM_INLINE int is_adjust_var_based_part_enabled( + AV1_COMMON *const cm, const PARTITION_SPEED_FEATURES *const part_sf, + BLOCK_SIZE bsize) { + if (part_sf->partition_search_type != VAR_BASED_PARTITION) return 0; + if (part_sf->adjust_var_based_rd_partitioning == 0 || + part_sf->adjust_var_based_rd_partitioning > 2) + return 0; + + if (bsize <= BLOCK_32X32) return 1; + if (part_sf->adjust_var_based_rd_partitioning == 2) { + const int is_larger_qindex = cm->quant_params.base_qindex > 190; + const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360; + return is_360p_or_larger && is_larger_qindex && bsize == BLOCK_64X64; + } + return 0; +} + +/*!\brief AV1 block partition search (partition estimation and partial search). +* +* \ingroup partition_search +* Encode the block by applying pre-calculated partition patterns that are +* represented by coding block sizes stored in the mbmi array. Minor partition +* adjustments are tested and applied if they lead to lower rd costs. The +* partition types are limited to a basic set: none, horz, vert, and split. +* +* \param[in] cpi Top-level encoder structure +* \param[in] td Pointer to thread data +* \param[in] tile_data Pointer to struct holding adaptive +data/contexts/models for the tile during encoding +* \param[in] mib Array representing MB_MODE_INFO pointers for mi +blocks starting from the first pixel of the current +block +* \param[in] tp Pointer to the starting token +* \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE +* \param[in] mi_col Column coordinate of the block in a step size of +MI_SIZE +* \param[in] bsize Current block size +* \param[in] rate Pointer to the final rate for encoding the current +block +* \param[in] dist Pointer to the final distortion of the current block +* \param[in] do_recon Whether the reconstruction function needs to be run, +either for finalizing a superblock or providing +reference for future sub-partitions +* \param[in] pc_tree Pointer to the PC_TREE node holding the picked +partitions and mode info for the current block +* +* \remark Nothing is returned. The pc_tree struct is modified to store the +* picked partition and modes. The rate and dist are also updated with those +* corresponding to the best partition found. +*/ +void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, + MB_MODE_INFO **mib, TokenExtra **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *rate, + int64_t *dist, int do_recon, PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const ModeCosts *mode_costs = &x->mode_costs; + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + const int pl = (bsize >= BLOCK_8X8) + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + const PARTITION_TYPE partition = + (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) + : PARTITION_NONE; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc; + BLOCK_SIZE bs_type = mib[0]->bsize; + int use_partition_none = 0; + x->try_merge_partition = 0; + + if (pc_tree->none == NULL) { + pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + PICK_MODE_CONTEXT *ctx_none = pc_tree->none; + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + // In rt mode, currently the min partition size is BLOCK_8X8. + assert(bsize >= cpi->sf.part_sf.default_min_partition_size); + + av1_invalid_rd_stats(&last_part_rdc); + av1_invalid_rd_stats(&none_rdc); + av1_invalid_rd_stats(&chosen_rdc); + av1_invalid_rd_stats(&invalid_rdc); + + pc_tree->partitioning = partition; + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) { + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + x->mb_energy = av1_log_block_var(cpi, x, bsize); + } + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + + if (partition != PARTITION_NONE && + is_adjust_var_based_part_enabled(cm, &cpi->sf.part_sf, bsize) && + (mi_row + hbs < mi_params->mi_rows && + mi_col + hbs < mi_params->mi_cols)) { + assert(bsize > cpi->sf.part_sf.default_min_partition_size); + mib[0]->bsize = bsize; + pc_tree->partitioning = PARTITION_NONE; + x->try_merge_partition = 1; + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, PARTITION_NONE, + bsize, ctx_none, invalid_rdc); + + if (none_rdc.rate < INT_MAX) { + none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); + } + + // Try to skip split partition evaluation based on none partition + // characteristics. + if (none_rdc.rate < INT_MAX && none_rdc.skip_txfm == 1) { + use_partition_none = 1; + } + + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + mib[0]->bsize = bs_type; + pc_tree->partitioning = partition; + } + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + pc_tree->split[i]->index = i; + } + switch (partition) { + case PARTITION_NONE: + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_NONE, bsize, ctx_none, invalid_rdc); + break; + case PARTITION_HORZ: + if (use_partition_none) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + pc_tree->horizontal[i] = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->horizontal[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_HORZ, subsize, pc_tree->horizontal[0], + invalid_rdc); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_row + hbs < mi_params->mi_rows) { + RD_STATS tmp_rdc; + const PICK_MODE_CONTEXT *const ctx_h = pc_tree->horizontal[0]; + av1_init_rd_stats(&tmp_rdc); + av1_update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, + NULL); + pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc, + PARTITION_HORZ, subsize, pc_tree->horizontal[1], + invalid_rdc); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; + } + break; + case PARTITION_VERT: + if (use_partition_none) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + pc_tree->vertical[i] = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->vertical[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_col + hbs < mi_params->mi_cols) { + RD_STATS tmp_rdc; + const PICK_MODE_CONTEXT *const ctx_v = pc_tree->vertical[0]; + av1_init_rd_stats(&tmp_rdc); + av1_update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, + NULL); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc, + PARTITION_VERT, subsize, + pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; + } + break; + case PARTITION_SPLIT: + if (use_partition_none) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + + last_part_rdc.rate = 0; + last_part_rdc.dist = 0; + last_part_rdc.rdcost = 0; + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + int jj = i >> 1, ii = i & 0x01; + RD_STATS tmp_rdc; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + + av1_init_rd_stats(&tmp_rdc); + av1_rd_use_partition( + cpi, td, tile_data, + mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp, + mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, + &tmp_rdc.dist, i != (SUB_PARTITIONS_SPLIT - 1), pc_tree->split[i]); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + } + break; + case PARTITION_VERT_A: + case PARTITION_VERT_B: + case PARTITION_HORZ_A: + case PARTITION_HORZ_B: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + assert(0 && "Cannot handle extended partition types"); + default: assert(0); break; + } + + if (last_part_rdc.rate < INT_MAX) { + last_part_rdc.rate += mode_costs->partition_cost[pl][partition]; + last_part_rdc.rdcost = + RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist); + } + + if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION && + cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) && + partition != PARTITION_SPLIT && bsize > BLOCK_8X8 && + (mi_row + bs < mi_params->mi_rows || + mi_row + hbs == mi_params->mi_rows) && + (mi_col + bs < mi_params->mi_cols || + mi_col + hbs == mi_params->mi_cols)) { + BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + chosen_rdc.rate = 0; + chosen_rdc.dist = 0; + + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + pc_tree->partitioning = PARTITION_SPLIT; + + // Split partition. + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + RD_STATS tmp_rdc; + + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + pc_tree->split[i]->partitioning = PARTITION_NONE; + if (pc_tree->split[i]->none == NULL) + pc_tree->split[i]->none = + av1_alloc_pmc(cpi, split_subsize, &td->shared_coeff_buf); + if (!pc_tree->split[i]->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc, + PARTITION_SPLIT, split_subsize, pc_tree->split[i]->none, + invalid_rdc); + + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&chosen_rdc); + break; + } + + chosen_rdc.rate += tmp_rdc.rate; + chosen_rdc.dist += tmp_rdc.dist; + + if (i != SUB_PARTITIONS_SPLIT - 1) + encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, + OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL); + + chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; + } + if (chosen_rdc.rate < INT_MAX) { + chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; + chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist); + } + } + + // If last_part is better set the partitioning to that. + if (last_part_rdc.rdcost < chosen_rdc.rdcost) { + mib[0]->bsize = bs_type; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition; + + chosen_rdc = last_part_rdc; + } + // If none was better set the partitioning to that. + if (none_rdc.rdcost < INT64_MAX && + none_rdc.rdcost - (none_rdc.rdcost >> 9) < chosen_rdc.rdcost) { + mib[0]->bsize = bsize; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; + chosen_rdc = none_rdc; + } + + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + // We must have chosen a partitioning and encoding or we'll fail later on. + // No other opportunities for success. + if (bsize == cm->seq_params->sb_size) + assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif + if (do_recon) { + if (bsize == cm->seq_params->sb_size) { + // NOTE: To get estimate for rate due to the tokens, use: + // int rate_coeffs = 0; + // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, + // bsize, pc_tree, &rate_coeffs); + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + pc_tree, NULL); + } else { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif + + *rate = chosen_rdc.rate; + *dist = chosen_rdc.dist; + x->rdmult = orig_rdmult; +} + +static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data, + ThreadData *td, TokenExtra **tp, int mi_row, + int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, + PARTITION_TYPE partition, + PICK_MODE_CONTEXT *const ctx, int *rate) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing((AV1_COMP *)cpi, encode_b_nonrd_time); +#endif + const AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + const int origin_mult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->partition = partition; + av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run); + const int subsampling_x = cpi->common.seq_params->subsampling_x; + const int subsampling_y = cpi->common.seq_params->subsampling_y; + if (!dry_run) { + set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y], + x->cb_offset[PLANE_TYPE_UV]); + assert(x->cb_offset[PLANE_TYPE_Y] < + (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size])); + assert(x->cb_offset[PLANE_TYPE_UV] < + ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >> + (subsampling_x + subsampling_y))); + } + + encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate); + if (!dry_run) { + update_cb_offsets(x, bsize, subsampling_x, subsampling_y); + if (has_second_ref(mbmi)) { + if (mbmi->compound_idx == 0 || + mbmi->interinter_comp.type == COMPOUND_AVERAGE) + mbmi->comp_group_idx = 0; + else + mbmi->comp_group_idx = 1; + mbmi->compound_idx = 1; + } + RD_COUNTS *const rdc = &td->rd_counts; + if (mbmi->skip_mode) { + assert(!frame_is_intra_only(cm)); + rdc->skip_mode_used_flag = 1; + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT && + has_second_ref(mbmi)) { + rdc->compound_ref_used_flag = 1; + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } else { + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active) { + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (is_inter_block(mbmi)) { + av1_collect_neighbors_ref_counts(xd); + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT && + has_second_ref(mbmi)) { + // This flag is also updated for 4x4 blocks + rdc->compound_ref_used_flag = 1; + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } + } + } + if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY && + (mbmi->mode == NEWMV || mbmi->mode < INTRA_MODE_END)) { + int32_t blocks = mi_size_high[bsize] * mi_size_wide[bsize]; + rdc->newmv_or_intra_blocks += blocks; + } + if (tile_data->allow_update_cdf) update_stats(&cpi->common, td); + } + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm && + !cpi->rc.rtc_external_ratectrl && cm->seg.enabled) + av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize, dry_run); + // TODO(Ravi/Remya): Move this copy function to a better logical place + // This function will copy the best mode information from block + // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This + // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during + // bitstream preparation. + av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); + x->rdmult = origin_mult; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing((AV1_COMP *)cpi, encode_b_nonrd_time); +#endif +} + +static int get_force_zeromv_skip_flag_for_blk(const AV1_COMP *cpi, + const MACROBLOCK *x, + BLOCK_SIZE bsize) { + // Force zero MV skip based on SB level decision + if (x->force_zeromv_skip_for_sb < 2) return x->force_zeromv_skip_for_sb; + + // For blocks of size equal to superblock size, the decision would have been + // already done at superblock level. Hence zeromv-skip decision is skipped. + const AV1_COMMON *const cm = &cpi->common; + if (bsize == cm->seq_params->sb_size) return 0; + + const int num_planes = av1_num_planes(cm); + const MACROBLOCKD *const xd = &x->e_mbd; + const unsigned int thresh_exit_part_y = + cpi->zeromv_skip_thresh_exit_part[bsize]; + const unsigned int thresh_exit_part_uv = + CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y); + const unsigned int thresh_exit_part[MAX_MB_PLANE] = { thresh_exit_part_y, + thresh_exit_part_uv, + thresh_exit_part_uv }; + const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, LAST_FRAME); + + struct buf_2d yv12_mb[MAX_MB_PLANE]; + av1_setup_pred_block(xd, yv12_mb, yv12, sf, sf, num_planes); + + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride); + assert(plane < MAX_MB_PLANE); + if (plane_sad >= thresh_exit_part[plane]) return 0; + } + return 1; +} + +/*!\brief Top level function to pick block mode for non-RD optimized case + * + * \ingroup partition_search + * \callgraph + * \callergraph + * Searches prediction modes, transform, and coefficient coding modes for an + * individual coding block. This function is the top-level function that is + * used for non-RD optimized mode search (controlled by + * \c cpi->sf.rt_sf.use_nonrd_pick_mode). Depending on frame type it calls + * inter/skip/hybrid-intra mode search functions + * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during + * encoding + * \param[in] x Pointer to structure holding all the data for + * the current macroblock + * \param[in] mi_row Row coordinate of the block in a step size of + * MI_SIZE + * \param[in] mi_col Column coordinate of the block in a step size of + * MI_SIZE + * \param[in] rd_cost Pointer to structure holding rate and distortion + * stats for the current block + * \param[in] bsize Current block size + * \param[in] ctx Pointer to structure holding coding contexts and + * chosen modes for the current block + * + * \remark Nothing is returned. Instead, the chosen modes and contexts necessary + * for reconstruction are stored in ctx, the rate-distortion stats are stored in + * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be + * signalled by an INT64_MAX rd_cost->rdcost. + */ +static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, + MACROBLOCK *const x, int mi_row, int mi_col, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + // For nonrd mode, av1_set_offsets is already called at the superblock level + // in encode_nonrd_sb when we determine the partitioning. + if (bsize != cpi->common.seq_params->sb_size || + cpi->sf.rt_sf.nonrd_check_partition_split == 1) { + av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize); + } + assert(x->last_set_offsets_loc.mi_row == mi_row && + x->last_set_offsets_loc.mi_col == mi_col && + x->last_set_offsets_loc.bsize == bsize); + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + int i; + + // This is only needed for real time/allintra row-mt enabled multi-threaded + // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF. + wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync, + &tile_data->tile_info, cm->seq_params->sb_size, + cm->seq_params->mib_size_log2, bsize, mi_row, mi_col); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, pick_sb_modes_nonrd_time); +#endif + // Sets up the tx_type_map buffer in MACROBLOCKD. + xd->tx_type_map = txfm_info->tx_type_map_; + xd->tx_type_map_stride = mi_size_wide[bsize]; + for (i = 0; i < num_planes; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + p[i].dqcoeff = ctx->dqcoeff[i]; + p[i].eobs = ctx->eobs[i]; + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + } + for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; + + x->force_zeromv_skip_for_blk = + get_force_zeromv_skip_flag_for_blk(cpi, x, bsize); + + // Source variance may be already compute at superblock level, so no need + // to recompute, unless bsize < sb_size or source_variance is not yet set. + if (!x->force_zeromv_skip_for_blk && + (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size)) + x->source_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi); + // Set error per bit for current rdmult + av1_set_error_per_bit(&x->errorperbit, x->rdmult); + // Find best coding mode & reconstruct the MB so it is available + // as a predictor for MBs that follow in the SB + if (frame_is_intra_only(cm)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, hybrid_intra_mode_search_time); +#endif + hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, hybrid_intra_mode_search_time); +#endif + } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, nonrd_pick_inter_mode_sb_time); +#endif + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + RD_STATS invalid_rd; + av1_invalid_rd_stats(&invalid_rd); + // TODO(kyslov): add av1_nonrd_pick_inter_mode_sb_seg_skip + av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, + rd_cost, bsize, ctx, + invalid_rd.rdcost); + } else { + av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, nonrd_pick_inter_mode_sb_time); +#endif + } + if (cpi->sf.rt_sf.skip_cdef_sb) { + // cdef_strength is initialized to 1 which means skip_cdef, and is updated + // here. Check to see is skipping cdef is allowed. + const int allow_cdef_skipping = + cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad && + !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]); + + // Find the corresponding 64x64 block. It'll be the 128x128 block if that's + // the block size. + const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64; + const int mi_col_sb = mi_col - mi_col % MI_SIZE_64X64; + MB_MODE_INFO **mi_sb = + cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb); + // Do not skip if intra or new mv is picked, or color sensitivity is set. + // Never skip on slide/scene change. + if (cpi->sf.rt_sf.skip_cdef_sb >= 2) { + mi_sb[0]->cdef_strength = + mi_sb[0]->cdef_strength && + (allow_cdef_skipping || x->source_variance == 0); + } else { + mi_sb[0]->cdef_strength = + mi_sb[0]->cdef_strength && allow_cdef_skipping && + !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV); + } + // Store in the pickmode context. + ctx->mic.cdef_strength = mi_sb[0]->cdef_strength; + } + x->rdmult = orig_rdmult; + ctx->rd_stats.rate = rd_cost->rate; + ctx->rd_stats.dist = rd_cost->dist; + ctx->rd_stats.rdcost = rd_cost->rdcost; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, pick_sb_modes_nonrd_time); +#endif +} + +static int try_split_partition(AV1_COMP *const cpi, ThreadData *const td, + TileDataEnc *const tile_data, + TileInfo *const tile_info, TokenExtra **tp, + MACROBLOCK *const x, MACROBLOCKD *const xd, + const CommonModeInfoParams *const mi_params, + const int mi_row, const int mi_col, + const BLOCK_SIZE bsize, const int pl, + PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + const ModeCosts *mode_costs = &x->mode_costs; + const int hbs = mi_size_wide[bsize] / 2; + if (mi_row + mi_size_high[bsize] >= mi_params->mi_rows || + mi_col + mi_size_wide[bsize] >= mi_params->mi_cols) + return 0; + if (bsize <= BLOCK_8X8 || frame_is_intra_only(cm)) return 0; + if (x->content_state_sb.source_sad_nonrd <= kLowSad) return 0; + + // Do not try split partition when the source sad is small, or + // the prediction residual is small. + const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, LAST_FRAME); + const int num_planes = av1_num_planes(cm); + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); + av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf, num_planes); + int block_sad = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride); + block_sad += plane_sad; + } + const int blk_pix = block_size_wide[bsize] * block_size_high[bsize]; + const int block_avg_sad = block_sad / blk_pix; + // TODO(chengchen): find a proper threshold. It might change according to + // q as well. + const int threshold = 25; + if (block_avg_sad < threshold) return 0; + + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_STATS split_rdc, none_rdc; + av1_invalid_rd_stats(&split_rdc); + av1_invalid_rd_stats(&none_rdc); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + // Calculate rdcost for none partition + pc_tree->partitioning = PARTITION_NONE; + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + if (!pc_tree->none) { + pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->none); + } + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, + pc_tree->none); + none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + + // Calculate rdcost for split partition + pc_tree->partitioning = PARTITION_SPLIT; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + av1_init_rd_stats(&split_rdc); + split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; + if (subsize >= BLOCK_8X8) { + split_rdc.rate += (mode_costs->partition_cost[pl][PARTITION_NONE] * 4); + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (!pc_tree->split[i]) { + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + } + pc_tree->split[i]->index = i; + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + RD_STATS block_rdc; + av1_invalid_rd_stats(&block_rdc); + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx; + xd->left_txfm_context = + xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK); + if (!pc_tree->split[i]->none) { + pc_tree->split[i]->none = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->split[i]->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->split[i]->none); + } + pc_tree->split[i]->partitioning = PARTITION_NONE; + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, + &block_rdc, subsize, pc_tree->split[i]->none); + split_rdc.rate += block_rdc.rate; + split_rdc.dist += block_rdc.dist; + av1_rd_cost_update(x->rdmult, &split_rdc); + if (none_rdc.rdcost < split_rdc.rdcost) break; + if (i != SUB_PARTITIONS_SPLIT - 1) + encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1, + subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL); + } + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist); + const int split = split_rdc.rdcost < none_rdc.rdcost; + + return split; +} + +// Returns if SPLIT partitions should be evaluated +static bool calc_do_split_flag(const AV1_COMP *cpi, const MACROBLOCK *x, + const PC_TREE *pc_tree, const RD_STATS *none_rdc, + const CommonModeInfoParams *mi_params, + int mi_row, int mi_col, int hbs, + BLOCK_SIZE bsize, PARTITION_TYPE partition) { + const AV1_COMMON *const cm = &cpi->common; + const int is_larger_qindex = cm->quant_params.base_qindex > 100; + const MACROBLOCKD *const xd = &x->e_mbd; + bool do_split = + (cpi->sf.rt_sf.nonrd_check_partition_merge_mode == 3) + ? (bsize <= BLOCK_32X32 || (is_larger_qindex && bsize <= BLOCK_64X64)) + : true; + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN || + cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 || + cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) || + !none_rdc->skip_txfm) + return do_split; + + const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize); + + // When model based skip is not used (i.e.,use_model_yrd_large = 0), skip_txfm + // would have been populated based on Hadamard transform and skip_txfm flag is + // more reliable. Hence SPLIT evaluation is disabled at all quantizers for 8x8 + // and 16x16 blocks. + // When model based skip is used (i.e.,use_model_yrd_large = 1), skip_txfm may + // not be reliable. Hence SPLIT evaluation is disabled only at lower + // quantizers for blocks >= 32x32. + if ((!use_model_yrd_large) || (!is_larger_qindex)) return false; + + // Use residual statistics to decide if SPLIT partition should be evaluated + // for 32x32 blocks. The pruning logic is avoided for larger block size to + // avoid the visual artifacts + if (pc_tree->none->mic.mode == NEWMV && bsize == BLOCK_32X32 && do_split) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + assert(subsize < BLOCK_SIZES_ALL); + double min_per_pixel_error = DBL_MAX; + double max_per_pixel_error = 0.; + int i; + for (i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + const int x_idx = (i & 1) * hbs; + const int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) { + break; + } + + // Populate the appropriate buffer pointers. + // Pass scale factors as NULL as the base pointer of the block would have + // been calculated appropriately. + struct buf_2d src_split_buf_2d, pred_split_buf_2d; + const struct buf_2d *src_none_buf_2d = &x->plane[AOM_PLANE_Y].src; + setup_pred_plane(&src_split_buf_2d, subsize, src_none_buf_2d->buf, + src_none_buf_2d->width, src_none_buf_2d->height, + src_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0); + const struct buf_2d *pred_none_buf_2d = &xd->plane[AOM_PLANE_Y].dst; + setup_pred_plane(&pred_split_buf_2d, subsize, pred_none_buf_2d->buf, + pred_none_buf_2d->width, pred_none_buf_2d->height, + pred_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0); + + unsigned int curr_uint_mse; + const unsigned int curr_uint_var = cpi->ppi->fn_ptr[subsize].vf( + src_split_buf_2d.buf, src_split_buf_2d.stride, pred_split_buf_2d.buf, + pred_split_buf_2d.stride, &curr_uint_mse); + const double curr_per_pixel_error = + sqrt((double)curr_uint_var / block_size_wide[subsize] / + block_size_high[subsize]); + if (curr_per_pixel_error < min_per_pixel_error) + min_per_pixel_error = curr_per_pixel_error; + if (curr_per_pixel_error > max_per_pixel_error) + max_per_pixel_error = curr_per_pixel_error; + } + + // Prune based on residual statistics only if all the sub-partitions are + // valid. + if (i == SUB_PARTITIONS_SPLIT) { + if (max_per_pixel_error - min_per_pixel_error <= 1.5) do_split = false; + } + } + + return do_split; +} + +static void try_merge(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, MB_MODE_INFO **mib, + TokenExtra **tp, const int mi_row, const int mi_col, + const BLOCK_SIZE bsize, PC_TREE *const pc_tree, + const PARTITION_TYPE partition, const BLOCK_SIZE subsize, + const int pl) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const ModeCosts *mode_costs = &x->mode_costs; + const int num_planes = av1_num_planes(cm); + // Only square blocks from 8x8 to 128x128 are supported + assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128); + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + bool do_split = false; + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_STATS split_rdc, none_rdc; + av1_invalid_rd_stats(&split_rdc); + av1_invalid_rd_stats(&none_rdc); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + pc_tree->partitioning = PARTITION_NONE; + if (!pc_tree->none) { + pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->none); + } + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, + pc_tree->none); + none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 || + none_rdc.skip_txfm != 1 || pc_tree->none->mic.mode == NEWMV) { + do_split = calc_do_split_flag(cpi, x, pc_tree, &none_rdc, mi_params, mi_row, + mi_col, hbs, bsize, partition); + if (do_split) { + av1_init_rd_stats(&split_rdc); + split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + RD_STATS block_rdc; + av1_invalid_rd_stats(&block_rdc); + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx; + xd->left_txfm_context = + xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK); + if (!pc_tree->split[i]->none) { + pc_tree->split[i]->none = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->split[i]->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->split[i]->none); + } + pc_tree->split[i]->partitioning = PARTITION_NONE; + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, + &block_rdc, subsize, pc_tree->split[i]->none); + // TODO(yunqingwang): The rate here did not include the cost of + // signaling PARTITION_NONE token in the sub-blocks. + split_rdc.rate += block_rdc.rate; + split_rdc.dist += block_rdc.dist; + + av1_rd_cost_update(x->rdmult, &split_rdc); + + if (none_rdc.rdcost < split_rdc.rdcost) { + break; + } + + if (i != SUB_PARTITIONS_SPLIT - 1) + encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, + 1, subsize, PARTITION_NONE, pc_tree->split[i]->none, + NULL); + } + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist); + } + } + + if (none_rdc.rdcost < split_rdc.rdcost) { + /* Predicted samples can not be reused for PARTITION_NONE since same + * buffer is being used to store the reconstructed samples of + * PARTITION_SPLIT block. */ + if (do_split) x->reuse_inter_pred = false; + + mib[0]->bsize = bsize; + pc_tree->partitioning = PARTITION_NONE; + encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition, + pc_tree->none, NULL); + } else { + mib[0]->bsize = subsize; + pc_tree->partitioning = PARTITION_SPLIT; + /* Predicted samples can not be reused for PARTITION_SPLIT since same + * buffer is being used to write the reconstructed samples. */ + // TODO(Cherma): Store and reuse predicted samples generated by + // encode_b_nonrd() in DRY_RUN_NORMAL mode. + x->reuse_inter_pred = false; + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + + // Note: We don't reset pc_tree->split[i]->none here because it + // could contain results from the additional check. Instead, it is + // reset before we enter the nonrd_check_partition_merge_mode + // condition. + if (!pc_tree->split[i]->none) { + pc_tree->split[i]->none = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->split[i]->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0, + subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL); + } + } +} + +// Evaluate if the sub-partitions can be merged directly into a large partition +// without calculating the RD cost. +static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MB_MODE_INFO **mib, + int mi_row, int mi_col, BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + const PARTITION_TYPE partition = + (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) + : PARTITION_NONE; + BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + + MB_MODE_INFO **b0 = mib; + MB_MODE_INFO **b1 = mib + hbs; + MB_MODE_INFO **b2 = mib + hbs * mi_params->mi_stride; + MB_MODE_INFO **b3 = mib + hbs * mi_params->mi_stride + hbs; + + // Check if the following conditions are met. This can be updated + // later with more support added. + const int further_split = b0[0]->bsize < subsize || b1[0]->bsize < subsize || + b2[0]->bsize < subsize || b3[0]->bsize < subsize; + if (further_split) return; + + const int no_skip = !b0[0]->skip_txfm || !b1[0]->skip_txfm || + !b2[0]->skip_txfm || !b3[0]->skip_txfm; + if (no_skip) return; + + const int compound = (b0[0]->ref_frame[1] != b1[0]->ref_frame[1] || + b0[0]->ref_frame[1] != b2[0]->ref_frame[1] || + b0[0]->ref_frame[1] != b3[0]->ref_frame[1] || + b0[0]->ref_frame[1] > NONE_FRAME); + if (compound) return; + + // Intra modes aren't considered here. + const int different_ref = (b0[0]->ref_frame[0] != b1[0]->ref_frame[0] || + b0[0]->ref_frame[0] != b2[0]->ref_frame[0] || + b0[0]->ref_frame[0] != b3[0]->ref_frame[0] || + b0[0]->ref_frame[0] <= INTRA_FRAME); + if (different_ref) return; + + const int different_mode = + (b0[0]->mode != b1[0]->mode || b0[0]->mode != b2[0]->mode || + b0[0]->mode != b3[0]->mode); + if (different_mode) return; + + const int unsupported_mode = + (b0[0]->mode != NEARESTMV && b0[0]->mode != GLOBALMV); + if (unsupported_mode) return; + + const int different_mv = (b0[0]->mv[0].as_int != b1[0]->mv[0].as_int || + b0[0]->mv[0].as_int != b2[0]->mv[0].as_int || + b0[0]->mv[0].as_int != b3[0]->mv[0].as_int); + if (different_mv) return; + + const int unsupported_motion_mode = + (b0[0]->motion_mode != b1[0]->motion_mode || + b0[0]->motion_mode != b2[0]->motion_mode || + b0[0]->motion_mode != b3[0]->motion_mode || + b0[0]->motion_mode != SIMPLE_TRANSLATION); + if (unsupported_motion_mode) return; + + const int diffent_filter = + (b0[0]->interp_filters.as_int != b1[0]->interp_filters.as_int || + b0[0]->interp_filters.as_int != b2[0]->interp_filters.as_int || + b0[0]->interp_filters.as_int != b3[0]->interp_filters.as_int); + if (diffent_filter) return; + + const int different_seg = (b0[0]->segment_id != b1[0]->segment_id || + b0[0]->segment_id != b2[0]->segment_id || + b0[0]->segment_id != b3[0]->segment_id); + if (different_seg) return; + + // Evaluate the ref_mv. + MB_MODE_INFO **this_mi = mib; + BLOCK_SIZE orig_bsize = this_mi[0]->bsize; + const PARTITION_TYPE orig_partition = this_mi[0]->partition; + + this_mi[0]->bsize = bsize; + this_mi[0]->partition = PARTITION_NONE; + this_mi[0]->skip_txfm = 1; + + // TODO(yunqing): functions called below can be optimized by + // removing unrelated operations. + av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row, + mi_col, bsize); + + const MV_REFERENCE_FRAME ref_frame = this_mi[0]->ref_frame[0]; + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES]; + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; + int force_skip_low_temp_var = 0; + int skip_pred_mv = 0; + bool use_scaled_ref; + + for (int i = 0; i < MB_MODE_COUNT; ++i) { + for (int j = 0; j < REF_FRAMES; ++j) { + frame_mv[i][j].as_int = INVALID_MV; + } + } + av1_copy(x->color_sensitivity, x->color_sensitivity_sb); + skip_pred_mv = (x->nonrd_prune_ref_frame_search > 2 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2); + + find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, bsize, + force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref); + + int continue_merging = 1; + if (frame_mv[NEARESTMV][ref_frame].as_mv.row != b0[0]->mv[0].as_mv.row || + frame_mv[NEARESTMV][ref_frame].as_mv.col != b0[0]->mv[0].as_mv.col) + continue_merging = 0; + + if (!continue_merging) { + this_mi[0]->bsize = orig_bsize; + this_mi[0]->partition = orig_partition; + + // TODO(yunqing): Store the results and restore here instead of + // calling find_predictors() again. + av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row, + mi_col, this_mi[0]->bsize); + find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, this_mi[0]->bsize, + force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref); + } else { + struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame); + const int is_scaled = av1_is_scaled(sf); + const int is_y_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 8) || + (abs(this_mi[0]->mv[0].as_mv.col) % 8); + const int is_uv_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 16) || + (abs(this_mi[0]->mv[0].as_mv.col) % 16); + + if (cpi->ppi->use_svc || is_scaled || is_y_subpel_mv || is_uv_subpel_mv) { + const int num_planes = av1_num_planes(cm); + set_ref_ptrs(cm, xd, ref_frame, this_mi[0]->ref_frame[1]); + const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame); + av1_setup_pre_planes(xd, 0, cfg, mi_row, mi_col, + xd->block_ref_scale_factors[0], num_planes); + + if (!cpi->ppi->use_svc && !is_scaled && !is_y_subpel_mv) { + assert(is_uv_subpel_mv == 1); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 1, + num_planes - 1); + } else { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + num_planes - 1); + } + } + + // Copy out mbmi_ext information. + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = x->mbmi_ext_frame; + av1_copy_mbmi_ext_to_mbmi_ext_frame( + mbmi_ext_frame, mbmi_ext, av1_ref_frame_type(this_mi[0]->ref_frame)); + + const BLOCK_SIZE this_subsize = + get_partition_subsize(bsize, this_mi[0]->partition); + // Update partition contexts. + update_ext_partition_context(xd, mi_row, mi_col, this_subsize, bsize, + this_mi[0]->partition); + + const int num_planes = av1_num_planes(cm); + av1_reset_entropy_context(xd, bsize, num_planes); + + // Note: use x->txfm_search_params.tx_mode_search_type instead of + // cm->features.tx_mode here. + TX_SIZE tx_size = + tx_size_from_tx_mode(bsize, x->txfm_search_params.tx_mode_search_type); + if (xd->lossless[this_mi[0]->segment_id]) tx_size = TX_4X4; + this_mi[0]->tx_size = tx_size; + memset(this_mi[0]->inter_tx_size, this_mi[0]->tx_size, + sizeof(this_mi[0]->inter_tx_size)); + + // Update txfm contexts. + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + set_txfm_ctxs(this_mi[0]->tx_size, xd->width, xd->height, + this_mi[0]->skip_txfm && is_inter_block(this_mi[0]), xd); + + // Update mi for this partition block. + for (int y = 0; y < bs; y++) { + for (int x_idx = 0; x_idx < bs; x_idx++) { + this_mi[x_idx + y * mi_params->mi_stride] = this_mi[0]; + } + } + } +} + +/*!\brief AV1 block partition application (minimal RD search). +* +* \ingroup partition_search +* \callgraph +* \callergraph +* Encode the block by applying pre-calculated partition patterns that are +* represented by coding block sizes stored in the mbmi array. The only +* partition adjustment allowed is merging leaf split nodes if it leads to a +* lower rd cost. The partition types are limited to a basic set: none, horz, +* vert, and split. This function is only used in the real-time mode. +* +* \param[in] cpi Top-level encoder structure +* \param[in] td Pointer to thread data +* \param[in] tile_data Pointer to struct holding adaptive +data/contexts/models for the tile during encoding +* \param[in] mib Array representing MB_MODE_INFO pointers for mi +blocks starting from the first pixel of the current +block +* \param[in] tp Pointer to the starting token +* \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE +* \param[in] mi_col Column coordinate of the block in a step size of +MI_SIZE +* \param[in] bsize Current block size +* \param[in] pc_tree Pointer to the PC_TREE node holding the picked +partitions and mode info for the current block +* +* \remark Nothing is returned. The pc_tree struct is modified to store the +* picked partition and modes. +*/ +void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MB_MODE_INFO **mib, + TokenExtra **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const ModeCosts *mode_costs = &x->mode_costs; + // Only square blocks from 8x8 to 128x128 are supported + assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128); + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + PARTITION_TYPE partition = (bsize >= BLOCK_8X8) + ? get_partition(cm, mi_row, mi_col, bsize) + : PARTITION_NONE; + BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + assert(subsize <= BLOCK_LARGEST); + const int pl = (bsize >= BLOCK_8X8) + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + + RD_STATS dummy_cost; + av1_invalid_rd_stats(&dummy_cost); + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + x->reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd; + + int change_none_to_split = 0; + if (partition == PARTITION_NONE && + cpi->sf.rt_sf.nonrd_check_partition_split == 1) { + change_none_to_split = + try_split_partition(cpi, td, tile_data, tile_info, tp, x, xd, mi_params, + mi_row, mi_col, bsize, pl, pc_tree); + if (change_none_to_split) { + partition = PARTITION_SPLIT; + subsize = get_partition_subsize(bsize, partition); + assert(subsize <= BLOCK_LARGEST); + } + } + + pc_tree->partitioning = partition; + + switch (partition) { + case PARTITION_NONE: + if (!pc_tree->none) { + pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->none); + } + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, bsize, + pc_tree->none); + encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, + partition, pc_tree->none, NULL); + break; + case PARTITION_VERT: + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (!pc_tree->vertical[i]) { + pc_tree->vertical[i] = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->vertical[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->vertical[i]); + } + } + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, + subsize, pc_tree->vertical[0]); + encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize, + PARTITION_VERT, pc_tree->vertical[0], NULL); + if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) { + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col + hbs, + &dummy_cost, subsize, pc_tree->vertical[1]); + encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize, + PARTITION_VERT, pc_tree->vertical[1], NULL); + } + break; + case PARTITION_HORZ: + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (!pc_tree->horizontal[i]) { + pc_tree->horizontal[i] = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->horizontal[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->horizontal[i]); + } + } + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, + subsize, pc_tree->horizontal[0]); + encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize, + PARTITION_HORZ, pc_tree->horizontal[0], NULL); + + if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) { + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + hbs, mi_col, + &dummy_cost, subsize, pc_tree->horizontal[1]); + encode_b_nonrd(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize, + PARTITION_HORZ, pc_tree->horizontal[1], NULL); + } + break; + case PARTITION_SPLIT: + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (!pc_tree->split[i]) { + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + } + pc_tree->split[i]->index = i; + } + if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode && + av1_is_leaf_split_partition(cm, mi_row, mi_col, bsize) && + !frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { + try_merge(cpi, td, tile_data, mib, tp, mi_row, mi_col, bsize, pc_tree, + partition, subsize, pl); + } else { + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + int jj = i >> 1, ii = i & 0x01; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + av1_nonrd_use_partition( + cpi, td, tile_data, + mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp, + mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]); + } + + if (!change_none_to_split) { + // Note: Palette, cfl are not supported. + if (!frame_is_intra_only(cm) && !tile_data->allow_update_cdf && + cpi->sf.rt_sf.partition_direct_merging && + mode_costs->partition_cost[pl][PARTITION_NONE] < + mode_costs->partition_cost[pl][PARTITION_SPLIT] && + (mi_row + bs <= mi_params->mi_rows) && + (mi_col + bs <= mi_params->mi_cols)) { + direct_partition_merging(cpi, td, tile_data, mib, mi_row, mi_col, + bsize); + } + } + } + break; + case PARTITION_VERT_A: + case PARTITION_VERT_B: + case PARTITION_HORZ_A: + case PARTITION_HORZ_B: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + assert(0 && "Cannot handle extended partition types"); + default: assert(0); break; + } +} + +#if !CONFIG_REALTIME_ONLY +// Try searching for an encoding for the given subblock. Returns zero if the +// rdcost is already too high (to tell the caller not to bother searching for +// encodings of further subblocks). +static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, int is_last, + int mi_row, int mi_col, BLOCK_SIZE subsize, + RD_STATS best_rdcost, RD_STATS *sum_rdc, + PARTITION_TYPE partition, + PICK_MODE_CONTEXT *this_ctx) { + MACROBLOCK *const x = &td->mb; + const int orig_mult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL); + + av1_rd_cost_update(x->rdmult, &best_rdcost); + + RD_STATS rdcost_remaining; + av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining); + RD_STATS this_rdc; + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition, + subsize, this_ctx, rdcost_remaining); + + if (this_rdc.rate == INT_MAX) { + sum_rdc->rdcost = INT64_MAX; + } else { + sum_rdc->rate += this_rdc.rate; + sum_rdc->dist += this_rdc.dist; + av1_rd_cost_update(x->rdmult, sum_rdc); + } + + if (sum_rdc->rdcost >= best_rdcost.rdcost) { + x->rdmult = orig_mult; + return 0; + } + + if (!is_last) { + av1_update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL); + } + + x->rdmult = orig_mult; + return 1; +} + +// Tests an AB partition, and updates the encoder status, the pick mode +// contexts, the best rdcost, and the best partition. +static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + PC_TREE *pc_tree, RD_STATS *best_rdc, + int64_t *this_rdcost, + PICK_MODE_CONTEXT *ctxs[SUB_PARTITIONS_AB], + int mi_row, int mi_col, BLOCK_SIZE bsize, + PARTITION_TYPE partition, + const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB], + const int ab_mi_pos[SUB_PARTITIONS_AB][2], + const MB_MODE_INFO **mode_cache) { + MACROBLOCK *const x = &td->mb; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + RD_STATS sum_rdc; + av1_init_rd_stats(&sum_rdc); + sum_rdc.rate = x->mode_costs.partition_cost[pl][partition]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + // Loop over sub-partitions in AB partition type. + for (int i = 0; i < SUB_PARTITIONS_AB; i++) { + if (mode_cache && mode_cache[i]) { + x->use_mb_mode_cache = 1; + x->mb_mode_cache = mode_cache[i]; + } + const int mode_search_success = + rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1, + ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i], + *best_rdc, &sum_rdc, partition, ctxs[i]); + x->use_mb_mode_cache = 0; + x->mb_mode_cache = NULL; + if (!mode_search_success) { + return false; + } + } + + av1_rd_cost_update(x->rdmult, &sum_rdc); + *this_rdcost = sum_rdc.rdcost; + if (sum_rdc.rdcost >= best_rdc->rdcost) return false; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + *this_rdcost = sum_rdc.rdcost; + if (sum_rdc.rdcost >= best_rdc->rdcost) return false; + + *best_rdc = sum_rdc; + pc_tree->partitioning = partition; + return true; +} + +#if CONFIG_COLLECT_PARTITION_STATS +static void init_partition_block_timing_stats( + PartitionTimingStats *part_timing_stats) { + av1_zero(*part_timing_stats); +} + +static INLINE void start_partition_block_timer( + PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type) { + assert(!part_timing_stats->timer_is_on); + part_timing_stats->partition_attempts[partition_type] += 1; + aom_usec_timer_start(&part_timing_stats->timer); + part_timing_stats->timer_is_on = 1; +} + +static INLINE void end_partition_block_timer( + PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type, + int64_t rdcost) { + if (part_timing_stats->timer_is_on) { + aom_usec_timer_mark(&part_timing_stats->timer); + const int64_t time = aom_usec_timer_elapsed(&part_timing_stats->timer); + part_timing_stats->partition_times[partition_type] += time; + part_timing_stats->partition_rdcost[partition_type] = rdcost; + part_timing_stats->timer_is_on = 0; + } +} +static INLINE void print_partition_timing_stats_with_rdcost( + const PartitionTimingStats *part_timing_stats, int mi_row, int mi_col, + BLOCK_SIZE bsize, FRAME_UPDATE_TYPE frame_update_type, int frame_number, + const RD_STATS *best_rdc, const char *filename) { + FILE *f = fopen(filename, "a"); + fprintf(f, "%d,%d,%d,%d,%d,%d,%" PRId64 ",%" PRId64 ",", bsize, frame_number, + frame_update_type, mi_row, mi_col, best_rdc->rate, best_rdc->dist, + best_rdc->rdcost); + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + if (part_timing_stats->partition_rdcost[idx] == INT64_MAX) { + fprintf(f, "%d,", -1); + } else { + fprintf(f, "%" PRId64 ",", part_timing_stats->partition_rdcost[idx]); + } + } + fprintf(f, "\n"); + fclose(f); +} + +static INLINE void print_partition_timing_stats( + const PartitionTimingStats *part_timing_stats, int intra_only, + int show_frame, const BLOCK_SIZE bsize, const char *filename) { + FILE *f = fopen(filename, "a"); + fprintf(f, "%d,%d,%d,", bsize, show_frame, intra_only); + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]); + } + fprintf(f, "\n"); + fclose(f); +} + +static INLINE void accumulate_partition_timing_stats( + FramePartitionTimingStats *fr_part_timing_stats, + const PartitionTimingStats *part_timing_stats, BLOCK_SIZE bsize) { + const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize); + int *agg_attempts = fr_part_timing_stats->partition_attempts[bsize_idx]; + int *agg_decisions = fr_part_timing_stats->partition_decisions[bsize_idx]; + int64_t *agg_times = fr_part_timing_stats->partition_times[bsize_idx]; + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + agg_attempts[idx] += part_timing_stats->partition_attempts[idx]; + agg_decisions[idx] += part_timing_stats->partition_decisions[idx]; + agg_times[idx] += part_timing_stats->partition_times[idx]; + } +} +#endif // CONFIG_COLLECT_PARTITION_STATS + +// Initialize state variables of partition search used in +// av1_rd_pick_partition(). +static void init_partition_search_state_params( + MACROBLOCK *x, AV1_COMP *const cpi, PartitionSearchState *part_search_state, + int mi_row, int mi_col, BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &x->e_mbd; + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams *blk_params = &part_search_state->part_blk_params; + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + + // Initialization of block size related parameters. + blk_params->mi_step = mi_size_wide[bsize] / 2; + blk_params->mi_row = mi_row; + blk_params->mi_col = mi_col; + blk_params->mi_row_edge = mi_row + blk_params->mi_step; + blk_params->mi_col_edge = mi_col + blk_params->mi_step; + blk_params->width = block_size_wide[bsize]; + blk_params->min_partition_size_1d = + block_size_wide[x->sb_enc.min_partition_size]; + blk_params->subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + blk_params->split_bsize2 = blk_params->subsize; + blk_params->bsize_at_least_8x8 = (bsize >= BLOCK_8X8); + blk_params->bsize = bsize; + + // Check if the partition corresponds to edge block. + blk_params->has_rows = (blk_params->mi_row_edge < mi_params->mi_rows); + blk_params->has_cols = (blk_params->mi_col_edge < mi_params->mi_cols); + + // Update intra partitioning related info. + part_search_state->intra_part_info = &x->part_search_info; + // Prepare for segmentation CNN-based partitioning for intra-frame. + if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) { + part_search_state->intra_part_info->quad_tree_idx = 0; + part_search_state->intra_part_info->cnn_output_valid = 0; + } + + // Set partition plane context index. + part_search_state->pl_ctx_idx = + blk_params->bsize_at_least_8x8 + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + + // Partition cost buffer update + ModeCosts *mode_costs = &x->mode_costs; + part_search_state->partition_cost = + mode_costs->partition_cost[part_search_state->pl_ctx_idx]; + + // Initialize HORZ and VERT win flags as true for all split partitions. + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + part_search_state->split_part_rect_win[i].rect_part_win[HORZ] = true; + part_search_state->split_part_rect_win[i].rect_part_win[VERT] = true; + } + + // Initialize the rd cost. + av1_init_rd_stats(&part_search_state->this_rdc); + + // Initialize RD costs for partition types to 0. + part_search_state->none_rd = 0; + av1_zero(part_search_state->split_rd); + av1_zero(part_search_state->rect_part_rd); + + // Initialize SPLIT partition to be not ready. + av1_zero(part_search_state->is_split_ctx_is_ready); + // Initialize HORZ and VERT partitions to be not ready. + av1_zero(part_search_state->is_rect_ctx_is_ready); + + // Chroma subsampling. + part_search_state->ss_x = x->e_mbd.plane[1].subsampling_x; + part_search_state->ss_y = x->e_mbd.plane[1].subsampling_y; + + // Initialize partition search flags to defaults. + part_search_state->terminate_partition_search = 0; + part_search_state->do_square_split = blk_params->bsize_at_least_8x8; + part_search_state->do_rectangular_split = + cpi->oxcf.part_cfg.enable_rect_partitions && + blk_params->bsize_at_least_8x8; + av1_zero(part_search_state->prune_rect_part); + + // Initialize allowed partition types for the partition block. + part_search_state->partition_none_allowed = + av1_blk_has_rows_and_cols(blk_params); + part_search_state->partition_rect_allowed[HORZ] = + part_search_state->do_rectangular_split && blk_params->has_cols && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), + part_search_state->ss_x, + part_search_state->ss_y) != BLOCK_INVALID; + part_search_state->partition_rect_allowed[VERT] = + part_search_state->do_rectangular_split && blk_params->has_rows && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), + part_search_state->ss_x, + part_search_state->ss_y) != BLOCK_INVALID; + + // Reset the flag indicating whether a partition leading to a rdcost lower + // than the bound best_rdc has been found. + part_search_state->found_best_partition = false; + +#if CONFIG_COLLECT_PARTITION_STATS + init_partition_block_timing_stats(&part_search_state->part_timing_stats); +#endif // CONFIG_COLLECT_PARTITION_STATS +} + +// Override partition cost buffer for the edge blocks. +static void set_partition_cost_for_edge_blk( + AV1_COMMON const *cm, PartitionSearchState *part_search_state) { + PartitionBlkParams blk_params = part_search_state->part_blk_params; + assert(blk_params.bsize_at_least_8x8 && part_search_state->pl_ctx_idx >= 0); + const aom_cdf_prob *partition_cdf = + cm->fc->partition_cdf[part_search_state->pl_ctx_idx]; + const int max_cost = av1_cost_symbol(0); + for (PARTITION_TYPE i = 0; i < PARTITION_TYPES; ++i) + part_search_state->tmp_partition_cost[i] = max_cost; + if (blk_params.has_cols) { + // At the bottom, the two possibilities are HORZ and SPLIT. + aom_cdf_prob bot_cdf[2]; + partition_gather_vert_alike(bot_cdf, partition_cdf, blk_params.bsize); + static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT }; + av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, bot_cdf, + bot_inv_map); + } else if (blk_params.has_rows) { + // At the right, the two possibilities are VERT and SPLIT. + aom_cdf_prob rhs_cdf[2]; + partition_gather_horz_alike(rhs_cdf, partition_cdf, blk_params.bsize); + static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT }; + av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, rhs_cdf, + rhs_inv_map); + } else { + // At the bottom right, we always split. + part_search_state->tmp_partition_cost[PARTITION_SPLIT] = 0; + } + // Override the partition cost buffer. + part_search_state->partition_cost = part_search_state->tmp_partition_cost; +} + +// Reset the partition search state flags when +// must_find_valid_partition is equal to 1. +static AOM_INLINE void reset_part_limitations( + AV1_COMP *const cpi, PartitionSearchState *part_search_state) { + PartitionBlkParams blk_params = part_search_state->part_blk_params; + const int is_rect_part_allowed = + blk_params.bsize_at_least_8x8 && + cpi->oxcf.part_cfg.enable_rect_partitions && + (blk_params.width > blk_params.min_partition_size_1d); + part_search_state->do_square_split = + blk_params.bsize_at_least_8x8 && + (blk_params.width > blk_params.min_partition_size_1d); + part_search_state->partition_none_allowed = + av1_blk_has_rows_and_cols(&blk_params) && + (blk_params.width >= blk_params.min_partition_size_1d); + part_search_state->partition_rect_allowed[HORZ] = + blk_params.has_cols && is_rect_part_allowed && + get_plane_block_size( + get_partition_subsize(blk_params.bsize, PARTITION_HORZ), + part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; + part_search_state->partition_rect_allowed[VERT] = + blk_params.has_rows && is_rect_part_allowed && + get_plane_block_size( + get_partition_subsize(blk_params.bsize, PARTITION_VERT), + part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; + part_search_state->terminate_partition_search = 0; +} + +// Rectangular partitions evaluation at sub-block level. +static void rd_pick_rect_partition(AV1_COMP *const cpi, TileDataEnc *tile_data, + MACROBLOCK *x, + PICK_MODE_CONTEXT *cur_partition_ctx, + PartitionSearchState *part_search_state, + RD_STATS *best_rdc, const int idx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + PARTITION_TYPE partition_type) { + // Obtain the remainder from the best rd cost + // for further processing of partition. + RD_STATS best_remain_rdcost; + av1_rd_stats_subtraction(x->rdmult, best_rdc, &part_search_state->sum_rdc, + &best_remain_rdcost); + + // Obtain the best mode for the partition sub-block. + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &part_search_state->this_rdc, + partition_type, bsize, cur_partition_ctx, best_remain_rdcost); + av1_rd_cost_update(x->rdmult, &part_search_state->this_rdc); + + // Update the partition rd cost with the current sub-block rd. + if (part_search_state->this_rdc.rate == INT_MAX) { + part_search_state->sum_rdc.rdcost = INT64_MAX; + } else { + part_search_state->sum_rdc.rate += part_search_state->this_rdc.rate; + part_search_state->sum_rdc.dist += part_search_state->this_rdc.dist; + av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc); + } + const RECT_PART_TYPE rect_part = + partition_type == PARTITION_HORZ ? HORZ : VERT; + part_search_state->rect_part_rd[rect_part][idx] = + part_search_state->this_rdc.rdcost; +} + +typedef int (*active_edge_info)(const AV1_COMP *cpi, int mi_col, int mi_step); + +// Checks if HORZ / VERT partition search is allowed. +static AOM_INLINE int is_rect_part_allowed( + const AV1_COMP *cpi, const PartitionSearchState *part_search_state, + const active_edge_info *active_edge, RECT_PART_TYPE rect_part, + const int mi_pos) { + const PartitionBlkParams *blk_params = &part_search_state->part_blk_params; + const int is_part_allowed = + (!part_search_state->terminate_partition_search && + part_search_state->partition_rect_allowed[rect_part] && + !part_search_state->prune_rect_part[rect_part] && + (part_search_state->do_rectangular_split || + active_edge[rect_part](cpi, mi_pos, blk_params->mi_step))); + return is_part_allowed; +} + +static void rectangular_partition_search( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree, + RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + RD_RECT_PART_WIN_INFO *rect_part_win_info, const RECT_PART_TYPE start_type, + const RECT_PART_TYPE end_type) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + RD_STATS *sum_rdc = &part_search_state->sum_rdc; + const int rect_partition_type[NUM_RECT_PARTS] = { PARTITION_HORZ, + PARTITION_VERT }; + + // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][0]: mi_row postion of + // HORZ and VERT partition types. + // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][1]: mi_col postion of + // HORZ and VERT partition types. + const int mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][2] = { + { { blk_params.mi_row, blk_params.mi_col }, + { blk_params.mi_row_edge, blk_params.mi_col } }, + { { blk_params.mi_row, blk_params.mi_col }, + { blk_params.mi_row, blk_params.mi_col_edge } } + }; + + // Initialize active edge_type function pointer + // for HOZR and VERT partition types. + active_edge_info active_edge_type[NUM_RECT_PARTS] = { av1_active_h_edge, + av1_active_v_edge }; + + // Indicates edge blocks for HORZ and VERT partition types. + const int is_not_edge_block[NUM_RECT_PARTS] = { blk_params.has_rows, + blk_params.has_cols }; + + // Initialize pc tree context for HORZ and VERT partition types. + PICK_MODE_CONTEXT **cur_ctx[NUM_RECT_PARTS][SUB_PARTITIONS_RECT] = { + { &pc_tree->horizontal[0], &pc_tree->horizontal[1] }, + { &pc_tree->vertical[0], &pc_tree->vertical[1] } + }; + + // Loop over rectangular partition types. + for (RECT_PART_TYPE i = start_type; i <= end_type; i++) { + assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, + !part_search_state->partition_rect_allowed[i])); + + // Check if the HORZ / VERT partition search is to be performed. + if (!is_rect_part_allowed(cpi, part_search_state, active_edge_type, i, + mi_pos_rect[i][0][i])) + continue; + + // Sub-partition idx. + int sub_part_idx = 0; + PARTITION_TYPE partition_type = rect_partition_type[i]; + blk_params.subsize = + get_partition_subsize(blk_params.bsize, partition_type); + assert(blk_params.subsize <= BLOCK_LARGEST); + av1_init_rd_stats(sum_rdc); + for (int j = 0; j < SUB_PARTITIONS_RECT; j++) { + if (cur_ctx[i][j][0] == NULL) { + cur_ctx[i][j][0] = + av1_alloc_pmc(cpi, blk_params.subsize, &td->shared_coeff_buf); + if (!cur_ctx[i][j][0]) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + } + sum_rdc->rate = part_search_state->partition_cost[partition_type]; + sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, 0); +#if CONFIG_COLLECT_PARTITION_STATS + PartitionTimingStats *part_timing_stats = + &part_search_state->part_timing_stats; + if (best_rdc->rdcost - sum_rdc->rdcost >= 0) { + start_partition_block_timer(part_timing_stats, partition_type); + } +#endif + + // First sub-partition evaluation in HORZ / VERT partition type. + rd_pick_rect_partition( + cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state, + best_rdc, 0, mi_pos_rect[i][sub_part_idx][0], + mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type); + + // Start of second sub-partition evaluation. + // Evaluate second sub-partition if the first sub-partition cost + // is less than the best cost and if it is not an edge block. + if (sum_rdc->rdcost < best_rdc->rdcost && is_not_edge_block[i]) { + const MB_MODE_INFO *const mbmi = &cur_ctx[i][sub_part_idx][0]->mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + // Neither palette mode nor cfl predicted. + if (pmi->palette_size[PLANE_TYPE_Y] == 0 && + pmi->palette_size[PLANE_TYPE_UV] == 0) { + if (mbmi->uv_mode != UV_CFL_PRED) + part_search_state->is_rect_ctx_is_ready[i] = 1; + } + av1_update_state(cpi, td, cur_ctx[i][sub_part_idx][0], blk_params.mi_row, + blk_params.mi_col, blk_params.subsize, DRY_RUN_NORMAL); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, + blk_params.subsize, NULL); + + // Second sub-partition evaluation in HORZ / VERT partition type. + sub_part_idx = 1; + rd_pick_rect_partition( + cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state, + best_rdc, 1, mi_pos_rect[i][sub_part_idx][0], + mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type); + } + // Update HORZ / VERT best partition. + if (sum_rdc->rdcost < best_rdc->rdcost) { + sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, sum_rdc->dist); + if (sum_rdc->rdcost < best_rdc->rdcost) { + *best_rdc = *sum_rdc; + part_search_state->found_best_partition = true; + pc_tree->partitioning = partition_type; + } + } else { + // Update HORZ / VERT win flag. + if (rect_part_win_info != NULL) + rect_part_win_info->rect_part_win[i] = false; + } +#if CONFIG_COLLECT_PARTITION_STATS + if (part_timing_stats->timer_is_on) { + end_partition_block_timer(part_timing_stats, partition_type, + sum_rdc->rdcost); + } +#endif + av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col, + blk_params.bsize, av1_num_planes(cm)); + } +} + +// AB partition type evaluation. +static void rd_pick_ab_part( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PC_TREE *pc_tree, PICK_MODE_CONTEXT *dst_ctxs[SUB_PARTITIONS_AB], + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB], + const int ab_mi_pos[SUB_PARTITIONS_AB][2], const PARTITION_TYPE part_type, + const MB_MODE_INFO **mode_cache) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + int64_t this_rdcost = 0; + +#if CONFIG_COLLECT_PARTITION_STATS + PartitionTimingStats *part_timing_stats = + &part_search_state->part_timing_stats; + { + RD_STATS tmp_sum_rdc; + av1_init_rd_stats(&tmp_sum_rdc); + tmp_sum_rdc.rate = part_search_state->partition_cost[part_type]; + tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); + if (best_rdc->rdcost - tmp_sum_rdc.rdcost >= 0) { + start_partition_block_timer(part_timing_stats, part_type); + } + } +#endif + + // Test this partition and update the best partition. + const bool find_best_ab_part = rd_test_partition3( + cpi, td, tile_data, tp, pc_tree, best_rdc, &this_rdcost, dst_ctxs, mi_row, + mi_col, bsize, part_type, ab_subsize, ab_mi_pos, mode_cache); + part_search_state->found_best_partition |= find_best_ab_part; + +#if CONFIG_COLLECT_PARTITION_STATS + if (part_timing_stats->timer_is_on) { + if (!find_best_ab_part) this_rdcost = INT64_MAX; + end_partition_block_timer(part_timing_stats, part_type, this_rdcost); + } +#endif + av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); +} + +// Set mode search context. +static AOM_INLINE void set_mode_search_ctx( + PC_TREE *pc_tree, const int is_ctx_ready[NUM_AB_PARTS][2], + PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]) { + mode_srch_ctx[HORZ_B][0] = &pc_tree->horizontal[0]; + mode_srch_ctx[VERT_B][0] = &pc_tree->vertical[0]; + + if (is_ctx_ready[HORZ_A][0]) + mode_srch_ctx[HORZ_A][0] = &pc_tree->split[0]->none; + + if (is_ctx_ready[VERT_A][0]) + mode_srch_ctx[VERT_A][0] = &pc_tree->split[0]->none; + + if (is_ctx_ready[HORZ_A][1]) + mode_srch_ctx[HORZ_A][1] = &pc_tree->split[1]->none; +} + +static AOM_INLINE void copy_partition_mode_from_mode_context( + const MB_MODE_INFO **dst_mode, const PICK_MODE_CONTEXT *ctx) { + if (ctx && ctx->rd_stats.rate < INT_MAX) { + *dst_mode = &ctx->mic; + } else { + *dst_mode = NULL; + } +} + +static AOM_INLINE void copy_partition_mode_from_pc_tree( + const MB_MODE_INFO **dst_mode, const PC_TREE *pc_tree) { + if (pc_tree) { + copy_partition_mode_from_mode_context(dst_mode, pc_tree->none); + } else { + *dst_mode = NULL; + } +} + +static AOM_INLINE void set_mode_cache_for_partition_ab( + const MB_MODE_INFO **mode_cache, const PC_TREE *pc_tree, + AB_PART_TYPE ab_part_type) { + switch (ab_part_type) { + case HORZ_A: + copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]); + copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]); + copy_partition_mode_from_mode_context(&mode_cache[2], + pc_tree->horizontal[1]); + break; + case HORZ_B: + copy_partition_mode_from_mode_context(&mode_cache[0], + pc_tree->horizontal[0]); + copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]); + copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]); + break; + case VERT_A: + copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]); + copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]); + copy_partition_mode_from_mode_context(&mode_cache[2], + pc_tree->vertical[1]); + break; + case VERT_B: + copy_partition_mode_from_mode_context(&mode_cache[0], + pc_tree->vertical[0]); + copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]); + copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]); + break; + default: assert(0 && "Invalid ab partition type!\n"); + } +} + +// AB Partitions type search. +static void ab_partitions_search( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PC_TREE *pc_tree, PartitionSearchState *part_search_state, + RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info, + int pb_source_variance, int ext_partition_allowed, + const AB_PART_TYPE start_type, const AB_PART_TYPE end_type) { + PartitionBlkParams blk_params = part_search_state->part_blk_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + + if (part_search_state->terminate_partition_search) { + return; + } + + int ab_partitions_allowed[NUM_AB_PARTS]; + // Prune AB partitions + av1_prune_ab_partitions(cpi, x, pc_tree, pb_source_variance, best_rdc->rdcost, + rect_part_win_info, ext_partition_allowed, + part_search_state, ab_partitions_allowed); + + // Flags to indicate whether the mode search is done. + const int is_ctx_ready[NUM_AB_PARTS][2] = { + { part_search_state->is_split_ctx_is_ready[0], + part_search_state->is_split_ctx_is_ready[1] }, + { part_search_state->is_rect_ctx_is_ready[HORZ], 0 }, + { part_search_state->is_split_ctx_is_ready[0], 0 }, + { part_search_state->is_rect_ctx_is_ready[VERT], 0 } + }; + + // Current partition context. + PICK_MODE_CONTEXT **cur_part_ctxs[NUM_AB_PARTS] = { pc_tree->horizontala, + pc_tree->horizontalb, + pc_tree->verticala, + pc_tree->verticalb }; + + // Context of already evaluted partition types. + PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]; + // Set context of already evaluted partition types. + set_mode_search_ctx(pc_tree, is_ctx_ready, mode_srch_ctx); + + // Array of sub-partition size of AB partition types. + const BLOCK_SIZE ab_subsize[NUM_AB_PARTS][SUB_PARTITIONS_AB] = { + { blk_params.split_bsize2, blk_params.split_bsize2, + get_partition_subsize(bsize, PARTITION_HORZ_A) }, + { get_partition_subsize(bsize, PARTITION_HORZ_B), blk_params.split_bsize2, + blk_params.split_bsize2 }, + { blk_params.split_bsize2, blk_params.split_bsize2, + get_partition_subsize(bsize, PARTITION_VERT_A) }, + { get_partition_subsize(bsize, PARTITION_VERT_B), blk_params.split_bsize2, + blk_params.split_bsize2 } + }; + + // Array of mi_row, mi_col positions corresponds to each sub-partition in AB + // partition types. + const int ab_mi_pos[NUM_AB_PARTS][SUB_PARTITIONS_AB][2] = { + { { mi_row, mi_col }, + { mi_row, blk_params.mi_col_edge }, + { blk_params.mi_row_edge, mi_col } }, + { { mi_row, mi_col }, + { blk_params.mi_row_edge, mi_col }, + { blk_params.mi_row_edge, blk_params.mi_col_edge } }, + { { mi_row, mi_col }, + { blk_params.mi_row_edge, mi_col }, + { mi_row, blk_params.mi_col_edge } }, + { { mi_row, mi_col }, + { mi_row, blk_params.mi_col_edge }, + { blk_params.mi_row_edge, blk_params.mi_col_edge } } + }; + + // Loop over AB partition types. + for (AB_PART_TYPE ab_part_type = start_type; ab_part_type <= end_type; + ab_part_type++) { + const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A; + + // Check if the AB partition search is to be performed. + if (!ab_partitions_allowed[ab_part_type]) { + continue; + } + + blk_params.subsize = get_partition_subsize(bsize, part_type); + for (int i = 0; i < SUB_PARTITIONS_AB; i++) { + // Set AB partition context. + cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc( + cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf); + if (!cur_part_ctxs[ab_part_type][i]) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + // Set mode as not ready. + cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0; + } + + if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab) { + // We can copy directly the mode search results if we have already + // searched the current block and the contexts match. + if (is_ctx_ready[ab_part_type][0]) { + av1_copy_tree_context(cur_part_ctxs[ab_part_type][0], + mode_srch_ctx[ab_part_type][0][0]); + cur_part_ctxs[ab_part_type][0]->mic.partition = part_type; + cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1; + if (is_ctx_ready[ab_part_type][1]) { + av1_copy_tree_context(cur_part_ctxs[ab_part_type][1], + mode_srch_ctx[ab_part_type][1][0]); + cur_part_ctxs[ab_part_type][1]->mic.partition = part_type; + cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1; + } + } + } + + // Even if the contexts don't match, we can still speed up by reusing the + // previous prediction mode. + const MB_MODE_INFO *mode_cache[3] = { NULL, NULL, NULL }; + if (cpi->sf.part_sf.reuse_best_prediction_for_part_ab) { + set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type); + } + + // Evaluation of AB partition type. + rd_pick_ab_part(cpi, td, tile_data, tp, x, x_ctx, pc_tree, + cur_part_ctxs[ab_part_type], part_search_state, best_rdc, + ab_subsize[ab_part_type], ab_mi_pos[ab_part_type], + part_type, mode_cache); + } +} + +// Set mi positions for HORZ4 / VERT4 sub-block partitions. +static void set_mi_pos_partition4(const int inc_step[NUM_PART4_TYPES], + int mi_pos[SUB_PARTITIONS_PART4][2], + const int mi_row, const int mi_col) { + for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; i++) { + mi_pos[i][0] = mi_row + i * inc_step[HORZ4]; + mi_pos[i][1] = mi_col + i * inc_step[VERT4]; + } +} + +// Set context and RD cost for HORZ4 / VERT4 partition types. +static void set_4_part_ctx_and_rdcost( + MACROBLOCK *x, const AV1_COMP *const cpi, ThreadData *td, + PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4], + PartitionSearchState *part_search_state, PARTITION_TYPE partition_type, + BLOCK_SIZE bsize) { + // Initialize sum_rdc RD cost structure. + av1_init_rd_stats(&part_search_state->sum_rdc); + const int subsize = get_partition_subsize(bsize, partition_type); + part_search_state->sum_rdc.rate = + part_search_state->partition_cost[partition_type]; + part_search_state->sum_rdc.rdcost = + RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0); + for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) { + cur_part_ctx[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!cur_part_ctx[i]) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } +} + +// Partition search of HORZ4 / VERT4 partition types. +static void rd_pick_4partition( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PC_TREE *pc_tree, PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4], + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + const int inc_step[NUM_PART4_TYPES], PARTITION_TYPE partition_type) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + // mi positions needed for HORZ4 and VERT4 partition types. + int mi_pos_check[NUM_PART4_TYPES] = { cm->mi_params.mi_rows, + cm->mi_params.mi_cols }; + const PART4_TYPES part4_idx = (partition_type != PARTITION_HORZ_4); + int mi_pos[SUB_PARTITIONS_PART4][2]; + + blk_params.subsize = get_partition_subsize(blk_params.bsize, partition_type); + // Set partition context and RD cost. + set_4_part_ctx_and_rdcost(x, cpi, td, cur_part_ctx, part_search_state, + partition_type, blk_params.bsize); + // Set mi positions for sub-block sizes. + set_mi_pos_partition4(inc_step, mi_pos, blk_params.mi_row, blk_params.mi_col); +#if CONFIG_COLLECT_PARTITION_STATS + PartitionTimingStats *part_timing_stats = + &part_search_state->part_timing_stats; + if (best_rdc->rdcost - part_search_state->sum_rdc.rdcost >= 0) { + start_partition_block_timer(part_timing_stats, partition_type); + } +#endif + // Loop over sub-block partitions. + for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) { + if (i > 0 && mi_pos[i][part4_idx] >= mi_pos_check[part4_idx]) break; + + // Sub-block evaluation of Horz4 / Vert4 partition type. + cur_part_ctx[i]->rd_mode_is_ready = 0; + if (!rd_try_subblock( + cpi, td, tile_data, tp, (i == SUB_PARTITIONS_PART4 - 1), + mi_pos[i][0], mi_pos[i][1], blk_params.subsize, *best_rdc, + &part_search_state->sum_rdc, partition_type, cur_part_ctx[i])) { + av1_invalid_rd_stats(&part_search_state->sum_rdc); + break; + } + } + + // Calculate the total cost and update the best partition. + av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc); + if (part_search_state->sum_rdc.rdcost < best_rdc->rdcost) { + *best_rdc = part_search_state->sum_rdc; + part_search_state->found_best_partition = true; + pc_tree->partitioning = partition_type; + } +#if CONFIG_COLLECT_PARTITION_STATS + if (part_timing_stats->timer_is_on) { + end_partition_block_timer(part_timing_stats, partition_type, + part_search_state->sum_rdc.rdcost); + } +#endif + av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col, + blk_params.bsize, av1_num_planes(cm)); +} + +// Do not evaluate extended partitions if NONE partition is skippable. +static INLINE int prune_ext_part_none_skippable( + PICK_MODE_CONTEXT *part_none, int must_find_valid_partition, + int skip_non_sq_part_based_on_none, BLOCK_SIZE bsize) { + if ((skip_non_sq_part_based_on_none >= 1) && (part_none != NULL)) { + if (part_none->skippable && !must_find_valid_partition && + bsize >= BLOCK_16X16) { + return 1; + } + } + return 0; +} + +// Allow ab partition search +static int allow_ab_partition_search(PartitionSearchState *part_search_state, + PARTITION_SPEED_FEATURES *part_sf, + PARTITION_TYPE curr_best_part, + int must_find_valid_partition, + int prune_ext_part_state, + int64_t best_rdcost) { + const PartitionBlkParams blk_params = part_search_state->part_blk_params; + const BLOCK_SIZE bsize = blk_params.bsize; + + // Do not prune if there is no valid partition + if (best_rdcost == INT64_MAX) return 1; + + // Determine bsize threshold to evaluate ab partitions + BLOCK_SIZE ab_bsize_thresh = part_sf->ext_partition_eval_thresh; + if (part_sf->ext_part_eval_based_on_cur_best && !must_find_valid_partition && + !(curr_best_part == PARTITION_HORZ || curr_best_part == PARTITION_VERT)) + ab_bsize_thresh = BLOCK_128X128; + + // ab partitions are only allowed for square block sizes BLOCK_16X16 or + // higher, so ab_bsize_thresh must be large enough to exclude BLOCK_4X4 and + // BLOCK_8X8. + assert(ab_bsize_thresh >= BLOCK_8X8); + + int ab_partition_allowed = + part_search_state->do_rectangular_split && bsize > ab_bsize_thresh && + av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state; + + return ab_partition_allowed; +} + +// Prune 4-way partitions based on the number of horz/vert wins +// in the current block and sub-blocks in PARTITION_SPLIT. +static void prune_4_partition_using_split_info( + AV1_COMP *const cpi, MACROBLOCK *x, PartitionSearchState *part_search_state, + int part4_search_allowed[NUM_PART4_TYPES]) { + PART4_TYPES cur_part[NUM_PART4_TYPES] = { HORZ4, VERT4 }; + // Count of child blocks in which HORZ or VERT partition has won + int num_child_rect_win[NUM_RECT_PARTS] = { 0, 0 }; + // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of + // split partiitons. + // Conservative pruning for high quantizers. + const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3); + + for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) { + if (!(cpi->sf.part_sf.prune_ext_part_using_split_info && + part4_search_allowed[cur_part[i]])) + continue; + // Loop over split partitions. + // Get rectangular partitions winner info of split partitions. + for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; idx++) + num_child_rect_win[i] += + (part_search_state->split_part_rect_win[idx].rect_part_win[i]) ? 1 + : 0; + if (num_child_rect_win[i] < num_win_thresh) { + part4_search_allowed[cur_part[i]] = 0; + } + } +} + +// Prune 4-way partition search. +static void prune_4_way_partition_search( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + int pb_source_variance, int prune_ext_part_state, + int part4_search_allowed[NUM_PART4_TYPES]) { + const PartitionBlkParams blk_params = part_search_state->part_blk_params; + const BLOCK_SIZE bsize = blk_params.bsize; + + // Do not prune if there is no valid partition + if (best_rdc->rdcost == INT64_MAX) return; + + // Determine bsize threshold to evaluate 4-way partitions + BLOCK_SIZE part4_bsize_thresh = cpi->sf.part_sf.ext_partition_eval_thresh; + if (cpi->sf.part_sf.ext_part_eval_based_on_cur_best && + !x->must_find_valid_partition && pc_tree->partitioning == PARTITION_NONE) + part4_bsize_thresh = BLOCK_128X128; + + // 4-way partitions are only allowed for BLOCK_16X16, BLOCK_32X32, and + // BLOCK_64X64, so part4_bsize_thresh must be large enough to exclude + // BLOCK_4X4 and BLOCK_8X8. + assert(part4_bsize_thresh >= BLOCK_8X8); + + bool partition4_allowed = + part_search_state->do_rectangular_split && bsize > part4_bsize_thresh && + av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state; + + // Disable 4-way partition search flags for width less than a multiple of the + // minimum partition width. + if (blk_params.width < (blk_params.min_partition_size_1d + << cpi->sf.part_sf.prune_part4_search)) { + part4_search_allowed[HORZ4] = 0; + part4_search_allowed[VERT4] = 0; + return; + } + + PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4, + PARTITION_VERT_4 }; + const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg; + // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or + // PARTITION_VERT_4 for this block. This is almost the same as + // partition4_allowed, except that we don't allow 128x32 or 32x128 + // blocks, so we require that bsize is not BLOCK_128X128. + partition4_allowed &= + part_cfg->enable_1to4_partitions && bsize != BLOCK_128X128; + + for (PART4_TYPES i = HORZ4; i < NUM_PART4_TYPES; i++) { + part4_search_allowed[i] = + partition4_allowed && part_search_state->partition_rect_allowed[i] && + get_plane_block_size(get_partition_subsize(bsize, cur_part[i]), + part_search_state->ss_x, + part_search_state->ss_y) != BLOCK_INVALID; + } + // Pruning: pruning out 4-way partitions based on the current best partition. + if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) { + part4_search_allowed[HORZ4] &= (pc_tree->partitioning == PARTITION_HORZ || + pc_tree->partitioning == PARTITION_HORZ_A || + pc_tree->partitioning == PARTITION_HORZ_B || + pc_tree->partitioning == PARTITION_SPLIT || + pc_tree->partitioning == PARTITION_NONE); + part4_search_allowed[VERT4] &= (pc_tree->partitioning == PARTITION_VERT || + pc_tree->partitioning == PARTITION_VERT_A || + pc_tree->partitioning == PARTITION_VERT_B || + pc_tree->partitioning == PARTITION_SPLIT || + pc_tree->partitioning == PARTITION_NONE); + } + + // Pruning: pruning out some 4-way partitions using a DNN taking rd costs of + // sub-blocks from basic partition types. + if (cpi->sf.part_sf.ml_prune_partition && partition4_allowed && + part_search_state->partition_rect_allowed[HORZ] && + part_search_state->partition_rect_allowed[VERT]) { + av1_ml_prune_4_partition(cpi, x, pc_tree->partitioning, best_rdc->rdcost, + part_search_state, part4_search_allowed, + pb_source_variance); + } + + // Pruning: pruning out 4-way partitions based on the number of horz/vert wins + // in the current block and sub-blocks in PARTITION_SPLIT. + prune_4_partition_using_split_info(cpi, x, part_search_state, + part4_search_allowed); +} + +// Set params needed for PARTITION_NONE search. +static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td, + MACROBLOCK *x, PC_TREE *pc_tree, + PartitionSearchState *part_search_state, + RD_STATS *best_remain_rdcost, + RD_STATS *best_rdc, int *pt_cost) { + PartitionBlkParams blk_params = part_search_state->part_blk_params; + RD_STATS partition_rdcost; + // Set PARTITION_NONE context. + if (pc_tree->none == NULL) + pc_tree->none = av1_alloc_pmc(cpi, blk_params.bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + + // Set PARTITION_NONE type cost. + if (part_search_state->partition_none_allowed) { + if (blk_params.bsize_at_least_8x8) { + *pt_cost = part_search_state->partition_cost[PARTITION_NONE] < INT_MAX + ? part_search_state->partition_cost[PARTITION_NONE] + : 0; + } + + // Initialize the RD stats structure. + av1_init_rd_stats(&partition_rdcost); + partition_rdcost.rate = *pt_cost; + av1_rd_cost_update(x->rdmult, &partition_rdcost); + av1_rd_stats_subtraction(x->rdmult, best_rdc, &partition_rdcost, + best_remain_rdcost); + } +} + +// Skip other partitions based on PARTITION_NONE rd cost. +static void prune_partitions_after_none(AV1_COMP *const cpi, MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + PICK_MODE_CONTEXT *ctx_none, + PartitionSearchState *part_search_state, + RD_STATS *best_rdc, + unsigned int *pb_source_variance) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const PartitionBlkParams blk_params = part_search_state->part_blk_params; + RD_STATS *this_rdc = &part_search_state->this_rdc; + const BLOCK_SIZE bsize = blk_params.bsize; + assert(bsize < BLOCK_SIZES_ALL); + + if (!frame_is_intra_only(cm) && + (part_search_state->do_square_split || + part_search_state->do_rectangular_split) && + !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) { + const int use_ml_based_breakout = + bsize <= cpi->sf.part_sf.use_square_partition_only_threshold && + bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1; + if (use_ml_based_breakout) { + av1_ml_predict_breakout(cpi, x, this_rdc, *pb_source_variance, xd->bd, + part_search_state); + } + + // Adjust dist breakout threshold according to the partition size. + const int64_t dist_breakout_thr = + cpi->sf.part_sf.partition_search_breakout_dist_thr >> + ((2 * (MAX_SB_SIZE_LOG2 - 2)) - + (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize])); + const int rate_breakout_thr = + cpi->sf.part_sf.partition_search_breakout_rate_thr * + num_pels_log2_lookup[bsize]; + // If all y, u, v transform blocks in this partition are skippable, + // and the dist & rate are within the thresholds, the partition + // search is terminated for current branch of the partition search + // tree. The dist & rate thresholds are set to 0 at speed 0 to + // disable the early termination at that speed. + if (best_rdc->dist < dist_breakout_thr && + best_rdc->rate < rate_breakout_thr) { + part_search_state->do_square_split = 0; + part_search_state->do_rectangular_split = 0; + } + } + + // Early termination: using simple_motion_search features and the + // rate, distortion, and rdcost of PARTITION_NONE, a DNN will make a + // decision on early terminating at PARTITION_NONE. + if (cpi->sf.part_sf.simple_motion_search_early_term_none && cm->show_frame && + !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 && + av1_blk_has_rows_and_cols(&blk_params) && this_rdc->rdcost < INT64_MAX && + this_rdc->rdcost >= 0 && this_rdc->rate < INT_MAX && + this_rdc->rate >= 0 && + (part_search_state->do_square_split || + part_search_state->do_rectangular_split)) { + av1_simple_motion_search_early_term_none(cpi, x, sms_tree, this_rdc, + part_search_state); + } +} + +// Decide early termination and rectangular partition pruning +// based on PARTITION_NONE and PARTITION_SPLIT costs. +static void prune_partitions_after_split( + AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + int64_t part_none_rd, int64_t part_split_rd) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + assert(bsize < BLOCK_SIZES_ALL); + + // Early termination: using the rd costs of PARTITION_NONE and subblocks + // from PARTITION_SPLIT to determine an early breakout. + if (cpi->sf.part_sf.ml_early_term_after_part_split_level && + !frame_is_intra_only(cm) && + !part_search_state->terminate_partition_search && + part_search_state->do_rectangular_split && + (part_search_state->partition_rect_allowed[HORZ] || + part_search_state->partition_rect_allowed[VERT])) { + av1_ml_early_term_after_split( + cpi, x, sms_tree, best_rdc->rdcost, part_none_rd, part_split_rd, + part_search_state->split_rd, part_search_state); + } + + // Use the rd costs of PARTITION_NONE and subblocks from PARTITION_SPLIT + // to prune out rectangular partitions in some directions. + if (!cpi->sf.part_sf.ml_early_term_after_part_split_level && + cpi->sf.part_sf.ml_prune_partition && !frame_is_intra_only(cm) && + (part_search_state->partition_rect_allowed[HORZ] || + part_search_state->partition_rect_allowed[VERT]) && + !(part_search_state->prune_rect_part[HORZ] || + part_search_state->prune_rect_part[VERT]) && + !part_search_state->terminate_partition_search) { + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm), + bsize); + av1_ml_prune_rect_partition(cpi, x, best_rdc->rdcost, + part_search_state->none_rd, + part_search_state->split_rd, part_search_state); + } +} + +// Returns true if either of the left and top neighbor blocks is larger than +// the current block; false otherwise. +static AOM_INLINE bool is_neighbor_blk_larger_than_cur_blk( + const MACROBLOCKD *xd, BLOCK_SIZE bsize) { + const int cur_blk_area = (block_size_high[bsize] * block_size_wide[bsize]); + if (xd->left_available) { + const BLOCK_SIZE left_bsize = xd->left_mbmi->bsize; + if (block_size_high[left_bsize] * block_size_wide[left_bsize] > + cur_blk_area) + return true; + } + + if (xd->up_available) { + const BLOCK_SIZE above_bsize = xd->above_mbmi->bsize; + if (block_size_high[above_bsize] * block_size_wide[above_bsize] > + cur_blk_area) + return true; + } + return false; +} + +static AOM_INLINE void prune_rect_part_using_none_pred_mode( + const MACROBLOCKD *xd, PartitionSearchState *part_state, + PREDICTION_MODE mode, BLOCK_SIZE bsize) { + if (mode == DC_PRED || mode == SMOOTH_PRED) { + // If the prediction mode of NONE partition is either DC_PRED or + // SMOOTH_PRED, it indicates that the current block has less variation. In + // this case, HORZ and VERT partitions are pruned if at least one of left + // and top neighbor blocks is larger than the current block. + if (is_neighbor_blk_larger_than_cur_blk(xd, bsize)) { + part_state->prune_rect_part[HORZ] = 1; + part_state->prune_rect_part[VERT] = 1; + } + } else if (mode == D67_PRED || mode == V_PRED || mode == D113_PRED) { + // If the prediction mode chosen by NONE partition is close to 90 degrees, + // it implies a dominant vertical pattern, and the chance of choosing a + // vertical rectangular partition is high. Hence, horizontal partition is + // pruned in these cases. + part_state->prune_rect_part[HORZ] = 1; + } else if (mode == D157_PRED || mode == H_PRED || mode == D203_PRED) { + // If the prediction mode chosen by NONE partition is close to 180 degrees, + // it implies a dominant horizontal pattern, and the chance of choosing a + // horizontal rectangular partition is high. Hence, vertical partition is + // pruned in these cases. + part_state->prune_rect_part[VERT] = 1; + } +} + +// PARTITION_NONE search. +static void none_partition_search( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MACROBLOCK *x, + PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree, + RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + unsigned int *pb_source_variance, int64_t *none_rd, int64_t *part_none_rd) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + RD_STATS *this_rdc = &part_search_state->this_rdc; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + assert(bsize < BLOCK_SIZES_ALL); + + if (part_search_state->terminate_partition_search || + !part_search_state->partition_none_allowed) + return; + + int pt_cost = 0; + RD_STATS best_remain_rdcost; + av1_invalid_rd_stats(&best_remain_rdcost); + + // Set PARTITION_NONE context and cost. + set_none_partition_params(cpi, td, x, pc_tree, part_search_state, + &best_remain_rdcost, best_rdc, &pt_cost); + +#if CONFIG_COLLECT_PARTITION_STATS + // Timer start for partition None. + PartitionTimingStats *part_timing_stats = + &part_search_state->part_timing_stats; + if (best_remain_rdcost.rdcost >= 0) { + start_partition_block_timer(part_timing_stats, PARTITION_NONE); + } +#endif + // PARTITION_NONE evaluation and cost update. + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, PARTITION_NONE, + bsize, pc_tree->none, best_remain_rdcost); + + av1_rd_cost_update(x->rdmult, this_rdc); + +#if CONFIG_COLLECT_PARTITION_STATS + // Timer end for partition None. + if (part_timing_stats->timer_is_on) { + RD_STATS tmp_rdc; + av1_init_rd_stats(&tmp_rdc); + if (this_rdc->rate != INT_MAX) { + tmp_rdc.rate = this_rdc->rate; + tmp_rdc.dist = this_rdc->dist; + tmp_rdc.rdcost = this_rdc->rdcost; + if (blk_params.bsize_at_least_8x8) { + tmp_rdc.rate += pt_cost; + tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist); + } + } + end_partition_block_timer(part_timing_stats, PARTITION_NONE, + tmp_rdc.rdcost); + } +#endif + *pb_source_variance = x->source_variance; + if (none_rd) *none_rd = this_rdc->rdcost; + part_search_state->none_rd = this_rdc->rdcost; + if (this_rdc->rate != INT_MAX) { + // Record picked ref frame to prune ref frames for other partition types. + if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) { + const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame); + av1_update_picked_ref_frames_mask( + x, ref_type, bsize, cm->seq_params->mib_size, mi_row, mi_col); + } + + // Calculate the total cost and update the best partition. + if (blk_params.bsize_at_least_8x8) { + this_rdc->rate += pt_cost; + this_rdc->rdcost = RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist); + } + *part_none_rd = this_rdc->rdcost; + if (this_rdc->rdcost < best_rdc->rdcost) { + *best_rdc = *this_rdc; + part_search_state->found_best_partition = true; + if (blk_params.bsize_at_least_8x8) { + pc_tree->partitioning = PARTITION_NONE; + } + + // Disable split and rectangular partition search + // based on PARTITION_NONE cost. + prune_partitions_after_none(cpi, x, sms_tree, pc_tree->none, + part_search_state, best_rdc, + pb_source_variance); + } + + if (cpi->sf.part_sf.prune_rect_part_using_none_pred_mode) + prune_rect_part_using_none_pred_mode(&x->e_mbd, part_search_state, + pc_tree->none->mic.mode, bsize); + } + av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); +} + +// PARTITION_SPLIT search. +static void split_partition_search( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree, + SIMPLE_MOTION_DATA_TREE *sms_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + SB_MULTI_PASS_MODE multi_pass_mode, int64_t *part_split_rd) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + assert(bsize < BLOCK_SIZES_ALL); + RD_STATS sum_rdc = part_search_state->sum_rdc; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + + // Check if partition split is allowed. + if (part_search_state->terminate_partition_search || + !part_search_state->do_square_split) + return; + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (pc_tree->split[i] == NULL) + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + pc_tree->split[i]->index = i; + } + + // Initialization of this partition RD stats. + av1_init_rd_stats(&sum_rdc); + sum_rdc.rate = part_search_state->partition_cost[PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + + int idx; +#if CONFIG_COLLECT_PARTITION_STATS + PartitionTimingStats *part_timing_stats = + &part_search_state->part_timing_stats; + if (best_rdc->rdcost - sum_rdc.rdcost >= 0) { + start_partition_block_timer(part_timing_stats, PARTITION_SPLIT); + } +#endif + // Recursive partition search on 4 sub-blocks. + for (idx = 0; idx < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc->rdcost; + ++idx) { + const int x_idx = (idx & 1) * blk_params.mi_step; + const int y_idx = (idx >> 1) * blk_params.mi_step; + + if (mi_row + y_idx >= mi_params->mi_rows || + mi_col + x_idx >= mi_params->mi_cols) + continue; + + pc_tree->split[idx]->index = idx; + int64_t *p_split_rd = &part_search_state->split_rd[idx]; + RD_STATS best_remain_rdcost; + av1_rd_stats_subtraction(x->rdmult, best_rdc, &sum_rdc, + &best_remain_rdcost); + + int curr_quad_tree_idx = 0; + if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { + curr_quad_tree_idx = part_search_state->intra_part_info->quad_tree_idx; + part_search_state->intra_part_info->quad_tree_idx = + 4 * curr_quad_tree_idx + idx + 1; + } + // Split partition evaluation of corresponding idx. + // If the RD cost exceeds the best cost then do not + // evaluate other split sub-partitions. + SIMPLE_MOTION_DATA_TREE *const sms_tree_split = + (sms_tree == NULL) ? NULL : sms_tree->split[idx]; + if (!av1_rd_pick_partition( + cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &part_search_state->this_rdc, best_remain_rdcost, + pc_tree->split[idx], sms_tree_split, p_split_rd, multi_pass_mode, + &part_search_state->split_part_rect_win[idx])) { + av1_invalid_rd_stats(&sum_rdc); + break; + } + if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { + part_search_state->intra_part_info->quad_tree_idx = curr_quad_tree_idx; + } + + sum_rdc.rate += part_search_state->this_rdc.rate; + sum_rdc.dist += part_search_state->this_rdc.dist; + av1_rd_cost_update(x->rdmult, &sum_rdc); + + // Set split ctx as ready for use. + if (idx <= 1 && (bsize <= BLOCK_8X8 || + pc_tree->split[idx]->partitioning == PARTITION_NONE)) { + const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none->mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + // Neither palette mode nor cfl predicted. + if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { + if (mbmi->uv_mode != UV_CFL_PRED) + part_search_state->is_split_ctx_is_ready[idx] = 1; + } + } + } +#if CONFIG_COLLECT_PARTITION_STATS + if (part_timing_stats->timer_is_on) { + end_partition_block_timer(part_timing_stats, PARTITION_SPLIT, + sum_rdc.rdcost); + } +#endif + const int reached_last_index = (idx == SUB_PARTITIONS_SPLIT); + + // Calculate the total cost and update the best partition. + *part_split_rd = sum_rdc.rdcost; + if (reached_last_index && sum_rdc.rdcost < best_rdc->rdcost) { + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + if (sum_rdc.rdcost < best_rdc->rdcost) { + *best_rdc = sum_rdc; + part_search_state->found_best_partition = true; + pc_tree->partitioning = PARTITION_SPLIT; + } + } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) { + // Skip rectangular partition test when partition type none gives better + // rd than partition type split. + if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) { + const int partition_none_valid = part_search_state->none_rd > 0; + const int partition_none_better = + part_search_state->none_rd < sum_rdc.rdcost; + part_search_state->do_rectangular_split &= + !(partition_none_valid && partition_none_better); + } + } + // Restore the context for the following cases: + // 1) Current block size not more than maximum partition size as dry run + // encode happens for these cases + // 2) Current block size same as superblock size as the final encode + // happens for this case + if (bsize <= x->sb_enc.max_partition_size || bsize == cm->seq_params->sb_size) + av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); +} + +// The max number of nodes in the partition tree. +// The number of leaf nodes is (128x128) / (4x4) = 1024. +// The number of All possible parent nodes is 1 + 2 + ... + 512 = 1023. +#define NUM_NODES 2048 + +static void write_partition_tree(AV1_COMP *const cpi, + const PC_TREE *const pc_tree, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col) { + (void)mi_row; + (void)mi_col; + const char *path = cpi->oxcf.partition_info_path; + char filename[256]; + snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path, + cpi->sb_counter, 0); + FILE *pfile = fopen(filename, "w"); + fprintf(pfile, "%d", bsize); + + // Write partition type with BFS order. + const PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int q_idx = 0; + int last_idx = 1; + int num_nodes = 1; + + // First traversal to get number of leaf nodes. + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + if (node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + num_nodes += 4; + } + --num_nodes; + ++q_idx; + } + const int num_leafs = last_idx; + fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1); + + // Write partitions for each node. + q_idx = 0; + last_idx = 1; + num_nodes = 1; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + fprintf(pfile, ",%d", node->partitioning); + if (node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + num_nodes += 4; + } + --num_nodes; + ++q_idx; + } + fprintf(pfile, "\n"); + + fclose(pfile); +} + +#if CONFIG_PARTITION_SEARCH_ORDER +static void verify_write_partition_tree(const AV1_COMP *const cpi, + const PC_TREE *const pc_tree, + const BLOCK_SIZE bsize, + const int config_id, const int mi_row, + const int mi_col) { + (void)mi_row; + (void)mi_col; + const char *path = cpi->oxcf.partition_info_path; + char filename[256]; + snprintf(filename, sizeof(filename), "%s/verify_partition_tree_sb%d_c%d", + path, cpi->sb_counter, config_id); + FILE *pfile = fopen(filename, "w"); + fprintf(pfile, "%d", bsize); + + // Write partition type with BFS order. + const PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int q_idx = 0; + int last_idx = 1; + int num_nodes = 1; + + // First traversal to get number of leaf nodes. + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL && node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + num_nodes += 4; + } + --num_nodes; + ++q_idx; + } + const int num_leafs = last_idx; + fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1); + + // Write partitions for each node. + q_idx = 0; + last_idx = 1; + num_nodes = 1; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL) { // suppress warning + fprintf(pfile, ",%d", node->partitioning); + if (node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + num_nodes += 4; + } + } + --num_nodes; + ++q_idx; + } + fprintf(pfile, "\n"); + + fclose(pfile); +} + +static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree, + struct aom_internal_error_info *error_info, + const int config_id) { + const AV1_COMMON *const cm = &cpi->common; + const char *path = cpi->oxcf.partition_info_path; + char filename[256]; + snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path, + cpi->sb_counter, config_id); + FILE *pfile = fopen(filename, "r"); + if (pfile == NULL) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Can't find input file: %s.", + filename); + } + + int read_bsize; + int num_nodes; + int num_configs; + fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs); + assert(read_bsize == cpi->common.seq_params->sb_size); + BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize; + assert(bsize == pc_tree->block_size); + + PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int last_idx = 1; + int q_idx = 0; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + int partitioning; + fscanf(pfile, ",%d", &partitioning); + assert(partitioning >= PARTITION_NONE && + partitioning < EXT_PARTITION_TYPES); + PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL) { + node->partitioning = partitioning; + bsize = node->block_size; + } + if (partitioning == PARTITION_SPLIT) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int i = 0; i < 4; ++i) { + if (node != NULL) { // Suppress warning + node->split[i] = av1_alloc_pc_tree_node(subsize); + if (!node->split[i]) + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + node->split[i]->index = i; + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + } + } + --num_nodes; + ++q_idx; + } + fclose(pfile); + + return num_configs; +} + +static RD_STATS rd_search_for_fixed_partition( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col, + const BLOCK_SIZE bsize, PC_TREE *pc_tree) { + const PARTITION_TYPE partition = pc_tree->partitioning; + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + TileInfo *const tile_info = &tile_data->tile_info; + RD_STATS best_rdc; + av1_invalid_rd_stats(&best_rdc); + int sum_subblock_rate = 0; + int64_t sum_subblock_dist = 0; + PartitionSearchState part_search_state; + init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col, + bsize); + // Override partition costs at the edges of the frame in the same + // way as in read_partition (see decodeframe.c). + PartitionBlkParams blk_params = part_search_state.part_blk_params; + if (!av1_blk_has_rows_and_cols(&blk_params)) + set_partition_cost_for_edge_blk(cm, &part_search_state); + + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + (void)orig_rdmult; + + // Set the context. + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + assert(bsize < BLOCK_SIZES_ALL); + unsigned int pb_source_variance = UINT_MAX; + int64_t part_none_rd = INT64_MAX; + int64_t none_rd = INT64_MAX; + int inc_step[NUM_PART4_TYPES] = { 0 }; + if (partition == PARTITION_HORZ_4) inc_step[HORZ4] = mi_size_high[bsize] / 4; + if (partition == PARTITION_VERT_4) inc_step[VERT4] = mi_size_wide[bsize] / 4; + + switch (partition) { + case PARTITION_NONE: + none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx, + &part_search_state, &best_rdc, &pb_source_variance, + &none_rd, &part_none_rd); + break; + case PARTITION_HORZ: + rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, + &part_search_state, &best_rdc, NULL, HORZ, + HORZ); + break; + case PARTITION_VERT: + rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, + &part_search_state, &best_rdc, NULL, VERT, + VERT); + break; + case PARTITION_HORZ_A: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, HORZ_A, HORZ_A); + break; + case PARTITION_HORZ_B: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, HORZ_B, HORZ_B); + break; + case PARTITION_VERT_A: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, VERT_A, VERT_A); + break; + case PARTITION_VERT_B: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, VERT_B, VERT_B); + break; + case PARTITION_HORZ_4: + rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + pc_tree->horizontal4, &part_search_state, &best_rdc, + inc_step, PARTITION_HORZ_4); + break; + case PARTITION_VERT_4: + rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + pc_tree->vertical4, &part_search_state, &best_rdc, + inc_step, PARTITION_VERT_4); + break; + case PARTITION_SPLIT: + for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; ++idx) { + const BLOCK_SIZE subsize = + get_partition_subsize(bsize, PARTITION_SPLIT); + assert(subsize < BLOCK_SIZES_ALL); + const int next_mi_row = + idx < 2 ? mi_row : mi_row + mi_size_high[subsize]; + const int next_mi_col = + idx % 2 == 0 ? mi_col : mi_col + mi_size_wide[subsize]; + if (next_mi_row >= cm->mi_params.mi_rows || + next_mi_col >= cm->mi_params.mi_cols) { + continue; + } + const RD_STATS subblock_rdc = rd_search_for_fixed_partition( + cpi, td, tile_data, tp, sms_tree->split[idx], next_mi_row, + next_mi_col, subsize, pc_tree->split[idx]); + sum_subblock_rate += subblock_rdc.rate; + sum_subblock_dist += subblock_rdc.dist; + } + best_rdc.rate = sum_subblock_rate; + best_rdc.rate += part_search_state.partition_cost[PARTITION_SPLIT]; + best_rdc.dist = sum_subblock_dist; + best_rdc.rdcost = RDCOST(x->rdmult, best_rdc.rate, best_rdc.dist); + break; + default: + assert(0 && "invalid partition type."); + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Invalid partition type."); + } + // Note: it is necessary to restore context information. + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + if (bsize != cm->seq_params->sb_size) { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + x->rdmult = orig_rdmult; + + return best_rdc; +} + +static void prepare_sb_features_before_search( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, + int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features) { + av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col, + bsize, features); + collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, features); +} + +static void update_partition_stats(const RD_STATS *const this_rdcost, + aom_partition_stats_t *stats) { + stats->rate = this_rdcost->rate; + stats->dist = this_rdcost->dist; + stats->rdcost = this_rdcost->rdcost; +} + +static void build_pc_tree_from_part_decision( + const aom_partition_decision_t *partition_decision, + const BLOCK_SIZE this_bsize, PC_TREE *pc_tree, + struct aom_internal_error_info *error_info) { + BLOCK_SIZE bsize = this_bsize; + int num_nodes = partition_decision->num_nodes; + PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int last_idx = 1; + int q_idx = 0; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const int partitioning = partition_decision->partition_decision[q_idx]; + assert(partitioning >= PARTITION_NONE && + partitioning < EXT_PARTITION_TYPES); + PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL) { + node->partitioning = partitioning; + bsize = node->block_size; + } + if (partitioning == PARTITION_SPLIT) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int i = 0; i < 4; ++i) { + if (node != NULL) { // Suppress warning + node->split[i] = av1_alloc_pc_tree_node(subsize); + if (!node->split[i]) + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + node->split[i]->index = i; + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + } + } + --num_nodes; + ++q_idx; + } +} + +// The ML model needs to provide the whole decision tree for the superblock. +static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, + TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, + int mi_row, int mi_col, + const BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + struct aom_internal_error_info *error_info = x->e_mbd.error_info; + aom_partition_features_t features; + prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize, + &features); + features.mi_row = mi_row; + features.mi_col = mi_col; + features.frame_width = cpi->frame_info.frame_width; + features.frame_height = cpi->frame_info.frame_height; + features.block_size = bsize; + av1_ext_part_send_features(ext_part_controller, &features); + + // rd mode search (dry run) for a valid partition decision from the ml model. + aom_partition_decision_t partition_decision; + do { + const bool valid_decision = av1_ext_part_get_partition_decision( + ext_part_controller, &partition_decision); + if (!valid_decision) return false; + + // First, let's take the easy approach. + // We require that the ml model has to provide partition decisions for the + // whole superblock. + td->pc_root = av1_alloc_pc_tree_node(bsize); + if (!td->pc_root) + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + build_pc_tree_from_part_decision(&partition_decision, bsize, td->pc_root, + error_info); + + const RD_STATS this_rdcost = rd_search_for_fixed_partition( + cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root); + aom_partition_stats_t stats; + update_partition_stats(&this_rdcost, &stats); + av1_ext_part_send_partition_stats(ext_part_controller, &stats); + if (!partition_decision.is_final_decision) { + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + } + } while (!partition_decision.is_final_decision); + + // Encode with the selected mode and partition. + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + td->pc_root, NULL); + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + + return true; +} + +// Use a bitmask to represent the valid partition types for the current +// block. "1" represents the corresponding partition type is vaild. +// The least significant bit represents "PARTITION_NONE", the +// largest significant bit represents "PARTITION_VERT_4", follow +// the enum order for PARTITION_TYPE in "enums.h" +static int get_valid_partition_types( + const AV1_COMP *const cpi, + const PartitionSearchState *const part_search_state, + const BLOCK_SIZE bsize) { + const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg; + const PartitionBlkParams blk_params = part_search_state->part_blk_params; + int valid_types = 0; + // PARTITION_NONE + valid_types |= (part_search_state->partition_none_allowed << 0); + // PARTITION_HORZ + valid_types |= (part_search_state->partition_rect_allowed[HORZ] << 1); + // PARTITION_VERT + valid_types |= (part_search_state->partition_rect_allowed[VERT] << 2); + // PARTITION_SPLIT + valid_types |= (part_search_state->do_square_split << 3); + // PARTITION_HORZ_A + const int ext_partition_allowed = part_search_state->do_rectangular_split && + av1_blk_has_rows_and_cols(&blk_params); + const int horzab_partition_allowed = + ext_partition_allowed && part_cfg->enable_ab_partitions && + part_search_state->partition_rect_allowed[HORZ]; + valid_types |= (horzab_partition_allowed << 4); + // PARTITION_HORZ_B + valid_types |= (horzab_partition_allowed << 5); + // PARTITION_VERT_A + const int vertab_partition_allowed = + ext_partition_allowed && part_cfg->enable_ab_partitions && + part_search_state->partition_rect_allowed[VERT]; + valid_types |= (vertab_partition_allowed << 6); + // PARTITION_VERT_B + valid_types |= (vertab_partition_allowed << 7); + // PARTITION_HORZ_4 + const int partition4_allowed = part_cfg->enable_1to4_partitions && + ext_partition_allowed && + bsize != BLOCK_128X128; + const int horz4_allowed = + partition4_allowed && part_search_state->partition_rect_allowed[HORZ] && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4), + part_search_state->ss_x, + part_search_state->ss_y) != BLOCK_INVALID; + valid_types |= (horz4_allowed << 8); + // PARTITION_VERT_4 + const int vert4_allowed = + partition4_allowed && part_search_state->partition_rect_allowed[HORZ] && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4), + part_search_state->ss_x, + part_search_state->ss_y) != BLOCK_INVALID; + valid_types |= (vert4_allowed << 9); + + return valid_types; +} + +static void prepare_tpl_stats_block(const AV1_COMP *const cpi, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int64_t *intra_cost, + int64_t *inter_cost, int64_t *mc_dep_cost) { + const AV1_COMMON *const cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) { + return; + } + + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + // If tpl stats is not established, early return + if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) { + return; + } + + const int tpl_stride = tpl_frame->stride; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int mi_width = + AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); + const int mi_height = + AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); + + int64_t sum_intra_cost = 0; + int64_t sum_inter_cost = 0; + int64_t sum_mc_dep_cost = 0; + for (int row = 0; row < mi_height; row += step) { + for (int col = 0; col < mi_width; col += step) { + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, + tpl_data->tpl_stats_block_mis_log2)]; + sum_intra_cost += this_stats->intra_cost; + sum_inter_cost += this_stats->inter_cost; + const int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + sum_mc_dep_cost += mc_dep_delta; + } + } + + *intra_cost = sum_intra_cost; + *inter_cost = sum_inter_cost; + *mc_dep_cost = sum_mc_dep_cost; +} + +static bool recursive_partition(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, + PC_TREE *pc_tree, int mi_row, int mi_col, + const BLOCK_SIZE bsize, RD_STATS *this_rdcost) { + const AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) { + return false; + } + aom_partition_decision_t partition_decision; + do { + PartitionSearchState part_search_state; + // Initialization of state variables used in partition search. + // TODO(chengchen): check if there is hidden conditions that don't allow + // all possible partition types. + init_partition_search_state_params(x, cpi, &part_search_state, mi_row, + mi_col, bsize); + // Override partition costs at the edges of the frame in the same + // way as in read_partition (see decodeframe.c). + PartitionBlkParams blk_params = part_search_state.part_blk_params; + if (!av1_blk_has_rows_and_cols(&blk_params)) + set_partition_cost_for_edge_blk(cm, &part_search_state); + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + const int valid_partition_types = + get_valid_partition_types(cpi, &part_search_state, bsize); + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + const int qindex = av1_get_qindex(&cm->seg, xd->mi[0]->segment_id, + cm->quant_params.base_qindex); + // RD multiplier + const int rdmult = x->rdmult; + // pyramid level + const int pyramid_level = + cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]; + x->rdmult = orig_rdmult; + // Neighbor information + const int has_above = !!xd->above_mbmi; + const int has_left = !!xd->left_mbmi; + const BLOCK_SIZE above_bsize = + has_above ? xd->above_mbmi->bsize : BLOCK_INVALID; + const BLOCK_SIZE left_bsize = + has_left ? xd->left_mbmi->bsize : BLOCK_INVALID; + const int above_block_width = + above_bsize == BLOCK_INVALID ? -1 : block_size_wide[above_bsize]; + const int above_block_height = + above_bsize == BLOCK_INVALID ? -1 : block_size_high[above_bsize]; + const int left_block_width = + left_bsize == BLOCK_INVALID ? -1 : block_size_wide[left_bsize]; + const int left_block_height = + left_bsize == BLOCK_INVALID ? -1 : block_size_high[left_bsize]; + // Prepare simple motion search stats as features + unsigned int block_sse = -1; + unsigned int block_var = -1; + unsigned int sub_block_sse[4] = { -1, -1, -1, -1 }; + unsigned int sub_block_var[4] = { -1, -1, -1, -1 }; + unsigned int horz_block_sse[2] = { -1, -1 }; + unsigned int horz_block_var[2] = { -1, -1 }; + unsigned int vert_block_sse[2] = { -1, -1 }; + unsigned int vert_block_var[2] = { -1, -1 }; + av1_prepare_motion_search_features_block( + cpi, td, tile_data, mi_row, mi_col, bsize, valid_partition_types, + &block_sse, &block_var, sub_block_sse, sub_block_var, horz_block_sse, + horz_block_var, vert_block_sse, vert_block_var); + // Prepare tpl stats for the current block as features + int64_t tpl_intra_cost = -1; + int64_t tpl_inter_cost = -1; + int64_t tpl_mc_dep_cost = -1; + prepare_tpl_stats_block(cpi, bsize, mi_row, mi_col, &tpl_intra_cost, + &tpl_inter_cost, &tpl_mc_dep_cost); + + aom_partition_features_t features; + features.mi_row = mi_row; + features.mi_col = mi_col; + features.frame_width = cpi->frame_info.frame_width; + features.frame_height = cpi->frame_info.frame_height; + features.block_size = bsize; + features.valid_partition_types = valid_partition_types; + features.update_type = update_type; + features.qindex = qindex; + features.rdmult = rdmult; + features.pyramid_level = pyramid_level; + features.has_above_block = has_above; + features.above_block_width = above_block_width; + features.above_block_height = above_block_height; + features.has_left_block = has_left; + features.left_block_width = left_block_width; + features.left_block_height = left_block_height; + features.block_sse = block_sse; + features.block_var = block_var; + for (int i = 0; i < 4; ++i) { + features.sub_block_sse[i] = sub_block_sse[i]; + features.sub_block_var[i] = sub_block_var[i]; + } + for (int i = 0; i < 2; ++i) { + features.horz_block_sse[i] = horz_block_sse[i]; + features.horz_block_var[i] = horz_block_var[i]; + features.vert_block_sse[i] = vert_block_sse[i]; + features.vert_block_var[i] = vert_block_var[i]; + } + features.tpl_intra_cost = tpl_intra_cost; + features.tpl_inter_cost = tpl_inter_cost; + features.tpl_mc_dep_cost = tpl_mc_dep_cost; + av1_ext_part_send_features(ext_part_controller, &features); + const bool valid_decision = av1_ext_part_get_partition_decision( + ext_part_controller, &partition_decision); + if (!valid_decision) return false; + pc_tree->partitioning = partition_decision.current_decision; + + av1_init_rd_stats(this_rdcost); + if (partition_decision.current_decision == PARTITION_SPLIT) { + assert(block_size_wide[bsize] >= 8 && block_size_high[bsize] >= 8); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + RD_STATS split_rdc[SUB_PARTITIONS_SPLIT]; + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + av1_init_rd_stats(&split_rdc[i]); + if (pc_tree->split[i] == NULL) + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + pc_tree->split[i]->index = i; + } + const int orig_rdmult_tmp = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + // TODO(chengchen): check boundary conditions + // top-left + recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[0], + mi_row, mi_col, subsize, &split_rdc[0]); + // top-right + recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[1], + mi_row, mi_col + mi_size_wide[subsize], subsize, + &split_rdc[1]); + // bottom-left + recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[2], + mi_row + mi_size_high[subsize], mi_col, subsize, + &split_rdc[2]); + // bottom_right + recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[3], + mi_row + mi_size_high[subsize], + mi_col + mi_size_wide[subsize], subsize, + &split_rdc[3]); + this_rdcost->rate += part_search_state.partition_cost[PARTITION_SPLIT]; + // problem is here, the rdmult is different from the rdmult in sub block. + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + this_rdcost->rate += split_rdc[i].rate; + this_rdcost->dist += split_rdc[i].dist; + av1_rd_cost_update(x->rdmult, this_rdcost); + } + x->rdmult = orig_rdmult_tmp; + } else { + *this_rdcost = rd_search_for_fixed_partition( + cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree); + } + + aom_partition_stats_t stats; + update_partition_stats(this_rdcost, &stats); + av1_ext_part_send_partition_stats(ext_part_controller, &stats); + if (!partition_decision.is_final_decision) { + if (partition_decision.current_decision == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + if (pc_tree->split[i] != NULL) { + av1_free_pc_tree_recursive(pc_tree->split[i], av1_num_planes(cm), 0, + 0, + cpi->sf.part_sf.partition_search_type); + pc_tree->split[i] = NULL; + } + } + } + } + } while (!partition_decision.is_final_decision); + + return true; +} + +// The ML model only needs to make decisions for the current block each time. +static bool ml_partition_search_partial(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, + int mi_row, int mi_col, + const BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + aom_partition_features_t features; + prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize, + &features); + features.mi_row = mi_row; + features.mi_col = mi_col; + features.frame_width = cpi->frame_info.frame_width; + features.frame_height = cpi->frame_info.frame_height; + features.block_size = bsize; + av1_ext_part_send_features(ext_part_controller, &features); + td->pc_root = av1_alloc_pc_tree_node(bsize); + if (!td->pc_root) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + + RD_STATS rdcost; + const bool valid_partition = + recursive_partition(cpi, td, tile_data, tp, sms_root, td->pc_root, mi_row, + mi_col, bsize, &rdcost); + if (!valid_partition) { + return false; + } + + // Encode with the selected mode and partition. + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + td->pc_root, NULL); + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + + return true; +} + +bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, + int mi_col, const BLOCK_SIZE bsize, + RD_STATS *best_rd_cost) { + AV1_COMMON *const cm = &cpi->common; + if (cpi->ext_part_controller.ready) { + bool valid_search = true; + const aom_ext_part_decision_mode_t decision_mode = + av1_get_ext_part_decision_mode(&cpi->ext_part_controller); + if (decision_mode == AOM_EXT_PART_WHOLE_TREE) { + valid_search = ml_partition_search_whole_tree( + cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize); + } else if (decision_mode == AOM_EXT_PART_RECURSIVE) { + valid_search = ml_partition_search_partial( + cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize); + } else { + assert(0 && "Unknown decision mode."); + return false; + } + if (!valid_search) { + aom_internal_error( + cm->error, AOM_CODEC_ERROR, + "Invalid search from ML model, partition search failed"); + } + return true; + } + + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + int best_idx = 0; + int64_t min_rdcost = INT64_MAX; + int num_configs; + int i = 0; + do { + td->pc_root = av1_alloc_pc_tree_node(bsize); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + num_configs = read_partition_tree(cpi, td->pc_root, xd->error_info, i); + if (num_configs <= 0) { + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + aom_internal_error(xd->error_info, AOM_CODEC_ERROR, "Invalid configs."); + } + verify_write_partition_tree(cpi, td->pc_root, bsize, i, mi_row, mi_col); + if (i == 0) { + AOM_CHECK_MEM_ERROR(xd->error_info, x->rdcost, + aom_calloc(num_configs, sizeof(*x->rdcost))); + } + // Encode the block with the given partition tree. Get rdcost and encoding + // time. + x->rdcost[i] = rd_search_for_fixed_partition( + cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root); + + if (x->rdcost[i].rdcost < min_rdcost) { + min_rdcost = x->rdcost[i].rdcost; + best_idx = i; + *best_rd_cost = x->rdcost[i]; + } + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + ++i; + } while (i < num_configs); + + aom_free(x->rdcost); + x->rdcost = NULL; + // Encode with the partition configuration with the smallest rdcost. + td->pc_root = av1_alloc_pc_tree_node(bsize); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + read_partition_tree(cpi, td->pc_root, xd->error_info, best_idx); + rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row, + mi_col, bsize, td->pc_root); + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + td->pc_root, NULL); + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + ++cpi->sb_counter; + + return true; +} +#endif // CONFIG_PARTITION_SEARCH_ORDER + +static AOM_INLINE bool should_do_dry_run_encode_for_current_block( + BLOCK_SIZE sb_size, BLOCK_SIZE max_partition_size, int curr_block_index, + BLOCK_SIZE bsize) { + if (bsize > max_partition_size) return false; + + // Enable the reconstruction with dry-run for the 4th sub-block only if its + // parent block's reconstruction with dry-run is skipped. If + // max_partition_size is the same as immediate split of superblock, then avoid + // reconstruction of the 4th sub-block, as this data is not consumed. + if (curr_block_index != 3) return true; + + const BLOCK_SIZE sub_sb_size = + get_partition_subsize(sb_size, PARTITION_SPLIT); + return bsize == max_partition_size && sub_sb_size != max_partition_size; +} + +static void log_sub_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs, + double *var_min, double *var_max) { + // This functions returns a the minimum and maximum log variances for 4x4 + // sub blocks in the current block. + + const MACROBLOCKD *const xd = &x->e_mbd; + const int is_hbd = is_cur_buf_hbd(xd); + const int right_overflow = + (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; + const int bottom_overflow = + (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; + const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; + const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; + + // Initialize minimum variance to a large value and maximum variance to 0. + double min_var_4x4 = (double)INT_MAX; + double max_var_4x4 = 0.0; + + for (int i = 0; i < bh; i += MI_SIZE) { + for (int j = 0; j < bw; j += MI_SIZE) { + int var; + // Calculate the 4x4 sub-block variance. + var = av1_calc_normalized_variance( + cpi->ppi->fn_ptr[BLOCK_4X4].vf, + x->plane[0].src.buf + (i * x->plane[0].src.stride) + j, + x->plane[0].src.stride, is_hbd); + + // Record min and max for over-arching block + min_var_4x4 = AOMMIN(min_var_4x4, var); + max_var_4x4 = AOMMAX(max_var_4x4, var); + } + } + *var_min = log1p(min_var_4x4 / 16.0); + *var_max = log1p(max_var_4x4 / 16.0); +} + +static AOM_INLINE void set_sms_tree_partitioning( + SIMPLE_MOTION_DATA_TREE *sms_tree, PARTITION_TYPE partition) { + if (sms_tree == NULL) return; + sms_tree->partitioning = partition; +} + +/*!\brief AV1 block partition search (full search). +* +* \ingroup partition_search +* \callgraph +* Searches for the best partition pattern for a block based on the +* rate-distortion cost, and returns a bool value to indicate whether a valid +* partition pattern is found. The partition can recursively go down to the +* smallest block size. +* +* \param[in] cpi Top-level encoder structure +* \param[in] td Pointer to thread data +* \param[in] tile_data Pointer to struct holding adaptive +data/contexts/models for the tile during +encoding +* \param[in] tp Pointer to the starting token +* \param[in] mi_row Row coordinate of the block in a step size +of MI_SIZE +* \param[in] mi_col Column coordinate of the block in a step +size of MI_SIZE +* \param[in] bsize Current block size +* \param[in] rd_cost Pointer to the final rd cost of the block +* \param[in] best_rdc Upper bound of rd cost of a valid partition +* \param[in] pc_tree Pointer to the PC_TREE node storing the +picked partitions and mode info for the +current block +* \param[in] sms_tree Pointer to struct holding simple motion +search data for the current block +* \param[in] none_rd Pointer to the rd cost in the case of not +splitting the current block +* \param[in] multi_pass_mode SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS +* \param[in] rect_part_win_info Pointer to struct storing whether horz/vert +partition outperforms previously tested +partitions +* +* \return A bool value is returned indicating if a valid partition is found. +* The pc_tree struct is modified to store the picked partition and modes. +* The rd_cost struct is also updated with the RD stats corresponding to the +* best partition found. +*/ +bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, + RD_STATS best_rdc, PC_TREE *pc_tree, + SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd, + SB_MULTI_PASS_MODE multi_pass_mode, + RD_RECT_PART_WIN_INFO *rect_part_win_info) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + const TokenExtra *const tp_orig = *tp; + PartitionSearchState part_search_state; + + // Initialization of state variables used in partition search. + init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col, + bsize); + PartitionBlkParams blk_params = part_search_state.part_blk_params; + + set_sms_tree_partitioning(sms_tree, PARTITION_NONE); + if (best_rdc.rdcost < 0) { + av1_invalid_rd_stats(rd_cost); + return part_search_state.found_best_partition; + } + if (bsize == cm->seq_params->sb_size) x->must_find_valid_partition = 0; + + // Override skipping rectangular partition operations for edge blocks. + if (none_rd) *none_rd = 0; + (void)*tp_orig; + +#if CONFIG_COLLECT_PARTITION_STATS + // Stats at the current quad tree + PartitionTimingStats *part_timing_stats = + &part_search_state.part_timing_stats; + // Stats aggregated at frame level + FramePartitionTimingStats *fr_part_timing_stats = &cpi->partition_stats; +#endif // CONFIG_COLLECT_PARTITION_STATS + + // Override partition costs at the edges of the frame in the same + // way as in read_partition (see decodeframe.c). + if (!av1_blk_has_rows_and_cols(&blk_params)) + set_partition_cost_for_edge_blk(cm, &part_search_state); + + // Disable rectangular partitions for inner blocks when the current block is + // forced to only use square partitions. + if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) { + part_search_state.partition_rect_allowed[HORZ] &= !blk_params.has_rows; + part_search_state.partition_rect_allowed[VERT] &= !blk_params.has_cols; + } + +#ifndef NDEBUG + // Nothing should rely on the default value of this array (which is just + // leftover from encoding the previous block. Setting it to fixed pattern + // when debugging. + // bit 0, 1, 2 are blk_skip of each plane + // bit 4, 5, 6 are initialization checking of each plane + memset(x->txfm_search_info.blk_skip, 0x77, + sizeof(x->txfm_search_info.blk_skip)); +#endif // NDEBUG + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + // Set buffers and offsets. + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + if (cpi->oxcf.mode == ALLINTRA) { + if (bsize == cm->seq_params->sb_size) { + double var_min, var_max; + log_sub_block_var(cpi, x, bsize, &var_min, &var_max); + + x->intra_sb_rdmult_modifier = 128; + if ((var_min < 2.0) && (var_max > 4.0)) { + if ((var_max - var_min) > 8.0) { + x->intra_sb_rdmult_modifier -= 48; + } else { + x->intra_sb_rdmult_modifier -= (int)((var_max - var_min) * 6); + } + } + } + } + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + + // Apply simple motion search for the entire super block with fixed block + // size, e.g., 16x16, to collect features and write to files for the + // external ML model. + // TODO(chengchen): reduce motion search. This function is similar to + // av1_get_max_min_partition_features(). + if (COLLECT_MOTION_SEARCH_FEATURE_SB && !frame_is_intra_only(cm) && + bsize == cm->seq_params->sb_size) { + av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col, + bsize, /*features=*/NULL); + collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, /*features=*/NULL); + } + + // Update rd cost of the bound using the current multiplier. + av1_rd_cost_update(x->rdmult, &best_rdc); + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) + x->mb_energy = av1_log_block_var(cpi, x, bsize); + + // Set the context. + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_prune_partitions_time); +#endif + // Pruning: before searching any partition type, using source and simple + // motion search results to prune out unlikely partitions. + av1_prune_partitions_before_search(cpi, x, sms_tree, &part_search_state); + + // Pruning: eliminating partition types leading to coding block sizes outside + // the min and max bsize limitations set from the encoder. + av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_prune_partitions_time); +#endif + + // Partition search +BEGIN_PARTITION_SEARCH: + // If a valid partition is required, usually when the first round cannot find + // a valid one under the cost limit after pruning, reset the limitations on + // partition types and intra cnn output. + if (x->must_find_valid_partition) { + reset_part_limitations(cpi, &part_search_state); + av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state); + // Invalidate intra cnn output for key frames. + if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) { + part_search_state.intra_part_info->quad_tree_idx = 0; + part_search_state.intra_part_info->cnn_output_valid = 0; + } + } + // Partition block source pixel variance. + unsigned int pb_source_variance = UINT_MAX; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, none_partition_search_time); +#endif + + if (cpi->oxcf.mode == ALLINTRA) { + const bool bsize_at_least_16x16 = (bsize >= BLOCK_16X16); + const bool prune_rect_part_using_4x4_var_deviation = + (cpi->sf.part_sf.prune_rect_part_using_4x4_var_deviation && + !x->must_find_valid_partition); + + if (bsize_at_least_16x16 || prune_rect_part_using_4x4_var_deviation) { + double var_min, var_max; + log_sub_block_var(cpi, x, bsize, &var_min, &var_max); + + // Further pruning or in some cases reverse pruning when allintra is set. + // This code helps visual and in some cases metrics quality where the + // current block comprises at least one very low variance sub-block and at + // least one where the variance is much higher. + // + // The idea is that in such cases there is danger of ringing and other + // visual artifacts from a high variance feature such as an edge into a + // very low variance region. + // + // The approach taken is to force break down / split to a smaller block + // size to try and separate out the low variance and well predicted blocks + // from the more complex ones and to prevent propagation of ringing over a + // large region. + if (bsize_at_least_16x16 && (var_min < 0.272) && + ((var_max - var_min) > 3.0)) { + part_search_state.partition_none_allowed = 0; + part_search_state.terminate_partition_search = 0; + part_search_state.do_square_split = 1; + } else if (prune_rect_part_using_4x4_var_deviation && + (var_max - var_min < 3.0)) { + // Prune rectangular partitions if the variance deviation of 4x4 + // sub-blocks within the block is less than a threshold (derived + // empirically). + part_search_state.do_rectangular_split = 0; + } + } + } + + // PARTITION_NONE search stage. + int64_t part_none_rd = INT64_MAX; + none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx, + &part_search_state, &best_rdc, &pb_source_variance, + none_rd, &part_none_rd); + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, none_partition_search_time); +#endif +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, split_partition_search_time); +#endif + // PARTITION_SPLIT search stage. + int64_t part_split_rd = INT64_MAX; + split_partition_search(cpi, td, tile_data, tp, x, pc_tree, sms_tree, &x_ctx, + &part_search_state, &best_rdc, multi_pass_mode, + &part_split_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, split_partition_search_time); +#endif + // Terminate partition search for child partition, + // when NONE and SPLIT partition rd_costs are INT64_MAX. + if (cpi->sf.part_sf.early_term_after_none_split && + part_none_rd == INT64_MAX && part_split_rd == INT64_MAX && + !x->must_find_valid_partition && (bsize != cm->seq_params->sb_size)) { + part_search_state.terminate_partition_search = 1; + } + + // Do not evaluate non-square partitions if NONE partition did not choose a + // newmv mode and is skippable. + if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 2) && + (pc_tree->none != NULL)) { + if (x->qindex <= 200 && is_inter_mode(pc_tree->none->mic.mode) && + !have_newmv_in_inter_mode(pc_tree->none->mic.mode) && + pc_tree->none->skippable && !x->must_find_valid_partition && + bsize >= BLOCK_16X16) + part_search_state.do_rectangular_split = 0; + } + + // Prune partitions based on PARTITION_NONE and PARTITION_SPLIT. + prune_partitions_after_split(cpi, x, sms_tree, &part_search_state, &best_rdc, + part_none_rd, part_split_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rectangular_partition_search_time); +#endif + // Rectangular partitions search stage. + rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, + &part_search_state, &best_rdc, + rect_part_win_info, HORZ, VERT); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rectangular_partition_search_time); +#endif + + if (pb_source_variance == UINT_MAX) { + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); + pb_source_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + } + + assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, + !part_search_state.do_rectangular_split)); + + const int prune_ext_part_state = prune_ext_part_none_skippable( + pc_tree->none, x->must_find_valid_partition, + cpi->sf.part_sf.skip_non_sq_part_based_on_none, bsize); + + const int ab_partition_allowed = allow_ab_partition_search( + &part_search_state, &cpi->sf.part_sf, pc_tree->partitioning, + x->must_find_valid_partition, prune_ext_part_state, best_rdc.rdcost); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, ab_partitions_search_time); +#endif + // AB partitions search stage. + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, rect_part_win_info, + pb_source_variance, ab_partition_allowed, HORZ_A, + VERT_B); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, ab_partitions_search_time); +#endif + + // 4-way partitions search stage. + int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 }; + // Prune 4-way partition search. + prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc, + pb_source_variance, prune_ext_part_state, + part4_search_allowed); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_4partition_time); +#endif + // PARTITION_HORZ_4 + assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, + !part4_search_allowed[HORZ4])); + if (!part_search_state.terminate_partition_search && + part4_search_allowed[HORZ4]) { + const int inc_step[NUM_PART4_TYPES] = { mi_size_high[blk_params.bsize] / 4, + 0 }; + // Evaluation of Horz4 partition type. + rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + pc_tree->horizontal4, &part_search_state, &best_rdc, + inc_step, PARTITION_HORZ_4); + } + + // PARTITION_VERT_4 + assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, + !part4_search_allowed[VERT4])); + if (!part_search_state.terminate_partition_search && + part4_search_allowed[VERT4] && blk_params.has_cols) { + const int inc_step[NUM_PART4_TYPES] = { 0, mi_size_wide[blk_params.bsize] / + 4 }; + // Evaluation of Vert4 partition type. + rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + pc_tree->vertical4, &part_search_state, &best_rdc, + inc_step, PARTITION_VERT_4); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_4partition_time); +#endif + + if (bsize == cm->seq_params->sb_size && + !part_search_state.found_best_partition) { + // Did not find a valid partition, go back and search again, with less + // constraint on which partition types to search. + x->must_find_valid_partition = 1; +#if CONFIG_COLLECT_PARTITION_STATS + fr_part_timing_stats->partition_redo += 1; +#endif // CONFIG_COLLECT_PARTITION_STATS + goto BEGIN_PARTITION_SEARCH; + } + + // Store the final rd cost + *rd_cost = best_rdc; + + // Also record the best partition in simple motion data tree because it is + // necessary for the related speed features. + set_sms_tree_partitioning(sms_tree, pc_tree->partitioning); + +#if CONFIG_COLLECT_PARTITION_STATS + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) { + part_timing_stats->partition_decisions[pc_tree->partitioning] += 1; + } + + // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each + // prediction block. + print_partition_timing_stats_with_rdcost( + part_timing_stats, mi_row, mi_col, bsize, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], + cm->current_frame.frame_number, &best_rdc, "part_timing.csv"); + const bool print_timing_stats = false; + if (print_timing_stats) { + print_partition_timing_stats(part_timing_stats, cm->show_frame, + frame_is_intra_only(cm), bsize, + "part_timing_data.csv"); + } + // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for + // the whole clip. So we need to pass the information upstream to the encoder. + accumulate_partition_timing_stats(fr_part_timing_stats, part_timing_stats, + bsize); +#endif // CONFIG_COLLECT_PARTITION_STATS + + // Reset the PC_TREE deallocation flag. + int pc_tree_dealloc = 0; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif + if (part_search_state.found_best_partition) { + if (bsize == cm->seq_params->sb_size) { + // Encode the superblock. + const int emit_output = multi_pass_mode != SB_DRY_PASS; + const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL; + + // Write partition tree to file. Not used by default. + if (COLLECT_MOTION_SEARCH_FEATURE_SB) { + write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col); + ++cpi->sb_counter; + } + + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize, + pc_tree, NULL); + assert(pc_tree == td->pc_root); + // Dealloc the whole PC_TREE after a superblock is done. + av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0, + cpi->sf.part_sf.partition_search_type); + pc_tree = NULL; + td->pc_root = NULL; + pc_tree_dealloc = 1; + } else if (should_do_dry_run_encode_for_current_block( + cm->seq_params->sb_size, x->sb_enc.max_partition_size, + pc_tree->index, bsize)) { + // Encode the smaller blocks in DRY_RUN mode. + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif + + // If the tree still exists (non-superblock), dealloc most nodes, only keep + // nodes for the best partition and PARTITION_NONE. + if (pc_tree_dealloc == 0) + av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1, + cpi->sf.part_sf.partition_search_type); + + if (bsize == cm->seq_params->sb_size) { + assert(best_rdc.rate < INT_MAX); + assert(best_rdc.dist < INT64_MAX); + } else { + assert(tp_orig == *tp); + } + + // Restore the rd multiplier. + x->rdmult = orig_rdmult; + return part_search_state.found_best_partition; +} +#endif // !CONFIG_REALTIME_ONLY + +#undef COLLECT_MOTION_SEARCH_FEATURE_SB + +#if CONFIG_RT_ML_PARTITIONING +#define FEATURES 6 +#define LABELS 2 +static int ml_predict_var_partitioning(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const NN_CONFIG *nn_config = NULL; + const float *means = NULL; + const float *vars = NULL; + switch (bsize) { + case BLOCK_64X64: + nn_config = &av1_var_part_nnconfig_64; + means = av1_var_part_means_64; + vars = av1_var_part_vars_64; + break; + case BLOCK_32X32: + nn_config = &av1_var_part_nnconfig_32; + means = av1_var_part_means_32; + vars = av1_var_part_vars_32; + break; + case BLOCK_16X16: + nn_config = &av1_var_part_nnconfig_16; + means = av1_var_part_means_16; + vars = av1_var_part_vars_16; + break; + case BLOCK_8X8: + default: assert(0 && "Unexpected block size."); return -1; + } + + if (!nn_config) return -1; + + { + const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f; + float features[FEATURES] = { 0.0f }; + const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0, + cm->seq_params->bit_depth); + int feature_idx = 0; + float score[LABELS]; + + features[feature_idx] = + (log1pf((float)(dc_q * dc_q) / 256.0f) - means[feature_idx]) / + sqrtf(vars[feature_idx]); + feature_idx++; + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize); + { + const int bs = block_size_wide[bsize]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + const int sb_offset_row = 4 * (mi_row & 15); + const int sb_offset_col = 4 * (mi_col & 15); + const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + int i; + // Variance of whole block. + const unsigned int var = + cpi->ppi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx] = + (log1pf((float)var) - means[feature_idx]) / sqrtf(vars[feature_idx]); + feature_idx++; + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->ppi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx] = + (var_ratio - means[feature_idx]) / sqrtf(vars[feature_idx]); + feature_idx++; + } + } + // for (int i = 0; i thresh) return PARTITION_SPLIT; + if (score[0] < -thresh) return PARTITION_NONE; + return -1; + } +} +#undef FEATURES +#undef LABELS + +// Uncomment for collecting data for ML-based partitioning +// #define _COLLECT_GROUND_TRUTH_ + +#ifdef _COLLECT_GROUND_TRUTH_ +static int store_partition_data(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, PARTITION_TYPE part) { + AV1_COMMON *const cm = &cpi->common; + char fname[128]; + switch (bsize) { + case BLOCK_64X64: sprintf(fname, "data_64x64.txt"); break; + case BLOCK_32X32: sprintf(fname, "data_32x32.txt"); break; + case BLOCK_16X16: sprintf(fname, "data_16x16.txt"); break; + case BLOCK_8X8: sprintf(fname, "data_8x8.txt"); break; + default: assert(0 && "Unexpected block size."); return -1; + } + + float features[6]; // DC_Q, VAR, VAR_RATIO-0..3 + + FILE *f = fopen(fname, "a"); + + { + const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0, + cm->seq_params->bit_depth); + int feature_idx = 0; + + features[feature_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f); + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize); + { + const int bs = block_size_wide[bsize]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + const int sb_offset_row = 4 * (mi_row & 15); + const int sb_offset_col = 4 * (mi_col & 15); + const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + int i; + // Variance of whole block. + /* + if (bs == 8) + { + int r, c; + printf("%d %d\n", mi_row, mi_col); + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + printf("%3d ", + src[r * src_stride + c] - pred[64 * r + c]); + } + printf("\n"); + } + printf("\n"); + } + */ + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx++] = log1pf((float)var); + + fprintf(f, "%f,%f,", features[0], features[1]); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + fprintf(f, "%f,", var_ratio); + } + + fprintf(f, "%d\n", part == PARTITION_NONE ? 0 : 1); + } + + fclose(f); + return -1; + } +} +#endif + +static void duplicate_mode_info_in_sb(AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int block_width = + AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); + const int block_height = + AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); + const int mi_stride = xd->mi_stride; + MB_MODE_INFO *const src_mi = xd->mi[0]; + int i, j; + + for (j = 0; j < block_height; ++j) + for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi; +} + +static INLINE void copy_mbmi_ext_frame_to_mbmi_ext( + MB_MODE_INFO_EXT *const mbmi_ext, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, uint8_t ref_frame_type) { + memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack, + sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); + memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight, + sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); + mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context; + mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count; + memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs, + sizeof(mbmi_ext->global_mvs)); +} + +static void fill_mode_info_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + int hbs = mi_size_wide[bsize] >> 1; + PARTITION_TYPE partition = pc_tree->partitioning; + BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + + assert(bsize >= BLOCK_8X8); + + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) + return; + + switch (partition) { + case PARTITION_NONE: + set_mode_info_offsets(&cm->mi_params, &cpi->mbmi_ext_info, x, xd, mi_row, + mi_col); + *(xd->mi[0]) = pc_tree->none->mic; + copy_mbmi_ext_frame_to_mbmi_ext( + &x->mbmi_ext, &pc_tree->none->mbmi_ext_best, LAST_FRAME); + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); + break; + case PARTITION_SPLIT: { + fill_mode_info_sb(cpi, x, mi_row, mi_col, subsize, pc_tree->split[0]); + fill_mode_info_sb(cpi, x, mi_row, mi_col + hbs, subsize, + pc_tree->split[1]); + fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col, subsize, + pc_tree->split[2]); + fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col + hbs, subsize, + pc_tree->split[3]); + break; + } + default: break; + } +} + +void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RD_STATS *rd_cost, int do_recon, int64_t best_rd, + PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int hbs = mi_size_wide[bsize] >> 1; + TokenExtra *tp_orig = *tp; + const ModeCosts *mode_costs = &x->mode_costs; + RD_STATS this_rdc, best_rdc; + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + int do_split = bsize > BLOCK_8X8; + // Override skipping rectangular partition operations for edge blocks + const int force_horz_split = (mi_row + 2 * hbs > cm->mi_params.mi_rows); + const int force_vert_split = (mi_col + 2 * hbs > cm->mi_params.mi_cols); + + int partition_none_allowed = !force_horz_split && !force_vert_split; + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); // Square partition only + assert(cm->seq_params->sb_size == BLOCK_64X64); // Small SB so far + + (void)*tp_orig; + + av1_invalid_rd_stats(&best_rdc); + best_rdc.rdcost = best_rd; +#ifndef _COLLECT_GROUND_TRUTH_ + if (partition_none_allowed && do_split) { + const int ml_predicted_partition = + ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col); + if (ml_predicted_partition == PARTITION_NONE) do_split = 0; + if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0; + } +#endif + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + + // PARTITION_NONE + if (partition_none_allowed) { + pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + PICK_MODE_CONTEXT *ctx = pc_tree->none; + +// Flip for RDO based pick mode +#if 0 + RD_STATS dummy; + av1_invalid_rd_stats(&dummy); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, + PARTITION_NONE, bsize, ctx, dummy); +#else + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, + ctx); +#endif + if (this_rdc.rate != INT_MAX) { + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + + this_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; + } + } + } + + // PARTITION_SPLIT + if (do_split) { + RD_STATS sum_rdc; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + + av1_init_rd_stats(&sum_rdc); + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + pc_tree->split[i]->index = i; + } + + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + sum_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + for (int i = 0; + i < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc.rdcost; ++i) { + const int x_idx = (i & 1) * hbs; + const int y_idx = (i >> 1) * hbs; + + if (mi_row + y_idx >= cm->mi_params.mi_rows || + mi_col + x_idx >= cm->mi_params.mi_cols) + continue; + av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, + mi_col + x_idx, subsize, &this_rdc, i < 3, + best_rdc.rdcost - sum_rdc.rdcost, + pc_tree->split[i]); + + if (this_rdc.rate == INT_MAX) { + av1_invalid_rd_stats(&sum_rdc); + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; + } + } + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + pc_tree->partitioning = PARTITION_SPLIT; + } + } + +#ifdef _COLLECT_GROUND_TRUTH_ + store_partition_data(cpi, x, bsize, mi_row, mi_col, pc_tree->partitioning); +#endif + + *rd_cost = best_rdc; + + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + + if (best_rdc.rate == INT_MAX) { + av1_invalid_rd_stats(rd_cost); + return; + } + + // update mode info array + fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree); + + if (do_recon) { + if (bsize == cm->seq_params->sb_size) { + // NOTE: To get estimate for rate due to the tokens, use: + // int rate_coeffs = 0; + // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, + // bsize, pc_tree, &rate_coeffs); + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + pc_tree, NULL); + } else { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } + + if (bsize == BLOCK_64X64 && do_recon) { + assert(best_rdc.rate < INT_MAX); + assert(best_rdc.dist < INT64_MAX); + } else { + assert(tp_orig == *tp); + } +} +#endif // CONFIG_RT_ML_PARTITIONING diff --git a/third_party/aom/av1/encoder/partition_search.h b/third_party/aom/av1/encoder/partition_search.h new file mode 100644 index 0000000000..1b5d71b7da --- /dev/null +++ b/third_party/aom/av1/encoder/partition_search.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_SEARCH_H_ +#define AOM_AV1_ENCODER_PARTITION_SEARCH_H_ + +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/tokenize.h" + +void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi, + const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize); +void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, int mi_col, + BLOCK_SIZE bsize); +void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, + MB_MODE_INFO **mib, TokenExtra **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *rate, + int64_t *dist, int do_recon, PC_TREE *pc_tree); +void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MB_MODE_INFO **mib, + TokenExtra **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, PC_TREE *pc_tree); +#if CONFIG_RT_ML_PARTITIONING +void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RD_STATS *rd_cost, int do_recon, int64_t best_rd, + PC_TREE *pc_tree); +#endif +void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf); +void av1_reset_sf_for_ext_part(AV1_COMP *const cpi); + +#if CONFIG_PARTITION_SEARCH_ORDER +bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, + int mi_col, BLOCK_SIZE bsize, + RD_STATS *best_rd_cost); +#endif + +bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, + RD_STATS best_rdc, PC_TREE *pc_tree, + SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd, + SB_MULTI_PASS_MODE multi_pass_mode, + RD_RECT_PART_WIN_INFO *rect_part_win_info); + +static AOM_INLINE void set_cb_offsets(uint16_t *cb_offset, + const uint16_t cb_offset_y, + const uint16_t cb_offset_uv) { + cb_offset[PLANE_TYPE_Y] = cb_offset_y; + cb_offset[PLANE_TYPE_UV] = cb_offset_uv; +} + +static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize, + const int subsampling_x, + const int subsampling_y) { + x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize]; + if (x->e_mbd.is_chroma_ref) { + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + assert(plane_bsize != BLOCK_INVALID); + x->cb_offset[PLANE_TYPE_UV] += + block_size_wide[plane_bsize] * block_size_high[plane_bsize]; + } +} + +#endif // AOM_AV1_ENCODER_PARTITION_SEARCH_H_ diff --git a/third_party/aom/av1/encoder/partition_strategy.c b/third_party/aom/av1/encoder/partition_strategy.c new file mode 100644 index 0000000000..ce06313579 --- /dev/null +++ b/third_party/aom/av1/encoder/partition_strategy.c @@ -0,0 +1,2573 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/thirdpass.h" +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/reconinter.h" + +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/cnn.h" +#include "av1/encoder/partition_model_weights.h" +#include "av1/encoder/partition_cnn_weights.h" +#endif +#include "av1/encoder/encoder.h" + +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/partition_strategy.h" +#include "av1/encoder/partition_search.h" +#include "av1/encoder/rdopt.h" + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE void simple_motion_search_prune_part_features( + AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, + int mi_row, int mi_col, BLOCK_SIZE bsize, float *features, + int features_to_get); + +static bool ext_ml_model_decision_before_none( + AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT], + int *partition_none_allowed, int *partition_horz_allowed, + int *partition_vert_allowed, int *do_rectangular_split, + int *do_square_split); + +static bool ext_ml_model_decision_before_none_part2( + AV1_COMP *cpi, + const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART], + int *prune_horz, int *prune_vert); + +static bool ext_ml_model_decision_after_none( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_none, int *do_square_split, + int *do_rectangular_split); + +static bool ext_ml_model_decision_after_none_part2( + AV1_COMP *const cpi, const float *const features_terminate, + int *terminate_partition_search); + +static bool ext_ml_model_decision_after_split( + AV1_COMP *const cpi, const float *const features_terminate, + int *terminate_partition_search); + +static bool ext_ml_model_decision_after_split_part2( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_prune, int *prune_rect_part_horz, + int *prune_rect_part_vert); + +static bool ext_ml_model_decision_after_rect( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_rect, int *horza_partition_allowed, + int *horzb_partition_allowed, int *verta_partition_allowed, + int *vertb_partition_allowed); + +static bool ext_ml_model_decision_after_part_ab( + AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, + int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed, + int *const partition_vert4_allowed, unsigned int pb_source_variance, + int mi_row, int mi_col); + +static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_128X128: return 0; + case BLOCK_64X64: return 1; + case BLOCK_32X32: return 2; + case BLOCK_16X16: return 3; + case BLOCK_8X8: return 4; + default: assert(0 && "Invalid bsize"); return -1; + } +} + +static char *get_feature_file_name(int id) { + static char *feature_file_names[] = { + "feature_before_partition_none", + "feature_before_partition_none_prune_rect", + "feature_after_partition_none_prune", + "feature_after_partition_none_terminate", + "feature_after_partition_split_terminate", + "feature_after_partition_split_prune_rect", + "feature_after_partition_rect", + "feature_after_partition_ab", + }; + + return feature_file_names[id]; +} + +static void write_features_to_file(const char *const path, + const bool is_test_mode, + const float *features, + const int feature_size, const int id, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col) { + if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return; + + char filename[256]; + snprintf(filename, sizeof(filename), "%s/%s", path, + get_feature_file_name(id)); + FILE *pfile = fopen(filename, "a"); + if (pfile == NULL) return; + if (!is_test_mode) { + fprintf(pfile, "%d,%d,%d,%d,%d\n", id, (int)bsize, mi_row, mi_col, + feature_size); + } + for (int i = 0; i < feature_size; ++i) { + fprintf(pfile, "%.6f", features[i]); + if (i < feature_size - 1) fprintf(pfile, ","); + } + fprintf(pfile, "\n"); + fclose(pfile); +} + +// TODO(chiyotsai@google.com): This is very much a work in progress. We still +// need to the following: +// -- add support for hdres +// -- add support for pruning rectangular partitions +// -- use reconstructed pixels instead of source pixels for padding +// -- use chroma pixels in addition to luma pixels +void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x, + int quad_tree_idx, + int intra_cnn_based_part_prune_level, + PartitionSearchState *part_state) { + assert(cm->seq_params->sb_size >= BLOCK_64X64 && + "Invalid sb_size for intra_cnn!"); + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const BLOCK_SIZE bsize = blk_params->bsize; + + const int bsize_idx = convert_bsize_to_idx(bsize); + + if (bsize == BLOCK_128X128) { + return; + } + + PartitionSearchInfo *part_info = &x->part_search_info; + + // Precompute the CNN part and cache the result in MACROBLOCK + if (bsize == BLOCK_64X64 && !part_info->cnn_output_valid) { + const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config; + + // Prepare the output + const CNN_THREAD_DATA thread_data = { .num_workers = 1, .workers = NULL }; + const int num_outputs = 4; + const int output_dims[4] = { 1, 2, 4, 8 }; + const int out_chs[4] = { CNN_BRANCH_0_OUT_CH, CNN_BRANCH_1_OUT_CH, + CNN_BRANCH_2_OUT_CH, CNN_BRANCH_3_OUT_CH }; + float *output_buffer[CNN_TOT_OUT_CH]; + + float **cur_output_buf = output_buffer; + float *curr_buf_ptr = part_info->cnn_buffer; + for (int output_idx = 0; output_idx < num_outputs; output_idx++) { + const int num_chs = out_chs[output_idx]; + const int ch_size = output_dims[output_idx] * output_dims[output_idx]; + for (int ch = 0; ch < num_chs; ch++) { + cur_output_buf[ch] = curr_buf_ptr; + curr_buf_ptr += ch_size; + } + cur_output_buf += num_chs; + } + + CNN_MULTI_OUT output = { + .num_outputs = 4, + .output_channels = out_chs, + .output_strides = output_dims, + .output_buffer = output_buffer, + }; + + // Prepare the input + const MACROBLOCKD *xd = &x->e_mbd; + const int bit_depth = xd->bd; + const int dc_q = + av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8); + part_info->log_q = log1pf((float)(dc_q * dc_q) / 256.0f); + part_info->log_q = + (part_info->log_q - av1_intra_mode_cnn_partition_mean[0]) / + av1_intra_mode_cnn_partition_std[0]; + + const int width = 65, height = 65, + stride = x->plane[AOM_PLANE_Y].src.stride; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *image[1] = { + CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1 + }; + + if (!av1_cnn_predict_img_multi_out_highbd(image, width, height, stride, + cnn_config, &thread_data, + bit_depth, &output)) { + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Error allocating CNN data"); + return; + } + } else { + uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 }; + + if (!av1_cnn_predict_img_multi_out(image, width, height, stride, + cnn_config, &thread_data, &output)) { + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Error allocating CNN data"); + return; + } + } + + part_info->cnn_output_valid = 1; + } + + if (!part_info->cnn_output_valid) { + return; + } + + const NN_CONFIG *dnn_configs[5] = { + NULL, + &av1_intra_mode_cnn_partition_branch_0_dnn_config, + &av1_intra_mode_cnn_partition_branch_1_dnn_config, + &av1_intra_mode_cnn_partition_branch_2_dnn_config, + &av1_intra_mode_cnn_partition_branch_3_dnn_config, + }; + + const NN_CONFIG *dnn_config = dnn_configs[bsize_idx]; + + float dnn_features[100]; + float logits[4] = { 0.0f }; + + const float *branch_0 = part_info->cnn_buffer; + const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE; + const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE; + const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE; + + if (bsize == BLOCK_64X64) { + int f_idx = 0; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_0_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_0[ch_idx]; + } + + const int spa_stride = 2 * 2; + for (int lin_idx = 0; lin_idx < spa_stride; lin_idx++) { + for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride]; + } + } + dnn_features[f_idx++] = part_info->log_q; + } else if (bsize == BLOCK_32X32) { + int f_idx = 0; + for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) { + dnn_features[f_idx++] = branch_0[idx]; + } + + const int curr_lin_idx = quad_to_linear_1[quad_tree_idx - 1]; + const int spa_stride = 2 * 2; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride]; + } + dnn_features[f_idx++] = part_info->log_q; + } else if (bsize == BLOCK_16X16) { + int f_idx = 0; + const int prev_quad_idx = (quad_tree_idx - 1) / 4; + const int prev_lin_idx = quad_to_linear_1[prev_quad_idx - 1]; + const int prev_spa_stride = 2 * 2; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_1[prev_lin_idx + ch_idx * prev_spa_stride]; + } + + const int curr_lin_idx = quad_to_linear_2[quad_tree_idx - 5]; + const int spa_stride = 4 * 4; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride]; + } + dnn_features[f_idx++] = part_info->log_q; + } else if (bsize == BLOCK_8X8) { + int f_idx = 0; + const int prev_quad_idx = (quad_tree_idx - 1) / 4; + const int prev_lin_idx = quad_to_linear_2[prev_quad_idx - 5]; + const int prev_spa_stride = 4 * 4; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_2[prev_lin_idx + ch_idx * prev_spa_stride]; + } + + const int curr_lin_idx = quad_to_linear_3[quad_tree_idx - 21]; + const int spa_stride = 8 * 8; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride]; + } + dnn_features[f_idx++] = part_info->log_q; + } else { + assert(0 && "Invalid bsize in intra_cnn partition"); + } + + // Make decision + av1_nn_predict(dnn_features, dnn_config, 1, logits); + + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + float split_only_thresh = 100.0f, no_split_thresh = -100.0f; + if (is_720p_or_larger) { + split_only_thresh = + av1_intra_mode_cnn_partition_split_thresh_hdres[bsize_idx]; + no_split_thresh = + av1_intra_mode_cnn_partition_no_split_thresh_hdres[bsize_idx]; + } else if (is_480p_or_larger) { + split_only_thresh = + av1_intra_mode_cnn_partition_split_thresh_midres[bsize_idx]; + no_split_thresh = + av1_intra_mode_cnn_partition_no_split_thresh_midres[bsize_idx]; + } else { + split_only_thresh = + av1_intra_mode_cnn_partition_split_thresh_lowres[bsize_idx]; + no_split_thresh = + av1_intra_mode_cnn_partition_no_split_thresh_lowres[bsize_idx]; + } + + if (logits[0] > split_only_thresh) { + // As screen contents tend to choose larger partitions, do not prune + // PARTITION_NONE when intra_cnn_based_part_prune_level=1. + if (intra_cnn_based_part_prune_level != 1) { + part_state->partition_none_allowed = 0; + } + part_state->do_square_split = 1; + av1_disable_rect_partitions(part_state); + } + + if (logits[0] < no_split_thresh) { + av1_disable_square_split_partition(part_state); + } +} + +static INLINE int get_simple_motion_search_prune_agg(int qindex, + int prune_level, + int is_rect_part) { + assert(prune_level < TOTAL_AGG_LVLS); + if (prune_level == NO_PRUNING) { + return -1; + } + + // Aggressiveness value for SIMPLE_MOTION_SEARCH_PRUNE_LEVEL except + // QIDX_BASED_AGG_LVL + const int sms_prune_agg_levels[TOTAL_SIMPLE_AGG_LVLS] = { 0, 1, 2, 3 }; + if (prune_level < TOTAL_SIMPLE_AGG_LVLS) { + return sms_prune_agg_levels[prune_level]; + } + + // Map the QIDX_BASED_AGG_LVL to corresponding aggressiveness value. + // Aggressive pruning for lower quantizers in non-boosted frames to prune + // rectangular partitions. + const int qband = is_rect_part ? (qindex <= 90 ? 1 : 0) : 0; + const int sms_prune_agg_qindex_based[2] = { 1, 2 }; + return sms_prune_agg_qindex_based[qband]; +} + +void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + PartitionSearchState *part_state) { + const AV1_COMMON *const cm = &cpi->common; + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + const int bsize_idx = convert_bsize_to_idx(bsize); + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + // res_idx is 0 for res < 480p, 1 for 480p, 2 for 720p+ + const int res_idx = is_480p_or_larger + is_720p_or_larger; + + assert(bsize_idx >= 0 && bsize_idx <= 4 && + "Invalid bsize in simple_motion_search_based_split"); + + const float *ml_mean = av1_simple_motion_search_split_mean[bsize_idx]; + const float *ml_std = av1_simple_motion_search_split_std[bsize_idx]; + const NN_CONFIG *nn_config = + av1_simple_motion_search_split_nn_config[bsize_idx]; + + const int agg = get_simple_motion_search_prune_agg( + x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 0); + if (agg < 0) { + return; + } + + const float split_only_thresh = + av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx]; + const float no_split_thresh = + av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx]; + + float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f }; + simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, + bsize, features, + FEATURE_SMS_SPLIT_MODEL_FLAG); + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + FEATURE_SIZE_SMS_SPLIT, 0, bsize, mi_row, mi_col); + + // Note: it is intended to not normalize the features here, to keep it + // consistent for all features collected and passed to the external model. + if (ext_ml_model_decision_before_none( + cpi, features, &part_state->partition_none_allowed, + &part_state->partition_rect_allowed[HORZ], + &part_state->partition_rect_allowed[VERT], + &part_state->do_rectangular_split, &part_state->do_square_split)) { + return; + } + + for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) { + features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx]; + } + + float score = 0.0f; + + av1_nn_predict(features, nn_config, 1, &score); + + if (score > split_only_thresh) { + av1_set_square_split_only(part_state); + } + + if (cpi->sf.part_sf.simple_motion_search_split >= 2 && + score < no_split_thresh) { + av1_disable_square_split_partition(part_state); + } + + // If the score is very low, prune rectangular split since it is unlikely to + // occur. + if (cpi->sf.part_sf.simple_motion_search_rect_split) { + const float scale = res_idx >= 2 ? 3.0f : 2.0f; + const float rect_split_thresh = + scale * av1_simple_motion_search_no_split_thresh + [cpi->sf.part_sf.simple_motion_search_rect_split][res_idx] + [bsize_idx]; + if (score < rect_split_thresh) { + part_state->do_rectangular_split = 0; + } + } +} + +// Given a list of ref frames in refs, performs simple_motion_search on each of +// the refs and returns the ref with the smallest sse. Returns -1 if none of the +// ref in the list is available. Also stores the best sse and var in best_sse, +// best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in +// sms_tree. If save_mv is 1, update mv_ref_fulls under sms_tree and the +// subtrees. +static int simple_motion_search_get_best_ref( + AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, + int mi_row, int mi_col, BLOCK_SIZE bsize, const int *const refs, + int num_refs, int use_subpixel, int save_mv, unsigned int *best_sse, + unsigned int *best_var) { + const AV1_COMMON *const cm = &cpi->common; + int best_ref = -1; + + if (mi_col >= cm->mi_params.mi_cols || mi_row >= cm->mi_params.mi_rows) { + // If the whole block is outside of the image, set the var and sse to 0. + *best_var = 0; + *best_sse = 0; + + return best_ref; + } + + // Otherwise do loop through the reference frames and find the one with the + // minimum SSE + const int num_planes = 1; + + *best_sse = INT_MAX; + + for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) { + const int ref = refs[ref_idx]; + + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) { + const FULLPEL_MV *start_mvs = sms_tree->start_mvs; + unsigned int curr_sse = 0, curr_var = 0; + const int_mv best_mv = av1_simple_motion_search_sse_var( + cpi, x, mi_row, mi_col, bsize, ref, start_mvs[ref], num_planes, + use_subpixel, &curr_sse, &curr_var); + if (curr_sse < *best_sse) { + *best_sse = curr_sse; + *best_var = curr_var; + best_ref = ref; + } + + if (save_mv) { + sms_tree->start_mvs[ref].row = best_mv.as_mv.row / 8; + sms_tree->start_mvs[ref].col = best_mv.as_mv.col / 8; + + if (bsize >= BLOCK_8X8) { + for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) { + // Propagate the new motion vectors to a lower level + SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx]; + sub_tree->start_mvs[ref] = sms_tree->start_mvs[ref]; + } + } + } + } + } + + return best_ref; +} + +// Collects features using simple_motion_search and store them in features. The +// features are also cached in SIMPLE_MOTION_DATA_TREE. By default, the features +// collected are the sse and var from the subblocks flagged by features_to_get. +// Furthermore, if features is not NULL, then 7 more features are appended to +// the end of features: +// - log(1.0 + dc_q ** 2) +// - whether an above macroblock exists +// - width of above macroblock +// - height of above macroblock +// - whether a left marcoblock exists +// - width of left macroblock +// - height of left macroblock +static AOM_INLINE void simple_motion_search_prune_part_features( + AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, + int mi_row, int mi_col, BLOCK_SIZE bsize, float *features, + int features_to_get) { + const int w_mi = mi_size_wide[bsize]; + const int h_mi = mi_size_high[bsize]; + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + assert(bsize >= BLOCK_8X8); + assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] || + cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]); + + // Setting up motion search + const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME + : LAST_FRAME }; + const int num_refs = 1; + const int use_subpixel = 1; + + // Doing whole block first to update the mv + if (!sms_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) { + simple_motion_search_get_best_ref(cpi, x, sms_tree, mi_row, mi_col, bsize, + ref_list, num_refs, use_subpixel, 1, + &sms_tree->sms_none_feat[0], + &sms_tree->sms_none_feat[1]); + sms_tree->sms_none_valid = 1; + } + + // Split subblocks + if (features_to_get & FEATURE_SMS_SPLIT_FLAG) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) { + const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2; + const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2; + SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx]; + + if (!sub_tree->sms_none_valid) { + simple_motion_search_get_best_ref( + cpi, x, sub_tree, sub_mi_row, sub_mi_col, subsize, ref_list, + num_refs, use_subpixel, 1, &sub_tree->sms_none_feat[0], + &sub_tree->sms_none_feat[1]); + sub_tree->sms_none_valid = 1; + } + } + } + + // Rectangular subblocks + if (!sms_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) { + // Horz subblock + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); + for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) { + const int sub_mi_col = mi_col + 0; + const int sub_mi_row = mi_row + r_idx * h_mi / 2; + + simple_motion_search_get_best_ref( + cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, + use_subpixel, 0, &sms_tree->sms_rect_feat[2 * r_idx], + &sms_tree->sms_rect_feat[2 * r_idx + 1]); + } + + // Vert subblock + subsize = get_partition_subsize(bsize, PARTITION_VERT); + for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) { + const int sub_mi_col = mi_col + r_idx * w_mi / 2; + const int sub_mi_row = mi_row + 0; + + simple_motion_search_get_best_ref( + cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, + use_subpixel, 0, &sms_tree->sms_rect_feat[4 + 2 * r_idx], + &sms_tree->sms_rect_feat[4 + 2 * r_idx + 1]); + } + sms_tree->sms_rect_valid = 1; + } + + if (!features) return; + + int f_idx = 0; + if (features_to_get & FEATURE_SMS_NONE_FLAG) { + for (int sub_idx = 0; sub_idx < 2; sub_idx++) { + features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[sub_idx]); + } + } + + if (features_to_get & FEATURE_SMS_SPLIT_FLAG) { + for (int sub_idx = 0; sub_idx < SUB_PARTITIONS_SPLIT; sub_idx++) { + SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[sub_idx]; + features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[0]); + features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[1]); + } + } + + if (features_to_get & FEATURE_SMS_RECT_FLAG) { + for (int sub_idx = 0; sub_idx < 8; sub_idx++) { + features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[sub_idx]); + } + } + + const MACROBLOCKD *xd = &x->e_mbd; + set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); + + // Q_INDEX + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + features[f_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f); + + // Neighbor stuff + const int has_above = !!xd->above_mbmi; + const int has_left = !!xd->left_mbmi; + const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->bsize : bsize; + const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->bsize : bsize; + features[f_idx++] = (float)has_above; + features[f_idx++] = (float)mi_size_wide_log2[above_bsize]; + features[f_idx++] = (float)mi_size_high_log2[above_bsize]; + features[f_idx++] = (float)has_left; + features[f_idx++] = (float)mi_size_wide_log2[left_bsize]; + features[f_idx++] = (float)mi_size_high_log2[left_bsize]; +} + +void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + PartitionSearchState *part_state) { + const AV1_COMMON *const cm = &cpi->common; + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + const int bsize_idx = convert_bsize_to_idx(bsize); + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + // res_idx is 0 for lowres, 1 for 48p, 2 for 720p+ + const int res_idx = is_480p_or_larger + is_720p_or_larger; + + // Get model parameters + const NN_CONFIG *nn_config = + av1_simple_motion_search_prune_rect_nn_config[bsize_idx]; + const float *ml_mean = av1_simple_motion_search_prune_rect_mean[bsize_idx], + *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx]; + + const int agg = get_simple_motion_search_prune_agg( + x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 1); + if (agg < 0) { + return; + } + + const float prune_thresh = + av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx]; + + // If there is no valid threshold, return immediately. + if (!nn_config || prune_thresh == 0.0f) { + return; + } + + // Get features + float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f }; + simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, + bsize, features, + FEATURE_SMS_PRUNE_PART_FLAG); + + // Note: it is intended to not normalize the features here, to keep it + // consistent for all features collected and passed to the external model. + if (cpi->sf.part_sf.simple_motion_search_prune_rect && + !frame_is_intra_only(cm) && + (part_state->partition_rect_allowed[HORZ] || + part_state->partition_rect_allowed[VERT]) && + bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) { + // Write features to file + write_features_to_file( + cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, + features, FEATURE_SIZE_SMS_PRUNE_PART, 1, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_before_none_part2( + cpi, features, &part_state->prune_rect_part[HORZ], + &part_state->prune_rect_part[VERT])) { + return; + } + } + + for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) { + features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; + } + + // Get probabilities + float scores[EXT_PARTITION_TYPES] = { 0.0f }, + probs[EXT_PARTITION_TYPES] = { 0.0f }; + const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8) + ? PARTITION_TYPES + : EXT_PARTITION_TYPES; + + av1_nn_predict(features, nn_config, 1, scores); + + av1_nn_softmax(scores, probs, num_classes); + + // Determine if we should prune rectangular partitions. + if (probs[PARTITION_HORZ] <= prune_thresh) { + part_state->prune_rect_part[HORZ] = 1; + } + if (probs[PARTITION_VERT] <= prune_thresh) { + part_state->prune_rect_part[VERT] = 1; + } +} + +// Early terminates PARTITION_NONE using simple_motion_search features and the +// rate, distortion, and rdcost of PARTITION_NONE. This is only called when: +// - The frame is a show frame +// - The frame is not intra only +// - The current bsize is > BLOCK_8X8 +// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols +void av1_simple_motion_search_early_term_none( + AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, + const RD_STATS *none_rdc, PartitionSearchState *part_state) { + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f }; + simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, + bsize, features, + FEATURE_SMS_PRUNE_PART_FLAG); + int f_idx = FEATURE_SIZE_SMS_PRUNE_PART; + + features[f_idx++] = log1pf((float)none_rdc->rate); + features[f_idx++] = log1pf((float)none_rdc->dist); + features[f_idx++] = log1pf((float)none_rdc->rdcost); + + assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE); + + const float *ml_mean = NULL; + const float *ml_std = NULL; + const float *ml_model = NULL; + + if (bsize == BLOCK_128X128) { + ml_mean = av1_simple_motion_search_term_none_mean_128; + ml_std = av1_simple_motion_search_term_none_std_128; + ml_model = av1_simple_motion_search_term_none_model_128; + } else if (bsize == BLOCK_64X64) { + ml_mean = av1_simple_motion_search_term_none_mean_64; + ml_std = av1_simple_motion_search_term_none_std_64; + ml_model = av1_simple_motion_search_term_none_model_64; + } else if (bsize == BLOCK_32X32) { + ml_mean = av1_simple_motion_search_term_none_mean_32; + ml_std = av1_simple_motion_search_term_none_std_32; + ml_model = av1_simple_motion_search_term_none_model_32; + } else if (bsize == BLOCK_16X16) { + ml_mean = av1_simple_motion_search_term_none_mean_16; + ml_std = av1_simple_motion_search_term_none_std_16; + ml_model = av1_simple_motion_search_term_none_model_16; + } else { + assert(0 && "Unexpected block size in simple_motion_term_none"); + } + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + FEATURE_SIZE_SMS_TERM_NONE, 3, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_none_part2( + cpi, features, &part_state->terminate_partition_search)) { + return; + } + + if (ml_model) { + float score = 0.0f; + for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) { + score += + ml_model[f_idx] * (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; + } + score += ml_model[FEATURE_SIZE_SMS_TERM_NONE]; + + if (score >= 0.0f) { + part_state->terminate_partition_search = 1; + } + } +} + +void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, + float *features) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + + // Currently this only allows 128X128 SB size. May extend it to 64X64 SB size. + assert(sb_size == BLOCK_128X128); + + int f_idx = 0; + + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + const float log_q_sq = log1pf((float)(dc_q * dc_q) / 256.0f); + + // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb + float sum_mv_row_sq = 0; + float sum_mv_row = 0; + float min_abs_mv_row = FLT_MAX; + float max_abs_mv_row = 0; + + float sum_mv_col_sq = 0; + float sum_mv_col = 0; + float min_abs_mv_col = FLT_MAX; + float max_abs_mv_col = 0; + + float sum_log_sse_sq = 0; + float sum_log_sse = 0; + float min_log_sse = FLT_MAX; + float max_log_sse = 0; + + const BLOCK_SIZE mb_size = BLOCK_16X16; + const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size]; + const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size]; + const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size]; + const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size]; + + for (int mb_row = 0; mb_row < mb_rows; mb_row++) + for (int mb_col = 0; mb_col < mb_cols; mb_col++) { + const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2); + const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2); + unsigned int sse = 0; + unsigned int var = 0; + const FULLPEL_MV start_mv = kZeroFullMv; + const MV_REFERENCE_FRAME ref = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + const int_mv best_mv = av1_simple_motion_search_sse_var( + cpi, x, this_mi_row, this_mi_col, mb_size, ref, start_mv, 1, 0, &sse, + &var); + + const float mv_row = (float)(best_mv.as_mv.row / 8); + const float mv_col = (float)(best_mv.as_mv.col / 8); + const float log_sse = log1pf((float)sse); + const float abs_mv_row = fabsf(mv_row); + const float abs_mv_col = fabsf(mv_col); + + sum_mv_row_sq += mv_row * mv_row; + sum_mv_row += mv_row; + sum_mv_col_sq += mv_col * mv_col; + sum_mv_col += mv_col; + + if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row; + if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row; + if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col; + if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col; + + sum_log_sse_sq += log_sse * log_sse; + sum_log_sse += log_sse; + if (log_sse < min_log_sse) min_log_sse = log_sse; + if (log_sse > max_log_sse) max_log_sse = log_sse; + } + const int blks = mb_rows * mb_cols; + const float avg_mv_row = sum_mv_row / (float)blks; + const float var_mv_row = + sum_mv_row_sq / (float)blks - avg_mv_row * avg_mv_row; + + const float avg_mv_col = sum_mv_col / (float)blks; + const float var_mv_col = + sum_mv_col_sq / (float)blks - avg_mv_col * avg_mv_col; + + const float avg_log_sse = sum_log_sse / (float)blks; + const float var_log_sse = + sum_log_sse_sq / (float)blks - avg_log_sse * avg_log_sse; + + features[f_idx++] = avg_log_sse; + features[f_idx++] = avg_mv_col; + features[f_idx++] = avg_mv_row; + features[f_idx++] = log_q_sq; + features[f_idx++] = max_abs_mv_col; + features[f_idx++] = max_abs_mv_row; + features[f_idx++] = max_log_sse; + features[f_idx++] = min_abs_mv_col; + features[f_idx++] = min_abs_mv_row; + features[f_idx++] = min_log_sse; + features[f_idx++] = var_log_sse; + features[f_idx++] = var_mv_col; + features[f_idx++] = var_mv_row; + + assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED); +} + +// Convert result index to block size. +// result idx block size +// 0 BLOCK_16X16 +// 1 BLOCK_32X32 +// 2 BLOCK_64X64 +// 3 BLOCK_128X128 +static BLOCK_SIZE get_block_size(int idx) { + return (BLOCK_SIZE)((idx + 2) * 3); +} + +BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const float *features) { + float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; + const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config; + + assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion != + NOT_IN_USE); + + av1_nn_predict(features, nn_config, 1, scores); + + int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; + if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == + DIRECT_PRED) { + result = 0; + float max_score = scores[0]; + for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) { + if (scores[i] > max_score) { + max_score = scores[i]; + result = i; + } + } + return get_block_size(result); + } + + float probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; + av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED); + + if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == + RELAXED_PRED) { + for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; + --result) { + if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { + probs[result] += probs[result + 1]; + } + if (probs[result] > 0.2) break; + } + } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == + ADAPT_PRED) { + const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; + // TODO(debargha): x->source_variance is unavailable at this point, + // so compute. The redundant recomputation later can be removed. + const unsigned int source_variance = av1_get_perpixel_variance_facade( + cpi, &x->e_mbd, &x->plane[0].src, sb_size, AOM_PLANE_Y); + if (source_variance > 16) { + const double thresh = source_variance < 128 ? 0.05 : 0.1; + for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; + --result) { + if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { + probs[result] += probs[result + 1]; + } + if (probs[result] > thresh) break; + } + } + } + + return get_block_size(result); +} + +// Get the minimum partition block width and height(in log scale) under a +// SIMPLE_MOTION_DATA_TREE. +static AOM_INLINE void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree, + int *min_bw, int *min_bh) { + if (!sms_tree) return; + + const BLOCK_SIZE bsize = sms_tree->block_size; + if (bsize == BLOCK_4X4) { + *min_bw = 0; + *min_bh = 0; + return; + } + + PARTITION_TYPE part_type = sms_tree->partitioning; + if (part_type == PARTITION_INVALID) return; + + if (part_type == PARTITION_SPLIT) { + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + get_min_bsize(sms_tree->split[i], min_bw, min_bh); + } + } else { + if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B || + part_type == PARTITION_VERT_A || part_type == PARTITION_VERT_B) + part_type = PARTITION_SPLIT; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, part_type); + if (subsize != BLOCK_INVALID) { + *min_bw = AOMMIN(*min_bw, mi_size_wide_log2[subsize]); + *min_bh = AOMMIN(*min_bh, mi_size_high_log2[subsize]); + } + } +} + +static INLINE void add_rd_feature(int64_t rd, int64_t best_rd, float *features, + int *feature_idx) { + const int rd_valid = rd > 0 && rd < INT64_MAX; + const float rd_ratio = rd_valid ? (float)rd / best_rd : 1.0f; + features[(*feature_idx)++] = (float)rd_valid; + features[(*feature_idx)++] = rd_ratio; +} + +#define FEATURES 31 +void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, + SIMPLE_MOTION_DATA_TREE *const sms_tree, + int64_t best_rd, int64_t part_none_rd, + int64_t part_split_rd, + int64_t *split_block_rd, + PartitionSearchState *part_state) { + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + if (best_rd <= 0 || best_rd == INT64_MAX || + part_state->terminate_partition_search) + return; + + const AV1_COMMON *const cm = &cpi->common; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const NN_CONFIG *nn_config = NULL; + float thresh = -1e6; + switch (bsize) { + case BLOCK_128X128: break; + case BLOCK_64X64: + nn_config = &av1_early_term_after_split_nnconfig_64; + thresh = is_480p_or_larger ? -2.0f : -1.2f; + break; + case BLOCK_32X32: + nn_config = &av1_early_term_after_split_nnconfig_32; + thresh = is_480p_or_larger ? -2.6f : -2.3f; + break; + case BLOCK_16X16: + nn_config = &av1_early_term_after_split_nnconfig_16; + thresh = is_480p_or_larger ? -2.0f : -2.4f; + break; + case BLOCK_8X8: + nn_config = &av1_early_term_after_split_nnconfig_8; + thresh = is_480p_or_larger ? -1.0f : -1.4f; + break; + case BLOCK_4X4: break; + default: + assert(0 && "Invalid block size in av1_ml_early_term_after_split()."); + break; + } + if (!nn_config) return; + + // Use more conservative threshold for level 1. + if (cpi->sf.part_sf.ml_early_term_after_part_split_level < 2) thresh -= 0.3f; + + const MACROBLOCKD *const xd = &x->e_mbd; + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + const int bs = block_size_wide[bsize]; + int f_idx = 0; + float features[FEATURES] = { 0.0f }; + + features[f_idx++] = log1pf((float)dc_q / 4.0f); + features[f_idx++] = log1pf((float)best_rd / bs / bs / 1024.0f); + + add_rd_feature(part_none_rd, best_rd, features, &f_idx); + add_rd_feature(part_split_rd, best_rd, features, &f_idx); + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + add_rd_feature(split_block_rd[i], best_rd, features, &f_idx); + int min_bw = MAX_SB_SIZE_LOG2; + int min_bh = MAX_SB_SIZE_LOG2; + get_min_bsize(sms_tree->split[i], &min_bw, &min_bh); + features[f_idx++] = (float)min_bw; + features[f_idx++] = (float)min_bh; + } + + simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, + bsize, NULL, + FEATURE_SMS_PRUNE_PART_FLAG); + + features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[1]); + + features[f_idx++] = log1pf((float)sms_tree->split[0]->sms_none_feat[1]); + features[f_idx++] = log1pf((float)sms_tree->split[1]->sms_none_feat[1]); + features[f_idx++] = log1pf((float)sms_tree->split[2]->sms_none_feat[1]); + features[f_idx++] = log1pf((float)sms_tree->split[3]->sms_none_feat[1]); + + features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[1]); + features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[3]); + features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[5]); + features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[7]); + + assert(f_idx == FEATURES); + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, FEATURES, + 4, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_split( + cpi, features, &part_state->terminate_partition_search)) { + return; + } + + float score = 0.0f; + av1_nn_predict(features, nn_config, 1, &score); + // Score is indicator of confidence that we should NOT terminate. + if (score < thresh) { + part_state->terminate_partition_search = 1; + } +} +#undef FEATURES + +void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x, + int64_t best_rd, int64_t none_rd, + const int64_t *split_rd, + PartitionSearchState *part_state) { + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; + best_rd = AOMMAX(best_rd, 1); + const NN_CONFIG *nn_config = NULL; + const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f }; + float cur_thresh = 0.0f; + switch (bsize) { + case BLOCK_8X8: + nn_config = &av1_rect_partition_nnconfig_8; + cur_thresh = prob_thresholds[0]; + break; + case BLOCK_16X16: + nn_config = &av1_rect_partition_nnconfig_16; + cur_thresh = prob_thresholds[1]; + break; + case BLOCK_32X32: + nn_config = &av1_rect_partition_nnconfig_32; + cur_thresh = prob_thresholds[2]; + break; + case BLOCK_64X64: + nn_config = &av1_rect_partition_nnconfig_64; + cur_thresh = prob_thresholds[3]; + break; + case BLOCK_128X128: + nn_config = &av1_rect_partition_nnconfig_128; + cur_thresh = prob_thresholds[4]; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + + // 1. Compute input features + float features[9]; + + // RD cost ratios + for (int i = 0; i < 5; i++) features[i] = 1.0f; + if (none_rd > 0 && none_rd < 1000000000) + features[0] = (float)none_rd / (float)best_rd; + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + features[1 + i] = (float)split_rd[i] / (float)best_rd; + } + + // Variance ratios + const MACROBLOCKD *const xd = &x->e_mbd; + int whole_block_variance; + whole_block_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + whole_block_variance = AOMMAX(whole_block_variance, 1); + + int split_variance[SUB_PARTITIONS_SPLIT]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + struct buf_2d buf; + buf.stride = x->plane[0].src.stride; + const int bw = block_size_wide[bsize]; + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + const int x_idx = (i & 1) * bw / 2; + const int y_idx = (i >> 1) * bw / 2; + buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride; + split_variance[i] = + av1_get_perpixel_variance_facade(cpi, xd, &buf, subsize, AOM_PLANE_Y); + } + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) + features[5 + i] = (float)split_variance[i] / (float)whole_block_variance; + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + /*feature_size=*/9, 5, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_split_part2( + &cpi->ext_part_controller, frame_is_intra_only(&cpi->common), + features, &part_state->prune_rect_part[HORZ], + &part_state->prune_rect_part[VERT])) { + return; + } + + // 2. Do the prediction and prune 0-2 partitions based on their probabilities + float raw_scores[3] = { 0.0f }; + av1_nn_predict(features, nn_config, 1, raw_scores); + float probs[3] = { 0.0f }; + av1_nn_softmax(raw_scores, probs, 3); + + // probs[0] is the probability of the fact that both rectangular partitions + // are worse than current best_rd + if (probs[1] <= cur_thresh) part_state->prune_rect_part[HORZ] = 1; + if (probs[2] <= cur_thresh) part_state->prune_rect_part[VERT] = 1; +} + +// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be +// considered. +void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx, + int64_t best_rd, + PartitionSearchState *part_state, + int *ab_partitions_allowed) { + const PartitionBlkParams blk_params = part_state->part_blk_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + + if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; + const NN_CONFIG *nn_config = NULL; + switch (bsize) { + case BLOCK_8X8: nn_config = NULL; break; + case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break; + case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break; + case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break; + case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + + // Generate features. + float features[10]; + int feature_index = 0; + features[feature_index++] = (float)part_ctx; + features[feature_index++] = (float)var_ctx; + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + const int64_t *horz_rd = part_state->rect_part_rd[HORZ]; + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + const int64_t *vert_rd = part_state->rect_part_rd[VERT]; + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + const int64_t *split_rd = part_state->split_rd; + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features[feature_index++] = rd_ratio; + } + assert(feature_index == 10); + + // Write features to file + if (!frame_is_intra_only(&cpi->common)) { + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + /*feature_size=*/10, 6, bsize, mi_row, mi_col); + } + + if (ext_ml_model_decision_after_rect( + &cpi->ext_part_controller, frame_is_intra_only(&cpi->common), + features, &ab_partitions_allowed[HORZ_A], + &ab_partitions_allowed[HORZ_B], &ab_partitions_allowed[VERT_A], + &ab_partitions_allowed[VERT_B])) { + return; + } + + // Calculate scores using the NN model. + float score[16] = { 0.0f }; + av1_nn_predict(features, nn_config, 1, score); + int int_score[16]; + int max_score = -1000; + for (int i = 0; i < 16; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = AOMMAX(int_score[i], max_score); + } + + // Make decisions based on the model scores. + int thresh = max_score; + switch (bsize) { + case BLOCK_16X16: thresh -= 150; break; + case BLOCK_32X32: thresh -= 100; break; + default: break; + } + av1_zero_array(ab_partitions_allowed, NUM_AB_PARTS); + for (int i = 0; i < 16; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) ab_partitions_allowed[HORZ_A] = 1; + if ((i >> 1) & 1) ab_partitions_allowed[HORZ_B] = 1; + if ((i >> 2) & 1) ab_partitions_allowed[VERT_A] = 1; + if ((i >> 3) & 1) ab_partitions_allowed[VERT_B] = 1; + } + } +} + +#define FEATURES 18 +#define LABELS 4 +// Use a ML model to predict if horz4 and vert4 should be considered. +void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x, + int part_ctx, int64_t best_rd, + PartitionSearchState *part_state, + int *part4_allowed, + unsigned int pb_source_variance) { + const PartitionBlkParams blk_params = part_state->part_blk_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + + int64_t(*rect_part_rd)[SUB_PARTITIONS_RECT] = part_state->rect_part_rd; + int64_t *split_rd = part_state->split_rd; + if (ext_ml_model_decision_after_part_ab( + cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd, + &part4_allowed[HORZ4], &part4_allowed[VERT4], pb_source_variance, + mi_row, mi_col)) + return; + + if (best_rd >= 1000000000) return; + int64_t *horz_rd = rect_part_rd[HORZ4]; + int64_t *vert_rd = rect_part_rd[VERT4]; + const NN_CONFIG *nn_config = NULL; + // 4-way partitions are only allowed for these three square block sizes. + switch (bsize) { + case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break; + case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break; + case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + + // Generate features. + float features[FEATURES]; + int feature_index = 0; + features[feature_index++] = (float)part_ctx; + features[feature_index++] = (float)get_unsigned_bits(pb_source_variance); + + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features[feature_index++] = rd_ratio; + } + + // Get variance of the 1:4 and 4:1 sub-blocks. + unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; + unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; + { + BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); + BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); + + assert(horz_4_bs != BLOCK_INVALID); + assert(vert_4_bs != BLOCK_INVALID); + + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, + av1_num_planes(&cpi->common), bsize); + const int src_stride = x->plane[0].src.stride; + uint8_t *src = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + + struct buf_2d horz_4_src, vert_4_src; + horz_4_src.stride = src_stride; + vert_4_src.stride = src_stride; + + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride; + vert_4_src.buf = src + i * block_size_wide[vert_4_bs]; + + horz_4_source_var[i] = av1_get_perpixel_variance_facade( + cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y); + vert_4_source_var[i] = av1_get_perpixel_variance_facade( + cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y); + } + } + + const float denom = (float)(pb_source_variance + 1); + const float low_b = 0.1f; + const float high_b = 10.0f; + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + // Ratio between the 4:1 sub-block variance and the whole-block variance. + float var_ratio = (float)(horz_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features[feature_index++] = var_ratio; + } + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + // Ratio between the 1:4 sub-block RD and the whole-block RD. + float var_ratio = (float)(vert_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features[feature_index++] = var_ratio; + } + assert(feature_index == FEATURES); + + // Write features to file + if (!frame_is_intra_only(&cpi->common)) { + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + FEATURES, 7, bsize, mi_row, mi_col); + } + + // Calculate scores using the NN model. + float score[LABELS] = { 0.0f }; + av1_nn_predict(features, nn_config, 1, score); + int int_score[LABELS]; + int max_score = -1000; + for (int i = 0; i < LABELS; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = AOMMAX(int_score[i], max_score); + } + + // Make decisions based on the model scores. + int thresh = max_score; + switch (bsize) { + case BLOCK_16X16: thresh -= 500; break; + case BLOCK_32X32: thresh -= 500; break; + case BLOCK_64X64: thresh -= 200; break; + default: break; + } + av1_zero_array(part4_allowed, NUM_PART4_TYPES); + for (int i = 0; i < LABELS; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) part4_allowed[HORZ4] = 1; + if ((i >> 1) & 1) part4_allowed[VERT4] = 1; + } + } +} +#undef FEATURES +#undef LABELS + +#define FEATURES 4 +void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + unsigned int pb_source_variance, int bit_depth, + PartitionSearchState *part_state) { + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + const NN_CONFIG *nn_config = NULL; + int thresh = 0; + switch (bsize) { + case BLOCK_8X8: + nn_config = &av1_partition_breakout_nnconfig_8; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[0]; + break; + case BLOCK_16X16: + nn_config = &av1_partition_breakout_nnconfig_16; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[1]; + break; + case BLOCK_32X32: + nn_config = &av1_partition_breakout_nnconfig_32; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[2]; + break; + case BLOCK_64X64: + nn_config = &av1_partition_breakout_nnconfig_64; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[3]; + break; + case BLOCK_128X128: + nn_config = &av1_partition_breakout_nnconfig_128; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[4]; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config || thresh < 0) return; + + const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f }; + thresh = (int)((float)thresh * + ml_predict_breakout_thresh_scale + [cpi->sf.part_sf.ml_predict_breakout_level - 1]); + + // Generate feature values. + float features[FEATURES]; + int feature_index = 0; + + const int num_pels_log2 = num_pels_log2_lookup[bsize]; + float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX); + rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * + rate_f; + features[feature_index++] = rate_f; + + const float dist_f = + (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2); + features[feature_index++] = dist_f; + + features[feature_index++] = (float)pb_source_variance; + + const int dc_q = (int)x->plane[0].dequant_QTX[0] >> (bit_depth - 8); + features[feature_index++] = (float)(dc_q * dc_q) / 256.0f; + assert(feature_index == FEATURES); + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, FEATURES, + 2, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_none(&cpi->ext_part_controller, + frame_is_intra_only(&cpi->common), + features, &part_state->do_square_split, + &part_state->do_rectangular_split)) { + return; + } + + // Calculate score using the NN model. + float score = 0.0f; + av1_nn_predict(features, nn_config, 1, &score); + + // Make decision. + if ((int)(score * 100) >= thresh) { + part_state->do_square_split = 0; + part_state->do_rectangular_split = 0; + } +} +#undef FEATURES + +void av1_prune_partitions_before_search(AV1_COMP *const cpi, + MACROBLOCK *const x, + SIMPLE_MOTION_DATA_TREE *const sms_tree, + PartitionSearchState *part_state) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const BLOCK_SIZE bsize = blk_params->bsize; + + if (cpi->third_pass_ctx) { + int mi_row = blk_params->mi_row; + int mi_col = blk_params->mi_col; + double ratio_h, ratio_w; + av1_get_third_pass_ratio(cpi->third_pass_ctx, 0, cm->height, cm->width, + &ratio_h, &ratio_w); + THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( + cpi->third_pass_ctx, 0, mi_row, mi_col, ratio_h, ratio_w); + BLOCK_SIZE third_pass_bsize = + av1_get_third_pass_adjusted_blk_size(this_mi, ratio_h, ratio_w); + // check the actual partition of this block in the second pass + PARTITION_TYPE third_pass_part = + av1_third_pass_get_sb_part_type(cpi->third_pass_ctx, this_mi); + + int is_edge = (mi_row + mi_size_high[bsize] >= cm->mi_params.mi_rows) || + (mi_col + mi_size_wide[bsize] >= cm->mi_params.mi_cols); + + if (!is_edge && block_size_wide[bsize] >= 16) { + // If in second pass we used rectangular partition, then do not search for + // rectangular partition in the different direction. + if (third_pass_part != PARTITION_NONE) { + if (third_pass_part == PARTITION_HORZ || + third_pass_part == PARTITION_HORZ_4 || + third_pass_part == PARTITION_HORZ_A || + third_pass_part == PARTITION_HORZ_B) { + part_state->partition_rect_allowed[VERT] = 0; + } else if (third_pass_part == PARTITION_VERT || + third_pass_part == PARTITION_VERT_4 || + third_pass_part == PARTITION_VERT_A || + third_pass_part == PARTITION_VERT_B) { + part_state->partition_rect_allowed[HORZ] = 0; + } + } + + int minSize = AOMMIN(block_size_wide[third_pass_bsize], + block_size_high[third_pass_bsize]); + int maxSize = AOMMAX(block_size_wide[third_pass_bsize], + block_size_high[third_pass_bsize]); + if (block_size_wide[bsize] < minSize / 4) { + // Current partition is too small, just terminate + part_state->terminate_partition_search = 1; + return; + } else if (block_size_wide[bsize] < minSize / 2) { + if (third_pass_part != PARTITION_NONE) { + // Current partition is very small, and in second pass we used + // rectangular partition. Terminate the search here then. + part_state->terminate_partition_search = 1; + return; + } else { + // Partition is small, but we still check this partition, only disable + // further splits. + // TODO(any): check why this is not covered by the termination for < + // minSize/4. + av1_disable_square_split_partition(part_state); + av1_disable_rect_partitions(part_state); + return; + } + } else if (block_size_wide[bsize] > maxSize) { + // Partition is larger than in the second pass. Only allow split. + av1_set_square_split_only(part_state); + return; + } else if (block_size_wide[bsize] >= minSize && + block_size_wide[bsize] <= maxSize) { + // Partition is within a range where it is very likely to find a good + // choice, so do not prune anything. + return; + } + } + } + + // Prune rectangular partitions for larger blocks. + if (bsize > cpi->sf.part_sf.rect_partition_eval_thresh) { + part_state->do_rectangular_split = 0; + part_state->partition_rect_allowed[HORZ] = 0; + part_state->partition_rect_allowed[VERT] = 0; + } + + // Prune rectangular, AB and 4-way partition based on q index and block size + if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 1) { + if (bsize == BLOCK_8X8 && x->qindex < 35) + av1_disable_rect_partitions(part_state); + + } else if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 2) { + // Enumeration difference between two square partitions + const int sqr_bsize_step = BLOCK_32X32 - BLOCK_16X16; + int max_bsize = + BLOCK_32X32 - (x->qindex * 3 / QINDEX_RANGE) * sqr_bsize_step; + max_bsize = AOMMAX(max_bsize, BLOCK_4X4); + const BLOCK_SIZE max_prune_bsize = + (BLOCK_SIZE)AOMMIN(max_bsize, BLOCK_32X32); + + // Prune partition + // qidx 0 to 85: prune bsize below BLOCK_32X32 + // qidx 86 to 170: prune bsize below BLOCK_16X16 + // qidx 171 to 255: prune bsize below BLOCK_8X8 + if (bsize < max_prune_bsize) { + av1_disable_rect_partitions(part_state); + } + } + + if (cpi->sf.part_sf.prune_sub_8x8_partition_level && (bsize == BLOCK_8X8)) { + const MACROBLOCKD *const xd = &x->e_mbd; + int prune_sub_8x8; + if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 2) { + prune_sub_8x8 = 1; + } else { + assert(cpi->sf.part_sf.prune_sub_8x8_partition_level == 1); + // Prune if both neighbors are available and either is > BLOCK_8X8 + prune_sub_8x8 = xd->left_available && xd->up_available && + (xd->left_mbmi->bsize > BLOCK_8X8 || + xd->above_mbmi->bsize > BLOCK_8X8); + } + if (prune_sub_8x8) { + av1_disable_all_splits(part_state); + } + } + + // A CNN-based speed feature pruning out either split or all non-split + // partition in INTRA frame coding. + const int try_intra_cnn_based_part_prune = + frame_is_intra_only(cm) && + cpi->sf.part_sf.intra_cnn_based_part_prune_level && + cm->seq_params->sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 && + blk_params->bsize_at_least_8x8 && + av1_is_whole_blk_in_frame(blk_params, mi_params); + + if (try_intra_cnn_based_part_prune) { + av1_intra_mode_cnn_partition( + &cpi->common, x, x->part_search_info.quad_tree_idx, + cpi->sf.part_sf.intra_cnn_based_part_prune_level, part_state); + } + + // Use simple motion search to prune out split or non-split partitions. This + // must be done prior to PARTITION_SPLIT to propagate the initial mvs to a + // smaller blocksize. + const int try_split_only = + cpi->sf.part_sf.simple_motion_search_split && + part_state->do_square_split && blk_params->bsize_at_least_8x8 && + av1_is_whole_blk_in_frame(blk_params, mi_params) && + !frame_is_intra_only(cm) && !av1_superres_scaled(cm); + + if (try_split_only) { + av1_simple_motion_search_based_split(cpi, x, sms_tree, part_state); + } + + // Use simple motion search to prune out rectangular partition in some + // direction. The results are stored in prune_horz and prune_vert in order to + // bypass future related pruning checks if a pruning decision has been made. + + // We want to search at least one partition mode, so don't prune if NONE and + // SPLIT are disabled. + const int non_rect_part_allowed = + part_state->do_square_split || part_state->partition_none_allowed; + // Only run the model if the partitions are not already pruned. + const int rect_part_allowed = part_state->do_rectangular_split && + ((part_state->partition_rect_allowed[HORZ] && + !part_state->prune_rect_part[HORZ]) || + (part_state->partition_rect_allowed[VERT] && + !part_state->prune_rect_part[VERT])); + + const int try_prune_rect = cpi->sf.part_sf.simple_motion_search_prune_rect && + !frame_is_intra_only(cm) && + non_rect_part_allowed && rect_part_allowed && + !av1_superres_scaled(cm); + + if (try_prune_rect) { + av1_simple_motion_search_prune_rect(cpi, x, sms_tree, part_state); + } +} + +#ifndef NDEBUG +static AOM_INLINE int is_bsize_square(BLOCK_SIZE bsize) { + return block_size_wide[bsize] == block_size_high[bsize]; +} +#endif // NDEBUG + +void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc, + PartitionSearchState *part_state) { + assert(is_bsize_square(sb_enc->max_partition_size)); + assert(is_bsize_square(sb_enc->min_partition_size)); + assert(sb_enc->min_partition_size <= sb_enc->max_partition_size); + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const BLOCK_SIZE bsize = blk_params->bsize; + assert(is_bsize_square(bsize)); + const int max_partition_size_1d = block_size_wide[sb_enc->max_partition_size]; + const int min_partition_size_1d = block_size_wide[sb_enc->min_partition_size]; + const int bsize_1d = block_size_wide[bsize]; + assert(min_partition_size_1d <= max_partition_size_1d); + const int is_le_min_sq_part = bsize_1d <= min_partition_size_1d; + const int is_gt_max_sq_part = bsize_1d > max_partition_size_1d; + if (is_gt_max_sq_part) { + // If current block size is larger than max, only allow split. + av1_set_square_split_only(part_state); + } else if (is_le_min_sq_part) { + // If current block size is less or equal to min, only allow none if valid + // block large enough; only allow split otherwise. + av1_disable_rect_partitions(part_state); + + // only disable square split when current block is not at the picture + // boundary. otherwise, inherit the square split flag from previous logic + if (av1_blk_has_rows_and_cols(blk_params)) { + part_state->do_square_split = 0; + } + part_state->partition_none_allowed = !(part_state->do_square_split); + } +} + +// Decide whether to evaluate the AB partition specified by part_type based on +// split and HORZ/VERT info +int evaluate_ab_partition_based_on_split( + const PC_TREE *pc_tree, PARTITION_TYPE rect_part, + const RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1, + int split_idx2) { + int num_win = 0; + // Threshold for number of winners + // Conservative pruning for high quantizers + const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3); + int sub_part_win = + (rect_part_win_info == NULL) ? (pc_tree->partitioning == rect_part) + : (rect_part == PARTITION_HORZ) ? rect_part_win_info->rect_part_win[HORZ] + : rect_part_win_info->rect_part_win[VERT]; + num_win += (sub_part_win) ? 1 : 0; + if (pc_tree->split[split_idx1]) { + num_win += + (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0; + } else { + num_win += 1; + } + if (pc_tree->split[split_idx2]) { + num_win += + (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0; + } else { + num_win += 1; + } + if (num_win < num_win_thresh) { + return 0; + } + return 1; +} + +void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x, + const PC_TREE *pc_tree, int pb_source_variance, + int64_t best_rdcost, + const RD_RECT_PART_WIN_INFO *rect_part_win_info, + bool ext_partition_allowed, + PartitionSearchState *part_state, + int *ab_partitions_allowed) { + int64_t *horz_rd = part_state->rect_part_rd[HORZ]; + int64_t *vert_rd = part_state->rect_part_rd[VERT]; + int64_t *split_rd = part_state->split_rd; + const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg; + // The standard AB partitions are allowed initially if ext-partition-types are + // allowed. + int horzab_partition_allowed = ext_partition_allowed && + part_cfg->enable_ab_partitions && + part_state->partition_rect_allowed[HORZ]; + int vertab_partition_allowed = ext_partition_allowed && + part_cfg->enable_ab_partitions && + part_state->partition_rect_allowed[VERT]; + + // Pruning: pruning out AB partitions on one main direction based on the + // current best partition and source variance. + if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) { + // TODO(debargha,huisu@google.com): may need to tune the threshold for + // pb_source_variance. + horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || + (pc_tree->partitioning == PARTITION_NONE && + pb_source_variance < 32) || + pc_tree->partitioning == PARTITION_SPLIT); + vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || + (pc_tree->partitioning == PARTITION_NONE && + pb_source_variance < 32) || + pc_tree->partitioning == PARTITION_SPLIT); + } else { + horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || + pc_tree->partitioning == PARTITION_SPLIT); + vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || + pc_tree->partitioning == PARTITION_SPLIT); + } + horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0); + horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0); + vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0); + vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0); + split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0); + split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0); + split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0); + split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0); + } + + // Pruning: pruning out horz_a or horz_b if the combined rdcost of its + // subblocks estimated from previous partitions is much higher than the best + // rd so far. + ab_partitions_allowed[HORZ_A] = horzab_partition_allowed; + ab_partitions_allowed[HORZ_B] = horzab_partition_allowed; + if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1]; + const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3]; + switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + case 1: + ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 14 < best_rdcost); + ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 14 < best_rdcost); + break; + case 2: + default: + ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 15 < best_rdcost); + ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 15 < best_rdcost); + break; + } + } + + // Pruning: pruning out vert_a or vert_b if the combined rdcost of its + // subblocks estimated from previous partitions is much higher than the best + // rd so far. + ab_partitions_allowed[VERT_A] = vertab_partition_allowed; + ab_partitions_allowed[VERT_B] = vertab_partition_allowed; + if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2]; + const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3]; + switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + case 1: + ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 14 < best_rdcost); + ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 14 < best_rdcost); + break; + case 2: + default: + ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 15 < best_rdcost); + ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 15 < best_rdcost); + break; + } + } + + // Pruning: pruning out some ab partitions using a DNN taking rd costs of + // sub-blocks from previous basic partition types. + if (cpi->sf.part_sf.ml_prune_partition && ext_partition_allowed && + part_state->partition_rect_allowed[HORZ] && + part_state->partition_rect_allowed[VERT]) { + // TODO(huisu@google.com): x->source_variance may not be the current + // block's variance. The correct one to use is pb_source_variance. Need to + // re-train the model to fix it. + av1_ml_prune_ab_partition(cpi, pc_tree->partitioning, + get_unsigned_bits(x->source_variance), + best_rdcost, part_state, ab_partitions_allowed); + } + + // Pruning: pruning AB partitions based on the number of horz/vert wins + // in the current block and sub-blocks in PARTITION_SPLIT. + if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && + ab_partitions_allowed[HORZ_A]) { + ab_partitions_allowed[HORZ_A] &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1); + } + if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && + ab_partitions_allowed[HORZ_B]) { + ab_partitions_allowed[HORZ_B] &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3); + } + if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && + ab_partitions_allowed[VERT_A]) { + ab_partitions_allowed[VERT_A] &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2); + } + if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && + ab_partitions_allowed[VERT_B]) { + ab_partitions_allowed[VERT_B] &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3); + } +} + +// Prepare features for the external model. Specifically, features after +// ab partition is searched. +static void prepare_features_after_part_ab( + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + int part_ctx, int64_t best_rd, + int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + int64_t split_rd[SUB_PARTITIONS_SPLIT], unsigned int pb_source_variance, + int mi_row, int mi_col, aom_partition_features_t *const features) { + int64_t *horz_rd = rect_part_rd[HORZ]; + int64_t *vert_rd = rect_part_rd[VERT]; + + // Generate features. + int feature_index = 0; + features->after_part_ab.f[feature_index++] = (float)part_ctx; + features->after_part_ab.f[feature_index++] = + (float)get_unsigned_bits(pb_source_variance); + + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features->after_part_ab.f[feature_index++] = rd_ratio; + } + + // 4-way partitions are only allowed for these three square block sizes. + assert(bsize == BLOCK_16X16 || bsize == BLOCK_32X32 || bsize == BLOCK_64X64); + + // Get variance of the 1:4 and 4:1 sub-blocks. + unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; + unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; + { + BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); + BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); + + assert(horz_4_bs != BLOCK_INVALID); + assert(vert_4_bs != BLOCK_INVALID); + + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, + av1_num_planes(&cpi->common), bsize); + const int src_stride = x->plane[0].src.stride; + uint8_t *src = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + + struct buf_2d horz_4_src, vert_4_src; + horz_4_src.stride = src_stride; + vert_4_src.stride = src_stride; + + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride; + vert_4_src.buf = src + i * block_size_wide[vert_4_bs]; + + horz_4_source_var[i] = av1_get_perpixel_variance_facade( + cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y); + vert_4_source_var[i] = av1_get_perpixel_variance_facade( + cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y); + } + } + + const float denom = (float)(pb_source_variance + 1); + const float low_b = 0.1f; + const float high_b = 10.0f; + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + // Ratio between the 4:1 sub-block variance and the whole-block variance. + float var_ratio = (float)(horz_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features->after_part_ab.f[feature_index++] = var_ratio; + } + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + // Ratio between the 1:4 sub-block RD and the whole-block RD. + float var_ratio = (float)(vert_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features->after_part_ab.f[feature_index++] = var_ratio; + } + assert(feature_index == 18); +} + +// If the external partition model is used, we let it determine partition +// decisions before partition none. Specifically, these parameters: +// partition_none_allowed +// partition_horz_allowed +// partition_vert_allowed +// do_rectangular_split +// do_square_split +static bool ext_ml_model_decision_before_none( + AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT], + int *partition_none_allowed, int *partition_horz_allowed, + int *partition_vert_allowed, int *do_rectangular_split, + int *do_square_split) { + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (!ext_part_controller->ready) return false; + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE; + for (int i = 0; i < FEATURE_SIZE_SMS_SPLIT; ++i) { + features.before_part_none.f[i] = features_from_motion[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *partition_none_allowed = decision.partition_none_allowed; + *partition_horz_allowed = decision.partition_rect_allowed[HORZ]; + *partition_vert_allowed = decision.partition_rect_allowed[VERT]; + *do_rectangular_split = decision.do_rectangular_split; + *do_square_split = decision.do_square_split; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions before partition none. Specifically, these parameters: +// prune_horz +// prune_vert +static bool ext_ml_model_decision_before_none_part2( + AV1_COMP *cpi, + const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART], + int *prune_horz, int *prune_vert) { + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (!ext_part_controller->ready) return false; + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2; + for (int i = 0; i < FEATURE_SIZE_SMS_PRUNE_PART; ++i) { + features.before_part_none.f_part2[i] = features_from_motion[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *prune_horz = decision.prune_rect_part[HORZ]; + *prune_vert = decision.prune_rect_part[VERT]; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// do_square_split +// do_rectangular_split +bool ext_ml_model_decision_after_none( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_none, int *do_square_split, + int *do_rectangular_split) { + if (!ext_part_controller->ready || is_intra_frame) return false; + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_NONE; + for (int i = 0; i < 4; ++i) { + features.after_part_none.f[i] = features_after_none[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *do_square_split = decision.do_square_split; + *do_rectangular_split = decision.do_rectangular_split; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// terminate_partition_search +bool ext_ml_model_decision_after_none_part2( + AV1_COMP *const cpi, const float *const features_terminate, + int *terminate_partition_search) { + AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (!ext_part_controller->ready || frame_is_intra_only(cm)) return false; + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_NONE_PART2; + for (int i = 0; i < FEATURE_SIZE_SMS_TERM_NONE; ++i) { + features.after_part_none.f_terminate[i] = features_terminate[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *terminate_partition_search = decision.terminate_partition_search; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// terminate_partition_search +bool ext_ml_model_decision_after_split(AV1_COMP *const cpi, + const float *const features_terminate, + int *terminate_partition_search) { + const AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (frame_is_intra_only(cm) || !cpi->ext_part_controller.ready) { + return false; + } + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT; + for (int i = 0; i < 31; ++i) { + features.after_part_split.f_terminate[i] = features_terminate[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *terminate_partition_search = decision.terminate_partition_search; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// prune_rect_part[HORZ] +// prune_rect_part[VERT] +bool ext_ml_model_decision_after_split_part2( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_prune, int *prune_rect_part_horz, + int *prune_rect_part_vert) { + if (is_intra_frame || !ext_part_controller->ready) { + return false; + } + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2; + for (int i = 0; i < 9; ++i) { + features.after_part_split.f_prune_rect[i] = features_prune[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *prune_rect_part_horz = decision.prune_rect_part[0]; + *prune_rect_part_vert = decision.prune_rect_part[1]; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after rectangular partition. Specifically, these parameters: +// horza_partition_allowed +// horzb_partition_allowed +// verta_partition_allowed +// vertb_partition_allowed +static bool ext_ml_model_decision_after_rect( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_rect, int *horza_partition_allowed, + int *horzb_partition_allowed, int *verta_partition_allowed, + int *vertb_partition_allowed) { + if (is_intra_frame || !ext_part_controller->ready) return false; + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_RECT; + for (int i = 0; i < 10; ++i) { + features.after_part_rect.f[i] = features_after_rect[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *horza_partition_allowed = decision.horza_partition_allowed; + *horzb_partition_allowed = decision.horzb_partition_allowed; + *verta_partition_allowed = decision.verta_partition_allowed; + *vertb_partition_allowed = decision.vertb_partition_allowed; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after AB partition. Specifically, these parameters: +// partition_vert4_allowed +// partition_horz4_allowed +static bool ext_ml_model_decision_after_part_ab( + AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, + int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed, + int *const partition_vert4_allowed, unsigned int pb_source_variance, + int mi_row, int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + + if (!frame_is_intra_only(cm) && ext_part_controller->ready) { + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_AB; + prepare_features_after_part_ab(cpi, x, bsize, part_ctx, best_rd, + rect_part_rd, split_rd, pb_source_variance, + mi_row, mi_col, &features); + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *partition_horz4_allowed = decision.partition_horz4_allowed; + *partition_vert4_allowed = decision.partition_vert4_allowed; + + return true; + } + + return false; +} + +// This function resembles "av1_setup_sms_tree()" in context_tree.c +// with function signature change. +static SIMPLE_MOTION_DATA_TREE *setup_sms_tree( + AV1_COMP *const cpi, SIMPLE_MOTION_DATA_TREE *sms_tree) { + AV1_COMMON *const cm = &cpi->common; + const int stat_generation_stage = is_stat_generation_stage(cpi); + const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; + const int tree_nodes = + av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); + int sms_tree_index = 0; + SIMPLE_MOTION_DATA_TREE *this_sms; + int square_index = 1; + int nodes; + this_sms = &sms_tree[0]; + + if (!stat_generation_stage) { + const int leaf_factor = is_sb_size_128 ? 4 : 1; + const int leaf_nodes = 256 * leaf_factor; + + // Sets up all the leaf nodes in the tree. + for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) { + SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index]; + tree->block_size = square[0]; + } + + // Each node has 4 leaf nodes, fill each block_size level of the tree + // from leafs to the root. + for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) { + for (int i = 0; i < nodes; ++i) { + SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index]; + tree->block_size = square[square_index]; + for (int j = 0; j < 4; j++) tree->split[j] = this_sms++; + ++sms_tree_index; + } + ++square_index; + } + } else { + // Allocation for firstpass/LAP stage + // TODO(Mufaddal): refactor square_index to use a common block_size macro + // from firstpass.c + SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index]; + square_index = 2; + tree->block_size = square[square_index]; + } + + // Set up the root node for the largest superblock size + return &sms_tree[tree_nodes - 1]; +} + +static void write_motion_feature_to_file( + const char *const path, const int sb_counter, const unsigned int *block_sse, + const unsigned int *block_var, const int num_blocks, const BLOCK_SIZE bsize, + const BLOCK_SIZE fixed_block_size, const int mi_row, const int mi_col) { + char filename[256]; + snprintf(filename, sizeof(filename), "%s/motion_search_feature_sb%d", path, + sb_counter); + FILE *pfile = fopen(filename, "w"); + fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize, + block_size_wide[fixed_block_size], num_blocks); + for (int i = 0; i < num_blocks; ++i) { + fprintf(pfile, "%d", block_sse[i]); + if (i < num_blocks - 1) fprintf(pfile, ","); + } + fprintf(pfile, "\n"); + for (int i = 0; i < num_blocks; ++i) { + fprintf(pfile, "%d", block_var[i]); + if (i < num_blocks - 1) fprintf(pfile, ","); + } + fprintf(pfile, "\n"); + fclose(pfile); +} + +void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, + const int mi_row, const int mi_col, + const BLOCK_SIZE bsize, + aom_partition_features_t *features) { + const AV1_COMMON *const cm = &cpi->common; + if (frame_is_intra_only(cm)) return; + + MACROBLOCK *const x = &td->mb; + const BLOCK_SIZE fixed_block_size = BLOCK_16X16; + const int col_step = mi_size_wide[fixed_block_size]; + const int row_step = mi_size_high[fixed_block_size]; + SIMPLE_MOTION_DATA_TREE *sms_tree = NULL; + const int stat_generation_stage = is_stat_generation_stage(cpi); + const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; + const int tree_nodes = + av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); + CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree))); + SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree); + TileInfo *const tile_info = &tile_data->tile_info; + av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize); + av1_init_simple_motion_search_mvs_for_sb(cpi, NULL, x, sms_root, mi_row, + mi_col); + av1_reset_simple_motion_tree_partition(sms_root, bsize); + const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME + : LAST_FRAME }; + const int mi_width = + AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); + const int mi_height = + AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); + const int col_steps = (mi_width / col_step) + ((mi_width % col_step) > 0); + const int row_steps = (mi_height / row_step) + ((mi_height % row_step) > 0); + const int num_blocks = col_steps * row_steps; + unsigned int *block_sse = aom_calloc(num_blocks, sizeof(*block_sse)); + unsigned int *block_var = aom_calloc(num_blocks, sizeof(*block_var)); + if (!(block_sse && block_var)) { + aom_free(sms_tree); + aom_free(block_sse); + aom_free(block_var); + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating block_sse & block_var"); + } + int idx = 0; + + for (int row = mi_row; + row < AOMMIN(mi_row + mi_size_high[bsize], cm->mi_params.mi_rows); + row += row_step) { + for (int col = mi_col; + col < AOMMIN(mi_col + mi_size_wide[bsize], cm->mi_params.mi_cols); + col += col_step) { + simple_motion_search_get_best_ref( + cpi, x, sms_root, row, col, fixed_block_size, ref_list, + /*num_refs=*/1, /*use_subpixel=*/1, + /*save_mv=*/1, &block_sse[idx], &block_var[idx]); + ++idx; + } + } + if (features == NULL) { + write_motion_feature_to_file(cpi->oxcf.partition_info_path, cpi->sb_counter, + block_sse, block_var, idx, bsize, + fixed_block_size, mi_row, mi_col); + } else { + features->sb_features.motion_features.unit_length = + block_size_wide[fixed_block_size]; + features->sb_features.motion_features.num_units = idx; + for (int i = 0; i < idx; ++i) { + features->sb_features.motion_features.block_sse[i] = block_sse[i]; + features->sb_features.motion_features.block_var[i] = block_var[i]; + } + } + + aom_free(block_sse); + aom_free(block_var); + aom_free(sms_tree); +} + +void av1_prepare_motion_search_features_block( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + const int mi_row, const int mi_col, const BLOCK_SIZE bsize, + const int valid_partition_types, unsigned int *block_sse, + unsigned int *block_var, unsigned int sub_block_sse[4], + unsigned int sub_block_var[4], unsigned int horz_block_sse[2], + unsigned int horz_block_var[2], unsigned int vert_block_sse[2], + unsigned int vert_block_var[2]) { + const AV1_COMMON *const cm = &cpi->common; + if (frame_is_intra_only(cm)) return; + MACROBLOCK *const x = &td->mb; + SIMPLE_MOTION_DATA_TREE *sms_tree = NULL; + const int stat_generation_stage = is_stat_generation_stage(cpi); + const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; + const int tree_nodes = + av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); + CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree))); + SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree); + TileInfo *const tile_info = &tile_data->tile_info; + av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize); + av1_reset_simple_motion_tree_partition(sms_root, bsize); + const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME + : LAST_FRAME }; + const int sub_mi_width = mi_size_wide[bsize] / 2; + const int sub_mi_height = sub_mi_width; + simple_motion_search_get_best_ref( + cpi, x, sms_root, mi_row, mi_col, bsize, ref_list, /*num_refs=*/1, + /*use_subpixel=*/1, /*save_mv=*/1, block_sse, block_var); + // Split to 4 sub blocks. + if (valid_partition_types & (1 << PARTITION_SPLIT)) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int i = 0; i < 4; ++i) { + const int row = mi_row + (i >> 1) * sub_mi_height; + const int col = mi_col + (i & 1) * sub_mi_width; + simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize, + ref_list, /*num_refs=*/1, + /*use_subpixel=*/1, /*save_mv=*/1, + &sub_block_sse[i], &sub_block_var[i]); + } + } + // Horizontal split + if (valid_partition_types & (1 << PARTITION_HORZ)) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); + for (int i = 0; i < 2; ++i) { + const int row = mi_row + (i & 1) * sub_mi_height; + const int col = mi_col; + simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize, + ref_list, /*num_refs=*/1, + /*use_subpixel=*/1, /*save_mv=*/1, + &horz_block_sse[i], &horz_block_var[i]); + } + } + // Vertical split + if (valid_partition_types & (1 << PARTITION_VERT)) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT); + for (int i = 0; i < 2; ++i) { + const int row = mi_row; + const int col = mi_col + (i & 1) * sub_mi_width; + simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize, + ref_list, /*num_refs=*/1, + /*use_subpixel=*/1, /*save_mv=*/1, + &vert_block_sse[i], &vert_block_var[i]); + } + } + + aom_free(sms_tree); +} +#endif // !CONFIG_REALTIME_ONLY + +static INLINE void init_simple_motion_search_mvs( + SIMPLE_MOTION_DATA_TREE *sms_tree, const FULLPEL_MV *start_mvs) { + memcpy(sms_tree->start_mvs, start_mvs, sizeof(sms_tree->start_mvs)); + av1_zero(sms_tree->sms_none_feat); + av1_zero(sms_tree->sms_rect_feat); + av1_zero(sms_tree->sms_none_valid); + av1_zero(sms_tree->sms_rect_valid); + + if (sms_tree->block_size >= BLOCK_8X8) { + init_simple_motion_search_mvs(sms_tree->split[0], start_mvs); + init_simple_motion_search_mvs(sms_tree->split[1], start_mvs); + init_simple_motion_search_mvs(sms_tree->split[2], start_mvs); + init_simple_motion_search_mvs(sms_tree->split[3], start_mvs); + } +} + +void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi, + const TileInfo *tile_info, + MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_root, + int mi_row, int mi_col) { + // Use the NEARESTMV of the sb as the start mv + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + FULLPEL_MV ref_mvs[REF_FRAMES]; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + av1_zero(ref_mvs); + // If tile_info is NULL, assume that the offsets have already been set. + if (tile_info) { + av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, + sb_size); + } + + MB_MODE_INFO_EXT mbmi_ext; + const int ref_frame = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + av1_find_mv_refs(cm, xd, xd->mi[0], ref_frame, mbmi_ext.ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext.global_mvs, + mbmi_ext.mode_context); + if (mbmi_ext.ref_mv_count[ref_frame] > 0) { + ref_mvs[ref_frame] = + get_fullmv_from_mv(&xd->ref_mv_stack[ref_frame][0].this_mv.as_mv); + } else { + ref_mvs[ref_frame] = + get_fullmv_from_mv(&mbmi_ext.global_mvs[ref_frame].as_mv); + } + + init_simple_motion_search_mvs(sms_root, ref_mvs); +} diff --git a/third_party/aom/av1/encoder/partition_strategy.h b/third_party/aom/av1/encoder/partition_strategy.h new file mode 100644 index 0000000000..84683f5fd4 --- /dev/null +++ b/third_party/aom/av1/encoder/partition_strategy.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ +#define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ + +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encoder.h" + +void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x, + int label_idx, + int intra_cnn_based_part_prune_level, + PartitionSearchState *part_state); + +// Performs a simple_motion_search with a single reference frame and extract +// the variance of residues. Then use the features to determine whether we want +// to go straight to splitting without trying PARTITION_NONE +void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + PartitionSearchState *part_state); + +// Performs a simple_motion_search with two reference frames and extract +// the variance of residues. Then use the features to determine whether we want +// to prune some partitions. +void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + PartitionSearchState *part_state); + +#if !CONFIG_REALTIME_ONLY +// Early terminates PARTITION_NONE using simple_motion_search features and the +// rate, distortion, and rdcost of PARTITION_NONE. This is only called when: +// - The frame is a show frame +// - The frame is not intra only +// - The current bsize is > BLOCK_8X8 +// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols +void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi, + MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + const RD_STATS *none_rdc, + PartitionSearchState *part_state); + +// Get the features for selecting the max and min partition size. Currently this +// performs simple_motion_search on 16X16 subblocks of the current superblock, +// and then extract the statistics of sse and motion vectors as features. +void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, + float *features); + +// Predict the maximum BLOCK_SIZE to be used to encoder the current superblock. +BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const float *features); + +// Attempts an early termination after PARTITION_SPLIT. +void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, + SIMPLE_MOTION_DATA_TREE *const sms_tree, + int64_t best_rd, int64_t part_none_rd, + int64_t part_split_rd, + int64_t *split_block_rd, + PartitionSearchState *part_state); + +// Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and +// PARTITION_VERT. +// TODO(chiyotsai@google.com): Currently this model does not use q value and has +// no information about rectangular partitions. Preliminary experiments suggest +// that we can get better performance by adding in q_index and rectangular +// sse/var from SMS. We should retrain and tune this model later. +void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x, + int64_t best_rd, int64_t none_rd, + const int64_t *split_rd, + PartitionSearchState *part_state); + +// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be +// considered. +void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx, + int64_t best_rd, + PartitionSearchState *part_state, + int *ab_partitions_allowed); + +// Use a ML model to predict if horz4 and vert4 should be considered. +void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x, + int part_ctx, int64_t best_rd, + PartitionSearchState *part_state, + int *part4_allowed, + unsigned int pb_source_variance); + +// ML-based partition search breakout after PARTITION_NONE. +void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + unsigned int pb_source_variance, int bit_depth, + PartitionSearchState *part_state); + +// The first round of partition pruning determined before any partition +// has been tested. The decisions will be updated and passed back +// to the partition search function. +void av1_prune_partitions_before_search(AV1_COMP *const cpi, + MACROBLOCK *const x, + SIMPLE_MOTION_DATA_TREE *const sms_tree, + PartitionSearchState *part_state); + +// Prune out partitions that lead to coding block sizes outside the min and max +// bsizes set by the encoder. Max and min square partition levels are defined as +// the partition nodes that the recursive function rd_pick_partition() can +// reach. To implement this: only PARTITION_NONE is allowed if the current node +// equals max_partition_size, only PARTITION_SPLIT is allowed if the current +// node exceeds max_partition_size. +void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc, + PartitionSearchState *part_state); + +// Prune out AB partitions based on rd decisions made from testing the +// basic partitions. +void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x, + const PC_TREE *pc_tree, int pb_source_variance, + int64_t best_rdcost, + const RD_RECT_PART_WIN_INFO *rect_part_win_info, + bool ext_partition_allowed, + PartitionSearchState *part_state, + int *ab_partitions_allowed); + +void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, + const int mi_row, const int mi_col, + const BLOCK_SIZE bsize, + aom_partition_features_t *features); +void av1_prepare_motion_search_features_block( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + const int mi_row, const int mi_col, const BLOCK_SIZE bsize, + const int valid_partition_types, unsigned int *block_sse, + unsigned int *block_var, unsigned int sub_block_sse[4], + unsigned int sub_block_var[4], unsigned int horz_block_sse[2], + unsigned int horz_block_var[2], unsigned int vert_block_sse[2], + unsigned int vert_block_var[2]); +#endif // !CONFIG_REALTIME_ONLY + +// A simplified version of set_offsets meant to be used for +// simple_motion_search. +static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *const x, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + + // Set up destination pointers. + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + av1_set_mv_limits(mi_params, &x->mv_limits, mi_row, mi_col, mi_height, + mi_width, cpi->oxcf.border_in_pixels); + + set_plane_n4(xd, mi_width, mi_height, num_planes); + + xd->mi_row = mi_row; + xd->mi_col = mi_col; + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = + GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); + xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE); + xd->mb_to_right_edge = + GET_MV_SUBPEL((mi_params->mi_cols - mi_width - mi_col) * MI_SIZE); + + // Set up source buffers. + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); +} + +void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi, + const TileInfo *tile_info, + MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_root, + int mi_row, int mi_col); + +static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, BLOCK_SIZE sb_size) { + const int sb_mi_wide = mi_size_wide[sb_size]; + const int sb_mi_high = mi_size_high[sb_size]; + + return (mi_row + sb_mi_high) <= mi_params->mi_rows && + (mi_col + sb_mi_wide) <= mi_params->mi_cols; +} + +#if !CONFIG_REALTIME_ONLY +// Do not use this criteria for screen content videos. +// Since screen content videos could often find good predictors and the largest +// block size is likely to be used. +static INLINE int use_auto_max_partition(const AV1_COMP *const cpi, + BLOCK_SIZE sb_size, int mi_row, + int mi_col) { + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const AV1_COMMON *const cm = &cpi->common; + return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools && + cpi->sf.part_sf.auto_max_partition_based_on_simple_motion != + NOT_IN_USE && + sb_size == BLOCK_128X128 && + is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) && + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] != + OVERLAY_UPDATE && + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] != + INTNL_OVERLAY_UPDATE; +} + +static BLOCK_SIZE dim_to_size(int dim) { + switch (dim) { + case 4: return BLOCK_4X4; + case 8: return BLOCK_8X8; + case 16: return BLOCK_16X16; + case 32: return BLOCK_32X32; + case 64: return BLOCK_64X64; + case 128: return BLOCK_128X128; + default: assert(0); return 0; + } +} + +static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc, + AV1_COMP *cpi, MACROBLOCK *x, + const SPEED_FEATURES *sf, + BLOCK_SIZE sb_size, + int mi_row, int mi_col) { + const AV1_COMMON *cm = &cpi->common; + + sb_enc->max_partition_size = + AOMMIN(sf->part_sf.default_max_partition_size, + dim_to_size(cpi->oxcf.part_cfg.max_partition_size)); + sb_enc->min_partition_size = + AOMMAX(sf->part_sf.default_min_partition_size, + dim_to_size(cpi->oxcf.part_cfg.min_partition_size)); + sb_enc->max_partition_size = + AOMMIN(sb_enc->max_partition_size, cm->seq_params->sb_size); + sb_enc->min_partition_size = + AOMMIN(sb_enc->min_partition_size, cm->seq_params->sb_size); + + if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) { + float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f }; + + av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features); + sb_enc->max_partition_size = + AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features), + sb_enc->max_partition_size), + sb_enc->min_partition_size); + } +} +#endif // !CONFIG_REALTIME_ONLY +#endif // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ diff --git a/third_party/aom/av1/encoder/pass2_strategy.c b/third_party/aom/av1/encoder/pass2_strategy.c new file mode 100644 index 0000000000..a9442ffc1a --- /dev/null +++ b/third_party/aom/av1/encoder/pass2_strategy.c @@ -0,0 +1,4488 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\defgroup gf_group_algo Golden Frame Group + * \ingroup high_level_algo + * Algorithms regarding determining the length of GF groups and defining GF + * group structures. + * @{ + */ +/*! @} - end defgroup gf_group_algo */ + +#include +#include + +#include "aom_mem/aom_mem.h" +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#include "av1/common/av1_common_int.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rc_utils.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/thirdpass.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/encode_strategy.h" + +#define DEFAULT_KF_BOOST 2300 +#define DEFAULT_GF_BOOST 2000 +#define GROUP_ADAPTIVE_MAXQ 1 + +static void init_gf_stats(GF_GROUP_STATS *gf_stats); +static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params, + int is_final_pass); + +// Calculate an active area of the image that discounts formatting +// bars and partially discounts other 0 energy areas. +#define MIN_ACTIVE_AREA 0.5 +#define MAX_ACTIVE_AREA 1.0 +static double calculate_active_area(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame) { + const double active_pct = + 1.0 - + ((this_frame->intra_skip_pct / 2) + + ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows)); + return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); +} + +// Calculate a modified Error used in distributing bits between easier and +// harder frames. +#define ACT_AREA_CORRECTION 0.5 +static double calculate_modified_err_new(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *total_stats, + const FIRSTPASS_STATS *this_stats, + int vbrbias, double modified_error_min, + double modified_error_max) { + if (total_stats == NULL) { + return 0; + } + const double av_weight = total_stats->weight / total_stats->count; + const double av_err = + (total_stats->coded_error * av_weight) / total_stats->count; + double modified_error = + av_err * pow(this_stats->coded_error * this_stats->weight / + DOUBLE_DIVIDE_CHECK(av_err), + vbrbias / 100.0); + + // Correction for active area. Frames with a reduced active area + // (eg due to formatting bars) have a higher error per mb for the + // remaining active MBs. The correction here assumes that coding + // 0.5N blocks of complexity 2X is a little easier than coding N + // blocks of complexity X. + modified_error *= + pow(calculate_active_area(frame_info, this_stats), ACT_AREA_CORRECTION); + + return fclamp(modified_error, modified_error_min, modified_error_max); +} + +static double calculate_modified_err(const FRAME_INFO *frame_info, + const TWO_PASS *twopass, + const AV1EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame) { + const FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats; + return calculate_modified_err_new( + frame_info, total_stats, this_frame, oxcf->rc_cfg.vbrbias, + twopass->modified_error_min, twopass->modified_error_max); +} + +// Resets the first pass file to the given position using a relative seek from +// the current position. +static void reset_fpf_position(TWO_PASS_FRAME *p_frame, + const FIRSTPASS_STATS *position) { + p_frame->stats_in = position; +} + +static int input_stats(TWO_PASS *p, TWO_PASS_FRAME *p_frame, + FIRSTPASS_STATS *fps) { + if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF; + + *fps = *p_frame->stats_in; + ++p_frame->stats_in; + return 1; +} + +static int input_stats_lap(TWO_PASS *p, TWO_PASS_FRAME *p_frame, + FIRSTPASS_STATS *fps) { + if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF; + + *fps = *p_frame->stats_in; + /* Move old stats[0] out to accommodate for next frame stats */ + memmove(p->frame_stats_arr[0], p->frame_stats_arr[1], + (p->stats_buf_ctx->stats_in_end - p_frame->stats_in - 1) * + sizeof(FIRSTPASS_STATS)); + p->stats_buf_ctx->stats_in_end--; + return 1; +} + +// Read frame stats at an offset from the current position. +static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, + const TWO_PASS_FRAME *p_frame, + int offset) { + if ((offset >= 0 && + p_frame->stats_in + offset >= p->stats_buf_ctx->stats_in_end) || + (offset < 0 && + p_frame->stats_in + offset < p->stats_buf_ctx->stats_in_start)) { + return NULL; + } + + return &p_frame->stats_in[offset]; +} + +// This function returns the maximum target rate per frame. +static int frame_max_bits(const RATE_CONTROL *rc, + const AV1EncoderConfig *oxcf) { + int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth * + (int64_t)oxcf->rc_cfg.vbrmax_section) / + 100; + if (max_bits < 0) + max_bits = 0; + else if (max_bits > rc->max_frame_bandwidth) + max_bits = rc->max_frame_bandwidth; + + return (int)max_bits; +} + +static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75, + 0.80, 0.85, 0.90, + 0.95, 0.95, 0.95 }; +#define ERR_DIVISOR 96.0 +static double calc_correction_factor(double err_per_mb, int q) { + const double error_term = err_per_mb / ERR_DIVISOR; + const int index = q >> 5; + // Adjustment to power term based on qindex + const double power_term = + q_pow_term[index] + + (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0); + assert(error_term >= 0.0); + return fclamp(pow(error_term, power_term), 0.05, 5.0); +} + +// Based on history adjust expectations of bits per macroblock. +static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) { + TWO_PASS *const twopass = &cpi->ppi->twopass; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + + // Based on recent history adjust expectations of bits per macroblock. + double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0); + double rate_err_factor = 1.0; + const double adj_limit = AOMMAX(0.2, (double)(100 - rate_err_tol) / 200.0); + const double min_fac = 1.0 - adj_limit; + const double max_fac = 1.0 + adj_limit; + + if (cpi->third_pass_ctx && cpi->third_pass_ctx->frame_info_count > 0) { + int64_t actual_bits = 0; + int64_t target_bits = 0; + double factor = 0.0; + int count = 0; + for (int i = 0; i < cpi->third_pass_ctx->frame_info_count; i++) { + actual_bits += cpi->third_pass_ctx->frame_info[i].actual_bits; + target_bits += cpi->third_pass_ctx->frame_info[i].bits_allocated; + factor += cpi->third_pass_ctx->frame_info[i].bpm_factor; + count++; + } + + if (count == 0) { + factor = 1.0; + } else { + factor /= (double)count; + } + + factor *= (double)actual_bits / DOUBLE_DIVIDE_CHECK((double)target_bits); + + if ((twopass->bpm_factor <= 1 && factor < twopass->bpm_factor) || + (twopass->bpm_factor >= 1 && factor > twopass->bpm_factor)) { + twopass->bpm_factor = factor; + twopass->bpm_factor = + AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor)); + } + } + + int err_estimate = p_rc->rate_error_estimate; + int64_t bits_left = twopass->bits_left; + int64_t total_actual_bits = p_rc->total_actual_bits; + int64_t bits_off_target = p_rc->vbr_bits_off_target; + double rolling_arf_group_actual_bits = + (double)twopass->rolling_arf_group_actual_bits; + double rolling_arf_group_target_bits = + (double)twopass->rolling_arf_group_target_bits; + +#if CONFIG_FPMT_TEST + const int is_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 ? 1 : 0; + const int simulate_parallel_frame = + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE + ? is_parallel_frame + : 0; + total_actual_bits = simulate_parallel_frame ? p_rc->temp_total_actual_bits + : p_rc->total_actual_bits; + bits_off_target = simulate_parallel_frame ? p_rc->temp_vbr_bits_off_target + : p_rc->vbr_bits_off_target; + bits_left = + simulate_parallel_frame ? p_rc->temp_bits_left : twopass->bits_left; + rolling_arf_group_target_bits = + (double)(simulate_parallel_frame + ? p_rc->temp_rolling_arf_group_target_bits + : twopass->rolling_arf_group_target_bits); + rolling_arf_group_actual_bits = + (double)(simulate_parallel_frame + ? p_rc->temp_rolling_arf_group_actual_bits + : twopass->rolling_arf_group_actual_bits); + err_estimate = simulate_parallel_frame ? p_rc->temp_rate_error_estimate + : p_rc->rate_error_estimate; +#endif + + if (p_rc->bits_off_target && total_actual_bits > 0) { + if (cpi->ppi->lap_enabled) { + rate_err_factor = rolling_arf_group_actual_bits / + DOUBLE_DIVIDE_CHECK(rolling_arf_group_target_bits); + } else { + rate_err_factor = 1.0 - ((double)(bits_off_target) / + AOMMAX(total_actual_bits, bits_left)); + } + + // Adjustment is damped if this is 1 pass with look ahead processing + // (as there are only ever a few frames of data) and for all but the first + // GOP in normal two pass. + if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) { + rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac); + } + rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor)); + } + + // Is the rate control trending in the right direction. Only make + // an adjustment if things are getting worse. + if ((rate_err_factor < 1.0 && err_estimate >= 0) || + (rate_err_factor > 1.0 && err_estimate <= 0)) { + twopass->bpm_factor *= rate_err_factor; + if (rate_err_tol >= 100) { + twopass->bpm_factor = + AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor)); + } else { + twopass->bpm_factor = AOMMAX(0.1, AOMMIN(10.0, twopass->bpm_factor)); + } + } +} + +static int qbpm_enumerator(int rate_err_tol) { + return 1200000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75); +} + +// Similar to find_qindex_by_rate() function in ratectrl.c, but includes +// calculation of a correction_factor. +static int find_qindex_by_rate_with_correction( + int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb, + double group_weight_factor, int rate_err_tol, int best_qindex, + int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + + while (low < high) { + const int mid = (low + high) >> 1; + const double mid_factor = calc_correction_factor(error_per_mb, mid); + const double q = av1_convert_qindex_to_q(mid, bit_depth); + const int enumerator = qbpm_enumerator(rate_err_tol); + const int mid_bits_per_mb = + (int)((enumerator * mid_factor * group_weight_factor) / q); + + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } + return low; +} + +/*!\brief Choose a target maximum Q for a group of frames + * + * \ingroup rate_control + * + * This function is used to estimate a suitable maximum Q for a + * group of frames. Inititally it is called to get a crude estimate + * for the whole clip. It is then called for each ARF/GF group to get + * a revised estimate for that group. + * + * \param[in] cpi Top-level encoder structure + * \param[in] av_frame_err The average per frame coded error score + * for frames making up this section/group. + * \param[in] inactive_zone Used to mask off /ignore part of the + * frame. The most common use case is where + * a wide format video (e.g. 16:9) is + * letter-boxed into a more square format. + * Here we want to ignore the bands at the + * top and bottom. + * \param[in] av_target_bandwidth The target bits per frame + * + * \return The maximum Q for frames in the group. + */ +static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err, + double inactive_zone, + int av_target_bandwidth) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + inactive_zone = fclamp(inactive_zone, 0.0, 0.9999); + + if (av_target_bandwidth <= 0) { + return rc->worst_quality; // Highest value allowed + } else { + const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.mi_params.MBs; + const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); + const double av_err_per_mb = av_frame_err / (1.0 - inactive_zone); + const int target_norm_bits_per_mb = + (int)((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs; + int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct); + + // Update bpm correction factor based on previous GOP rate error. + twopass_update_bpm_factor(cpi, rate_err_tol); + + // Try and pick a max Q that will be high enough to encode the + // content at the given rate. + int q = find_qindex_by_rate_with_correction( + target_norm_bits_per_mb, cpi->common.seq_params->bit_depth, + av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol, + rc->best_quality, rc->worst_quality); + + // Restriction on active max q for constrained quality mode. + if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level); + return q; + } +} + +#define INTRA_PART 0.005 +#define DEFAULT_DECAY_LIMIT 0.75 +#define LOW_SR_DIFF_TRHESH 0.01 +#define NCOUNT_FRAME_II_THRESH 5.0 +#define LOW_CODED_ERR_PER_MB 0.01 + +/* This function considers how the quality of prediction may be deteriorating + * with distance. It comapres the coded error for the last frame and the + * second reference frame (usually two frames old) and also applies a factor + * based on the extent of INTRA coding. + * + * The decay factor is then used to reduce the contribution of frames further + * from the alt-ref or golden frame, to the bitframe boost calculation for that + * alt-ref or golden frame. + */ +static double get_sr_decay_rate(const FIRSTPASS_STATS *frame) { + double sr_diff = (frame->sr_coded_error - frame->coded_error); + double sr_decay = 1.0; + double modified_pct_inter; + double modified_pcnt_intra; + + modified_pct_inter = frame->pcnt_inter; + if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && + ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < + (double)NCOUNT_FRAME_II_THRESH)) { + modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral; + } + modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); + + if ((sr_diff > LOW_SR_DIFF_TRHESH)) { + double sr_diff_part = ((sr_diff * 0.25) / frame->intra_error); + sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra); + } + return AOMMAX(sr_decay, DEFAULT_DECAY_LIMIT); +} + +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. +static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) { + const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion; + double sr_decay = get_sr_decay_rate(frame); + return AOMMIN(sr_decay, zero_motion_pct); +} + +#define DEFAULT_ZM_FACTOR 0.5 +static double get_prediction_decay_rate(const FIRSTPASS_STATS *frame_stats) { + const double sr_decay_rate = get_sr_decay_rate(frame_stats); + double zero_motion_factor = + DEFAULT_ZM_FACTOR * (frame_stats->pcnt_inter - frame_stats->pcnt_motion); + + // Clamp value to range 0.0 to 1.0 + // This should happen anyway if input values are sensibly clamped but checked + // here just in case. + if (zero_motion_factor > 1.0) + zero_motion_factor = 1.0; + else if (zero_motion_factor < 0.0) + zero_motion_factor = 0.0; + + return AOMMAX(zero_motion_factor, + (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); +} + +// Function to test for a condition where a complex transition is followed +// by a static section. For example in slide shows where there is a fade +// between slides. This is to help with more optimal kf and gf positioning. +static int detect_transition_to_still(const FIRSTPASS_INFO *firstpass_info, + int next_stats_index, + const int min_gf_interval, + const int frame_interval, + const int still_interval, + const double loop_decay_rate, + const double last_decay_rate) { + // Break clause to detect very still sections after motion + // For example a static image after a fade or other transition + // instead of a clean scene cut. + if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 && + last_decay_rate < 0.9) { + int stats_left = + av1_firstpass_info_future_count(firstpass_info, next_stats_index); + if (stats_left >= still_interval) { + int j; + // Look ahead a few frames to see if static condition persists... + for (j = 0; j < still_interval; ++j) { + const FIRSTPASS_STATS *stats = + av1_firstpass_info_peek(firstpass_info, next_stats_index + j); + if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; + } + // Only if it does do we signal a transition to still. + return j == still_interval; + } + } + return 0; +} + +// This function detects a flash through the high relative pcnt_second_ref +// score in the frame following a flash frame. The offset passed in should +// reflect this. +static int detect_flash(const TWO_PASS *twopass, + const TWO_PASS_FRAME *twopass_frame, const int offset) { + const FIRSTPASS_STATS *const next_frame = + read_frame_stats(twopass, twopass_frame, offset); + + // What we are looking for here is a situation where there is a + // brief break in prediction (such as a flash) but subsequent frames + // are reasonably well predicted by an earlier (pre flash) frame. + // The recovery after a flash is indicated by a high pcnt_second_ref + // compared to pcnt_inter. + return next_frame != NULL && + next_frame->pcnt_second_ref > next_frame->pcnt_inter && + next_frame->pcnt_second_ref >= 0.5; +} + +// Update the motion related elements to the GF arf boost calculation. +static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, + GF_GROUP_STATS *gf_stats, double f_w, + double f_h) { + const double pct = stats->pcnt_motion; + + // Accumulate Motion In/Out of frame stats. + gf_stats->this_frame_mv_in_out = stats->mv_in_out_count * pct; + gf_stats->mv_in_out_accumulator += gf_stats->this_frame_mv_in_out; + gf_stats->abs_mv_in_out_accumulator += fabs(gf_stats->this_frame_mv_in_out); + + // Accumulate a measure of how uniform (or conversely how random) the motion + // field is (a ratio of abs(mv) / mv). + if (pct > 0.05) { + const double mvr_ratio = + fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr)); + const double mvc_ratio = + fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc)); + + gf_stats->mv_ratio_accumulator += + pct * + (mvr_ratio < stats->mvr_abs * f_h ? mvr_ratio : stats->mvr_abs * f_h); + gf_stats->mv_ratio_accumulator += + pct * + (mvc_ratio < stats->mvc_abs * f_w ? mvc_ratio : stats->mvc_abs * f_w); + } +} + +static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats, + const double mod_frame_err, + GF_GROUP_STATS *gf_stats) { + gf_stats->gf_group_err += mod_frame_err; +#if GROUP_ADAPTIVE_MAXQ + gf_stats->gf_group_raw_error += stats->coded_error; +#endif + gf_stats->gf_group_skip_pct += stats->intra_skip_pct; + gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows; +} + +static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats, + const int flash_detected, + const int frames_since_key, + const int cur_idx, + GF_GROUP_STATS *gf_stats, int f_w, + int f_h) { + accumulate_frame_motion_stats(stats, gf_stats, f_w, f_h); + // sum up the metric values of current gf group + gf_stats->avg_sr_coded_error += stats->sr_coded_error; + gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref; + gf_stats->avg_new_mv_count += stats->new_mv_count; + gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy; + if (fabs(stats->raw_error_stdev) > 0.000001) { + gf_stats->non_zero_stdev_count++; + gf_stats->avg_raw_err_stdev += stats->raw_error_stdev; + } + + // Accumulate the effect of prediction quality decay + if (!flash_detected) { + gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate; + gf_stats->loop_decay_rate = get_prediction_decay_rate(stats); + + gf_stats->decay_accumulator = + gf_stats->decay_accumulator * gf_stats->loop_decay_rate; + + // Monitor for static sections. + if ((frames_since_key + cur_idx - 1) > 1) { + gf_stats->zero_motion_accumulator = AOMMIN( + gf_stats->zero_motion_accumulator, get_zero_motion_factor(stats)); + } + } +} + +static void average_gf_stats(const int total_frame, GF_GROUP_STATS *gf_stats) { + if (total_frame) { + gf_stats->avg_sr_coded_error /= total_frame; + gf_stats->avg_pcnt_second_ref /= total_frame; + gf_stats->avg_new_mv_count /= total_frame; + gf_stats->avg_wavelet_energy /= total_frame; + } + + if (gf_stats->non_zero_stdev_count) + gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count; +} + +#define BOOST_FACTOR 12.5 +static double baseline_err_per_mb(const FRAME_INFO *frame_info) { + unsigned int screen_area = frame_info->frame_height * frame_info->frame_width; + + // Use a different error per mb factor for calculating boost for + // different formats. + if (screen_area <= 640 * 360) { + return 500.0; + } else { + return 1000.0; + } +} + +static double calc_frame_boost(const PRIMARY_RATE_CONTROL *p_rc, + const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame, + double this_frame_mv_in_out, double max_boost) { + double frame_boost; + const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME], + frame_info->bit_depth); + const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5); + const double active_area = calculate_active_area(frame_info, this_frame); + + // Underlying boost factor is based on inter error ratio. + frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area, + this_frame->intra_error * active_area) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; + + // Increase boost for frames where new data coming into frame (e.g. zoom out). + // Slightly reduce boost if there is a net balance of motion out of the frame + // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. + if (this_frame_mv_in_out > 0.0) + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + // In the extreme case the boost is halved. + else + frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); + + return AOMMIN(frame_boost, max_boost * boost_q_correction); +} + +static double calc_kf_frame_boost(const PRIMARY_RATE_CONTROL *p_rc, + const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame, + double *sr_accumulator, double max_boost) { + double frame_boost; + const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME], + frame_info->bit_depth); + const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00); + const double active_area = calculate_active_area(frame_info, this_frame); + + // Underlying boost factor is based on inter error ratio. + frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area, + this_frame->intra_error * active_area) / + DOUBLE_DIVIDE_CHECK( + (this_frame->coded_error + *sr_accumulator) * active_area); + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error); + *sr_accumulator = AOMMAX(0.0, *sr_accumulator); + + // Q correction and scaling + // The 40.0 value here is an experimentally derived baseline minimum. + // This value is in line with the minimum per frame boost in the alt_ref + // boost calculation. + frame_boost = ((frame_boost + 40.0) * boost_q_correction); + + return AOMMIN(frame_boost, max_boost * boost_q_correction); +} + +static int get_projected_gfu_boost(const PRIMARY_RATE_CONTROL *p_rc, + int gfu_boost, int frames_to_project, + int num_stats_used_for_gfu_boost) { + /* + * If frames_to_project is equal to num_stats_used_for_gfu_boost, + * it means that gfu_boost was calculated over frames_to_project to + * begin with(ie; all stats required were available), hence return + * the original boost. + */ + if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost; + + double min_boost_factor = sqrt(p_rc->baseline_gf_interval); + // Get the current tpl factor (number of frames = frames_to_project). + double tpl_factor = av1_get_gfu_boost_projection_factor( + min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project); + // Get the tpl factor when number of frames = num_stats_used_for_prior_boost. + double tpl_factor_num_stats = av1_get_gfu_boost_projection_factor( + min_boost_factor, MAX_GFUBOOST_FACTOR, num_stats_used_for_gfu_boost); + int projected_gfu_boost = + (int)rint((tpl_factor * gfu_boost) / tpl_factor_num_stats); + return projected_gfu_boost; +} + +#define GF_MAX_BOOST 90.0 +#define GF_MIN_BOOST 50 +#define MIN_DECAY_FACTOR 0.01 +int av1_calc_arf_boost(const TWO_PASS *twopass, + const TWO_PASS_FRAME *twopass_frame, + const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, + int offset, int f_frames, int b_frames, + int *num_fpstats_used, int *num_fpstats_required, + int project_gfu_boost) { + int i; + GF_GROUP_STATS gf_stats; + init_gf_stats(&gf_stats); + double boost_score = (double)NORMAL_BOOST; + int arf_boost; + int flash_detected = 0; + if (num_fpstats_used) *num_fpstats_used = 0; + + // Search forward from the proposed arf/next gf position. + for (i = 0; i < f_frames; ++i) { + const FIRSTPASS_STATS *this_frame = + read_frame_stats(twopass, twopass_frame, i + offset); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats(this_frame, &gf_stats, + frame_info->frame_width, + frame_info->frame_height); + + // We want to discount the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash(twopass, twopass_frame, i + offset) || + detect_flash(twopass, twopass_frame, i + offset + 1); + + // Accumulate the effect of prediction quality decay. + if (!flash_detected) { + gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame); + gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : gf_stats.decay_accumulator; + } + + boost_score += + gf_stats.decay_accumulator * + calc_frame_boost(p_rc, frame_info, this_frame, + gf_stats.this_frame_mv_in_out, GF_MAX_BOOST); + if (num_fpstats_used) (*num_fpstats_used)++; + } + + arf_boost = (int)boost_score; + + // Reset for backward looking loop. + boost_score = 0.0; + init_gf_stats(&gf_stats); + // Search backward towards last gf position. + for (i = -1; i >= -b_frames; --i) { + const FIRSTPASS_STATS *this_frame = + read_frame_stats(twopass, twopass_frame, i + offset); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats(this_frame, &gf_stats, + frame_info->frame_width, + frame_info->frame_height); + + // We want to discount the the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash(twopass, twopass_frame, i + offset) || + detect_flash(twopass, twopass_frame, i + offset + 1); + + // Cumulative effect of prediction quality decay. + if (!flash_detected) { + gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame); + gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : gf_stats.decay_accumulator; + } + + boost_score += + gf_stats.decay_accumulator * + calc_frame_boost(p_rc, frame_info, this_frame, + gf_stats.this_frame_mv_in_out, GF_MAX_BOOST); + if (num_fpstats_used) (*num_fpstats_used)++; + } + arf_boost += (int)boost_score; + + if (project_gfu_boost) { + assert(num_fpstats_required != NULL); + assert(num_fpstats_used != NULL); + *num_fpstats_required = f_frames + b_frames; + arf_boost = get_projected_gfu_boost(p_rc, arf_boost, *num_fpstats_required, + *num_fpstats_used); + } + + if (arf_boost < ((b_frames + f_frames) * GF_MIN_BOOST)) + arf_boost = ((b_frames + f_frames) * GF_MIN_BOOST); + + return arf_boost; +} + +// Calculate a section intra ratio used in setting max loop filter. +static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, + const FIRSTPASS_STATS *end, + int section_length) { + const FIRSTPASS_STATS *s = begin; + double intra_error = 0.0; + double coded_error = 0.0; + int i = 0; + + while (s < end && i < section_length) { + intra_error += s->intra_error; + coded_error += s->coded_error; + ++s; + ++i; + } + + return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error)); +} + +/*!\brief Calculates the bit target for this GF/ARF group + * + * \ingroup rate_control + * + * Calculates the total bits to allocate in this GF/ARF group. + * + * \param[in] cpi Top-level encoder structure + * \param[in] gf_group_err Cumulative coded error score for the + * frames making up this group. + * + * \return The target total number of bits for this GF/ARF group. + */ +static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi, + double gf_group_err) { + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const TWO_PASS *const twopass = &cpi->ppi->twopass; + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + int64_t total_group_bits; + + // Calculate the bits to be allocated to the group as a whole. + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { + total_group_bits = (int64_t)(twopass->kf_group_bits * + (gf_group_err / twopass->kf_group_error_left)); + } else { + total_group_bits = 0; + } + + // Clamp odd edge cases. + total_group_bits = (total_group_bits < 0) ? 0 + : (total_group_bits > twopass->kf_group_bits) + ? twopass->kf_group_bits + : total_group_bits; + + // Clip based on user supplied data rate variability limit. + if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval) + total_group_bits = (int64_t)max_bits * p_rc->baseline_gf_interval; + + return total_group_bits; +} + +// Calculate the number of bits to assign to boosted frames in a group. +static int calculate_boost_bits(int frame_count, int boost, + int64_t total_group_bits) { + int allocation_chunks; + + // return 0 for invalid inputs (could arise e.g. through rounding errors) + if (!boost || (total_group_bits <= 0)) return 0; + + if (frame_count <= 0) return (int)(AOMMIN(total_group_bits, INT_MAX)); + + allocation_chunks = (frame_count * 100) + boost; + + // Prevent overflow. + if (boost > 1023) { + int divisor = boost >> 10; + boost /= divisor; + allocation_chunks /= divisor; + } + + // Calculate the number of extra bits for use in the boosted frame or frames. + return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), + 0); +} + +// Calculate the boost factor based on the number of bits assigned, i.e. the +// inverse of calculate_boost_bits(). +static int calculate_boost_factor(int frame_count, int bits, + int64_t total_group_bits) { + return (int)(100.0 * frame_count * bits / (total_group_bits - bits)); +} + +// Reduce the number of bits assigned to keyframe or arf if necessary, to +// prevent bitrate spikes that may break level constraints. +// frame_type: 0: keyframe; 1: arf. +static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi, + RATE_CONTROL *const rc, + int bits_assigned, + int64_t group_bits, + int frame_type) { + const AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const int temporal_layer_id = cm->temporal_layer_id; + const int spatial_layer_id = cm->spatial_layer_id; + for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1; + ++index) { + if (!is_in_operating_point(seq_params->operating_point_idc[index], + temporal_layer_id, spatial_layer_id)) { + continue; + } + + const AV1_LEVEL target_level = + cpi->ppi->level_params.target_seq_level_idx[index]; + if (target_level >= SEQ_LEVELS) continue; + + assert(is_valid_seq_level_idx(target_level)); + + const double level_bitrate_limit = av1_get_max_bitrate_for_level( + target_level, seq_params->tier[0], seq_params->profile); + const int target_bits_per_frame = + (int)(level_bitrate_limit / cpi->framerate); + if (frame_type == 0) { + // Maximum bits for keyframe is 8 times the target_bits_per_frame. + const int level_enforced_max_kf_bits = target_bits_per_frame * 8; + if (bits_assigned > level_enforced_max_kf_bits) { + const int frames = rc->frames_to_key - 1; + p_rc->kf_boost = calculate_boost_factor( + frames, level_enforced_max_kf_bits, group_bits); + bits_assigned = + calculate_boost_bits(frames, p_rc->kf_boost, group_bits); + } + } else if (frame_type == 1) { + // Maximum bits for arf is 4 times the target_bits_per_frame. + const int level_enforced_max_arf_bits = target_bits_per_frame * 4; + if (bits_assigned > level_enforced_max_arf_bits) { + p_rc->gfu_boost = + calculate_boost_factor(p_rc->baseline_gf_interval, + level_enforced_max_arf_bits, group_bits); + bits_assigned = calculate_boost_bits(p_rc->baseline_gf_interval, + p_rc->gfu_boost, group_bits); + } + } else { + assert(0); + } + } + + return bits_assigned; +} + +// Allocate bits to each frame in a GF / ARF group +double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0, 0.70, 0.55, 0.60, + 0.60, 1.0, 1.0 }; +static void allocate_gf_group_bits(GF_GROUP *gf_group, + PRIMARY_RATE_CONTROL *const p_rc, + RATE_CONTROL *const rc, + int64_t gf_group_bits, int gf_arf_bits, + int key_frame, int use_arf) { + int64_t total_group_bits = gf_group_bits; + int base_frame_bits; + const int gf_group_size = gf_group->size; + int layer_frames[MAX_ARF_LAYERS + 1] = { 0 }; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + int frame_index = !!key_frame; + + // Subtract the extra bits set aside for ARF frames from the Group Total + if (use_arf) total_group_bits -= gf_arf_bits; + + int num_frames = + AOMMAX(1, p_rc->baseline_gf_interval - (rc->frames_since_key == 0)); + base_frame_bits = (int)(total_group_bits / num_frames); + + // Check the number of frames in each layer in case we have a + // non standard group length. + int max_arf_layer = gf_group->max_layer_depth - 1; + for (int idx = frame_index; idx < gf_group_size; ++idx) { + if ((gf_group->update_type[idx] == ARF_UPDATE) || + (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) { + layer_frames[gf_group->layer_depth[idx]]++; + } + } + + // Allocate extra bits to each ARF layer + int i; + int layer_extra_bits[MAX_ARF_LAYERS + 1] = { 0 }; + assert(max_arf_layer <= MAX_ARF_LAYERS); + for (i = 1; i <= max_arf_layer; ++i) { + double fraction = (i == max_arf_layer) ? 1.0 : layer_fraction[i]; + layer_extra_bits[i] = + (int)((gf_arf_bits * fraction) / AOMMAX(1, layer_frames[i])); + gf_arf_bits -= (int)(gf_arf_bits * fraction); + } + + // Now combine ARF layer and baseline bits to give total bits for each frame. + int arf_extra_bits; + for (int idx = frame_index; idx < gf_group_size; ++idx) { + switch (gf_group->update_type[idx]) { + case ARF_UPDATE: + case INTNL_ARF_UPDATE: + arf_extra_bits = layer_extra_bits[gf_group->layer_depth[idx]]; + gf_group->bit_allocation[idx] = base_frame_bits + arf_extra_bits; + break; + case INTNL_OVERLAY_UPDATE: + case OVERLAY_UPDATE: gf_group->bit_allocation[idx] = 0; break; + default: gf_group->bit_allocation[idx] = base_frame_bits; break; + } + } + + // Set the frame following the current GOP to 0 bit allocation. For ARF + // groups, this next frame will be overlay frame, which is the first frame + // in the next GOP. For GF group, next GOP will overwrite the rate allocation. + // Setting this frame to use 0 bit (of out the current GOP budget) will + // simplify logics in reference frame management. + if (gf_group_size < MAX_STATIC_GF_GROUP_LENGTH) + gf_group->bit_allocation[gf_group_size] = 0; +} + +// Returns true if KF group and GF group both are almost completely static. +static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion, + int is_lap_enabled) { + if (is_lap_enabled) { + /* + * when LAP enabled kf_zero_motion is not reliable, so use strict + * constraint on gf_zero_motion. + */ + return (gf_zero_motion >= 0.999); + } else { + return (gf_zero_motion >= 0.995) && + (kf_zero_motion >= STATIC_KF_GROUP_THRESH); + } +} + +#define ARF_ABS_ZOOM_THRESH 4.4 +static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start, + int flash_detected, int active_max_gf_interval, + int active_min_gf_interval, + GF_GROUP_STATS *gf_stats) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + AV1_COMMON *const cm = &cpi->common; + // Motion breakout threshold for loop below depends on image size. + const double mv_ratio_accumulator_thresh = (cm->height + cm->width) / 4.0; + + if (!flash_detected) { + // Break clause to detect very still sections after motion. For example, + // a static image after a fade or other transition. + + // TODO(angiebird): This is a temporary change, we will avoid using + // twopass_frame.stats_in in the follow-up CL + int index = (int)(cpi->twopass_frame.stats_in - + twopass->stats_buf_ctx->stats_in_start); + if (detect_transition_to_still(&twopass->firstpass_info, index, + rc->min_gf_interval, frame_index - cur_start, + 5, gf_stats->loop_decay_rate, + gf_stats->last_loop_decay_rate)) { + return 1; + } + } + + // Some conditions to breakout after min interval. + if (frame_index - cur_start >= active_min_gf_interval && + // If possible don't break very close to a kf + (rc->frames_to_key - frame_index >= rc->min_gf_interval) && + ((frame_index - cur_start) & 0x01) && !flash_detected && + (gf_stats->mv_ratio_accumulator > mv_ratio_accumulator_thresh || + gf_stats->abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) { + return 1; + } + + // If almost totally static, we will not use the the max GF length later, + // so we can continue for more frames. + if (((frame_index - cur_start) >= active_max_gf_interval + 1) && + !is_almost_static(gf_stats->zero_motion_accumulator, + twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) { + return 1; + } + return 0; +} + +static int is_shorter_gf_interval_better( + AV1_COMP *cpi, const EncodeFrameParams *frame_params) { + const RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method; + int shorten_gf_interval; + + av1_tpl_preload_rc_estimate(cpi, frame_params); + + if (gop_length_decision_method == 2) { + // GF group length is decided based on GF boost and tpl stats of ARFs from + // base layer, (base+1) layer. + shorten_gf_interval = + (p_rc->gfu_boost < + p_rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) && + !av1_tpl_setup_stats(cpi, 3, frame_params); + } else { + int do_complete_tpl = 1; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + int is_temporal_filter_enabled = + (rc->frames_since_key > 0 && gf_group->arf_index > -1); + + if (gop_length_decision_method == 1) { + // Check if tpl stats of ARFs from base layer, (base+1) layer, + // (base+2) layer can decide the GF group length. + int gop_length_eval = av1_tpl_setup_stats(cpi, 2, frame_params); + + if (gop_length_eval != 2) { + do_complete_tpl = 0; + shorten_gf_interval = !gop_length_eval; + } + } + + if (do_complete_tpl) { + // Decide GF group length based on complete tpl stats. + shorten_gf_interval = !av1_tpl_setup_stats(cpi, 1, frame_params); + // Tpl stats is reused when the ARF is temporally filtered and GF + // interval is not shortened. + if (is_temporal_filter_enabled && !shorten_gf_interval) { + cpi->skip_tpl_setup_stats = 1; +#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS + assert(cpi->gf_frame_index == 0); + av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data, + gf_group, + cpi->common.seq_params->bit_depth); +#endif // CONFIG_BITRATE_ACCURACY + } + } + } + return shorten_gf_interval; +} + +#define MIN_SHRINK_LEN 6 // the minimum length of gf if we are shrinking +#define SMOOTH_FILT_LEN 7 +#define HALF_FILT_LEN (SMOOTH_FILT_LEN / 2) +#define WINDOW_SIZE 7 +#define HALF_WIN (WINDOW_SIZE / 2) +// A 7-tap gaussian smooth filter +const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383, + 0.242, 0.061, 0.006 }; + +// Smooth filter intra_error and coded_error in firstpass stats. +// If stats[i].is_flash==1, the ith element should not be used in the filtering. +static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx, + int last_idx, double *filt_intra_err, + double *filt_coded_err) { + int i, j; + for (i = start_idx; i <= last_idx; i++) { + double total_wt = 0; + for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { + int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx); + if (stats[idx].is_flash) continue; + + filt_intra_err[i] += + smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error; + total_wt += smooth_filt[j + HALF_FILT_LEN]; + } + if (total_wt > 0.01) { + filt_intra_err[i] /= total_wt; + } else { + filt_intra_err[i] = stats[i].intra_error; + } + } + for (i = start_idx; i <= last_idx; i++) { + double total_wt = 0; + for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { + int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx); + // Coded error involves idx and idx - 1. + if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue; + + filt_coded_err[i] += + smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error; + total_wt += smooth_filt[j + HALF_FILT_LEN]; + } + if (total_wt > 0.01) { + filt_coded_err[i] /= total_wt; + } else { + filt_coded_err[i] = stats[i].coded_error; + } + } +} + +// Calculate gradient +static void get_gradient(const double *values, int start, int last, + double *grad) { + if (start == last) { + grad[start] = 0; + return; + } + for (int i = start; i <= last; i++) { + int prev = AOMMAX(i - 1, start); + int next = AOMMIN(i + 1, last); + grad[i] = (values[next] - values[prev]) / (next - prev); + } +} + +static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start, + int first, int last) { + // Identify unstable areas caused by scenecuts. + // Find the max and 2nd max coded error, and the average of the rest frames. + // If there is only one frame that yields a huge coded error, it is likely a + // scenecut. + double this_ratio, max_prev_ratio, max_next_ratio, max_prev_coded, + max_next_coded; + + if (last - first == 0) return -1; + + for (int i = first; i <= last; i++) { + if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash)) + continue; + double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01); + this_ratio = stats_start[i].coded_error / temp_intra; + // find the avg ratio in the preceding neighborhood + max_prev_ratio = 0; + max_prev_coded = 0; + for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) { + if (stats_start[j].is_flash || (j > 0 && stats_start[j - 1].is_flash)) + continue; + temp_intra = AOMMAX(stats_start[j].intra_error, 0.01); + double temp_ratio = stats_start[j].coded_error / temp_intra; + if (temp_ratio > max_prev_ratio) { + max_prev_ratio = temp_ratio; + } + if (stats_start[j].coded_error > max_prev_coded) { + max_prev_coded = stats_start[j].coded_error; + } + } + // find the avg ratio in the following neighborhood + max_next_ratio = 0; + max_next_coded = 0; + for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) { + if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash)) + continue; + temp_intra = AOMMAX(stats_start[j].intra_error, 0.01); + double temp_ratio = stats_start[j].coded_error / temp_intra; + if (temp_ratio > max_next_ratio) { + max_next_ratio = temp_ratio; + } + if (stats_start[j].coded_error > max_next_coded) { + max_next_coded = stats_start[j].coded_error; + } + } + + if (max_prev_ratio < 0.001 && max_next_ratio < 0.001) { + // the ratios are very small, only check a small fixed threshold + if (this_ratio < 0.02) continue; + } else { + // check if this frame has a larger ratio than the neighborhood + double max_sr = stats_start[i].sr_coded_error; + if (i < last) max_sr = AOMMAX(max_sr, stats_start[i + 1].sr_coded_error); + double max_sr_fr_ratio = + max_sr / AOMMAX(stats_start[i].coded_error, 0.01); + + if (max_sr_fr_ratio > 1.2) continue; + if (this_ratio < 2 * AOMMAX(max_prev_ratio, max_next_ratio) && + stats_start[i].coded_error < + 2 * AOMMAX(max_prev_coded, max_next_coded)) { + continue; + } + } + return i; + } + return -1; +} + +// Remove the region with index next_region. +// parameter merge: 0: merge with previous; 1: merge with next; 2: +// merge with both, take type from previous if possible +// After removing, next_region will be the index of the next region. +static void remove_region(int merge, REGIONS *regions, int *num_regions, + int *next_region) { + int k = *next_region; + assert(k < *num_regions); + if (*num_regions == 1) { + *num_regions = 0; + return; + } + if (k == 0) { + merge = 1; + } else if (k == *num_regions - 1) { + merge = 0; + } + int num_merge = (merge == 2) ? 2 : 1; + switch (merge) { + case 0: + regions[k - 1].last = regions[k].last; + *next_region = k; + break; + case 1: + regions[k + 1].start = regions[k].start; + *next_region = k + 1; + break; + case 2: + regions[k - 1].last = regions[k + 1].last; + *next_region = k; + break; + default: assert(0); + } + *num_regions -= num_merge; + for (k = *next_region - (merge == 1); k < *num_regions; k++) { + regions[k] = regions[k + num_merge]; + } +} + +// Insert a region in the cur_region_idx. The start and last should both be in +// the current region. After insertion, the cur_region_idx will point to the +// last region that was splitted from the original region. +static void insert_region(int start, int last, REGION_TYPES type, + REGIONS *regions, int *num_regions, + int *cur_region_idx) { + int k = *cur_region_idx; + REGION_TYPES this_region_type = regions[k].type; + int this_region_last = regions[k].last; + int num_add = (start != regions[k].start) + (last != regions[k].last); + // move the following regions further to the back + for (int r = *num_regions - 1; r > k; r--) { + regions[r + num_add] = regions[r]; + } + *num_regions += num_add; + if (start > regions[k].start) { + regions[k].last = start - 1; + k++; + regions[k].start = start; + } + regions[k].type = type; + if (last < this_region_last) { + regions[k].last = last; + k++; + regions[k].start = last + 1; + regions[k].last = this_region_last; + regions[k].type = this_region_type; + } else { + regions[k].last = this_region_last; + } + *cur_region_idx = k; +} + +// Get the average of stats inside a region. +static void analyze_region(const FIRSTPASS_STATS *stats, int k, + REGIONS *regions) { + int i; + regions[k].avg_cor_coeff = 0; + regions[k].avg_sr_fr_ratio = 0; + regions[k].avg_intra_err = 0; + regions[k].avg_coded_err = 0; + + int check_first_sr = (k != 0); + + for (i = regions[k].start; i <= regions[k].last; i++) { + if (i > regions[k].start || check_first_sr) { + double num_frames = + (double)(regions[k].last - regions[k].start + check_first_sr); + double max_coded_error = + AOMMAX(stats[i].coded_error, stats[i - 1].coded_error); + double this_ratio = + stats[i].sr_coded_error / AOMMAX(max_coded_error, 0.001); + regions[k].avg_sr_fr_ratio += this_ratio / num_frames; + } + + regions[k].avg_intra_err += + stats[i].intra_error / (double)(regions[k].last - regions[k].start + 1); + regions[k].avg_coded_err += + stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1); + + regions[k].avg_cor_coeff += + AOMMAX(stats[i].cor_coeff, 0.001) / + (double)(regions[k].last - regions[k].start + 1); + regions[k].avg_noise_var += + AOMMAX(stats[i].noise_var, 0.001) / + (double)(regions[k].last - regions[k].start + 1); + } +} + +// Calculate the regions stats of every region. +static void get_region_stats(const FIRSTPASS_STATS *stats, REGIONS *regions, + int num_regions) { + for (int k = 0; k < num_regions; k++) { + analyze_region(stats, k, regions); + } +} + +// Find tentative stable regions +static int find_stable_regions(const FIRSTPASS_STATS *stats, + const double *grad_coded, int this_start, + int this_last, REGIONS *regions) { + int i, j, k = 0; + regions[k].start = this_start; + for (i = this_start; i <= this_last; i++) { + // Check mean and variance of stats in a window + double mean_intra = 0.001, var_intra = 0.001; + double mean_coded = 0.001, var_coded = 0.001; + int count = 0; + for (j = -HALF_WIN; j <= HALF_WIN; j++) { + int idx = AOMMIN(AOMMAX(i + j, this_start), this_last); + if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue; + mean_intra += stats[idx].intra_error; + var_intra += stats[idx].intra_error * stats[idx].intra_error; + mean_coded += stats[idx].coded_error; + var_coded += stats[idx].coded_error * stats[idx].coded_error; + count++; + } + + REGION_TYPES cur_type; + if (count > 0) { + mean_intra /= (double)count; + var_intra /= (double)count; + mean_coded /= (double)count; + var_coded /= (double)count; + int is_intra_stable = (var_intra / (mean_intra * mean_intra) < 1.03); + int is_coded_stable = (var_coded / (mean_coded * mean_coded) < 1.04 && + fabs(grad_coded[i]) / mean_coded < 0.05) || + mean_coded / mean_intra < 0.05; + int is_coded_small = mean_coded < 0.5 * mean_intra; + cur_type = (is_intra_stable && is_coded_stable && is_coded_small) + ? STABLE_REGION + : HIGH_VAR_REGION; + } else { + cur_type = HIGH_VAR_REGION; + } + + // mark a new region if type changes + if (i == regions[k].start) { + // first frame in the region + regions[k].type = cur_type; + } else if (cur_type != regions[k].type) { + // Append a new region + regions[k].last = i - 1; + regions[k + 1].start = i; + regions[k + 1].type = cur_type; + k++; + } + } + regions[k].last = this_last; + return k + 1; +} + +// Clean up regions that should be removed or merged. +static void cleanup_regions(REGIONS *regions, int *num_regions) { + int k = 0; + while (k < *num_regions) { + if ((k > 0 && regions[k - 1].type == regions[k].type && + regions[k].type != SCENECUT_REGION) || + regions[k].last < regions[k].start) { + remove_region(0, regions, num_regions, &k); + } else { + k++; + } + } +} + +// Remove regions that are of type and shorter than length. +// Merge it with its neighboring regions. +static void remove_short_regions(REGIONS *regions, int *num_regions, + REGION_TYPES type, int length) { + int k = 0; + while (k < *num_regions && (*num_regions) > 1) { + if ((regions[k].last - regions[k].start + 1 < length && + regions[k].type == type)) { + // merge current region with the previous and next regions + remove_region(2, regions, num_regions, &k); + } else { + k++; + } + } + cleanup_regions(regions, num_regions); +} + +static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats, + REGIONS *regions, int *num_regions) { + int i, j, k; + // Remove regions that are too short. Likely noise. + remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN); + remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); + + get_region_stats(stats, regions, *num_regions); + + // Adjust region boundaries. The thresholds are empirically obtained, but + // overall the performance is not very sensitive to small changes to them. + for (k = 0; k < *num_regions; k++) { + if (regions[k].type == STABLE_REGION) continue; + if (k > 0) { + // Adjust previous boundary. + // First find the average intra/coded error in the previous + // neighborhood. + double avg_intra_err = 0; + const int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1, + regions[k - 1].start + 1); + const int lasti = regions[k - 1].last; + int counti = 0; + for (i = starti; i <= lasti; i++) { + avg_intra_err += stats[i].intra_error; + counti++; + } + if (counti > 0) { + avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001); + int count_coded = 0, count_grad = 0; + for (j = lasti + 1; j <= regions[k].last; j++) { + const int intra_close = + fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1; + const int coded_small = stats[j].coded_error / avg_intra_err < 0.1; + const int coeff_close = stats[j].cor_coeff > 0.995; + if (!coeff_close || !coded_small) count_coded--; + if (intra_close && count_coded >= 0 && count_grad >= 0) { + // this frame probably belongs to the previous stable region + regions[k - 1].last = j; + regions[k].start = j + 1; + } else { + break; + } + } + } + } // if k > 0 + if (k < *num_regions - 1) { + // Adjust next boundary. + // First find the average intra/coded error in the next neighborhood. + double avg_intra_err = 0; + const int starti = regions[k + 1].start; + const int lasti = AOMMIN(regions[k + 1].last - 1, + regions[k + 1].start + WINDOW_SIZE - 1); + int counti = 0; + for (i = starti; i <= lasti; i++) { + avg_intra_err += stats[i].intra_error; + counti++; + } + if (counti > 0) { + avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001); + // At the boundary, coded error is large, but still the frame is stable + int count_coded = 1, count_grad = 1; + for (j = starti - 1; j >= regions[k].start; j--) { + const int intra_close = + fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1; + const int coded_small = + stats[j + 1].coded_error / avg_intra_err < 0.1; + const int coeff_close = stats[j].cor_coeff > 0.995; + if (!coeff_close || !coded_small) count_coded--; + if (intra_close && count_coded >= 0 && count_grad >= 0) { + // this frame probably belongs to the next stable region + regions[k + 1].start = j; + regions[k].last = j - 1; + } else { + break; + } + } + } + } // if k < *num_regions - 1 + } // end of loop over all regions + + cleanup_regions(regions, num_regions); + remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); + get_region_stats(stats, regions, *num_regions); + + // If a stable regions has higher error than neighboring high var regions, + // or if the stable region has a lower average correlation, + // then it should be merged with them + k = 0; + while (k < *num_regions && (*num_regions) > 1) { + if (regions[k].type == STABLE_REGION && + (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE && + ((k > 0 && // previous regions + (regions[k].avg_coded_err > regions[k - 1].avg_coded_err * 1.01 || + regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff * 0.999)) && + (k < *num_regions - 1 && // next region + (regions[k].avg_coded_err > regions[k + 1].avg_coded_err * 1.01 || + regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff * 0.999)))) { + // merge current region with the previous and next regions + remove_region(2, regions, num_regions, &k); + analyze_region(stats, k - 1, regions); + } else if (regions[k].type == HIGH_VAR_REGION && + (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE && + ((k > 0 && // previous regions + (regions[k].avg_coded_err < + regions[k - 1].avg_coded_err * 0.99 || + regions[k].avg_cor_coeff > + regions[k - 1].avg_cor_coeff * 1.001)) && + (k < *num_regions - 1 && // next region + (regions[k].avg_coded_err < + regions[k + 1].avg_coded_err * 0.99 || + regions[k].avg_cor_coeff > + regions[k + 1].avg_cor_coeff * 1.001)))) { + // merge current region with the previous and next regions + remove_region(2, regions, num_regions, &k); + analyze_region(stats, k - 1, regions); + } else { + k++; + } + } + + remove_short_regions(regions, num_regions, STABLE_REGION, WINDOW_SIZE); + remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); +} + +// Identify blending regions. +static void find_blending_regions(const FIRSTPASS_STATS *stats, + REGIONS *regions, int *num_regions) { + int i, k = 0; + // Blending regions will have large content change, therefore will have a + // large consistent change in intra error. + int count_stable = 0; + while (k < *num_regions) { + if (regions[k].type == STABLE_REGION) { + k++; + count_stable++; + continue; + } + int dir = 0; + int start = 0, last; + for (i = regions[k].start; i <= regions[k].last; i++) { + // First mark the regions that has consistent large change of intra error. + if (k == 0 && i == regions[k].start) continue; + if (stats[i].is_flash || (i > 0 && stats[i - 1].is_flash)) continue; + double grad = stats[i].intra_error - stats[i - 1].intra_error; + int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05; + int this_dir = 0; + if (large_change) { + this_dir = (grad > 0) ? 1 : -1; + } + // the current trend continues + if (dir == this_dir) continue; + if (dir != 0) { + // Mark the end of a new large change group and add it + last = i - 1; + insert_region(start, last, BLENDING_REGION, regions, num_regions, &k); + } + dir = this_dir; + if (k == 0 && i == regions[k].start + 1) { + start = i - 1; + } else { + start = i; + } + } + if (dir != 0) { + last = regions[k].last; + insert_region(start, last, BLENDING_REGION, regions, num_regions, &k); + } + k++; + } + + // If the blending region has very low correlation, mark it as high variance + // since we probably cannot benefit from it anyways. + get_region_stats(stats, regions, *num_regions); + for (k = 0; k < *num_regions; k++) { + if (regions[k].type != BLENDING_REGION) continue; + if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 || + count_stable == 0) + regions[k].type = HIGH_VAR_REGION; + } + get_region_stats(stats, regions, *num_regions); + + // It is possible for blending to result in a "dip" in intra error (first + // decrease then increase). Therefore we need to find the dip and combine the + // two regions. + k = 1; + while (k < *num_regions) { + if (k < *num_regions - 1 && regions[k].type == HIGH_VAR_REGION) { + // Check if this short high variance regions is actually in the middle of + // a blending region. + if (regions[k - 1].type == BLENDING_REGION && + regions[k + 1].type == BLENDING_REGION && + regions[k].last - regions[k].start < 3) { + int prev_dir = (stats[regions[k - 1].last].intra_error - + stats[regions[k - 1].last - 1].intra_error) > 0 + ? 1 + : -1; + int next_dir = (stats[regions[k + 1].last].intra_error - + stats[regions[k + 1].last - 1].intra_error) > 0 + ? 1 + : -1; + if (prev_dir < 0 && next_dir > 0) { + // This is possibly a mid region of blending. Check the ratios + double ratio_thres = AOMMIN(regions[k - 1].avg_sr_fr_ratio, + regions[k + 1].avg_sr_fr_ratio) * + 0.95; + if (regions[k].avg_sr_fr_ratio > ratio_thres) { + regions[k].type = BLENDING_REGION; + remove_region(2, regions, num_regions, &k); + analyze_region(stats, k - 1, regions); + continue; + } + } + } + } + // Check if we have a pair of consecutive blending regions. + if (regions[k - 1].type == BLENDING_REGION && + regions[k].type == BLENDING_REGION) { + int prev_dir = (stats[regions[k - 1].last].intra_error - + stats[regions[k - 1].last - 1].intra_error) > 0 + ? 1 + : -1; + int next_dir = (stats[regions[k].last].intra_error - + stats[regions[k].last - 1].intra_error) > 0 + ? 1 + : -1; + + // if both are too short, no need to check + int total_length = regions[k].last - regions[k - 1].start + 1; + if (total_length < 4) { + regions[k - 1].type = HIGH_VAR_REGION; + k++; + continue; + } + + int to_merge = 0; + if (prev_dir < 0 && next_dir > 0) { + // In this case we check the last frame in the previous region. + double prev_length = + (double)(regions[k - 1].last - regions[k - 1].start + 1); + double last_ratio, ratio_thres; + if (prev_length < 2.01) { + // if the previous region is very short + double max_coded_error = + AOMMAX(stats[regions[k - 1].last].coded_error, + stats[regions[k - 1].last - 1].coded_error); + last_ratio = stats[regions[k - 1].last].sr_coded_error / + AOMMAX(max_coded_error, 0.001); + ratio_thres = regions[k].avg_sr_fr_ratio * 0.95; + } else { + double max_coded_error = + AOMMAX(stats[regions[k - 1].last].coded_error, + stats[regions[k - 1].last - 1].coded_error); + last_ratio = stats[regions[k - 1].last].sr_coded_error / + AOMMAX(max_coded_error, 0.001); + double prev_ratio = + (regions[k - 1].avg_sr_fr_ratio * prev_length - last_ratio) / + (prev_length - 1.0); + ratio_thres = AOMMIN(prev_ratio, regions[k].avg_sr_fr_ratio) * 0.95; + } + if (last_ratio > ratio_thres) { + to_merge = 1; + } + } + + if (to_merge) { + remove_region(0, regions, num_regions, &k); + analyze_region(stats, k - 1, regions); + continue; + } else { + // These are possibly two separate blending regions. Mark the boundary + // frame as HIGH_VAR_REGION to separate the two. + int prev_k = k - 1; + insert_region(regions[prev_k].last, regions[prev_k].last, + HIGH_VAR_REGION, regions, num_regions, &prev_k); + analyze_region(stats, prev_k, regions); + k = prev_k + 1; + analyze_region(stats, k, regions); + } + } + k++; + } + cleanup_regions(regions, num_regions); +} + +// Clean up decision for blendings. Remove blending regions that are too short. +// Also if a very short high var region is between a blending and a stable +// region, just merge it with one of them. +static void cleanup_blendings(REGIONS *regions, int *num_regions) { + int k = 0; + while (k<*num_regions && * num_regions> 1) { + int is_short_blending = regions[k].type == BLENDING_REGION && + regions[k].last - regions[k].start + 1 < 5; + int is_short_hv = regions[k].type == HIGH_VAR_REGION && + regions[k].last - regions[k].start + 1 < 5; + int has_stable_neighbor = + ((k > 0 && regions[k - 1].type == STABLE_REGION) || + (k < *num_regions - 1 && regions[k + 1].type == STABLE_REGION)); + int has_blend_neighbor = + ((k > 0 && regions[k - 1].type == BLENDING_REGION) || + (k < *num_regions - 1 && regions[k + 1].type == BLENDING_REGION)); + int total_neighbors = (k > 0) + (k < *num_regions - 1); + + if (is_short_blending || + (is_short_hv && + has_stable_neighbor + has_blend_neighbor >= total_neighbors)) { + // Remove this region.Try to determine whether to combine it with the + // previous or next region. + int merge; + double prev_diff = + (k > 0) + ? fabs(regions[k].avg_cor_coeff - regions[k - 1].avg_cor_coeff) + : 1; + double next_diff = + (k < *num_regions - 1) + ? fabs(regions[k].avg_cor_coeff - regions[k + 1].avg_cor_coeff) + : 1; + // merge == 0 means to merge with previous, 1 means to merge with next + merge = prev_diff > next_diff; + remove_region(merge, regions, num_regions, &k); + } else { + k++; + } + } + cleanup_regions(regions, num_regions); +} + +static void free_firstpass_stats_buffers(REGIONS *temp_regions, + double *filt_intra_err, + double *filt_coded_err, + double *grad_coded) { + aom_free(temp_regions); + aom_free(filt_intra_err); + aom_free(filt_coded_err); + aom_free(grad_coded); +} + +// Identify stable and unstable regions from first pass stats. +// stats_start points to the first frame to analyze. +// |offset| is the offset from the current frame to the frame stats_start is +// pointing to. +// Returns 0 on success, -1 on memory allocation failure. +static int identify_regions(const FIRSTPASS_STATS *const stats_start, + int total_frames, int offset, REGIONS *regions, + int *total_regions) { + int k; + if (total_frames <= 1) return 0; + + // store the initial decisions + REGIONS *temp_regions = + (REGIONS *)aom_malloc(total_frames * sizeof(temp_regions[0])); + // buffers for filtered stats + double *filt_intra_err = + (double *)aom_calloc(total_frames, sizeof(*filt_intra_err)); + double *filt_coded_err = + (double *)aom_calloc(total_frames, sizeof(*filt_coded_err)); + double *grad_coded = (double *)aom_calloc(total_frames, sizeof(*grad_coded)); + if (!(temp_regions && filt_intra_err && filt_coded_err && grad_coded)) { + free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err, + grad_coded); + return -1; + } + av1_zero_array(temp_regions, total_frames); + + int cur_region = 0, this_start = 0, this_last; + + int next_scenecut = -1; + do { + // first get the obvious scenecuts + next_scenecut = + find_next_scenecut(stats_start, this_start, total_frames - 1); + this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1; + + // low-pass filter the needed stats + smooth_filter_stats(stats_start, this_start, this_last, filt_intra_err, + filt_coded_err); + get_gradient(filt_coded_err, this_start, this_last, grad_coded); + + // find tentative stable regions and unstable regions + int num_regions = find_stable_regions(stats_start, grad_coded, this_start, + this_last, temp_regions); + + adjust_unstable_region_bounds(stats_start, temp_regions, &num_regions); + + get_region_stats(stats_start, temp_regions, num_regions); + + // Try to identify blending regions in the unstable regions + find_blending_regions(stats_start, temp_regions, &num_regions); + cleanup_blendings(temp_regions, &num_regions); + + // The flash points should all be considered high variance points + k = 0; + while (k < num_regions) { + if (temp_regions[k].type != STABLE_REGION) { + k++; + continue; + } + int start = temp_regions[k].start; + int last = temp_regions[k].last; + for (int i = start; i <= last; i++) { + if (stats_start[i].is_flash) { + insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k); + } + } + k++; + } + cleanup_regions(temp_regions, &num_regions); + + // copy the regions in the scenecut group + for (k = 0; k < num_regions; k++) { + if (temp_regions[k].last < temp_regions[k].start && + k == num_regions - 1) { + num_regions--; + break; + } + regions[k + cur_region] = temp_regions[k]; + } + cur_region += num_regions; + + // add the scenecut region + if (next_scenecut > -1) { + // add the scenecut region, and find the next scenecut + regions[cur_region].type = SCENECUT_REGION; + regions[cur_region].start = next_scenecut; + regions[cur_region].last = next_scenecut; + cur_region++; + this_start = next_scenecut + 1; + } + } while (next_scenecut >= 0); + + *total_regions = cur_region; + get_region_stats(stats_start, regions, *total_regions); + + for (k = 0; k < *total_regions; k++) { + // If scenecuts are very minor, mark them as high variance. + if (regions[k].type != SCENECUT_REGION || + regions[k].avg_cor_coeff * + (1 - stats_start[regions[k].start].noise_var / + regions[k].avg_intra_err) < + 0.8) { + continue; + } + regions[k].type = HIGH_VAR_REGION; + } + cleanup_regions(regions, total_regions); + get_region_stats(stats_start, regions, *total_regions); + + for (k = 0; k < *total_regions; k++) { + regions[k].start += offset; + regions[k].last += offset; + } + + free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err, + grad_coded); + return 0; +} + +static int find_regions_index(const REGIONS *regions, int num_regions, + int frame_idx) { + for (int k = 0; k < num_regions; k++) { + if (regions[k].start <= frame_idx && regions[k].last >= frame_idx) { + return k; + } + } + return -1; +} + +/*!\brief Determine the length of future GF groups. + * + * \ingroup gf_group_algo + * This function decides the gf group length of future frames in batch + * + * \param[in] cpi Top-level encoder structure + * \param[in] max_gop_length Maximum length of the GF group + * \param[in] max_intervals Maximum number of intervals to decide + * + * \remark Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is + * changed to store the decided GF group lengths. + */ +static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, + int max_intervals) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; + const FIRSTPASS_STATS *const stats = start_pos - (rc->frames_since_key == 0); + + const int f_w = cpi->common.width; + const int f_h = cpi->common.height; + int i; + + int flash_detected; + + av1_zero(next_frame); + + if (has_no_stats_stage(cpi)) { + for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) { + p_rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length); + } + p_rc->cur_gf_index = 0; + rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS; + return; + } + + // TODO(urvang): Try logic to vary min and max interval based on q. + const int active_min_gf_interval = rc->min_gf_interval; + const int active_max_gf_interval = + AOMMIN(rc->max_gf_interval, max_gop_length); + const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval); + + i = (rc->frames_since_key == 0); + max_intervals = cpi->ppi->lap_enabled ? 1 : max_intervals; + int count_cuts = 1; + // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF. + int cur_start = -1 + !cpi->ppi->gf_state.arf_gf_boost_lst, cur_last; + int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 }; + int cut_here; + GF_GROUP_STATS gf_stats; + init_gf_stats(&gf_stats); + while (count_cuts < max_intervals + 1) { + // reaches next key frame, break here + if (i >= rc->frames_to_key) { + cut_here = 2; + } else if (i - cur_start >= rc->static_scene_max_gf_interval) { + // reached maximum len, but nothing special yet (almost static) + // let's look at the next interval + cut_here = 1; + } else if (EOF == input_stats(twopass, &cpi->twopass_frame, &next_frame)) { + // reaches last frame, break + cut_here = 2; + } else { + // Test for the case where there is a brief flash but the prediction + // quality back to an earlier frame is then restored. + flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0); + // TODO(bohanli): remove redundant accumulations here, or unify + // this and the ones in define_gf_group + accumulate_next_frame_stats(&next_frame, flash_detected, + rc->frames_since_key, i, &gf_stats, f_w, f_h); + + cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected, + active_max_gf_interval, active_min_gf_interval, + &gf_stats); + } + if (cut_here) { + cur_last = i - 1; // the current last frame in the gf group + int ori_last = cur_last; + // The region frame idx does not start from the same frame as cur_start + // and cur_last. Need to offset them. + int offset = rc->frames_since_key - p_rc->regions_offset; + REGIONS *regions = p_rc->regions; + int num_regions = p_rc->num_regions; + + int scenecut_idx = -1; + // only try shrinking if interval smaller than active_max_gf_interval + if (cur_last - cur_start <= active_max_gf_interval && + cur_last > cur_start) { + // find the region indices of where the first and last frame belong. + int k_start = + find_regions_index(regions, num_regions, cur_start + offset); + int k_last = + find_regions_index(regions, num_regions, cur_last + offset); + if (cur_start + offset == 0) k_start = 0; + + // See if we have a scenecut in between + for (int r = k_start + 1; r <= k_last; r++) { + if (regions[r].type == SCENECUT_REGION && + regions[r].last - offset - cur_start > active_min_gf_interval) { + scenecut_idx = r; + break; + } + } + + // if the found scenecut is very close to the end, ignore it. + if (regions[num_regions - 1].last - regions[scenecut_idx].last < 4) { + scenecut_idx = -1; + } + + if (scenecut_idx != -1) { + // If we have a scenecut, then stop at it. + // TODO(bohanli): add logic here to stop before the scenecut and for + // the next gop start from the scenecut with GF + int is_minor_sc = + (regions[scenecut_idx].avg_cor_coeff * + (1 - stats[regions[scenecut_idx].start - offset].noise_var / + regions[scenecut_idx].avg_intra_err) > + 0.6); + cur_last = regions[scenecut_idx].last - offset - !is_minor_sc; + } else { + int is_last_analysed = (k_last == num_regions - 1) && + (cur_last + offset == regions[k_last].last); + int not_enough_regions = + k_last - k_start <= + 1 + (regions[k_start].type == SCENECUT_REGION); + // if we are very close to the end, then do not shrink since it may + // introduce intervals that are too short + if (!(is_last_analysed && not_enough_regions)) { + const double arf_length_factor = 0.1; + double best_score = 0; + int best_j = -1; + const int first_frame = regions[0].start - offset; + const int last_frame = regions[num_regions - 1].last - offset; + // score of how much the arf helps the whole GOP + double base_score = 0.0; + // Accumulate base_score in + for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) { + if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break; + base_score = (base_score + 1.0) * stats[j].cor_coeff; + } + int met_blending = 0; // Whether we have met blending areas before + int last_blending = 0; // Whether the previous frame if blending + for (int j = cur_start + min_shrink_int; j <= cur_last; j++) { + if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break; + base_score = (base_score + 1.0) * stats[j].cor_coeff; + int this_reg = + find_regions_index(regions, num_regions, j + offset); + if (this_reg < 0) continue; + // A GOP should include at most 1 blending region. + if (regions[this_reg].type == BLENDING_REGION) { + last_blending = 1; + if (met_blending) { + break; + } else { + base_score = 0; + continue; + } + } else { + if (last_blending) met_blending = 1; + last_blending = 0; + } + + // Add the factor of how good the neighborhood is for this + // candidate arf. + double this_score = arf_length_factor * base_score; + double temp_accu_coeff = 1.0; + // following frames + int count_f = 0; + for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) { + if (stats + n >= twopass->stats_buf_ctx->stats_in_end) break; + temp_accu_coeff *= stats[n].cor_coeff; + this_score += + temp_accu_coeff * + sqrt(AOMMAX(0.5, + 1 - stats[n].noise_var / + AOMMAX(stats[n].intra_error, 0.001))); + count_f++; + } + // preceding frames + temp_accu_coeff = 1.0; + for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) { + if (stats + n < twopass->stats_buf_ctx->stats_in_start) break; + temp_accu_coeff *= stats[n].cor_coeff; + this_score += + temp_accu_coeff * + sqrt(AOMMAX(0.5, + 1 - stats[n].noise_var / + AOMMAX(stats[n].intra_error, 0.001))); + } + + if (this_score > best_score) { + best_score = this_score; + best_j = j; + } + } + + // For blending areas, move one more frame in case we missed the + // first blending frame. + int best_reg = + find_regions_index(regions, num_regions, best_j + offset); + if (best_reg < num_regions - 1 && best_reg > 0) { + if (regions[best_reg - 1].type == BLENDING_REGION && + regions[best_reg + 1].type == BLENDING_REGION) { + if (best_j + offset == regions[best_reg].start && + best_j + offset < regions[best_reg].last) { + best_j += 1; + } else if (best_j + offset == regions[best_reg].last && + best_j + offset > regions[best_reg].start) { + best_j -= 1; + } + } + } + + if (cur_last - best_j < 2) best_j = cur_last; + if (best_j > 0 && best_score > 0.1) cur_last = best_j; + // if cannot find anything, just cut at the original place. + } + } + } + cut_pos[count_cuts] = cur_last; + count_cuts++; + + // reset pointers to the shrunken location + cpi->twopass_frame.stats_in = start_pos + cur_last; + cur_start = cur_last; + int cur_region_idx = + find_regions_index(regions, num_regions, cur_start + 1 + offset); + if (cur_region_idx >= 0) + if (regions[cur_region_idx].type == SCENECUT_REGION) cur_start++; + + i = cur_last; + + if (cut_here > 1 && cur_last == ori_last) break; + + // reset accumulators + init_gf_stats(&gf_stats); + } + ++i; + } + + // save intervals + rc->intervals_till_gf_calculate_due = count_cuts - 1; + for (int n = 1; n < count_cuts; n++) { + p_rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1]; + } + p_rc->cur_gf_index = 0; + cpi->twopass_frame.stats_in = start_pos; +} + +static void correct_frames_to_key(AV1_COMP *cpi) { + int lookahead_size = + (int)av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); + if (lookahead_size < + av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage)) { + assert( + IMPLIES(cpi->oxcf.pass != AOM_RC_ONE_PASS && cpi->ppi->frames_left > 0, + lookahead_size == cpi->ppi->frames_left)); + cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size); + } else if (cpi->ppi->frames_left > 0) { + // Correct frames to key based on limit + cpi->rc.frames_to_key = + AOMMIN(cpi->rc.frames_to_key, cpi->ppi->frames_left); + } +} + +/*!\brief Define a GF group in one pass mode when no look ahead stats are + * available. + * + * \ingroup gf_group_algo + * This function defines the structure of a GF group, along with various + * parameters regarding bit-allocation and quality setup in the special + * case of one pass encoding where no lookahead stats are avialable. + * + * \param[in] cpi Top-level encoder structure + * + * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed. + */ +static void define_gf_group_pass0(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const GFConfig *const gf_cfg = &oxcf->gf_cfg; + int target; + + if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) { + av1_cyclic_refresh_set_golden_update(cpi); + } else { + p_rc->baseline_gf_interval = p_rc->gf_intervals[p_rc->cur_gf_index]; + rc->intervals_till_gf_calculate_due--; + p_rc->cur_gf_index++; + } + + // correct frames_to_key when lookahead queue is flushing + correct_frames_to_key(cpi); + + if (p_rc->baseline_gf_interval > rc->frames_to_key) + p_rc->baseline_gf_interval = rc->frames_to_key; + + p_rc->gfu_boost = DEFAULT_GF_BOOST; + p_rc->constrained_gf_group = + (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0; + + gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height; + + // Rare case when the look-ahead is less than the target GOP length, can't + // generate ARF frame. + if (p_rc->baseline_gf_interval > gf_cfg->lag_in_frames || + !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) || + p_rc->baseline_gf_interval < rc->min_gf_interval) + gf_group->max_layer_depth_allowed = 0; + + // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) + av1_gop_setup_structure(cpi); + + // Allocate bits to each of the frames in the GF group. + // TODO(sarahparker) Extend this to work with pyramid structure. + for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) { + const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index]; + if (oxcf->rc_cfg.mode == AOM_CBR) { + if (cur_update_type == KF_UPDATE) { + target = av1_calc_iframe_target_size_one_pass_cbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type); + } + } else { + if (cur_update_type == KF_UPDATE) { + target = av1_calc_iframe_target_size_one_pass_vbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type); + } + } + gf_group->bit_allocation[cur_index] = target; + } +} + +static INLINE void set_baseline_gf_interval(PRIMARY_RATE_CONTROL *p_rc, + int arf_position) { + p_rc->baseline_gf_interval = arf_position; +} + +// initialize GF_GROUP_STATS +static void init_gf_stats(GF_GROUP_STATS *gf_stats) { + gf_stats->gf_group_err = 0.0; + gf_stats->gf_group_raw_error = 0.0; + gf_stats->gf_group_skip_pct = 0.0; + gf_stats->gf_group_inactive_zone_rows = 0.0; + + gf_stats->mv_ratio_accumulator = 0.0; + gf_stats->decay_accumulator = 1.0; + gf_stats->zero_motion_accumulator = 1.0; + gf_stats->loop_decay_rate = 1.0; + gf_stats->last_loop_decay_rate = 1.0; + gf_stats->this_frame_mv_in_out = 0.0; + gf_stats->mv_in_out_accumulator = 0.0; + gf_stats->abs_mv_in_out_accumulator = 0.0; + + gf_stats->avg_sr_coded_error = 0.0; + gf_stats->avg_pcnt_second_ref = 0.0; + gf_stats->avg_new_mv_count = 0.0; + gf_stats->avg_wavelet_energy = 0.0; + gf_stats->avg_raw_err_stdev = 0.0; + gf_stats->non_zero_stdev_count = 0; +} + +static void accumulate_gop_stats(AV1_COMP *cpi, int is_intra_only, int f_w, + int f_h, FIRSTPASS_STATS *next_frame, + const FIRSTPASS_STATS *start_pos, + GF_GROUP_STATS *gf_stats, int *idx) { + int i, flash_detected; + TWO_PASS *const twopass = &cpi->ppi->twopass; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + RATE_CONTROL *const rc = &cpi->rc; + FRAME_INFO *frame_info = &cpi->frame_info; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + init_gf_stats(gf_stats); + av1_zero(*next_frame); + + // If this is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + i = is_intra_only; + // get the determined gf group length from p_rc->gf_intervals + while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) { + // read in the next frame + if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break; + // Accumulate error score of frames in this gf group. + double mod_frame_err = + calculate_modified_err(frame_info, twopass, oxcf, next_frame); + // accumulate stats for this frame + accumulate_this_frame_stats(next_frame, mod_frame_err, gf_stats); + ++i; + } + + reset_fpf_position(&cpi->twopass_frame, start_pos); + + i = is_intra_only; + input_stats(twopass, &cpi->twopass_frame, next_frame); + while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) { + // read in the next frame + if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break; + + // Test for the case where there is a brief flash but the prediction + // quality back to an earlier frame is then restored. + flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0); + + // accumulate stats for next frame + accumulate_next_frame_stats(next_frame, flash_detected, + rc->frames_since_key, i, gf_stats, f_w, f_h); + + ++i; + } + + i = p_rc->gf_intervals[p_rc->cur_gf_index]; + average_gf_stats(i, gf_stats); + + *idx = i; +} + +static void update_gop_length(RATE_CONTROL *rc, PRIMARY_RATE_CONTROL *p_rc, + int idx, int is_final_pass) { + if (is_final_pass) { + rc->intervals_till_gf_calculate_due--; + p_rc->cur_gf_index++; + } + + // Was the group length constrained by the requirement for a new KF? + p_rc->constrained_gf_group = (idx >= rc->frames_to_key) ? 1 : 0; + + set_baseline_gf_interval(p_rc, idx); + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; +} + +#define MAX_GF_BOOST 5400 +#define REDUCE_GF_LENGTH_THRESH 4 +#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9 +#define REDUCE_GF_LENGTH_BY 1 +static void set_gop_bits_boost(AV1_COMP *cpi, int i, int is_intra_only, + int is_final_pass, int use_alt_ref, + int alt_offset, const FIRSTPASS_STATS *start_pos, + GF_GROUP_STATS *gf_stats) { + // Should we use the alternate reference frame. + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + FRAME_INFO *frame_info = &cpi->frame_info; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + int ext_len = i - is_intra_only; + if (use_alt_ref) { + const int forward_frames = (rc->frames_to_key - i >= ext_len) + ? ext_len + : AOMMAX(0, rc->frames_to_key - i); + + // Calculate the boost for alt ref. + p_rc->gfu_boost = av1_calc_arf_boost( + twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, + forward_frames, ext_len, &p_rc->num_stats_used_for_gfu_boost, + &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled); + } else { + reset_fpf_position(&cpi->twopass_frame, start_pos); + p_rc->gfu_boost = AOMMIN( + MAX_GF_BOOST, + av1_calc_arf_boost( + twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, ext_len, + 0, &p_rc->num_stats_used_for_gfu_boost, + &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled)); + } + +#define LAST_ALR_BOOST_FACTOR 0.2f + p_rc->arf_boost_factor = 1.0; + if (use_alt_ref && !is_lossless_requested(rc_cfg)) { + // Reduce the boost of altref in the last gf group + if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY || + rc->frames_to_key - ext_len == 0) { + p_rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR; + } + } + + // Reset the file position. + reset_fpf_position(&cpi->twopass_frame, start_pos); + if (cpi->ppi->lap_enabled) { + // Since we don't have enough stats to know the actual error of the + // gf group, we assume error of each frame to be equal to 1 and set + // the error of the group as baseline_gf_interval. + gf_stats->gf_group_err = p_rc->baseline_gf_interval; + } + // Calculate the bits to be allocated to the gf/arf group as a whole + p_rc->gf_group_bits = + calculate_total_gf_group_bits(cpi, gf_stats->gf_group_err); + +#if GROUP_ADAPTIVE_MAXQ + // Calculate an estimate of the maxq needed for the group. + // We are more aggressive about correcting for sections + // where there could be significant overshoot than for easier + // sections where we do not wish to risk creating an overshoot + // of the allocated bit budget. + if ((rc_cfg->mode != AOM_Q) && (p_rc->baseline_gf_interval > 1) && + is_final_pass) { + const int vbr_group_bits_per_frame = + (int)(p_rc->gf_group_bits / p_rc->baseline_gf_interval); + const double group_av_err = + gf_stats->gf_group_raw_error / p_rc->baseline_gf_interval; + const double group_av_skip_pct = + gf_stats->gf_group_skip_pct / p_rc->baseline_gf_interval; + const double group_av_inactive_zone = + ((gf_stats->gf_group_inactive_zone_rows * 2) / + (p_rc->baseline_gf_interval * (double)cm->mi_params.mb_rows)); + + int tmp_q; + tmp_q = get_twopass_worst_quality( + cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), + vbr_group_bits_per_frame); + rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1); + } +#endif + + // Adjust KF group bits and error remaining. + if (is_final_pass) twopass->kf_group_error_left -= gf_stats->gf_group_err; + + // Reset the file position. + reset_fpf_position(&cpi->twopass_frame, start_pos); + + // Calculate a section intra ratio used in setting max loop filter. + if (rc->frames_since_key != 0) { + twopass->section_intra_rating = calculate_section_intra_ratio( + start_pos, twopass->stats_buf_ctx->stats_in_end, + p_rc->baseline_gf_interval); + } + + av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0, + use_alt_ref, p_rc->gf_group_bits); + + // TODO(jingning): Generalize this condition. + if (is_final_pass) { + cpi->ppi->gf_state.arf_gf_boost_lst = use_alt_ref; + + // Reset rolling actual and target bits counters for ARF groups. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; + } +#if CONFIG_BITRATE_ACCURACY + if (is_final_pass) { + av1_vbr_rc_set_gop_bit_budget(&cpi->vbr_rc_info, + p_rc->baseline_gf_interval); + } +#endif +} + +/*!\brief Define a GF group. + * + * \ingroup gf_group_algo + * This function defines the structure of a GF group, along with various + * parameters regarding bit-allocation and quality setup. + * + * \param[in] cpi Top-level encoder structure + * \param[in] frame_params Structure with frame parameters + * \param[in] is_final_pass Whether this is the final pass for the + * GF group, or a trial (non-zero) + * + * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed. + */ +static void define_gf_group(AV1_COMP *cpi, EncodeFrameParams *frame_params, + int is_final_pass) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + const GFConfig *const gf_cfg = &oxcf->gf_cfg; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + const int f_w = cm->width; + const int f_h = cm->height; + int i; + const int is_intra_only = rc->frames_since_key == 0; + + cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1); + + // Reset the GF group data structures unless this is a key + // frame in which case it will already have been done. + if (!is_intra_only) { + av1_zero(cpi->ppi->gf_group); + cpi->gf_frame_index = 0; + } + + if (has_no_stats_stage(cpi)) { + define_gf_group_pass0(cpi); + return; + } + + if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) { + int ret = define_gf_group_pass3(cpi, frame_params, is_final_pass); + if (ret == 0) return; + + av1_free_thirdpass_ctx(cpi->third_pass_ctx); + cpi->third_pass_ctx = NULL; + } + + // correct frames_to_key when lookahead queue is emptying + if (cpi->ppi->lap_enabled) { + correct_frames_to_key(cpi); + } + + GF_GROUP_STATS gf_stats; + accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos, + &gf_stats, &i); + + const int can_disable_arf = !gf_cfg->gf_min_pyr_height; + + // If this is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + const int active_min_gf_interval = rc->min_gf_interval; + + // Disable internal ARFs for "still" gf groups. + // zero_motion_accumulator: minimum percentage of (0,0) motion; + // avg_sr_coded_error: average of the SSE per pixel of each frame; + // avg_raw_err_stdev: average of the standard deviation of (0,0) + // motion error per block of each frame. + const int can_disable_internal_arfs = gf_cfg->gf_min_pyr_height <= 1; + if (can_disable_internal_arfs && + gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION && + gf_stats.avg_sr_coded_error < MAX_SR_CODED_ERROR && + gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) { + cpi->ppi->internal_altref_allowed = 0; + } + + int use_alt_ref; + if (can_disable_arf) { + use_alt_ref = + !is_almost_static(gf_stats.zero_motion_accumulator, + twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled) && + p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) && + (i >= MIN_GF_INTERVAL); + } else { + use_alt_ref = p_rc->use_arf_in_this_kf_group && + (i < gf_cfg->lag_in_frames) && (i > 2); + } + if (use_alt_ref) { + gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height; + } else { + gf_group->max_layer_depth_allowed = 0; + } + + int alt_offset = 0; + // The length reduction strategy is tweaked for certain cases, and doesn't + // work well for certain other cases. + const int allow_gf_length_reduction = + ((rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 128) || + !cpi->ppi->internal_altref_allowed) && + !is_lossless_requested(rc_cfg); + + if (allow_gf_length_reduction && use_alt_ref) { + // adjust length of this gf group if one of the following condition met + // 1: only one overlay frame left and this gf is too long + // 2: next gf group is too short to have arf compared to the current gf + + // maximum length of next gf group + const int next_gf_len = rc->frames_to_key - i; + const int single_overlay_left = + next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH; + // the next gf is probably going to have a ARF but it will be shorter than + // this gf + const int unbalanced_gf = + i > REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 >= rc->min_gf_interval; + + if (single_overlay_left || unbalanced_gf) { + const int roll_back = REDUCE_GF_LENGTH_BY; + // Reduce length only if active_min_gf_interval will be respected later. + if (i - roll_back >= active_min_gf_interval + 1) { + alt_offset = -roll_back; + i -= roll_back; + if (is_final_pass) rc->intervals_till_gf_calculate_due = 0; + p_rc->gf_intervals[p_rc->cur_gf_index] -= roll_back; + reset_fpf_position(&cpi->twopass_frame, start_pos); + accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, + start_pos, &gf_stats, &i); + } + } + } + + update_gop_length(rc, p_rc, i, is_final_pass); + + // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) + av1_gop_setup_structure(cpi); + + set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref, + alt_offset, start_pos, &gf_stats); + + frame_params->frame_type = + rc->frames_since_key == 0 ? KEY_FRAME : INTER_FRAME; + frame_params->show_frame = + !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE); +} + +/*!\brief Define a GF group for the third apss. + * + * \ingroup gf_group_algo + * This function defines the structure of a GF group for the third pass, along + * with various parameters regarding bit-allocation and quality setup based on + * the two-pass bitstream. + * Much of the function still uses the strategies used for the second pass and + * relies on first pass statistics. It is expected that over time these portions + * would be replaced with strategies specific to the third pass. + * + * \param[in] cpi Top-level encoder structure + * \param[in] frame_params Structure with frame parameters + * \param[in] is_final_pass Whether this is the final pass for the + * GF group, or a trial (non-zero) + * + * \return 0: Success; + * -1: There are conflicts between the bitstream and current config + * The values in cpi->ppi->gf_group are also changed. + */ +static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params, + int is_final_pass) { + if (!cpi->third_pass_ctx) return -1; + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + const GFConfig *const gf_cfg = &oxcf->gf_cfg; + const int f_w = cm->width; + const int f_h = cm->height; + int i; + const int is_intra_only = rc->frames_since_key == 0; + + cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1); + + // Reset the GF group data structures unless this is a key + // frame in which case it will already have been done. + if (!is_intra_only) { + av1_zero(cpi->ppi->gf_group); + cpi->gf_frame_index = 0; + } + + GF_GROUP_STATS gf_stats; + accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos, + &gf_stats, &i); + + const int can_disable_arf = !gf_cfg->gf_min_pyr_height; + + // TODO(any): set cpi->ppi->internal_altref_allowed accordingly; + + int use_alt_ref = av1_check_use_arf(cpi->third_pass_ctx); + if (use_alt_ref == 0 && !can_disable_arf) return -1; + if (use_alt_ref) { + gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height; + } else { + gf_group->max_layer_depth_allowed = 0; + } + + update_gop_length(rc, p_rc, i, is_final_pass); + + // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) + av1_gop_setup_structure(cpi); + + set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref, 0, + start_pos, &gf_stats); + + frame_params->frame_type = cpi->third_pass_ctx->frame_info[0].frame_type; + frame_params->show_frame = cpi->third_pass_ctx->frame_info[0].is_show_frame; + return 0; +} + +// #define FIXED_ARF_BITS +#ifdef FIXED_ARF_BITS +#define ARF_BITS_FRACTION 0.75 +#endif +void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, + GF_GROUP *gf_group, int is_key_frame, int use_arf, + int64_t gf_group_bits) { + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + // Calculate the extra bits to be used for boosted frame(s) +#ifdef FIXED_ARF_BITS + int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits); +#else + int gf_arf_bits = calculate_boost_bits( + p_rc->baseline_gf_interval - (rc->frames_since_key == 0), p_rc->gfu_boost, + gf_group_bits); +#endif + + gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits, + gf_group_bits, 1); + + // Allocate bits to each of the frames in the GF group. + allocate_gf_group_bits(gf_group, p_rc, rc, gf_group_bits, gf_arf_bits, + is_key_frame, use_arf); +} + +// Minimum % intra coding observed in first pass (1.0 = 100%) +#define MIN_INTRA_LEVEL 0.25 +// Minimum ratio between the % of intra coding and inter coding in the first +// pass after discounting neutral blocks (discounting neutral blocks in this +// way helps catch scene cuts in clips with very flat areas or letter box +// format clips with image padding. +#define INTRA_VS_INTER_THRESH 2.0 +// Hard threshold where the first pass chooses intra for almost all blocks. +// In such a case even if the frame is not a scene cut coding a key frame +// may be a good option. +#define VERY_LOW_INTER_THRESH 0.05 +// Maximum threshold for the relative ratio of intra error score vs best +// inter error score. +#define KF_II_ERR_THRESHOLD 1.9 +// In real scene cuts there is almost always a sharp change in the intra +// or inter error score. +#define ERR_CHANGE_THRESHOLD 0.4 +// For real scene cuts we expect an improvment in the intra inter error +// ratio in the next frame. +#define II_IMPROVEMENT_THRESHOLD 3.5 +#define KF_II_MAX 128.0 +// Intra / Inter threshold very low +#define VERY_LOW_II 1.5 +// Clean slide transitions we expect a sharp single frame spike in error. +#define ERROR_SPIKE 5.0 + +// Slide show transition detection. +// Tests for case where there is very low error either side of the current frame +// but much higher just for this frame. This can help detect key frames in +// slide shows even where the slides are pictures of different sizes. +// Also requires that intra and inter errors are very similar to help eliminate +// harmful false positives. +// It will not help if the transition is a fade or other multi-frame effect. +static int slide_transition(const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *next_frame) { + return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) && + (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) && + (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE)); +} + +// Threshold for use of the lagging second reference frame. High second ref +// usage may point to a transient event like a flash or occlusion rather than +// a real scene cut. +// We adapt the threshold based on number of frames in this key-frame group so +// far. +static double get_second_ref_usage_thresh(int frame_count_so_far) { + const int adapt_upto = 32; + const double min_second_ref_usage_thresh = 0.085; + const double second_ref_usage_thresh_max_delta = 0.035; + if (frame_count_so_far >= adapt_upto) { + return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta; + } + return min_second_ref_usage_thresh + + ((double)frame_count_so_far / (adapt_upto - 1)) * + second_ref_usage_thresh_max_delta; +} + +static int test_candidate_kf(const FIRSTPASS_INFO *firstpass_info, + int this_stats_index, int frame_count_so_far, + enum aom_rc_mode rc_mode, int scenecut_mode, + int num_mbs) { + const FIRSTPASS_STATS *last_stats = + av1_firstpass_info_peek(firstpass_info, this_stats_index - 1); + const FIRSTPASS_STATS *this_stats = + av1_firstpass_info_peek(firstpass_info, this_stats_index); + const FIRSTPASS_STATS *next_stats = + av1_firstpass_info_peek(firstpass_info, this_stats_index + 1); + if (last_stats == NULL || this_stats == NULL || next_stats == NULL) { + return 0; + } + + int is_viable_kf = 0; + double pcnt_intra = 1.0 - this_stats->pcnt_inter; + double modified_pcnt_inter = + this_stats->pcnt_inter - this_stats->pcnt_neutral; + const double second_ref_usage_thresh = + get_second_ref_usage_thresh(frame_count_so_far); + int frames_to_test_after_candidate_key = SCENE_CUT_KEY_TEST_INTERVAL; + int count_for_tolerable_prediction = 3; + + // We do "-1" because the candidate key is not counted. + int stats_after_this_stats = + av1_firstpass_info_future_count(firstpass_info, this_stats_index) - 1; + + if (scenecut_mode == ENABLE_SCENECUT_MODE_1) { + if (stats_after_this_stats < 3) { + return 0; + } else { + frames_to_test_after_candidate_key = 3; + count_for_tolerable_prediction = 1; + } + } + // Make sure we have enough stats after the candidate key. + frames_to_test_after_candidate_key = + AOMMIN(frames_to_test_after_candidate_key, stats_after_this_stats); + + // Does the frame satisfy the primary criteria of a key frame? + // See above for an explanation of the test criteria. + // If so, then examine how well it predicts subsequent frames. + if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) && + (this_stats->pcnt_second_ref < second_ref_usage_thresh) && + (next_stats->pcnt_second_ref < second_ref_usage_thresh) && + ((this_stats->pcnt_inter < VERY_LOW_INTER_THRESH) || + slide_transition(this_stats, last_stats, next_stats) || + ((pcnt_intra > MIN_INTRA_LEVEL) && + (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && + ((this_stats->intra_error / + DOUBLE_DIVIDE_CHECK(this_stats->coded_error)) < + KF_II_ERR_THRESHOLD) && + ((fabs(last_stats->coded_error - this_stats->coded_error) / + DOUBLE_DIVIDE_CHECK(this_stats->coded_error) > + ERR_CHANGE_THRESHOLD) || + (fabs(last_stats->intra_error - this_stats->intra_error) / + DOUBLE_DIVIDE_CHECK(this_stats->intra_error) > + ERR_CHANGE_THRESHOLD) || + ((next_stats->intra_error / + DOUBLE_DIVIDE_CHECK(next_stats->coded_error)) > + II_IMPROVEMENT_THRESHOLD))))) { + int i; + double boost_score = 0.0; + double old_boost_score = 0.0; + double decay_accumulator = 1.0; + + // Examine how well the key frame predicts subsequent frames. + for (i = 1; i <= frames_to_test_after_candidate_key; ++i) { + // Get the next frame details + const FIRSTPASS_STATS *local_next_frame = + av1_firstpass_info_peek(firstpass_info, this_stats_index + i); + double next_iiratio = + (BOOST_FACTOR * local_next_frame->intra_error / + DOUBLE_DIVIDE_CHECK(local_next_frame->coded_error)); + + if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX; + + // Cumulative effect of decay in prediction quality. + if (local_next_frame->pcnt_inter > 0.85) + decay_accumulator *= local_next_frame->pcnt_inter; + else + decay_accumulator *= (0.85 + local_next_frame->pcnt_inter) / 2.0; + + // Keep a running total. + boost_score += (decay_accumulator * next_iiratio); + + // Test various breakout clauses. + // TODO(any): Test of intra error should be normalized to an MB. + if ((local_next_frame->pcnt_inter < 0.05) || (next_iiratio < 1.5) || + (((local_next_frame->pcnt_inter - local_next_frame->pcnt_neutral) < + 0.20) && + (next_iiratio < 3.0)) || + ((boost_score - old_boost_score) < 3.0) || + (local_next_frame->intra_error < (200.0 / (double)num_mbs))) { + break; + } + + old_boost_score = boost_score; + } + + // If there is tolerable prediction for at least the next 3 frames then + // break out else discard this potential key frame and move on + if (boost_score > 30.0 && (i > count_for_tolerable_prediction)) { + is_viable_kf = 1; + } else { + is_viable_kf = 0; + } + } + return is_viable_kf; +} + +#define FRAMES_TO_CHECK_DECAY 8 +#define KF_MIN_FRAME_BOOST 80.0 +#define KF_MAX_FRAME_BOOST 128.0 +#define MIN_KF_BOOST 600 // Minimum boost for non-static KF interval +#define MAX_KF_BOOST 3200 +#define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval + +static int detect_app_forced_key(AV1_COMP *cpi) { + int num_frames_to_app_forced_key = is_forced_keyframe_pending( + cpi->ppi->lookahead, cpi->ppi->lookahead->max_sz, cpi->compressor_stage); + return num_frames_to_app_forced_key; +} + +static int get_projected_kf_boost(AV1_COMP *cpi) { + /* + * If num_stats_used_for_kf_boost >= frames_to_key, then + * all stats needed for prior boost calculation are available. + * Hence projecting the prior boost is not needed in this cases. + */ + if (cpi->ppi->p_rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key) + return cpi->ppi->p_rc.kf_boost; + + // Get the current tpl factor (number of frames = frames_to_key). + double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key); + // Get the tpl factor when number of frames = num_stats_used_for_kf_boost. + double tpl_factor_num_stats = av1_get_kf_boost_projection_factor( + cpi->ppi->p_rc.num_stats_used_for_kf_boost); + int projected_kf_boost = + (int)rint((tpl_factor * cpi->ppi->p_rc.kf_boost) / tpl_factor_num_stats); + return projected_kf_boost; +} + +/*!\brief Determine the location of the next key frame + * + * \ingroup gf_group_algo + * This function decides the placement of the next key frame when a + * scenecut is detected or the maximum key frame distance is reached. + * + * \param[in] cpi Top-level encoder structure + * \param[in] firstpass_info struct for firstpass info + * \param[in] num_frames_to_detect_scenecut Maximum lookahead frames. + * \param[in] search_start_idx the start index for searching key frame. + * Set it to one if we already know the + * current frame is key frame. Otherwise, + * set it to zero. + * + * \return Number of frames to the next key including the current frame. + */ +static int define_kf_interval(AV1_COMP *cpi, + const FIRSTPASS_INFO *firstpass_info, + int num_frames_to_detect_scenecut, + int search_start_idx) { + const TWO_PASS *const twopass = &cpi->ppi->twopass; + const RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg; + double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; + double decay_accumulator = 1.0; + int i = 0, j; + int frames_to_key = search_start_idx; + int frames_since_key = rc->frames_since_key + 1; + int scenecut_detected = 0; + + int num_frames_to_next_key = detect_app_forced_key(cpi); + + if (num_frames_to_detect_scenecut == 0) { + if (num_frames_to_next_key != -1) + return num_frames_to_next_key; + else + return rc->frames_to_key; + } + + if (num_frames_to_next_key != -1) + num_frames_to_detect_scenecut = + AOMMIN(num_frames_to_detect_scenecut, num_frames_to_next_key); + + // Initialize the decay rates for the recent frames to check + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; + + i = 0; + const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.mi_params.MBs; + const int future_stats_count = + av1_firstpass_info_future_count(firstpass_info, 0); + while (frames_to_key < future_stats_count && + frames_to_key < num_frames_to_detect_scenecut) { + // Provided that we are not at the end of the file... + if ((cpi->ppi->p_rc.enable_scenecut_detection > 0) && kf_cfg->auto_key && + frames_to_key + 1 < future_stats_count) { + double loop_decay_rate; + + // Check for a scene cut. + if (frames_since_key >= kf_cfg->key_freq_min) { + scenecut_detected = test_candidate_kf( + &twopass->firstpass_info, frames_to_key, frames_since_key, + oxcf->rc_cfg.mode, cpi->ppi->p_rc.enable_scenecut_detection, + num_mbs); + if (scenecut_detected) { + break; + } + } + + // How fast is the prediction quality decaying? + const FIRSTPASS_STATS *next_stats = + av1_firstpass_info_peek(firstpass_info, frames_to_key + 1); + loop_decay_rate = get_prediction_decay_rate(next_stats); + + // We want to know something about the recent past... rather than + // as used elsewhere where we are concerned with decay in prediction + // quality since the last GF or KF. + recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate; + decay_accumulator = 1.0; + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) + decay_accumulator *= recent_loop_decay[j]; + + // Special check for transition or high motion followed by a + // static scene. + if (frames_since_key >= kf_cfg->key_freq_min) { + scenecut_detected = detect_transition_to_still( + firstpass_info, frames_to_key + 1, rc->min_gf_interval, i, + kf_cfg->key_freq_max - i, loop_decay_rate, decay_accumulator); + if (scenecut_detected) { + // In the case of transition followed by a static scene, the key frame + // could be a good predictor for the following frames, therefore we + // do not use an arf. + p_rc->use_arf_in_this_kf_group = 0; + break; + } + } + + // Step on to the next frame. + ++frames_to_key; + ++frames_since_key; + + // If we don't have a real key frame within the next two + // key_freq_max intervals then break out of the loop. + if (frames_to_key >= 2 * kf_cfg->key_freq_max) { + break; + } + } else { + ++frames_to_key; + ++frames_since_key; + } + ++i; + } + if (cpi->ppi->lap_enabled && !scenecut_detected) + frames_to_key = num_frames_to_next_key; + + return frames_to_key; +} + +static double get_kf_group_avg_error(TWO_PASS *twopass, + TWO_PASS_FRAME *twopass_frame, + const FIRSTPASS_STATS *first_frame, + const FIRSTPASS_STATS *start_position, + int frames_to_key) { + FIRSTPASS_STATS cur_frame = *first_frame; + int num_frames, i; + double kf_group_avg_error = 0.0; + + reset_fpf_position(twopass_frame, start_position); + + for (i = 0; i < frames_to_key; ++i) { + kf_group_avg_error += cur_frame.coded_error; + if (EOF == input_stats(twopass, twopass_frame, &cur_frame)) break; + } + num_frames = i + 1; + num_frames = AOMMIN(num_frames, frames_to_key); + kf_group_avg_error = kf_group_avg_error / num_frames; + + return (kf_group_avg_error); +} + +static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err, + double kf_group_avg_error) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + int64_t kf_group_bits; + if (cpi->ppi->lap_enabled) { + kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth; + if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) { + double vbr_corpus_complexity_lap = + cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap / 10.0; + /* Get the average corpus complexity of the frame */ + kf_group_bits = (int64_t)( + kf_group_bits * (kf_group_avg_error / vbr_corpus_complexity_lap)); + } + } else { + kf_group_bits = (int64_t)(twopass->bits_left * + (kf_group_err / twopass->modified_error_left)); + } + + return kf_group_bits; +} + +static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS cur_frame; + av1_zero(cur_frame); + int num_frames = 0; + // Accumulate total stat using available number of stats. + for (num_frames = 0; num_frames < (rc->frames_to_key - 1); ++num_frames) { + if (EOF == input_stats(twopass, &cpi->twopass_frame, &cur_frame)) break; + av1_accumulate_stats(avg_frame_stat, &cur_frame); + } + + if (num_frames < 2) { + return num_frames; + } + // Average the total stat + avg_frame_stat->weight = avg_frame_stat->weight / num_frames; + avg_frame_stat->intra_error = avg_frame_stat->intra_error / num_frames; + avg_frame_stat->frame_avg_wavelet_energy = + avg_frame_stat->frame_avg_wavelet_energy / num_frames; + avg_frame_stat->coded_error = avg_frame_stat->coded_error / num_frames; + avg_frame_stat->sr_coded_error = avg_frame_stat->sr_coded_error / num_frames; + avg_frame_stat->pcnt_inter = avg_frame_stat->pcnt_inter / num_frames; + avg_frame_stat->pcnt_motion = avg_frame_stat->pcnt_motion / num_frames; + avg_frame_stat->pcnt_second_ref = + avg_frame_stat->pcnt_second_ref / num_frames; + avg_frame_stat->pcnt_neutral = avg_frame_stat->pcnt_neutral / num_frames; + avg_frame_stat->intra_skip_pct = avg_frame_stat->intra_skip_pct / num_frames; + avg_frame_stat->inactive_zone_rows = + avg_frame_stat->inactive_zone_rows / num_frames; + avg_frame_stat->inactive_zone_cols = + avg_frame_stat->inactive_zone_cols / num_frames; + avg_frame_stat->MVr = avg_frame_stat->MVr / num_frames; + avg_frame_stat->mvr_abs = avg_frame_stat->mvr_abs / num_frames; + avg_frame_stat->MVc = avg_frame_stat->MVc / num_frames; + avg_frame_stat->mvc_abs = avg_frame_stat->mvc_abs / num_frames; + avg_frame_stat->MVrv = avg_frame_stat->MVrv / num_frames; + avg_frame_stat->MVcv = avg_frame_stat->MVcv / num_frames; + avg_frame_stat->mv_in_out_count = + avg_frame_stat->mv_in_out_count / num_frames; + avg_frame_stat->new_mv_count = avg_frame_stat->new_mv_count / num_frames; + avg_frame_stat->count = avg_frame_stat->count / num_frames; + avg_frame_stat->duration = avg_frame_stat->duration / num_frames; + + return num_frames; +} + +static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err, + double *zero_motion_accumulator, + double *sr_accumulator, int use_avg_stat) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FRAME_INFO *const frame_info = &cpi->frame_info; + FIRSTPASS_STATS frame_stat; + av1_zero(frame_stat); + int i = 0, num_stat_used = 0; + double boost_score = 0.0; + const double kf_max_boost = + cpi->oxcf.rc_cfg.mode == AOM_Q + ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST), + KF_MAX_FRAME_BOOST) + : KF_MAX_FRAME_BOOST; + + // Calculate the average using available number of stats. + if (use_avg_stat) num_stat_used = calc_avg_stats(cpi, &frame_stat); + + for (i = num_stat_used; i < (rc->frames_to_key - 1); ++i) { + if (!use_avg_stat && + EOF == input_stats(twopass, &cpi->twopass_frame, &frame_stat)) + break; + + // Monitor for static sections. + // For the first frame in kf group, the second ref indicator is invalid. + if (i > 0) { + *zero_motion_accumulator = + AOMMIN(*zero_motion_accumulator, get_zero_motion_factor(&frame_stat)); + } else { + *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion; + } + + // Not all frames in the group are necessarily used in calculating boost. + if ((*sr_accumulator < (kf_raw_err * 1.50)) && + (i <= rc->max_gf_interval * 2)) { + double frame_boost; + double zm_factor; + + // Factor 0.75-1.25 based on how much of frame is static. + zm_factor = (0.75 + (*zero_motion_accumulator / 2.0)); + + if (i < 2) *sr_accumulator = 0.0; + frame_boost = + calc_kf_frame_boost(&cpi->ppi->p_rc, frame_info, &frame_stat, + sr_accumulator, kf_max_boost); + boost_score += frame_boost * zm_factor; + } + } + return boost_score; +} + +/*!\brief Interval(in seconds) to clip key-frame distance to in LAP. + */ +#define MAX_KF_BITS_INTERVAL_SINGLE_PASS 5 + +/*!\brief Determine the next key frame group + * + * \ingroup gf_group_algo + * This function decides the placement of the next key frame, and + * calculates the bit allocation of the KF group and the keyframe itself. + * + * \param[in] cpi Top-level encoder structure + * \param[in] this_frame Pointer to first pass stats + */ +static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + FRAME_INFO *const frame_info = &cpi->frame_info; + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg; + const FIRSTPASS_STATS first_frame = *this_frame; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_INFO *firstpass_info = &twopass->firstpass_info; + av1_zero(next_frame); + + rc->frames_since_key = 0; + // Use arfs if possible. + p_rc->use_arf_in_this_kf_group = is_altref_enabled( + oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf); + + // Reset the GF group data structures. + av1_zero(*gf_group); + cpi->gf_frame_index = 0; + + // KF is always a GF so clear frames till next gf counter. + rc->frames_till_gf_update_due = 0; + + if (has_no_stats_stage(cpi)) { + int num_frames_to_app_forced_key = detect_app_forced_key(cpi); + p_rc->this_key_frame_forced = + current_frame->frame_number != 0 && rc->frames_to_key == 0; + if (num_frames_to_app_forced_key != -1) + rc->frames_to_key = num_frames_to_app_forced_key; + else + rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max); + correct_frames_to_key(cpi); + p_rc->kf_boost = DEFAULT_KF_BOOST; + gf_group->update_type[0] = KF_UPDATE; + return; + } + int i; + const FIRSTPASS_STATS *const start_position = cpi->twopass_frame.stats_in; + int kf_bits = 0; + double zero_motion_accumulator = 1.0; + double boost_score = 0.0; + double kf_raw_err = 0.0; + double kf_mod_err = 0.0; + double sr_accumulator = 0.0; + double kf_group_avg_error = 0.0; + int frames_to_key, frames_to_key_clipped = INT_MAX; + int64_t kf_group_bits_clipped = INT64_MAX; + + // Is this a forced key frame by interval. + p_rc->this_key_frame_forced = p_rc->next_key_frame_forced; + + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0; // Group modified error score. + + kf_raw_err = this_frame->intra_error; + kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame); + + // We assume the current frame is a key frame and we are looking for the next + // key frame. Therefore search_start_idx = 1 + frames_to_key = define_kf_interval(cpi, firstpass_info, kf_cfg->key_freq_max, + /*search_start_idx=*/1); + + if (frames_to_key != -1) { + rc->frames_to_key = AOMMIN(kf_cfg->key_freq_max, frames_to_key); + } else { + rc->frames_to_key = kf_cfg->key_freq_max; + } + + if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi); + + // If there is a max kf interval set by the user we must obey it. + // We already breakout of the loop above at 2x max. + // This code centers the extra kf if the actual natural interval + // is between 1x and 2x. + if (kf_cfg->auto_key && rc->frames_to_key > kf_cfg->key_freq_max) { + FIRSTPASS_STATS tmp_frame = first_frame; + + rc->frames_to_key /= 2; + + // Reset to the start of the group. + reset_fpf_position(&cpi->twopass_frame, start_position); + // Rescan to get the correct error data for the forced kf group. + for (i = 0; i < rc->frames_to_key; ++i) { + if (EOF == input_stats(twopass, &cpi->twopass_frame, &tmp_frame)) break; + } + p_rc->next_key_frame_forced = 1; + } else if ((cpi->twopass_frame.stats_in == + twopass->stats_buf_ctx->stats_in_end && + is_stat_consumption_stage_twopass(cpi)) || + rc->frames_to_key >= kf_cfg->key_freq_max) { + p_rc->next_key_frame_forced = 1; + } else { + p_rc->next_key_frame_forced = 0; + } + + double kf_group_err = 0; + for (i = 0; i < rc->frames_to_key; ++i) { + const FIRSTPASS_STATS *this_stats = + av1_firstpass_info_peek(&twopass->firstpass_info, i); + if (this_stats != NULL) { + // Accumulate kf group error. + kf_group_err += calculate_modified_err_new( + frame_info, &firstpass_info->total_stats, this_stats, + oxcf->rc_cfg.vbrbias, twopass->modified_error_min, + twopass->modified_error_max); + ++p_rc->num_stats_used_for_kf_boost; + } + } + + // Calculate the number of bits that should be assigned to the kf group. + if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) || + (cpi->ppi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) { + // Maximum number of bits for a single normal frame (not key frame). + const int max_bits = frame_max_bits(rc, oxcf); + + // Maximum number of bits allocated to the key frame group. + int64_t max_grp_bits; + + if (oxcf->rc_cfg.vbr_corpus_complexity_lap) { + kf_group_avg_error = + get_kf_group_avg_error(twopass, &cpi->twopass_frame, &first_frame, + start_position, rc->frames_to_key); + } + + // Default allocation based on bits left and relative + // complexity of the section. + twopass->kf_group_bits = + get_kf_group_bits(cpi, kf_group_err, kf_group_avg_error); + // Clip based on maximum per frame rate defined by the user. + max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; + if (twopass->kf_group_bits > max_grp_bits) + twopass->kf_group_bits = max_grp_bits; + } else { + twopass->kf_group_bits = 0; + } + twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits); + + if (cpi->ppi->lap_enabled) { + // In the case of single pass based on LAP, frames to key may have an + // inaccurate value, and hence should be clipped to an appropriate + // interval. + frames_to_key_clipped = + (int)(MAX_KF_BITS_INTERVAL_SINGLE_PASS * cpi->framerate); + + // This variable calculates the bits allocated to kf_group with a clipped + // frames_to_key. + if (rc->frames_to_key > frames_to_key_clipped) { + kf_group_bits_clipped = + (int64_t)((double)twopass->kf_group_bits * frames_to_key_clipped / + rc->frames_to_key); + } + } + + // Reset the first pass file position. + reset_fpf_position(&cpi->twopass_frame, start_position); + + // Scan through the kf group collating various stats used to determine + // how many bits to spend on it. + boost_score = get_kf_boost_score(cpi, kf_raw_err, &zero_motion_accumulator, + &sr_accumulator, 0); + reset_fpf_position(&cpi->twopass_frame, start_position); + // Store the zero motion percentage + twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); + + // Calculate a section intra ratio used in setting max loop filter. + twopass->section_intra_rating = calculate_section_intra_ratio( + start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key); + + p_rc->kf_boost = (int)boost_score; + + if (cpi->ppi->lap_enabled) { + if (oxcf->rc_cfg.mode == AOM_Q) { + p_rc->kf_boost = get_projected_kf_boost(cpi); + } else { + // TODO(any): Explore using average frame stats for AOM_Q as well. + boost_score = get_kf_boost_score( + cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1); + reset_fpf_position(&cpi->twopass_frame, start_position); + p_rc->kf_boost += (int)boost_score; + } + } + + // Special case for static / slide show content but don't apply + // if the kf group is very short. + if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) && + (rc->frames_to_key > 8)) { + p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_STATIC_KF_BOOST); + } else { + // Apply various clamps for min and max boost + p_rc->kf_boost = AOMMAX(p_rc->kf_boost, (rc->frames_to_key * 3)); + p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_KF_BOOST); +#ifdef STRICT_RC + p_rc->kf_boost = AOMMIN(p_rc->kf_boost, MAX_KF_BOOST); +#endif + } + + // Work out how many bits to allocate for the key frame itself. + // In case of LAP enabled for VBR, if the frames_to_key value is + // very high, we calculate the bits based on a clipped value of + // frames_to_key. + kf_bits = calculate_boost_bits( + AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, p_rc->kf_boost, + AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped)); + // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", + // p_rc->kf_boost, + // kf_bits, twopass->kf_zeromotion_pct); + kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits, + twopass->kf_group_bits, 0); + + twopass->kf_group_bits -= kf_bits; + + // Save the bits to spend on the key frame. + gf_group->bit_allocation[0] = kf_bits; + gf_group->update_type[0] = KF_UPDATE; + + // Note the total error score of the kf group minus the key frame itself. + if (cpi->ppi->lap_enabled) + // As we don't have enough stats to know the actual error of the group, + // we assume the complexity of each frame to be equal to 1, and set the + // error as the number of frames in the group(minus the keyframe). + twopass->kf_group_error_left = (double)(rc->frames_to_key - 1); + else + twopass->kf_group_error_left = kf_group_err - kf_mod_err; + + // Adjust the count of total modified error left. + // The count of bits left is adjusted elsewhere based on real coded frame + // sizes. + twopass->modified_error_left -= kf_group_err; +} + +#define ARF_STATS_OUTPUT 0 +#if ARF_STATS_OUTPUT +unsigned int arf_count = 0; +#endif + +static int get_section_target_bandwidth(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + int section_target_bandwidth; + const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count - + current_frame->frame_number); + if (cpi->ppi->lap_enabled) + section_target_bandwidth = (int)rc->avg_frame_bandwidth; + else + section_target_bandwidth = (int)(twopass->bits_left / frames_left); + return section_target_bandwidth; +} + +static INLINE void set_twopass_params_based_on_fp_stats( + AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) { + if (this_frame_ptr == NULL) return; + + TWO_PASS_FRAME *twopass_frame = &cpi->twopass_frame; + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass_frame->mb_av_energy = log1p(this_frame_ptr->intra_error); + + const FIRSTPASS_STATS *const total_stats = + cpi->ppi->twopass.stats_buf_ctx->total_stats; + if (is_fp_wavelet_energy_invalid(total_stats) == 0) { + twopass_frame->frame_avg_haar_energy = + log1p(this_frame_ptr->frame_avg_wavelet_energy); + } + + // Set the frame content type flag. + if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH) + twopass_frame->fr_content_type = FC_GRAPHICS_ANIMATION; + else + twopass_frame->fr_content_type = FC_NORMAL; +} + +static void process_first_pass_stats(AV1_COMP *cpi, + FIRSTPASS_STATS *this_frame) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats; + + if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 && + cpi->gf_frame_index == 0 && total_stats && + twopass->stats_buf_ctx->total_left_stats) { + if (cpi->ppi->lap_enabled) { + /* + * Accumulate total_stats using available limited number of stats, + * and assign it to total_left_stats. + */ + *twopass->stats_buf_ctx->total_left_stats = *total_stats; + } + // Special case code for first frame. + const int section_target_bandwidth = get_section_target_bandwidth(cpi); + const double section_length = + twopass->stats_buf_ctx->total_left_stats->count; + const double section_error = + twopass->stats_buf_ctx->total_left_stats->coded_error / section_length; + const double section_intra_skip = + twopass->stats_buf_ctx->total_left_stats->intra_skip_pct / + section_length; + const double section_inactive_zone = + (twopass->stats_buf_ctx->total_left_stats->inactive_zone_rows * 2) / + ((double)cm->mi_params.mb_rows * section_length); + const int tmp_q = get_twopass_worst_quality( + cpi, section_error, section_intra_skip + section_inactive_zone, + section_target_bandwidth); + + rc->active_worst_quality = tmp_q; + rc->ni_av_qi = tmp_q; + p_rc->last_q[INTER_FRAME] = tmp_q; + p_rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params->bit_depth); + p_rc->avg_frame_qindex[INTER_FRAME] = tmp_q; + p_rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2; + p_rc->avg_frame_qindex[KEY_FRAME] = p_rc->last_q[KEY_FRAME]; + } + + if (cpi->twopass_frame.stats_in < twopass->stats_buf_ctx->stats_in_end) { + *this_frame = *cpi->twopass_frame.stats_in; + ++cpi->twopass_frame.stats_in; + } + set_twopass_params_based_on_fp_stats(cpi, this_frame); +} + +static void setup_target_rate(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + + int target_rate = gf_group->bit_allocation[cpi->gf_frame_index]; + + if (has_no_stats_stage(cpi)) { + av1_rc_set_frame_target(cpi, target_rate, cpi->common.width, + cpi->common.height); + } + + rc->base_frame_target = target_rate; +} + +void av1_mark_flashes(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { + FIRSTPASS_STATS *this_stats = first_stats, *next_stats; + while (this_stats < last_stats - 1) { + next_stats = this_stats + 1; + if (next_stats->pcnt_second_ref > next_stats->pcnt_inter && + next_stats->pcnt_second_ref >= 0.5) { + this_stats->is_flash = 1; + } else { + this_stats->is_flash = 0; + } + this_stats = next_stats; + } + // We always treat the last one as none flash. + if (last_stats - 1 >= first_stats) { + (last_stats - 1)->is_flash = 0; + } +} + +// Smooth-out the noise variance so it is more stable +// Returns 0 on success, -1 on memory allocation failure. +// TODO(bohanli): Use a better low-pass filter than averaging +static int smooth_filter_noise(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { + int len = (int)(last_stats - first_stats); + double *smooth_noise = aom_malloc(len * sizeof(*smooth_noise)); + if (!smooth_noise) return -1; + + for (int i = 0; i < len; i++) { + double total_noise = 0; + double total_wt = 0; + for (int j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { + int idx = AOMMIN(AOMMAX(i + j, 0), len - 1); + if (first_stats[idx].is_flash) continue; + + total_noise += first_stats[idx].noise_var; + total_wt += 1.0; + } + if (total_wt > 0.01) { + total_noise /= total_wt; + } else { + total_noise = first_stats[i].noise_var; + } + smooth_noise[i] = total_noise; + } + + for (int i = 0; i < len; i++) { + first_stats[i].noise_var = smooth_noise[i]; + } + + aom_free(smooth_noise); + return 0; +} + +// Estimate the noise variance of each frame from the first pass stats +void av1_estimate_noise(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats, + struct aom_internal_error_info *error_info) { + FIRSTPASS_STATS *this_stats, *next_stats; + double C1, C2, C3, noise; + for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { + this_stats->noise_var = 0.0; + // flashes tend to have high correlation of innovations, so ignore them. + if (this_stats->is_flash || (this_stats - 1)->is_flash || + (this_stats - 2)->is_flash) + continue; + + C1 = (this_stats - 1)->intra_error * + (this_stats->intra_error - this_stats->coded_error); + C2 = (this_stats - 2)->intra_error * + ((this_stats - 1)->intra_error - (this_stats - 1)->coded_error); + C3 = (this_stats - 2)->intra_error * + (this_stats->intra_error - this_stats->sr_coded_error); + if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue; + C1 = sqrt(C1); + C2 = sqrt(C2); + C3 = sqrt(C3); + + noise = (this_stats - 1)->intra_error - C1 * C2 / C3; + noise = AOMMAX(noise, 0.01); + this_stats->noise_var = noise; + } + + // Copy noise from the neighbor if the noise value is not trustworthy + for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { + if (this_stats->is_flash || (this_stats - 1)->is_flash || + (this_stats - 2)->is_flash) + continue; + if (this_stats->noise_var < 1.0) { + int found = 0; + // TODO(bohanli): consider expanding to two directions at the same time + for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash || next_stats->noise_var < 1.0) + continue; + found = 1; + this_stats->noise_var = next_stats->noise_var; + break; + } + if (found) continue; + for (next_stats = this_stats - 1; next_stats >= first_stats + 2; + next_stats--) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash || next_stats->noise_var < 1.0) + continue; + this_stats->noise_var = next_stats->noise_var; + break; + } + } + } + + // copy the noise if this is a flash + for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { + if (this_stats->is_flash || (this_stats - 1)->is_flash || + (this_stats - 2)->is_flash) { + int found = 0; + for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash) + continue; + found = 1; + this_stats->noise_var = next_stats->noise_var; + break; + } + if (found) continue; + for (next_stats = this_stats - 1; next_stats >= first_stats + 2; + next_stats--) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash) + continue; + this_stats->noise_var = next_stats->noise_var; + break; + } + } + } + + // if we are at the first 2 frames, copy the noise + for (this_stats = first_stats; + this_stats < first_stats + 2 && (first_stats + 2) < last_stats; + this_stats++) { + this_stats->noise_var = (first_stats + 2)->noise_var; + } + + if (smooth_filter_noise(first_stats, last_stats) == -1) { + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Error allocating buffers in smooth_filter_noise()"); + } +} + +// Estimate correlation coefficient of each frame with its previous frame. +void av1_estimate_coeff(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { + FIRSTPASS_STATS *this_stats; + for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) { + const double C = + sqrt(AOMMAX((this_stats - 1)->intra_error * + (this_stats->intra_error - this_stats->coded_error), + 0.001)); + const double cor_coeff = + C / + AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001); + + this_stats->cor_coeff = + cor_coeff * + sqrt(AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, + 0.001) / + AOMMAX(this_stats->intra_error - this_stats->noise_var, 0.001)); + // clip correlation coefficient. + this_stats->cor_coeff = AOMMIN(AOMMAX(this_stats->cor_coeff, 0), 1); + } + first_stats->cor_coeff = 1.0; +} + +void av1_get_second_pass_params(AV1_COMP *cpi, + EncodeFrameParams *const frame_params, + unsigned int frame_flags) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + if (cpi->use_ducky_encode && + cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) { + frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; + frame_params->show_frame = + !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE); + if (cpi->gf_frame_index == 0) { + av1_tf_info_reset(&cpi->ppi->tf_info); + av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group); + } + return; + } + + const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; + int update_total_stats = 0; + + if (is_stat_consumption_stage(cpi) && !cpi->twopass_frame.stats_in) return; + + // Check forced key frames. + const int frames_to_next_forced_key = detect_app_forced_key(cpi); + if (frames_to_next_forced_key == 0) { + rc->frames_to_key = 0; + frame_flags &= FRAMEFLAGS_KEY; + } else if (frames_to_next_forced_key > 0 && + frames_to_next_forced_key < rc->frames_to_key) { + rc->frames_to_key = frames_to_next_forced_key; + } + + assert(cpi->twopass_frame.stats_in != NULL); + const int update_type = gf_group->update_type[cpi->gf_frame_index]; + frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; + + if (cpi->gf_frame_index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) { + assert(cpi->gf_frame_index < gf_group->size); + + setup_target_rate(cpi); + + // If this is an arf frame then we dont want to read the stats file or + // advance the input pointer as we already have what we need. + if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) { + const FIRSTPASS_STATS *const this_frame_ptr = + read_frame_stats(twopass, &cpi->twopass_frame, + gf_group->arf_src_offset[cpi->gf_frame_index]); + set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr); + return; + } + } + + if (oxcf->rc_cfg.mode == AOM_Q) + rc->active_worst_quality = oxcf->rc_cfg.cq_level; + + if (cpi->gf_frame_index == gf_group->size) { + if (cpi->ppi->lap_enabled && cpi->ppi->p_rc.enable_scenecut_detection) { + const int num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1; + const int frames_to_key = define_kf_interval( + cpi, &twopass->firstpass_info, num_frames_to_detect_scenecut, + /*search_start_idx=*/0); + if (frames_to_key != -1) + rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key); + } + } + + FIRSTPASS_STATS this_frame; + av1_zero(this_frame); + // call above fn + if (is_stat_consumption_stage(cpi)) { + if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0) { + process_first_pass_stats(cpi, &this_frame); + update_total_stats = 1; + } + } else { + rc->active_worst_quality = oxcf->rc_cfg.cq_level; + } + + // Keyframe and section processing. + FIRSTPASS_STATS this_frame_copy; + this_frame_copy = this_frame; + if (rc->frames_to_key <= 0) { + assert(rc->frames_to_key == 0); + // Define next KF group and assign bits to it. + frame_params->frame_type = KEY_FRAME; + find_next_key_frame(cpi, &this_frame); + this_frame = this_frame_copy; + } + + if (rc->frames_to_fwd_kf <= 0) + rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist; + + // Define a new GF/ARF group. (Should always enter here for key frames). + if (cpi->gf_frame_index == gf_group->size) { + av1_tf_info_reset(&cpi->ppi->tf_info); +#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS + vbr_rc_reset_gop_data(&cpi->vbr_rc_info); +#endif // CONFIG_BITRATE_ACCURACY + int max_gop_length = + (oxcf->gf_cfg.lag_in_frames >= 32) + ? AOMMIN(MAX_GF_INTERVAL, oxcf->gf_cfg.lag_in_frames - + oxcf->algo_cfg.arnr_max_frames / 2) + : MAX_GF_LENGTH_LAP; + + // Handle forward key frame when enabled. + if (oxcf->kf_cfg.fwd_kf_dist > 0) + max_gop_length = AOMMIN(rc->frames_to_fwd_kf + 1, max_gop_length); + + // Use the provided gop size in low delay setting + if (oxcf->gf_cfg.lag_in_frames == 0) max_gop_length = rc->max_gf_interval; + + // Limit the max gop length for the last gop in 1 pass setting. + max_gop_length = AOMMIN(max_gop_length, rc->frames_to_key); + + // Identify regions if needed. + // TODO(bohanli): identify regions for all stats available. + if (rc->frames_since_key == 0 || rc->frames_since_key == 1 || + (p_rc->frames_till_regions_update - rc->frames_since_key < + rc->frames_to_key && + p_rc->frames_till_regions_update - rc->frames_since_key < + max_gop_length + 1)) { + // how many frames we can analyze from this frame + int rest_frames = + AOMMIN(rc->frames_to_key, MAX_FIRSTPASS_ANALYSIS_FRAMES); + rest_frames = + AOMMIN(rest_frames, (int)(twopass->stats_buf_ctx->stats_in_end - + cpi->twopass_frame.stats_in + + (rc->frames_since_key == 0))); + p_rc->frames_till_regions_update = rest_frames; + + int ret; + if (cpi->ppi->lap_enabled) { + av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end, + cpi->common.error); + av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + ret = identify_regions(cpi->twopass_frame.stats_in, rest_frames, + (rc->frames_since_key == 0), p_rc->regions, + &p_rc->num_regions); + } else { + ret = identify_regions( + cpi->twopass_frame.stats_in - (rc->frames_since_key == 0), + rest_frames, 0, p_rc->regions, &p_rc->num_regions); + } + if (ret == -1) { + aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, + "Error allocating buffers in identify_regions"); + } + } + + int cur_region_idx = + find_regions_index(p_rc->regions, p_rc->num_regions, + rc->frames_since_key - p_rc->regions_offset); + if ((cur_region_idx >= 0 && + p_rc->regions[cur_region_idx].type == SCENECUT_REGION) || + rc->frames_since_key == 0) { + // If we start from a scenecut, then the last GOP's arf boost is not + // needed for this GOP. + cpi->ppi->gf_state.arf_gf_boost_lst = 0; + } + + int need_gf_len = 1; + if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) { + // set up bitstream to read + if (!cpi->third_pass_ctx->input_file_name && oxcf->two_pass_output) { + cpi->third_pass_ctx->input_file_name = oxcf->two_pass_output; + } + av1_open_second_pass_log(cpi, 1); + THIRD_PASS_GOP_INFO *gop_info = &cpi->third_pass_ctx->gop_info; + // Read in GOP information from the second pass file. + av1_read_second_pass_gop_info(cpi->second_pass_log_stream, gop_info, + cpi->common.error); +#if CONFIG_BITRATE_ACCURACY + TPL_INFO *tpl_info; + AOM_CHECK_MEM_ERROR(cpi->common.error, tpl_info, + aom_malloc(sizeof(*tpl_info))); + av1_read_tpl_info(tpl_info, cpi->second_pass_log_stream, + cpi->common.error); + aom_free(tpl_info); +#if CONFIG_THREE_PASS + // TODO(angiebird): Put this part into a func + cpi->vbr_rc_info.cur_gop_idx++; +#endif // CONFIG_THREE_PASS +#endif // CONFIG_BITRATE_ACCURACY + // Read in third_pass_info from the bitstream. + av1_set_gop_third_pass(cpi->third_pass_ctx); + // Read in per-frame info from second-pass encoding + av1_read_second_pass_per_frame_info( + cpi->second_pass_log_stream, cpi->third_pass_ctx->frame_info, + gop_info->num_frames, cpi->common.error); + + p_rc->cur_gf_index = 0; + p_rc->gf_intervals[0] = cpi->third_pass_ctx->gop_info.gf_length; + need_gf_len = 0; + } + + if (need_gf_len) { + // If we cannot obtain GF group length from second_pass_file + // TODO(jingning): Resolve the redundant calls here. + if (rc->intervals_till_gf_calculate_due == 0 || 1) { + calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS); + } + + if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model && + oxcf->gf_cfg.lag_in_frames >= 32 && + cpi->sf.tpl_sf.gop_length_decision_method != 3) { + int this_idx = rc->frames_since_key + + p_rc->gf_intervals[p_rc->cur_gf_index] - + p_rc->regions_offset - 1; + int this_region = + find_regions_index(p_rc->regions, p_rc->num_regions, this_idx); + int next_region = + find_regions_index(p_rc->regions, p_rc->num_regions, this_idx + 1); + // TODO(angiebird): Figure out why this_region and next_region are -1 in + // unit test like AltRefFramePresenceTestLarge (aomedia:3134) + int is_last_scenecut = + p_rc->gf_intervals[p_rc->cur_gf_index] >= rc->frames_to_key || + (this_region != -1 && + p_rc->regions[this_region].type == SCENECUT_REGION) || + (next_region != -1 && + p_rc->regions[next_region].type == SCENECUT_REGION); + + int ori_gf_int = p_rc->gf_intervals[p_rc->cur_gf_index]; + + if (p_rc->gf_intervals[p_rc->cur_gf_index] > 16 && + rc->min_gf_interval <= 16) { + // The calculate_gf_length function is previously used with + // max_gop_length = 32 with look-ahead gf intervals. + define_gf_group(cpi, frame_params, 0); + av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group); + this_frame = this_frame_copy; + + if (is_shorter_gf_interval_better(cpi, frame_params)) { + // A shorter gf interval is better. + // TODO(jingning): Remove redundant computations here. + max_gop_length = 16; + calculate_gf_length(cpi, max_gop_length, 1); + if (is_last_scenecut && + (ori_gf_int - p_rc->gf_intervals[p_rc->cur_gf_index] < 4)) { + p_rc->gf_intervals[p_rc->cur_gf_index] = ori_gf_int; + } + } + } + } + } + + define_gf_group(cpi, frame_params, 0); + + if (gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE && + rc->frames_since_key > 0) + process_first_pass_stats(cpi, &this_frame); + + define_gf_group(cpi, frame_params, 1); + + // write gop info if needed for third pass. Per-frame info is written after + // each frame is encoded. + av1_write_second_pass_gop_info(cpi); + + av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group); + + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; + assert(cpi->gf_frame_index == 0); +#if ARF_STATS_OUTPUT + { + FILE *fpfile; + fpfile = fopen("arf.stt", "a"); + ++arf_count; + fprintf(fpfile, "%10d %10d %10d %10d %10d\n", + cpi->common.current_frame.frame_number, + rc->frames_till_gf_update_due, cpi->ppi->p_rc.kf_boost, arf_count, + p_rc->gfu_boost); + + fclose(fpfile); + } +#endif + } + assert(cpi->gf_frame_index < gf_group->size); + + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { + reset_fpf_position(&cpi->twopass_frame, start_pos); + + const FIRSTPASS_STATS *const this_frame_ptr = + read_frame_stats(twopass, &cpi->twopass_frame, + gf_group->arf_src_offset[cpi->gf_frame_index]); + set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr); + } else { + // Back up this frame's stats for updating total stats during post encode. + cpi->twopass_frame.this_frame = update_total_stats ? start_pos : NULL; + } + + frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; + setup_target_rate(cpi); +} + +void av1_init_second_pass(AV1_COMP *cpi) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FRAME_INFO *const frame_info = &cpi->frame_info; + double frame_rate; + FIRSTPASS_STATS *stats; + + if (!twopass->stats_buf_ctx->stats_in_end) return; + + av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end, cpi->common.error); + av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + + stats = twopass->stats_buf_ctx->total_stats; + + *stats = *twopass->stats_buf_ctx->stats_in_end; + *twopass->stats_buf_ctx->total_left_stats = *stats; + + frame_rate = 10000000.0 * stats->count / stats->duration; + // Each frame can have a different duration, as the frame rate in the source + // isn't guaranteed to be constant. The frame rate prior to the first frame + // encoded in the second pass is a guess. However, the sum duration is not. + // It is calculated based on the actual durations of all frames from the + // first pass. + av1_new_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * oxcf->rc_cfg.target_bandwidth / 10000000.0); + +#if CONFIG_BITRATE_ACCURACY + av1_vbr_rc_init(&cpi->vbr_rc_info, twopass->bits_left, + (int)round(stats->count)); +#endif + +#if CONFIG_RATECTRL_LOG + rc_log_init(&cpi->rc_log); +#endif + + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + + // Scan the first pass file and calculate a modified total error based upon + // the bias/power function used to allocate bits. + { + const double avg_error = + stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count); + const FIRSTPASS_STATS *s = cpi->twopass_frame.stats_in; + double modified_error_total = 0.0; + twopass->modified_error_min = + (avg_error * oxcf->rc_cfg.vbrmin_section) / 100; + twopass->modified_error_max = + (avg_error * oxcf->rc_cfg.vbrmax_section) / 100; + while (s < twopass->stats_buf_ctx->stats_in_end) { + modified_error_total += + calculate_modified_err(frame_info, twopass, oxcf, s); + ++s; + } + twopass->modified_error_left = modified_error_total; + } + + // Reset the vbr bits off target counters + cpi->ppi->p_rc.vbr_bits_off_target = 0; + cpi->ppi->p_rc.vbr_bits_off_target_fast = 0; + + cpi->ppi->p_rc.rate_error_estimate = 0; + + // Static sequence monitor variables. + twopass->kf_zeromotion_pct = 100; + twopass->last_kfgroup_zeromotion_pct = 100; + + // Initialize bits per macro_block estimate correction factor. + twopass->bpm_factor = 1.0; + // Initialize actual and target bits counters for ARF groups so that + // at the start we have a neutral bpm adjustment. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; +} + +void av1_init_single_pass_lap(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->ppi->twopass; + + if (!twopass->stats_buf_ctx->stats_in_end) return; + + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + + twopass->bits_left = 0; + twopass->modified_error_min = 0.0; + twopass->modified_error_max = 0.0; + twopass->modified_error_left = 0.0; + + // Reset the vbr bits off target counters + cpi->ppi->p_rc.vbr_bits_off_target = 0; + cpi->ppi->p_rc.vbr_bits_off_target_fast = 0; + + cpi->ppi->p_rc.rate_error_estimate = 0; + + // Static sequence monitor variables. + twopass->kf_zeromotion_pct = 100; + twopass->last_kfgroup_zeromotion_pct = 100; + + // Initialize bits per macro_block estimate correction factor. + twopass->bpm_factor = 1.0; + // Initialize actual and target bits counters for ARF groups so that + // at the start we have a neutral bpm adjustment. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; +} + +#define MINQ_ADJ_LIMIT 48 +#define MINQ_ADJ_LIMIT_CQ 20 +#define HIGH_UNDERSHOOT_RATIO 2 +void av1_twopass_postencode_update(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->ppi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + + // Increment the stats_in pointer. + if (is_stat_consumption_stage(cpi) && + !(cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.gop_mode == + DUCKY_ENCODE_GOP_MODE_RCL) && + (cpi->gf_frame_index < cpi->ppi->gf_group.size || + rc->frames_to_key == 0)) { + const int update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; + if (update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE) { + FIRSTPASS_STATS this_frame; + assert(cpi->twopass_frame.stats_in > + twopass->stats_buf_ctx->stats_in_start); + --cpi->twopass_frame.stats_in; + if (cpi->ppi->lap_enabled) { + input_stats_lap(twopass, &cpi->twopass_frame, &this_frame); + } else { + input_stats(twopass, &cpi->twopass_frame, &this_frame); + } + } else if (cpi->ppi->lap_enabled) { + cpi->twopass_frame.stats_in = twopass->stats_buf_ctx->stats_in_start; + } + } + + // VBR correction is done through rc->vbr_bits_off_target. Based on the + // sign of this value, a limited % adjustment is made to the target rate + // of subsequent frames, to try and push it back towards 0. This method + // is designed to prevent extreme behaviour at the end of a clip + // or group of frames. + p_rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; + twopass->bits_left = AOMMAX(twopass->bits_left - rc->base_frame_target, 0); + + if (cpi->do_update_vbr_bits_off_target_fast) { + // Subtract current frame's fast_extra_bits. + p_rc->vbr_bits_off_target_fast -= rc->frame_level_fast_extra_bits; + rc->frame_level_fast_extra_bits = 0; + } + + // Target vs actual bits for this arf group. + twopass->rolling_arf_group_target_bits += rc->base_frame_target; + twopass->rolling_arf_group_actual_bits += rc->projected_frame_size; + + // Calculate the pct rc error. + if (p_rc->total_actual_bits) { + p_rc->rate_error_estimate = + (int)((p_rc->vbr_bits_off_target * 100) / p_rc->total_actual_bits); + p_rc->rate_error_estimate = clamp(p_rc->rate_error_estimate, -100, 100); + } else { + p_rc->rate_error_estimate = 0; + } + +#if CONFIG_FPMT_TEST + /* The variables temp_vbr_bits_off_target, temp_bits_left, + * temp_rolling_arf_group_target_bits, temp_rolling_arf_group_actual_bits + * temp_rate_error_estimate are introduced for quality simulation purpose, + * it retains the value previous to the parallel encode frames. The + * variables are updated based on the update flag. + * + * If there exist show_existing_frames between parallel frames, then to + * retain the temp state do not update it. */ + const int simulate_parallel_frame = + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int show_existing_between_parallel_frames = + (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); + + if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && + simulate_parallel_frame) { + cpi->ppi->p_rc.temp_vbr_bits_off_target = p_rc->vbr_bits_off_target; + cpi->ppi->p_rc.temp_bits_left = twopass->bits_left; + cpi->ppi->p_rc.temp_rolling_arf_group_target_bits = + twopass->rolling_arf_group_target_bits; + cpi->ppi->p_rc.temp_rolling_arf_group_actual_bits = + twopass->rolling_arf_group_actual_bits; + cpi->ppi->p_rc.temp_rate_error_estimate = p_rc->rate_error_estimate; + } +#endif + // Update the active best quality pyramid. + if (!rc->is_src_frame_alt_ref) { + const int pyramid_level = + cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]; + int i; + for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) { + p_rc->active_best_quality[i] = cpi->common.quant_params.base_qindex; +#if CONFIG_TUNE_VMAF + if (cpi->vmaf_info.original_qindex != -1 && + (cpi->oxcf.tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + cpi->oxcf.tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) { + p_rc->active_best_quality[i] = cpi->vmaf_info.original_qindex; + } +#endif + } + } + +#if 0 + { + AV1_COMMON *cm = &cpi->common; + FILE *fpfile; + fpfile = fopen("details.stt", "a"); + fprintf(fpfile, + "%10d %10d %10d %10" PRId64 " %10" PRId64 + " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n", + cm->current_frame.frame_number, rc->base_frame_target, + rc->projected_frame_size, rc->total_actual_bits, + rc->vbr_bits_off_target, p_rc->rate_error_estimate, + twopass->rolling_arf_group_target_bits, + twopass->rolling_arf_group_actual_bits, + (double)twopass->rolling_arf_group_actual_bits / + (double)twopass->rolling_arf_group_target_bits, + twopass->bpm_factor, + av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex, + cm->seq_params->bit_depth), + av1_convert_qindex_to_q(rc->active_worst_quality, + cm->seq_params->bit_depth)); + fclose(fpfile); + } +#endif + + if (cpi->common.current_frame.frame_type != KEY_FRAME) { + twopass->kf_group_bits -= rc->base_frame_target; + twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; + } + twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0); + + // If the rate control is drifting consider adjustment to min or maxq. + if ((rc_cfg->mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) { + int minq_adj_limit; + int maxq_adj_limit; + minq_adj_limit = + (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT); + maxq_adj_limit = rc->worst_quality - rc->active_worst_quality; + + // Undershoot + if ((rc_cfg->under_shoot_pct < 100) && + (p_rc->rolling_actual_bits < p_rc->rolling_target_bits)) { + int pct_error = + ((p_rc->rolling_target_bits - p_rc->rolling_actual_bits) * 100) / + p_rc->rolling_target_bits; + + if ((pct_error >= rc_cfg->under_shoot_pct) && + (p_rc->rate_error_estimate > 0)) { + twopass->extend_minq += 1; + } + twopass->extend_maxq -= 1; + // Overshoot + } else if ((rc_cfg->over_shoot_pct < 100) && + (p_rc->rolling_actual_bits > p_rc->rolling_target_bits)) { + int pct_error = + ((p_rc->rolling_actual_bits - p_rc->rolling_target_bits) * 100) / + p_rc->rolling_target_bits; + + pct_error = clamp(pct_error, 0, 100); + if ((pct_error >= rc_cfg->over_shoot_pct) && + (p_rc->rate_error_estimate < 0)) { + twopass->extend_maxq += 1; + } + twopass->extend_minq -= 1; + } else { + // Adjustment for extreme local overshoot. + // Only applies when normal adjustment above is not used (e.g. + // when threshold is set to 100). + if (rc->projected_frame_size > (2 * rc->base_frame_target) && + rc->projected_frame_size > (2 * rc->avg_frame_bandwidth)) + ++twopass->extend_maxq; + // Unwind extreme overshoot adjustment. + else if (p_rc->rolling_target_bits > p_rc->rolling_actual_bits) + --twopass->extend_maxq; + } + twopass->extend_minq = + clamp(twopass->extend_minq, -minq_adj_limit, minq_adj_limit); + twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit); + + // If there is a big and undexpected undershoot then feed the extra + // bits back in quickly. One situation where this may happen is if a + // frame is unexpectedly almost perfectly predicted by the ARF or GF + // but not very well predcited by the previous frame. + if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) { + int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO; + if (rc->projected_frame_size < fast_extra_thresh) { + p_rc->vbr_bits_off_target_fast += + fast_extra_thresh - rc->projected_frame_size; + p_rc->vbr_bits_off_target_fast = AOMMIN(p_rc->vbr_bits_off_target_fast, + (4 * rc->avg_frame_bandwidth)); + } + } + +#if CONFIG_FPMT_TEST + if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && + simulate_parallel_frame) { + cpi->ppi->p_rc.temp_vbr_bits_off_target_fast = + p_rc->vbr_bits_off_target_fast; + cpi->ppi->p_rc.temp_extend_minq = twopass->extend_minq; + cpi->ppi->p_rc.temp_extend_maxq = twopass->extend_maxq; + } +#endif + } + + // Update the frame probabilities obtained from parallel encode frames + FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs; +#if CONFIG_FPMT_TEST + /* The variable temp_active_best_quality is introduced only for quality + * simulation purpose, it retains the value previous to the parallel + * encode frames. The variable is updated based on the update flag. + * + * If there exist show_existing_frames between parallel frames, then to + * retain the temp state do not update it. */ + if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && + simulate_parallel_frame) { + int i; + const int pyramid_level = + cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]; + if (!rc->is_src_frame_alt_ref) { + for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) + cpi->ppi->p_rc.temp_active_best_quality[i] = + p_rc->active_best_quality[i]; + } + } + + // Update the frame probabilities obtained from parallel encode frames + FrameProbInfo *const temp_frame_probs_simulation = + simulate_parallel_frame ? &cpi->ppi->temp_frame_probs_simulation + : frame_probs; + FrameProbInfo *const temp_frame_probs = + simulate_parallel_frame ? &cpi->ppi->temp_frame_probs : NULL; +#endif + int i, j, loop; + // Sequentially do average on temp_frame_probs_simulation which holds + // probabilities of last frame before parallel encode + for (loop = 0; loop <= cpi->num_frame_recode; loop++) { + // Sequentially update tx_type_probs + if (cpi->do_update_frame_probs_txtype[loop] && + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + for (i = 0; i < TX_SIZES_ALL; i++) { + int left = 1024; + + for (j = TX_TYPES - 1; j >= 0; j--) { + const int new_prob = + cpi->frame_new_probs[loop].tx_type_probs[update_type][i][j]; +#if CONFIG_FPMT_TEST + int prob = + (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + temp_frame_probs_simulation->tx_type_probs[update_type][i][j] = prob; +#else + int prob = + (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1; + left -= prob; + if (j == 0) prob += left; + frame_probs->tx_type_probs[update_type][i][j] = prob; +#endif + } + } + } + + // Sequentially update obmc_probs + if (cpi->do_update_frame_probs_obmc[loop] && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + + for (i = 0; i < BLOCK_SIZES_ALL; i++) { + const int new_prob = + cpi->frame_new_probs[loop].obmc_probs[update_type][i]; +#if CONFIG_FPMT_TEST + temp_frame_probs_simulation->obmc_probs[update_type][i] = + (temp_frame_probs_simulation->obmc_probs[update_type][i] + + new_prob) >> + 1; +#else + frame_probs->obmc_probs[update_type][i] = + (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1; +#endif + } + } + + // Sequentially update warped_probs + if (cpi->do_update_frame_probs_warp[loop] && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + const int new_prob = cpi->frame_new_probs[loop].warped_probs[update_type]; +#if CONFIG_FPMT_TEST + temp_frame_probs_simulation->warped_probs[update_type] = + (temp_frame_probs_simulation->warped_probs[update_type] + new_prob) >> + 1; +#else + frame_probs->warped_probs[update_type] = + (frame_probs->warped_probs[update_type] + new_prob) >> 1; +#endif + } + + // Sequentially update switchable_interp_probs + if (cpi->do_update_frame_probs_interpfilter[loop] && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + int left = 1536; + + for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) { + const int new_prob = cpi->frame_new_probs[loop] + .switchable_interp_probs[update_type][i][j]; +#if CONFIG_FPMT_TEST + int prob = (temp_frame_probs_simulation + ->switchable_interp_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + + temp_frame_probs_simulation + ->switchable_interp_probs[update_type][i][j] = prob; +#else + int prob = (frame_probs->switchable_interp_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + frame_probs->switchable_interp_probs[update_type][i][j] = prob; +#endif + } + } + } + } + +#if CONFIG_FPMT_TEST + // Copying temp_frame_probs_simulation to temp_frame_probs based on + // the flag + if (cpi->do_frame_data_update && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + simulate_parallel_frame) { + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + for (i = 0; i < BLOCK_SIZES_ALL; i++) { + temp_frame_probs->obmc_probs[update_type_idx][i] = + temp_frame_probs_simulation->obmc_probs[update_type_idx][i]; + } + temp_frame_probs->warped_probs[update_type_idx] = + temp_frame_probs_simulation->warped_probs[update_type_idx]; + for (i = 0; i < TX_SIZES_ALL; i++) { + for (j = 0; j < TX_TYPES; j++) { + temp_frame_probs->tx_type_probs[update_type_idx][i][j] = + temp_frame_probs_simulation->tx_type_probs[update_type_idx][i][j]; + } + } + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + for (j = 0; j < SWITCHABLE_FILTERS; j++) { + temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] = + temp_frame_probs_simulation + ->switchable_interp_probs[update_type_idx][i][j]; + } + } + } + } +#endif + // Update framerate obtained from parallel encode frames + if (cpi->common.show_frame && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + cpi->framerate = cpi->new_framerate; +#if CONFIG_FPMT_TEST + // SIMULATION PURPOSE + int show_existing_between_parallel_frames_cndn = + (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); + if (cpi->common.show_frame && !show_existing_between_parallel_frames_cndn && + cpi->do_frame_data_update && simulate_parallel_frame) + cpi->temp_framerate = cpi->framerate; +#endif +} diff --git a/third_party/aom/av1/encoder/pass2_strategy.h b/third_party/aom/av1/encoder/pass2_strategy.h new file mode 100644 index 0000000000..5987a78a23 --- /dev/null +++ b/third_party/aom/av1/encoder/pass2_strategy.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_ +#define AOM_AV1_ENCODER_PASS2_STRATEGY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct EncodeFrameParams; + +#include "av1/encoder/encoder.h" + +/*! + * \brief accumulated stats and features in a gf group + */ +typedef struct { + /*!\cond */ + double gf_group_err; + double gf_group_raw_error; + double gf_group_skip_pct; + double gf_group_inactive_zone_rows; + + double mv_ratio_accumulator; + double decay_accumulator; + double zero_motion_accumulator; + double loop_decay_rate; + double last_loop_decay_rate; + double this_frame_mv_in_out; + double mv_in_out_accumulator; + double abs_mv_in_out_accumulator; + + double avg_sr_coded_error; + double avg_pcnt_second_ref; + double avg_new_mv_count; + double avg_wavelet_energy; + double avg_raw_err_stdev; + int non_zero_stdev_count; + /*!\endcond */ +} GF_GROUP_STATS; + +/*! + * \brief accumulated stats and features for a frame + */ +typedef struct { + /*!\cond */ + double frame_err; + double frame_coded_error; + double frame_sr_coded_error; + /*!\endcond */ +} GF_FRAME_STATS; +/*!\cond */ + +void av1_init_second_pass(struct AV1_COMP *cpi); + +void av1_init_single_pass_lap(AV1_COMP *cpi); + +/*!\endcond */ +/*!\brief Main per frame entry point for second pass of two pass encode + * + *\ingroup rate_control + * + * This function is called for each frame in the second pass of a two pass + * encode. It checks the frame type and if a new KF or GF/ARF is due. + * When a KF is due it calls find_next_key_frame() to work out how long + * this key frame group will be and assign bits to the key frame. + * At the start of a new GF/ARF group it calls calculate_gf_length() + * and define_gf_group() which are the main functions responsible for + * defining the size and structure of the new GF/ARF group. + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] frame_params Per frame encoding parameters + * \param[in] frame_flags Frame type and coding flags + * + * \remark No return but analyses first pass stats and assigns a target + * number of bits to the current frame and a target Q range. + */ +void av1_get_second_pass_params(struct AV1_COMP *cpi, + struct EncodeFrameParams *const frame_params, + unsigned int frame_flags); + +/*!\brief Adjustments to two pass and rate control after each frame. + * + *\ingroup rate_control + * + * This function is called after each frame to make adjustments to + * heuristics and data structures that relate to rate control. + * + * \param[in] cpi Top - level encoder instance structure + * + * \remark No return value but this function updates various rate control + * related data structures that for example track overshoot and + * undershoot. + */ +void av1_twopass_postencode_update(struct AV1_COMP *cpi); + +/*!\brief Distributes bits to frames in a group + * + *\ingroup rate_control + * + * This function decides on the allocation of bits between the different + * frames and types of frame in a GF/ARF group. + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] rc Rate control data + * \param[in] gf_group GF/ARF group data structure + * \param[in] is_key_frame Indicates if the first frame in the group is + * also a key frame. + * \param[in] use_arf Are ARF frames enabled or is this a GF only + * uni-directional group. + * \param[in] gf_group_bits Bits available to be allocated. + * + * \remark No return but updates the rate control and group data structures + * to reflect the allocation of bits. + */ +void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, + GF_GROUP *gf_group, int is_key_frame, int use_arf, + int64_t gf_group_bits); + +int av1_calc_arf_boost(const TWO_PASS *twopass, + const TWO_PASS_FRAME *twopass_frame, + const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, + int offset, int f_frames, int b_frames, + int *num_fpstats_used, int *num_fpstats_required, + int project_gfu_boost); + +void av1_mark_flashes(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats); +void av1_estimate_noise(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats, + struct aom_internal_error_info *error_info); +void av1_estimate_coeff(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PASS2_STRATEGY_H_ diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c new file mode 100644 index 0000000000..232a2f9edb --- /dev/null +++ b/third_party/aom/av1/encoder/pickcdef.c @@ -0,0 +1,958 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/pickcdef.h" +#include "av1/encoder/mcomp.h" + +// Get primary and secondary filter strength for the given strength index and +// search method +static INLINE void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method, + int *pri_strength, + int *sec_strength, + int strength_idx) { + const int tot_sec_filter = + (pick_method == CDEF_FAST_SEARCH_LVL5) + ? REDUCED_SEC_STRENGTHS_LVL5 + : ((pick_method >= CDEF_FAST_SEARCH_LVL3) ? REDUCED_SEC_STRENGTHS_LVL3 + : CDEF_SEC_STRENGTHS); + const int pri_idx = strength_idx / tot_sec_filter; + const int sec_idx = strength_idx % tot_sec_filter; + *pri_strength = pri_idx; + *sec_strength = sec_idx; + if (pick_method == CDEF_FULL_SEARCH) return; + + switch (pick_method) { + case CDEF_FAST_SEARCH_LVL1: + assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL1); + *pri_strength = priconv_lvl1[pri_idx]; + break; + case CDEF_FAST_SEARCH_LVL2: + assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2); + *pri_strength = priconv_lvl2[pri_idx]; + break; + case CDEF_FAST_SEARCH_LVL3: + assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2); + assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3); + *pri_strength = priconv_lvl2[pri_idx]; + *sec_strength = secconv_lvl3[sec_idx]; + break; + case CDEF_FAST_SEARCH_LVL4: + assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4); + assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3); + *pri_strength = priconv_lvl4[pri_idx]; + *sec_strength = secconv_lvl3[sec_idx]; + break; + case CDEF_FAST_SEARCH_LVL5: + assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4); + assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL5); + *pri_strength = priconv_lvl5[pri_idx]; + *sec_strength = secconv_lvl5[sec_idx]; + break; + default: assert(0 && "Invalid CDEF search method"); + } +} + +// Store CDEF filter strength calculated from strength index for given search +// method +#define STORE_CDEF_FILTER_STRENGTH(cdef_strength, pick_method, strength_idx) \ + do { \ + get_cdef_filter_strengths((pick_method), &pri_strength, &sec_strength, \ + (strength_idx)); \ + cdef_strength = pri_strength * CDEF_SEC_STRENGTHS + sec_strength; \ + } while (0) + +/* Search for the best strength to add as an option, knowing we + already selected nb_strengths options. */ +static uint64_t search_one(int *lev, int nb_strengths, + uint64_t mse[][TOTAL_STRENGTHS], int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t tot_mse[TOTAL_STRENGTHS]; + const int total_strengths = nb_cdef_strengths[pick_method]; + int i, j; + uint64_t best_tot_mse = (uint64_t)1 << 63; + int best_id = 0; + memset(tot_mse, 0, sizeof(tot_mse)); + for (i = 0; i < sb_count; i++) { + int gi; + uint64_t best_mse = (uint64_t)1 << 63; + /* Find best mse among already selected options. */ + for (gi = 0; gi < nb_strengths; gi++) { + if (mse[i][lev[gi]] < best_mse) { + best_mse = mse[i][lev[gi]]; + } + } + /* Find best mse when adding each possible new option. */ + for (j = 0; j < total_strengths; j++) { + uint64_t best = best_mse; + if (mse[i][j] < best) best = mse[i][j]; + tot_mse[j] += best; + } + } + for (j = 0; j < total_strengths; j++) { + if (tot_mse[j] < best_tot_mse) { + best_tot_mse = tot_mse[j]; + best_id = j; + } + } + lev[nb_strengths] = best_id; + return best_tot_mse; +} + +/* Search for the best luma+chroma strength to add as an option, knowing we + already selected nb_strengths options. */ +static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths, + uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS]; + int i, j; + uint64_t best_tot_mse = (uint64_t)1 << 63; + int best_id0 = 0; + int best_id1 = 0; + const int total_strengths = nb_cdef_strengths[pick_method]; + memset(tot_mse, 0, sizeof(tot_mse)); + for (i = 0; i < sb_count; i++) { + int gi; + uint64_t best_mse = (uint64_t)1 << 63; + /* Find best mse among already selected options. */ + for (gi = 0; gi < nb_strengths; gi++) { + uint64_t curr = mse[0][i][lev0[gi]]; + curr += mse[1][i][lev1[gi]]; + if (curr < best_mse) { + best_mse = curr; + } + } + /* Find best mse when adding each possible new option. */ + for (j = 0; j < total_strengths; j++) { + int k; + for (k = 0; k < total_strengths; k++) { + uint64_t best = best_mse; + uint64_t curr = mse[0][i][j]; + curr += mse[1][i][k]; + if (curr < best) best = curr; + tot_mse[j][k] += best; + } + } + } + for (j = 0; j < total_strengths; j++) { + int k; + for (k = 0; k < total_strengths; k++) { + if (tot_mse[j][k] < best_tot_mse) { + best_tot_mse = tot_mse[j][k]; + best_id0 = j; + best_id1 = k; + } + } + } + lev0[nb_strengths] = best_id0; + lev1[nb_strengths] = best_id1; + return best_tot_mse; +} + +/* Search for the set of strengths that minimizes mse. */ +static uint64_t joint_strength_search(int *best_lev, int nb_strengths, + uint64_t mse[][TOTAL_STRENGTHS], + int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t best_tot_mse; + int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 && + pick_method <= CDEF_FAST_SEARCH_LVL5); + int i; + best_tot_mse = (uint64_t)1 << 63; + /* Greedy search: add one strength options at a time. */ + for (i = 0; i < nb_strengths; i++) { + best_tot_mse = search_one(best_lev, i, mse, sb_count, pick_method); + } + /* Trying to refine the greedy search by reconsidering each + already-selected option. */ + if (!fast) { + for (i = 0; i < 4 * nb_strengths; i++) { + int j; + for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1]; + best_tot_mse = + search_one(best_lev, nb_strengths - 1, mse, sb_count, pick_method); + } + } + return best_tot_mse; +} + +/* Search for the set of luma+chroma strengths that minimizes mse. */ +static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1, + int nb_strengths, + uint64_t (**mse)[TOTAL_STRENGTHS], + int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t best_tot_mse; + int i; + best_tot_mse = (uint64_t)1 << 63; + /* Greedy search: add one strength options at a time. */ + for (i = 0; i < nb_strengths; i++) { + best_tot_mse = + search_one_dual(best_lev0, best_lev1, i, mse, sb_count, pick_method); + } + /* Trying to refine the greedy search by reconsidering each + already-selected option. */ + for (i = 0; i < 4 * nb_strengths; i++) { + int j; + for (j = 0; j < nb_strengths - 1; j++) { + best_lev0[j] = best_lev0[j + 1]; + best_lev1[j] = best_lev1[j + 1]; + } + best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse, + sb_count, pick_method); + } + return best_tot_mse; +} + +static INLINE void init_src_params(int *src_stride, int *width, int *height, + int *width_log2, int *height_log2, + BLOCK_SIZE bsize) { + *src_stride = block_size_wide[bsize]; + *width = block_size_wide[bsize]; + *height = block_size_high[bsize]; + *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize]; + *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize]; +} +#if CONFIG_AV1_HIGHBITDEPTH +/* Compute MSE only on the blocks we filtered. */ +static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src, + cdef_list *dlist, int cdef_count, + BLOCK_SIZE bsize, int coeff_shift, + int row, int col) { + assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || + bsize == BLOCK_8X8); + uint64_t sum = 0; + int bi, bx, by; + uint16_t *dst16 = CONVERT_TO_SHORTPTR((uint8_t *)dst); + uint16_t *dst_buff = &dst16[row * dstride + col]; + int src_stride, width, height, width_log2, height_log2; + init_src_params(&src_stride, &width, &height, &width_log2, &height_log2, + bsize); + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + sum += aom_mse_wxh_16bit_highbd( + &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride, + &src[bi << (height_log2 + width_log2)], src_stride, width, height); + } + return sum >> 2 * coeff_shift; +} +#endif + +// Checks dual and quad block processing is applicable for block widths 8 and 4 +// respectively. +static INLINE int is_dual_or_quad_applicable(cdef_list *dlist, int width, + int cdef_count, int bi, int iter) { + assert(width == 8 || width == 4); + const int blk_offset = (width == 8) ? 1 : 3; + if ((iter + blk_offset) >= cdef_count) return 0; + + if (dlist[bi].by == dlist[bi + blk_offset].by && + dlist[bi].bx + blk_offset == dlist[bi + blk_offset].bx) + return 1; + + return 0; +} + +static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src, + cdef_list *dlist, int cdef_count, + BLOCK_SIZE bsize, int coeff_shift, int row, + int col) { + assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || + bsize == BLOCK_8X8); + uint64_t sum = 0; + int bi, bx, by; + int iter = 0; + int inc = 1; + uint8_t *dst8 = (uint8_t *)dst; + uint8_t *dst_buff = &dst8[row * dstride + col]; + int src_stride, width, height, width_log2, height_log2; + init_src_params(&src_stride, &width, &height, &width_log2, &height_log2, + bsize); + + const int num_blks = 16 / width; + for (bi = 0; bi < cdef_count; bi += inc) { + by = dlist[bi].by; + bx = dlist[bi].bx; + uint16_t *src_tmp = &src[bi << (height_log2 + width_log2)]; + uint8_t *dst_tmp = + &dst_buff[(by << height_log2) * dstride + (bx << width_log2)]; + + if (is_dual_or_quad_applicable(dlist, width, cdef_count, bi, iter)) { + sum += aom_mse_16xh_16bit(dst_tmp, dstride, src_tmp, width, height); + iter += num_blks; + inc = num_blks; + } else { + sum += aom_mse_wxh_16bit(dst_tmp, dstride, src_tmp, src_stride, width, + height); + iter += 1; + inc = 1; + } + } + + return sum >> 2 * coeff_shift; +} + +// Fill the boundary regions of the block with CDEF_VERY_LARGE, only if the +// region is outside frame boundary +static INLINE void fill_borders_for_fbs_on_frame_boundary( + uint16_t *inbuf, int hfilt_size, int vfilt_size, + bool is_fb_on_frm_left_boundary, bool is_fb_on_frm_right_boundary, + bool is_fb_on_frm_top_boundary, bool is_fb_on_frm_bottom_boundary) { + if (!is_fb_on_frm_left_boundary && !is_fb_on_frm_right_boundary && + !is_fb_on_frm_top_boundary && !is_fb_on_frm_bottom_boundary) + return; + if (is_fb_on_frm_bottom_boundary) { + // Fill bottom region of the block + const int buf_offset = + (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + CDEF_HBORDER; + fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_left_boundary) { + const int buf_offset = (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE; + // Fill bottom-left region of the block + fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_right_boundary) { + const int buf_offset = + (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + hfilt_size + CDEF_HBORDER; + // Fill bottom-right region of the block + fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_top_boundary) { + // Fill top region of the block + fill_rect(&inbuf[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_top_boundary || is_fb_on_frm_left_boundary) { + // Fill top-left region of the block + fill_rect(inbuf, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); + } + if (is_fb_on_frm_top_boundary || is_fb_on_frm_right_boundary) { + const int buf_offset = hfilt_size + CDEF_HBORDER; + // Fill top-right region of the block + fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_left_boundary) { + const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE; + // Fill left region of the block + fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, vfilt_size, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_right_boundary) { + const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE; + // Fill right region of the block + fill_rect(&inbuf[buf_offset + hfilt_size + CDEF_HBORDER], CDEF_BSTRIDE, + vfilt_size, CDEF_HBORDER, CDEF_VERY_LARGE); + } +} + +// Calculate the number of 8x8/4x4 filter units for which SSE can be calculated +// after CDEF filtering in single function call +static AOM_FORCE_INLINE int get_error_calc_width_in_filt_units( + cdef_list *dlist, int cdef_count, int bi, int subsampling_x, + int subsampling_y) { + // TODO(Ranjit): Extend the optimization for 422 + if (subsampling_x != subsampling_y) return 1; + + // Combining more blocks seems to increase encode time due to increase in + // control code + if (bi + 3 < cdef_count && dlist[bi].by == dlist[bi + 3].by && + dlist[bi].bx + 3 == dlist[bi + 3].bx) { + /* Calculate error for four 8x8/4x4 blocks using 32x8/16x4 block specific + * logic if y co-ordinates match and x co-ordinates are + * separated by 3 for first and fourth 8x8/4x4 blocks in dlist[]. */ + return 4; + } + if (bi + 1 < cdef_count && dlist[bi].by == dlist[bi + 1].by && + dlist[bi].bx + 1 == dlist[bi + 1].bx) { + /* Calculate error for two 8x8/4x4 blocks using 16x8/8x4 block specific + * logic if their y co-ordinates match and x co-ordinates are + * separated by 1 for first and second 8x8/4x4 blocks in dlist[]. */ + return 2; + } + return 1; +} + +// Returns the block error after CDEF filtering for a given strength +static INLINE uint64_t get_filt_error( + const CdefSearchCtx *cdef_search_ctx, const struct macroblockd_plane *pd, + cdef_list *dlist, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], uint16_t *in, uint8_t *ref_buffer, + int ref_stride, int row, int col, int pri_strength, int sec_strength, + int cdef_count, int pli, int coeff_shift, BLOCK_SIZE bs) { + uint64_t curr_sse = 0; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bs, pd->subsampling_x, pd->subsampling_y); + const int bw_log2 = 3 - pd->subsampling_x; + const int bh_log2 = 3 - pd->subsampling_y; + + // TODO(Ranjit): Extend this optimization for HBD + if (!cdef_search_ctx->use_highbitdepth) { + // If all 8x8/4x4 blocks in CDEF block need to be filtered, calculate the + // error at CDEF block level + const int tot_blk_count = + (block_size_wide[plane_bsize] * block_size_high[plane_bsize]) >> + (bw_log2 + bh_log2); + if (cdef_count == tot_blk_count) { + // Calculate the offset in the buffer based on block position + const FULLPEL_MV this_mv = { row, col }; + const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride); + if (pri_strength == 0 && sec_strength == 0) { + // When CDEF strength is zero, filtering is not applied. Hence + // error is calculated between source and unfiltered pixels + curr_sse = + aom_sse(&ref_buffer[buf_offset], ref_stride, + get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride, + block_size_wide[plane_bsize], block_size_high[plane_bsize]); + } else { + DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]); + + av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in, + cdef_search_ctx->xdec[pli], + cdef_search_ctx->ydec[pli], dir, dirinit, var, pli, + dlist, cdef_count, pri_strength, + sec_strength + (sec_strength == 3), + cdef_search_ctx->damping, coeff_shift); + curr_sse = + aom_sse(&ref_buffer[buf_offset], ref_stride, tmp_dst8, + (1 << MAX_SB_SIZE_LOG2), block_size_wide[plane_bsize], + block_size_high[plane_bsize]); + } + } else { + // If few 8x8/4x4 blocks in CDEF block need to be filtered, filtering + // functions produce 8-bit output and the error is calculated in 8-bit + // domain + if (pri_strength == 0 && sec_strength == 0) { + int num_error_calc_filt_units = 1; + for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) { + const uint8_t by = dlist[bi].by; + const uint8_t bx = dlist[bi].bx; + const int16_t by_pos = (by << bh_log2); + const int16_t bx_pos = (bx << bw_log2); + // Calculate the offset in the buffer based on block position + const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos }; + const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride); + num_error_calc_filt_units = get_error_calc_width_in_filt_units( + dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y); + curr_sse += aom_sse( + &ref_buffer[buf_offset], ref_stride, + get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride, + num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2)); + } + } else { + DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]); + av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in, + cdef_search_ctx->xdec[pli], + cdef_search_ctx->ydec[pli], dir, dirinit, var, pli, + dlist, cdef_count, pri_strength, + sec_strength + (sec_strength == 3), + cdef_search_ctx->damping, coeff_shift); + int num_error_calc_filt_units = 1; + for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) { + const uint8_t by = dlist[bi].by; + const uint8_t bx = dlist[bi].bx; + const int16_t by_pos = (by << bh_log2); + const int16_t bx_pos = (bx << bw_log2); + // Calculate the offset in the buffer based on block position + const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos }; + const FULLPEL_MV tmp_buf_pos = { by_pos, bx_pos }; + const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride); + const int tmp_buf_offset = + get_offset_from_fullmv(&tmp_buf_pos, (1 << MAX_SB_SIZE_LOG2)); + num_error_calc_filt_units = get_error_calc_width_in_filt_units( + dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y); + curr_sse += aom_sse( + &ref_buffer[buf_offset], ref_stride, &tmp_dst8[tmp_buf_offset], + (1 << MAX_SB_SIZE_LOG2), + num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2)); + } + } + } + } else { + DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]); + + av1_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, + cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli], + dir, dirinit, var, pli, dlist, cdef_count, pri_strength, + sec_strength + (sec_strength == 3), + cdef_search_ctx->damping, coeff_shift); + curr_sse = cdef_search_ctx->compute_cdef_dist_fn( + ref_buffer, ref_stride, tmp_dst, dlist, cdef_count, + cdef_search_ctx->bsize[pli], coeff_shift, row, col); + } + return curr_sse; +} + +// Calculates MSE at block level. +// Inputs: +// cdef_search_ctx: Pointer to the structure containing parameters related to +// CDEF search context. +// fbr: Row index in units of 64x64 block +// fbc: Column index in units of 64x64 block +// Returns: +// Nothing will be returned. Contents of cdef_search_ctx will be modified. +void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, + struct aom_internal_error_info *error_info, + int fbr, int fbc, int sb_count) { + // TODO(aomedia:3276): Pass error_info to the low-level functions as required + // in future to handle error propagation. + (void)error_info; + const CommonModeInfoParams *const mi_params = cdef_search_ctx->mi_params; + const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref; + const int coeff_shift = cdef_search_ctx->coeff_shift; + const int *mi_wide_l2 = cdef_search_ctx->mi_wide_l2; + const int *mi_high_l2 = cdef_search_ctx->mi_high_l2; + + // Declare and initialize the temporary buffers. + DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]); + cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128]; + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; + int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; + uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER; + int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); + int hb_step = 1, vb_step = 1; + BLOCK_SIZE bs; + + const MB_MODE_INFO *const mbmi = + mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc]; + + uint8_t *ref_buffer[MAX_MB_PLANE] = { ref->y_buffer, ref->u_buffer, + ref->v_buffer }; + int ref_stride[MAX_MB_PLANE] = { ref->y_stride, ref->uv_stride, + ref->uv_stride }; + + if (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64 || + mbmi->bsize == BLOCK_64X128) { + bs = mbmi->bsize; + if (bs == BLOCK_128X128 || bs == BLOCK_128X64) { + nhb = AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + hb_step = 2; + } + if (bs == BLOCK_128X128 || bs == BLOCK_64X128) { + nvb = AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr); + vb_step = 2; + } + } else { + bs = BLOCK_64X64; + } + // Get number of 8x8 blocks which are not skip. Cdef processing happens for + // 8x8 blocks which are not skip. + const int cdef_count = av1_cdef_compute_sb_list( + mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs); + const bool is_fb_on_frm_left_boundary = (fbc == 0); + const bool is_fb_on_frm_right_boundary = + (fbc + hb_step == cdef_search_ctx->nhfb); + const bool is_fb_on_frm_top_boundary = (fbr == 0); + const bool is_fb_on_frm_bottom_boundary = + (fbr + vb_step == cdef_search_ctx->nvfb); + const int yoff = CDEF_VBORDER * (!is_fb_on_frm_top_boundary); + const int xoff = CDEF_HBORDER * (!is_fb_on_frm_left_boundary); + int dirinit = 0; + for (int pli = 0; pli < cdef_search_ctx->num_planes; pli++) { + /* We avoid filtering the pixels for which some of the pixels to + average are outside the frame. We could change the filter instead, + but it would add special cases for any future vectorization. */ + const int hfilt_size = (nhb << mi_wide_l2[pli]); + const int vfilt_size = (nvb << mi_high_l2[pli]); + const int ysize = + vfilt_size + CDEF_VBORDER * (!is_fb_on_frm_bottom_boundary) + yoff; + const int xsize = + hfilt_size + CDEF_HBORDER * (!is_fb_on_frm_right_boundary) + xoff; + const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli]; + const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli]; + struct macroblockd_plane pd = cdef_search_ctx->plane[pli]; + cdef_search_ctx->copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE, + pd.dst.buf, row - yoff, col - xoff, pd.dst.stride, + ysize, xsize); + fill_borders_for_fbs_on_frame_boundary( + inbuf, hfilt_size, vfilt_size, is_fb_on_frm_left_boundary, + is_fb_on_frm_right_boundary, is_fb_on_frm_top_boundary, + is_fb_on_frm_bottom_boundary); + for (int gi = 0; gi < cdef_search_ctx->total_strengths; gi++) { + int pri_strength, sec_strength; + get_cdef_filter_strengths(cdef_search_ctx->pick_method, &pri_strength, + &sec_strength, gi); + const uint64_t curr_mse = get_filt_error( + cdef_search_ctx, &pd, dlist, dir, &dirinit, var, in, ref_buffer[pli], + ref_stride[pli], row, col, pri_strength, sec_strength, cdef_count, + pli, coeff_shift, bs); + if (pli < 2) + cdef_search_ctx->mse[pli][sb_count][gi] = curr_mse; + else + cdef_search_ctx->mse[1][sb_count][gi] += curr_mse; + } + } + cdef_search_ctx->sb_index[sb_count] = + MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc; +} + +// MSE calculation at frame level. +// Inputs: +// cdef_search_ctx: Pointer to the structure containing parameters related to +// CDEF search context. +// Returns: +// Nothing will be returned. Contents of cdef_search_ctx will be modified. +static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx, + struct aom_internal_error_info *error_info) { + // Loop over each sb. + for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) { + for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) { + // Checks if cdef processing can be skipped for particular sb. + if (cdef_sb_skip(cdef_search_ctx->mi_params, fbr, fbc)) continue; + // Calculate mse for each sb and store the relevant sb index. + av1_cdef_mse_calc_block(cdef_search_ctx, error_info, fbr, fbc, + cdef_search_ctx->sb_count); + cdef_search_ctx->sb_count++; + } + } +} + +// Allocates memory for members of CdefSearchCtx. +// Inputs: +// cdef_search_ctx: Pointer to the structure containing parameters +// related to CDEF search context. +// Returns: +// Nothing will be returned. Contents of cdef_search_ctx will be modified. +static void cdef_alloc_data(AV1_COMMON *cm, CdefSearchCtx *cdef_search_ctx) { + const int nvfb = cdef_search_ctx->nvfb; + const int nhfb = cdef_search_ctx->nhfb; + CHECK_MEM_ERROR( + cm, cdef_search_ctx->sb_index, + aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index[0]))); + cdef_search_ctx->sb_count = 0; + CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[0], + aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb)); + CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[1], + aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb)); +} + +// Deallocates the memory allocated for members of CdefSearchCtx. +// Inputs: +// cdef_search_ctx: Pointer to the structure containing parameters +// related to CDEF search context. +// Returns: +// Nothing will be returned. +void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) { + if (cdef_search_ctx) { + aom_free(cdef_search_ctx->mse[0]); + cdef_search_ctx->mse[0] = NULL; + aom_free(cdef_search_ctx->mse[1]); + cdef_search_ctx->mse[1] = NULL; + aom_free(cdef_search_ctx->sb_index); + cdef_search_ctx->sb_index = NULL; + } +} + +// Initialize the parameters related to CDEF search context. +// Inputs: +// frame: Pointer to compressed frame buffer +// ref: Pointer to the frame buffer holding the source frame +// cm: Pointer to top level common structure +// xd: Pointer to common current coding block structure +// cdef_search_ctx: Pointer to the structure containing parameters related to +// CDEF search context. +// pick_method: Search method used to select CDEF parameters +// Returns: +// Nothing will be returned. Contents of cdef_search_ctx will be modified. +static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame, + const YV12_BUFFER_CONFIG *ref, + AV1_COMMON *cm, MACROBLOCKD *xd, + CdefSearchCtx *cdef_search_ctx, + CDEF_PICK_METHOD pick_method) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + cdef_search_ctx->mi_params = &cm->mi_params; + cdef_search_ctx->ref = ref; + cdef_search_ctx->nvfb = + (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + cdef_search_ctx->nhfb = + (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); + cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6); + cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method]; + cdef_search_ctx->num_planes = num_planes; + cdef_search_ctx->pick_method = pick_method; + cdef_search_ctx->sb_count = 0; + cdef_search_ctx->use_highbitdepth = cm->seq_params->use_highbitdepth; + av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, + num_planes); + // Initialize plane wise information. + for (int pli = 0; pli < num_planes; pli++) { + cdef_search_ctx->xdec[pli] = xd->plane[pli].subsampling_x; + cdef_search_ctx->ydec[pli] = xd->plane[pli].subsampling_y; + cdef_search_ctx->bsize[pli] = + cdef_search_ctx->ydec[pli] + ? (cdef_search_ctx->xdec[pli] ? BLOCK_4X4 : BLOCK_8X4) + : (cdef_search_ctx->xdec[pli] ? BLOCK_4X8 : BLOCK_8X8); + cdef_search_ctx->mi_wide_l2[pli] = + MI_SIZE_LOG2 - xd->plane[pli].subsampling_x; + cdef_search_ctx->mi_high_l2[pli] = + MI_SIZE_LOG2 - xd->plane[pli].subsampling_y; + cdef_search_ctx->plane[pli] = xd->plane[pli]; + } + // Function pointer initialization. +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params->use_highbitdepth) { + cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_highbd; + cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd; + } else { + cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd; + cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist; + } +#else + cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd; + cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist; +#endif +} + +void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef, + int is_screen_content) { + const int bd = cm->seq_params->bit_depth; + const int q = + av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8); + CdefInfo *const cdef_info = &cm->cdef_info; + // Check the speed feature to avoid extra signaling. + if (skip_cdef) { + cdef_info->cdef_bits = 1; + cdef_info->nb_cdef_strengths = 2; + } else { + cdef_info->cdef_bits = 0; + cdef_info->nb_cdef_strengths = 1; + } + cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6); + + int predicted_y_f1 = 0; + int predicted_y_f2 = 0; + int predicted_uv_f1 = 0; + int predicted_uv_f2 = 0; + if (is_screen_content) { + predicted_y_f1 = + (int)(5.88217781e-06 * q * q + 6.10391455e-03 * q + 9.95043102e-02); + predicted_y_f2 = + (int)(-7.79934857e-06 * q * q + 6.58957830e-03 * q + 8.81045025e-01); + predicted_uv_f1 = + (int)(-6.79500136e-06 * q * q + 1.02695586e-02 * q + 1.36126802e-01); + predicted_uv_f2 = + (int)(-9.99613695e-08 * q * q - 1.79361339e-05 * q + 1.17022324e+0); + predicted_y_f1 = clamp(predicted_y_f1, 0, 15); + predicted_y_f2 = clamp(predicted_y_f2, 0, 3); + predicted_uv_f1 = clamp(predicted_uv_f1, 0, 15); + predicted_uv_f2 = clamp(predicted_uv_f2, 0, 3); + } else { + if (!frame_is_intra_only(cm)) { + predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f + + q * 0.0068615186f + 0.02709886f), + 0, 15); + predicted_y_f2 = clamp((int)roundf(q * q * -0.00000057629734f + + q * 0.0013993345f + 0.03831067f), + 0, 3); + predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000007095069f + + q * 0.0034628846f + 0.00887099f), + 0, 15); + predicted_uv_f2 = clamp((int)roundf(q * q * 0.00000023874085f + + q * 0.00028223585f + 0.05576307f), + 0, 3); + } else { + predicted_y_f1 = clamp( + (int)roundf(q * q * 0.0000033731974f + q * 0.008070594f + 0.0187634f), + 0, 15); + predicted_y_f2 = clamp((int)roundf(q * q * 0.0000029167343f + + q * 0.0027798624f + 0.0079405f), + 0, 3); + predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000130790995f + + q * 0.012892405f - 0.00748388f), + 0, 15); + predicted_uv_f2 = clamp((int)roundf(q * q * 0.0000032651783f + + q * 0.00035520183f + 0.00228092f), + 0, 3); + } + } + cdef_info->cdef_strengths[0] = + predicted_y_f1 * CDEF_SEC_STRENGTHS + predicted_y_f2; + cdef_info->cdef_uv_strengths[0] = + predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2; + + // mbmi->cdef_strength is already set in the encoding stage. We don't need to + // set it again here. + if (skip_cdef) { + cdef_info->cdef_strengths[1] = 0; + cdef_info->cdef_uv_strengths[1] = 0; + return; + } + + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + MB_MODE_INFO **mbmi = mi_params->mi_grid_base; + // mbmi is NULL when real-time rate control library is used. + if (!mbmi) return; + for (int r = 0; r < nvfb; ++r) { + for (int c = 0; c < nhfb; ++c) { + MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c]; + current_mbmi->cdef_strength = 0; + } + mbmi += MI_SIZE_64X64 * mi_params->mi_stride; + } +} + +void av1_cdef_search(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + CDEF_CONTROL cdef_control = cpi->oxcf.tool_cfg.cdef_control; + + assert(cdef_control != CDEF_NONE); + if (cdef_control == CDEF_REFERENCE && cpi->ppi->rtc_ref.non_reference_frame) { + CdefInfo *const cdef_info = &cm->cdef_info; + cdef_info->nb_cdef_strengths = 1; + cdef_info->cdef_bits = 0; + cdef_info->cdef_strengths[0] = 0; + cdef_info->cdef_uv_strengths[0] = 0; + return; + } + + // Indicate if external RC is used for testing + const int rtc_ext_rc = cpi->rc.rtc_external_ratectrl; + if (rtc_ext_rc) { + av1_pick_cdef_from_qp(cm, 0, 0); + return; + } + CDEF_PICK_METHOD pick_method = cpi->sf.lpf_sf.cdef_pick_method; + if (pick_method == CDEF_PICK_FROM_Q) { + const int use_screen_content_model = + cm->quant_params.base_qindex > + AOMMAX(cpi->sf.rt_sf.screen_content_cdef_filter_qindex_thresh, + cpi->rc.best_quality + 5) && + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; + av1_pick_cdef_from_qp(cm, cpi->sf.rt_sf.skip_cdef_sb, + use_screen_content_model); + return; + } + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int damping = 3 + (cm->quant_params.base_qindex >> 6); + const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 && + pick_method <= CDEF_FAST_SEARCH_LVL5); + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + + if (!cpi->cdef_search_ctx) + CHECK_MEM_ERROR(cm, cpi->cdef_search_ctx, + aom_malloc(sizeof(*cpi->cdef_search_ctx))); + CdefSearchCtx *cdef_search_ctx = cpi->cdef_search_ctx; + + // Initialize parameters related to CDEF search context. + cdef_params_init(&cm->cur_frame->buf, cpi->source, cm, xd, cdef_search_ctx, + pick_method); + // Allocate CDEF search context buffers. + cdef_alloc_data(cm, cdef_search_ctx); + // Frame level mse calculation. + if (cpi->mt_info.num_workers > 1) { + av1_cdef_mse_calc_frame_mt(cpi); + } else { + cdef_mse_calc_frame(cdef_search_ctx, cm->error); + } + + /* Search for different number of signaling bits. */ + int nb_strength_bits = 0; + uint64_t best_rd = UINT64_MAX; + CdefInfo *const cdef_info = &cm->cdef_info; + int sb_count = cdef_search_ctx->sb_count; + uint64_t(*mse[2])[TOTAL_STRENGTHS]; + mse[0] = cdef_search_ctx->mse[0]; + mse[1] = cdef_search_ctx->mse[1]; + /* Calculate the maximum number of bits required to signal CDEF strengths at + * block level */ + const int total_strengths = nb_cdef_strengths[pick_method]; + const int joint_strengths = + num_planes > 1 ? total_strengths * total_strengths : total_strengths; + const int max_signaling_bits = + joint_strengths == 1 ? 0 : get_msb(joint_strengths - 1) + 1; + int rdmult = cpi->td.mb.rdmult; + for (int i = 0; i <= 3; i++) { + if (i > max_signaling_bits) break; + int best_lev0[CDEF_MAX_STRENGTHS]; + int best_lev1[CDEF_MAX_STRENGTHS] = { 0 }; + const int nb_strengths = 1 << i; + uint64_t tot_mse; + if (num_planes > 1) { + tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths, + mse, sb_count, pick_method); + } else { + tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count, + pick_method); + } + + const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS * + (num_planes > 1 ? 2 : 1); + const int rate_cost = av1_cost_literal(total_bits); + const uint64_t dist = tot_mse * 16; + const uint64_t rd = RDCOST(rdmult, rate_cost, dist); + if (rd < best_rd) { + best_rd = rd; + nb_strength_bits = i; + memcpy(cdef_info->cdef_strengths, best_lev0, + nb_strengths * sizeof(best_lev0[0])); + if (num_planes > 1) { + memcpy(cdef_info->cdef_uv_strengths, best_lev1, + nb_strengths * sizeof(best_lev1[0])); + } + } + } + + cdef_info->cdef_bits = nb_strength_bits; + cdef_info->nb_cdef_strengths = 1 << nb_strength_bits; + for (int i = 0; i < sb_count; i++) { + uint64_t best_mse = UINT64_MAX; + int best_gi = 0; + for (int gi = 0; gi < cdef_info->nb_cdef_strengths; gi++) { + uint64_t curr = mse[0][i][cdef_info->cdef_strengths[gi]]; + if (num_planes > 1) curr += mse[1][i][cdef_info->cdef_uv_strengths[gi]]; + if (curr < best_mse) { + best_gi = gi; + best_mse = curr; + } + } + mi_params->mi_grid_base[cdef_search_ctx->sb_index[i]]->cdef_strength = + best_gi; + } + if (fast) { + for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) { + const int luma_strength = cdef_info->cdef_strengths[j]; + const int chroma_strength = cdef_info->cdef_uv_strengths[j]; + int pri_strength, sec_strength; + + STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_strengths[j], pick_method, + luma_strength); + STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_uv_strengths[j], pick_method, + chroma_strength); + } + } + + cdef_info->cdef_damping = damping; + // Deallocate CDEF search context buffers. + av1_cdef_dealloc_data(cdef_search_ctx); +} diff --git a/third_party/aom/av1/encoder/pickcdef.h b/third_party/aom/av1/encoder/pickcdef.h new file mode 100644 index 0000000000..192e734fb0 --- /dev/null +++ b/third_party/aom/av1/encoder/pickcdef.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_PICKCDEF_H_ +#define AOM_AV1_ENCODER_PICKCDEF_H_ + +#include "av1/common/cdef.h" +#include "av1/encoder/speed_features.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\enum CDEF_CONTROL + * \brief This enum controls to which frames CDEF is applied. + */ +typedef enum { + CDEF_NONE = 0, /*!< Disable CDEF on all frames. */ + CDEF_ALL = 1, /*!< Enable CDEF for all frames. */ + CDEF_REFERENCE = 2, /*!< Disable CDEF on non reference frames. */ +} CDEF_CONTROL; + +/*!\cond */ +struct MultiThreadInfo; + +#define REDUCED_PRI_STRENGTHS_LVL1 8 +#define REDUCED_PRI_STRENGTHS_LVL2 5 +#define REDUCED_SEC_STRENGTHS_LVL3 2 +#define REDUCED_SEC_STRENGTHS_LVL5 1 +#define REDUCED_PRI_STRENGTHS_LVL4 2 + +#define REDUCED_TOTAL_STRENGTHS_LVL1 \ + (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS) +#define REDUCED_TOTAL_STRENGTHS_LVL2 \ + (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS) +#define REDUCED_TOTAL_STRENGTHS_LVL3 \ + (REDUCED_PRI_STRENGTHS_LVL2 * REDUCED_SEC_STRENGTHS_LVL3) +#define REDUCED_TOTAL_STRENGTHS_LVL4 \ + (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL3) +#define REDUCED_TOTAL_STRENGTHS_LVL5 \ + (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL5) +#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS) + +static const int priconv_lvl1[REDUCED_PRI_STRENGTHS_LVL1] = { 0, 1, 2, 3, + 5, 7, 10, 13 }; +static const int priconv_lvl2[REDUCED_PRI_STRENGTHS_LVL2] = { 0, 2, 4, 8, 14 }; +static const int priconv_lvl4[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 11 }; +static const int priconv_lvl5[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 5 }; +static const int secconv_lvl3[REDUCED_SEC_STRENGTHS_LVL3] = { 0, 2 }; +static const int secconv_lvl5[REDUCED_SEC_STRENGTHS_LVL5] = { 0 }; +static const int nb_cdef_strengths[CDEF_PICK_METHODS] = { + TOTAL_STRENGTHS, + REDUCED_TOTAL_STRENGTHS_LVL1, + REDUCED_TOTAL_STRENGTHS_LVL2, + REDUCED_TOTAL_STRENGTHS_LVL3, + REDUCED_TOTAL_STRENGTHS_LVL4, + REDUCED_TOTAL_STRENGTHS_LVL5, + TOTAL_STRENGTHS +}; + +typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const uint8_t *src, + int src_voffset, int src_hoffset, int sstride, + int vsize, int hsize); +typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src, + cdef_list *dlist, int cdef_count, + BLOCK_SIZE bsize, int coeff_shift, + int row, int col); + +/*! \brief CDEF search context. + */ +typedef struct { + /*! + * Pointer to the frame buffer holding the source frame + */ + const YV12_BUFFER_CONFIG *ref; + /*! + * Pointer to params related to MB_MODE_INFO arrays and related info + */ + CommonModeInfoParams *mi_params; + /*! + * Info specific to each plane + */ + struct macroblockd_plane plane[MAX_MB_PLANE]; + /*! + * Function pointer of copy_fn + */ + copy_fn_t copy_fn; + /*! + * Function pointer of compute_cdef_dist_fn + */ + compute_cdef_dist_t compute_cdef_dist_fn; + /*! + * Number of strenghts evaluated in CDEF filter search + */ + int total_strengths; + /*! + * Bit-depth dependent shift + */ + int coeff_shift; + /*! + * CDEF damping factor + */ + int damping; + /*! + * Search method used to select CDEF parameters + */ + int pick_method; + /*! + * Number of planes + */ + int num_planes; + /*! + * Log2 of width of the MI unit in pixels. mi_wide_l2[i] + * indicates the width of the MI unit in pixels for the ith plane + */ + int mi_wide_l2[MAX_MB_PLANE]; + /*! + * Log2 of height of the MI unit in pixels. mi_high_l2[i] + * indicates the height of the MI unit in pixels for the ith plane + */ + int mi_high_l2[MAX_MB_PLANE]; + /*! + * Subsampling in x direction. xdec[i] indicates the subsampling + * for the ith plane + */ + int xdec[MAX_MB_PLANE]; + /*! + * Subsampling in y direction. ydec[i] indicates the subsampling + * for the ith plane + */ + int ydec[MAX_MB_PLANE]; + /*! + * bsize[i] indicates the block size of ith plane + */ + int bsize[MAX_MB_PLANE]; + /*! + * Number of 64x64 blocks in vertical direction of a frame + */ + int nvfb; + /*! + * Number of 64x64 blocks in horizontal direction of a frame + */ + int nhfb; + /*! + * Pointer to the mean squared error between the CDEF filtered block and the + * source block. mse[i][j][k] stores the MSE of the ith plane (i=0 corresponds + * to Y-plane, i=1 corresponds to U and V planes), jth block and kth strength + * index + */ + uint64_t (*mse[2])[TOTAL_STRENGTHS]; + /*! + * Holds the position (in units of mi's) of the cdef filtered + * block in raster scan order + */ + int *sb_index; + /*! + * Holds the count of cdef filtered blocks + */ + int sb_count; + /*! + * Indicates if 16bit frame buffers are to be used i.e., the content bit-depth + * is > 8-bit + */ + bool use_highbitdepth; +} CdefSearchCtx; + +static INLINE int sb_all_skip(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col) { + const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64); + const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64); + const int stride = mi_params->mi_stride; + MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col; + for (int r = 0; r < maxr; ++r, mbmi += stride) { + for (int c = 0; c < maxc; ++c) { + if (!mbmi[c]->skip_txfm) return 0; + } + } + return 1; +} + +// Checks if cdef processing can be skipped for particular sb. +// Inputs: +// cdef_search_ctx: Pointer to the structure containing parameters related to +// CDEF search context. +// fbr: Row index in units of 64x64 block +// fbc: Column index in units of 64x64 block +// Returns: +// 1/0 will be returned to indicate skip/don't skip cdef processing of sb +// respectively. +static INLINE int cdef_sb_skip(const CommonModeInfoParams *const mi_params, + int fbr, int fbc) { + const MB_MODE_INFO *const mbmi = + mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc]; + // No filtering if the entire filter block is skipped. + if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) + return 1; + // Skip odd numbered 64x64 block rows(cols) when bsize is BLOCK_128X128, + // BLOCK_64X128(BLOCK_128X128, BLOCK_128X64) as for such blocks CDEF filtering + // is done at the corresponding block sizes. + if (((fbc & 1) && + (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) || + ((fbr & 1) && + (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128))) + return 1; + return 0; +} + +void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx); + +void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, + struct aom_internal_error_info *error_info, + int fbr, int fbc, int sb_count); +/*!\endcond */ + +/*!\brief AV1 CDEF parameter search + * + * \ingroup in_loop_cdef + * + * Searches for optimal CDEF parameters for frame + * + * \param[in,out] cpi Top level encoder structure + * + * \remark Nothing is returned. Instead, optimal CDEF parameters are stored + * in the \c cdef_info structure of type \ref CdefInfo inside \c cm: + * \arg \c cdef_bits: Bits of strength parameters + * \arg \c nb_cdef_strengths: Number of strength parameters + * \arg \c cdef_strengths: list of \c nb_cdef_strengths strength parameters + * for the luma plane. + * \arg \c uv_cdef_strengths: list of \c nb_cdef_strengths strength parameters + * for the chroma planes. + * \arg \c damping_factor: CDEF damping factor. + * + */ +void av1_cdef_search(struct AV1_COMP *cpi); + +/*!\brief AV1 CDEF level from QP + * + * \ingroup in_loop_cdef + * + * Calculates CDEF levels from frame QP. Only used for speed 7+ with RT mode. + * + * \param[in,out] cm Pointer to top level common structure + * \param[in] skip_cdef Flag to skip CDEF filtering + * \param[in] is_screen_content Flag indicating screen content + * + */ +void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef, + int is_screen_content); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_ENCODER_PICKCDEF_H_ diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c new file mode 100644 index 0000000000..9084d3f13a --- /dev/null +++ b/third_party/aom/av1/encoder/picklpf.c @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/psnr.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/quant_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/picklpf.h" + +static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc, int plane) { + switch (plane) { + case 0: aom_yv12_copy_y(src_bc, dst_bc); break; + case 1: aom_yv12_copy_u(src_bc, dst_bc); break; + case 2: aom_yv12_copy_v(src_bc, dst_bc); break; + default: assert(plane >= 0 && plane <= 2); break; + } +} + +int av1_get_max_filter_level(const AV1_COMP *cpi) { + if (is_stat_consumption_stage_twopass(cpi)) { + return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 + : MAX_LOOP_FILTER; + } else { + return MAX_LOOP_FILTER; + } +} + +static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, + AV1_COMP *const cpi, int filt_level, + int partial_frame, int plane, int dir) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + int num_workers = mt_info->num_mod_workers[MOD_LPF]; + AV1_COMMON *const cm = &cpi->common; + int64_t filt_err; + + assert(plane >= 0 && plane <= 2); + int filter_level[2] = { filt_level, filt_level }; + if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1]; + if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0]; + + // set base filters for use of av1_get_filter_level when in DELTA_LF mode + switch (plane) { + case 0: + cm->lf.filter_level[0] = filter_level[0]; + cm->lf.filter_level[1] = filter_level[1]; + break; + case 1: cm->lf.filter_level_u = filter_level[0]; break; + case 2: cm->lf.filter_level_v = filter_level[0]; break; + } + + // lpf_opt_level = 1 : Enables dual/quad loop-filtering. + int lpf_opt_level = is_inter_tx_size_search_level_one(&cpi->sf.tx_sf); + + av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane, + plane + 1, partial_frame, mt_info->workers, + num_workers, &mt_info->lf_row_sync, lpf_opt_level); + + filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane, + cm->seq_params->use_highbitdepth); + + // Re-instate the unfiltered frame + yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane); + + return filt_err; +} + +static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, + int partial_frame, + const int *last_frame_filter_level, int plane, + int dir) { + const AV1_COMMON *const cm = &cpi->common; + const int min_filter_level = 0; + const int max_filter_level = av1_get_max_filter_level(cpi); + int filt_direction = 0; + int64_t best_err; + int filt_best; + + // Start the search at the previous frame filter level unless it is now out of + // range. + int lvl; + switch (plane) { + case 0: + switch (dir) { + case 2: + lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >> + 1; + break; + case 0: + case 1: lvl = last_frame_filter_level[dir]; break; + default: assert(dir >= 0 && dir <= 2); return 0; + } + break; + case 1: lvl = last_frame_filter_level[2]; break; + case 2: lvl = last_frame_filter_level[3]; break; + default: assert(plane >= 0 && plane <= 2); return 0; + } + int filt_mid = clamp(lvl, min_filter_level, max_filter_level); + int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; + // Sum squared error at each filter level + int64_t ss_err[MAX_LOOP_FILTER + 1]; + + const int use_coarse_search = cpi->sf.lpf_sf.use_coarse_filter_level_search; + assert(use_coarse_search <= 1); + static const int min_filter_step_lookup[2] = { 0, 2 }; + // min_filter_step_thesh determines the stopping criteria for the search. + // The search is terminated when filter_step equals min_filter_step_thesh. + const int min_filter_step_thesh = min_filter_step_lookup[use_coarse_search]; + + // Set each entry to -1 + memset(ss_err, 0xFF, sizeof(ss_err)); + yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane); + best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir); + filt_best = filt_mid; + ss_err[filt_mid] = best_err; + + while (filter_step > min_filter_step_thesh) { + const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level); + const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level); + + // Bias against raising loop filter in favor of lowering it. + int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; + + if ((is_stat_consumption_stage_twopass(cpi)) && + (cpi->ppi->twopass.section_intra_rating < 20)) + bias = (bias * cpi->ppi->twopass.section_intra_rating) / 20; + + // yx, bias less for large block size + if (cm->features.tx_mode != ONLY_4X4) bias >>= 1; + + if (filt_direction <= 0 && filt_low != filt_mid) { + // Get Low filter error score + if (ss_err[filt_low] < 0) { + ss_err[filt_low] = + try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir); + } + // If value is close to the best so far then bias towards a lower loop + // filter value. + if (ss_err[filt_low] < (best_err + bias)) { + // Was it actually better than the previous best? + if (ss_err[filt_low] < best_err) { + best_err = ss_err[filt_low]; + } + filt_best = filt_low; + } + } + + // Now look at filt_high + if (filt_direction >= 0 && filt_high != filt_mid) { + if (ss_err[filt_high] < 0) { + ss_err[filt_high] = + try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir); + } + // If value is significantly better than previous best, bias added against + // raising filter value + if (ss_err[filt_high] < (best_err - bias)) { + best_err = ss_err[filt_high]; + filt_best = filt_high; + } + } + + // Half the step distance if the best filter value was the same as last time + if (filt_best == filt_mid) { + filter_step /= 2; + filt_direction = 0; + } else { + filt_direction = (filt_best < filt_mid) ? -1 : 1; + filt_mid = filt_best; + } + } + + return filt_best; +} + +void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, + LPF_PICK_METHOD method) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + const int num_planes = av1_num_planes(cm); + struct loopfilter *const lf = &cm->lf; + int disable_filter_rt_screen = 0; + (void)sd; + + lf->sharpness_level = 0; + + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cpi->sf.rt_sf.skip_lf_screen) + disable_filter_rt_screen = av1_cyclic_refresh_disable_lf_cdef(cpi); + + if (disable_filter_rt_screen || + cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_NONE || + (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_REFERENCE && + cpi->ppi->rtc_ref.non_reference_frame)) { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + return; + } + + if (method == LPF_PICK_MINIMAL_LPF) { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + } else if (method >= LPF_PICK_FROM_Q) { + const int min_filter_level = 0; + const int max_filter_level = av1_get_max_filter_level(cpi); + const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, + seq_params->bit_depth); + // based on tests result for rtc test set + // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point + const int strength_boost_q_treshold = 0; + int inter_frame_multiplier = + (q > strength_boost_q_treshold || + (cpi->sf.rt_sf.use_nonrd_pick_mode && + cpi->common.width * cpi->common.height > 352 * 288)) + ? 12034 + : 6017; + // Increase strength on base TL0 for temporal layers, for low-resoln, + // based on frame source_sad. + if (cpi->svc.number_temporal_layers > 1 && + cpi->svc.temporal_layer_id == 0 && + cpi->common.width * cpi->common.height <= 352 * 288 && + cpi->sf.rt_sf.use_nonrd_pick_mode) { + if (cpi->rc.frame_source_sad > 100000) + inter_frame_multiplier = inter_frame_multiplier << 1; + else if (cpi->rc.frame_source_sad > 50000) + inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1); + } + // These values were determined by linear fitting the result of the + // searched level for 8 bit depth: + // Keyframes: filt_guess = q * 0.06699 - 1.60817 + // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225 + // + // And high bit depth separately: + // filt_guess = q * 0.316206 + 3.87252 + int filt_guess; + switch (seq_params->bit_depth) { + case AOM_BITS_8: + filt_guess = + (cm->current_frame.frame_type == KEY_FRAME) + ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18) + : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18); + break; + case AOM_BITS_10: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); + break; + case AOM_BITS_12: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); + break; + default: + assert(0 && + "bit_depth should be AOM_BITS_8, AOM_BITS_10 " + "or AOM_BITS_12"); + return; + } + if (seq_params->bit_depth != AOM_BITS_8 && + cm->current_frame.frame_type == KEY_FRAME) + filt_guess -= 4; + // TODO(chengchen): retrain the model for Y, U, V filter levels + lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level); + lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level); + lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level); + lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level); + if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY && + !frame_is_intra_only(cm) && !cpi->rc.high_source_sad) { + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + } else { + const int num4x4 = (cm->width >> 2) * (cm->height >> 2); + const int newmv_thresh = 7; + const int distance_since_key_thresh = 5; + if ((cpi->td.rd_counts.newmv_or_intra_blocks * 100 / num4x4) < + newmv_thresh && + cpi->rc.frames_since_key > distance_since_key_thresh) { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + } + } + } + } else { + int last_frame_filter_level[4] = { 0 }; + if (!frame_is_intra_only(cm)) { + last_frame_filter_level[0] = cpi->ppi->filter_level[0]; + last_frame_filter_level[1] = cpi->ppi->filter_level[1]; + last_frame_filter_level[2] = cpi->ppi->filter_level_u; + last_frame_filter_level[3] = cpi->ppi->filter_level_v; + } + // The frame buffer last_frame_uf is used to store the non-loop filtered + // reconstructed frame in search_filter_level(). + if (aom_realloc_frame_buffer( + &cpi->last_frame_uf, cm->width, cm->height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate last frame buffer"); + + lf->filter_level[0] = lf->filter_level[1] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, 0, 2); + if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) { + lf->filter_level[0] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, 0, 0); + lf->filter_level[1] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, 0, 1); + } + + if (num_planes > 1) { + lf->filter_level_u = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, 1, 0); + lf->filter_level_v = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, 2, 0); + } + } +} diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h new file mode 100644 index 0000000000..f567937c32 --- /dev/null +++ b/third_party/aom/av1/encoder/picklpf.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PICKLPF_H_ +#define AOM_AV1_ENCODER_PICKLPF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/encoder.h" + +struct yv12_buffer_config; +struct AV1_COMP; +int av1_get_max_filter_level(const AV1_COMP *cpi); + +/*!\brief Algorithm for AV1 loop filter level selection. + * + * \ingroup in_loop_filter + * This function determines proper filter levels used for in-loop filter + * (deblock filter). + * + * \param[in] sd The pointer of frame buffer + * \param[in] cpi Top-level encoder structure + * \param[in] method The method used to select filter levels + * + * \par + * method includes: + * \arg \c LPF_PICK_FROM_FULL_IMAGE: Try the full image with different values. + * \arg \c LPF_PICK_FROM_FULL_IMAGE_NON_DUAL: Try the full image filter search + * with non-dual filter only. + * \arg \c LPF_PICK_FROM_SUBIMAGE: Try a small portion of the image with + * different values. + * \arg \c LPF_PICK_FROM_Q: Estimate the level based on quantizer and frame type + * \arg \c LPF_PICK_MINIMAL_LPF: Pick 0 to disable LPF if LPF was enabled last + * frame + * + * \remark Nothing is returned. Instead, filter levels below are stored in the + * "loopfilter" structure inside "cpi": + * \arg \c filter_level[0]: the vertical filter level for Y plane + * \arg \c filter_level[1]: the horizontal filter level for Y plane + * \arg \c filter_level_u: the filter level for U plane + * \arg \c filter_level_v: the filter level for V plane + * + * \n + * \b Overview + * \par + * The workflow of deblock filter is shown in Fig.1. \n + * Boundary pixels pass through a non-flatness check, followed by a step that + * determines smoothness and selects proper types of filters + * (4-, 6-, 8-, 14-tap filter). \n + * If non-flatness criteria is not satisfied, the encoder will not apply + * deblock filtering on these boundary pixels. + * \image html filter_flow.png "Fig.1. The workflow of deblock filter" width=70% + * + * \par + * The non-flatness is determined by the boundary pixels and thresholds as shown + * in Fig.2. \n + * Filtering is applied when \n + * \f$|p_0-p_1| 700 ? 0.04590 : 0.02295 \n + * For 10 bit and 12 bit: \n + * filt_guess = q * 0.316206 + 3.87252 \n + * Then filter_level[0] = filter_level[1] = filter_level_u = filter_level_v = + * clamp(filt_guess, min_filter_level, max_filter_level) \n + * Where min_filter_level = 0, max_filter_level = 64 \n + * The equations were determined by linear fitting using filter levels + * generated by "LPF_PICK_FROM_FULL_IMAGE" method. + * + */ +void av1_pick_filter_level(const struct yv12_buffer_config *sd, + struct AV1_COMP *cpi, LPF_PICK_METHOD method); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PICKLPF_H_ diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c new file mode 100644 index 0000000000..6429064175 --- /dev/null +++ b/third_party/aom/av1/encoder/pickrst.c @@ -0,0 +1,2217 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/psnr.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/quant_common.h" +#include "av1/common/restoration.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/picklpf.h" +#include "av1/encoder/pickrst.h" + +// Number of Wiener iterations +#define NUM_WIENER_ITERS 5 + +// Penalty factor for use of dual sgr +#define DUAL_SGR_PENALTY_MULT 0.01 + +// Working precision for Wiener filter coefficients +#define WIENER_TAP_SCALE_FACTOR ((int64_t)1 << 16) + +#define SGRPROJ_EP_GRP1_START_IDX 0 +#define SGRPROJ_EP_GRP1_END_IDX 9 +#define SGRPROJ_EP_GRP1_SEARCH_COUNT 4 +#define SGRPROJ_EP_GRP2_3_SEARCH_COUNT 2 +static const int sgproj_ep_grp1_seed[SGRPROJ_EP_GRP1_SEARCH_COUNT] = { 0, 3, 6, + 9 }; +static const int sgproj_ep_grp2_3[SGRPROJ_EP_GRP2_3_SEARCH_COUNT][14] = { + { 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, -1, -1, -1, -1 }, + { 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 } +}; + +#if DEBUG_LR_COSTING +RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE] + [MAX_LR_UNITS_W * MAX_LR_UNITS_H]; +#endif // DEBUG_LR_COSTING + +typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, + int hstart, int width, int vstart, + int height); +typedef uint64_t (*var_part_extractor_type)(const YV12_BUFFER_CONFIG *a, + int hstart, int width, int vstart, + int height); + +#if CONFIG_AV1_HIGHBITDEPTH +#define NUM_EXTRACTORS (3 * (1 + 1)) +#else +#define NUM_EXTRACTORS 3 +#endif +static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = { + aom_get_y_sse_part, aom_get_u_sse_part, + aom_get_v_sse_part, +#if CONFIG_AV1_HIGHBITDEPTH + aom_highbd_get_y_sse_part, aom_highbd_get_u_sse_part, + aom_highbd_get_v_sse_part, +#endif +}; +static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = { + aom_get_y_var, aom_get_u_var, aom_get_v_var, +#if CONFIG_AV1_HIGHBITDEPTH + aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var, +#endif +}; + +static int64_t sse_restoration_unit(const RestorationTileLimits *limits, + const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *dst, int plane, + int highbd) { + return sse_part_extractors[3 * highbd + plane]( + src, dst, limits->h_start, limits->h_end - limits->h_start, + limits->v_start, limits->v_end - limits->v_start); +} + +static uint64_t var_restoration_unit(const RestorationTileLimits *limits, + const YV12_BUFFER_CONFIG *src, int plane, + int highbd) { + return var_part_extractors[3 * highbd + plane]( + src, limits->h_start, limits->h_end - limits->h_start, limits->v_start, + limits->v_end - limits->v_start); +} + +typedef struct { + const YV12_BUFFER_CONFIG *src; + YV12_BUFFER_CONFIG *dst; + + const AV1_COMMON *cm; + const MACROBLOCK *x; + int plane; + int plane_w; + int plane_h; + RestUnitSearchInfo *rusi; + + // Speed features + const LOOP_FILTER_SPEED_FEATURES *lpf_sf; + + uint8_t *dgd_buffer; + int dgd_stride; + const uint8_t *src_buffer; + int src_stride; + + // SSE values for each restoration mode for the current RU + // These are saved by each search function for use in search_switchable() + int64_t sse[RESTORE_SWITCHABLE_TYPES]; + + // This flag will be set based on the speed feature + // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning. + uint8_t skip_sgr_eval; + + // Total rate and distortion so far for each restoration type + // These are initialised by reset_rsc in search_rest_type + int64_t total_sse[RESTORE_TYPES]; + int64_t total_bits[RESTORE_TYPES]; + + // Reference parameters for delta-coding + // + // For each restoration type, we need to store the latest parameter set which + // has been used, so that we can properly cost up the next parameter set. + // Note that we have two sets of these - one for the single-restoration-mode + // search (ie, frame_restoration_type = RESTORE_WIENER or RESTORE_SGRPROJ) + // and one for the switchable mode. This is because these two cases can lead + // to different sets of parameters being signaled, but we don't know which + // we will pick for sure until the end of the search process. + WienerInfo ref_wiener; + SgrprojInfo ref_sgrproj; + WienerInfo switchable_ref_wiener; + SgrprojInfo switchable_ref_sgrproj; + + // Buffers used to hold dgd-avg and src-avg data respectively during SIMD + // call of Wiener filter. + int16_t *dgd_avg; + int16_t *src_avg; +} RestSearchCtxt; + +static AOM_INLINE void rsc_on_tile(void *priv) { + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + set_default_wiener(&rsc->ref_wiener); + set_default_sgrproj(&rsc->ref_sgrproj); + set_default_wiener(&rsc->switchable_ref_wiener); + set_default_sgrproj(&rsc->switchable_ref_sgrproj); +} + +static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) { + memset(rsc->total_sse, 0, sizeof(rsc->total_sse)); + memset(rsc->total_bits, 0, sizeof(rsc->total_bits)); +} + +static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src, + const AV1_COMMON *cm, const MACROBLOCK *x, + const LOOP_FILTER_SPEED_FEATURES *lpf_sf, + int plane, RestUnitSearchInfo *rusi, + YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) { + rsc->src = src; + rsc->dst = dst; + rsc->cm = cm; + rsc->x = x; + rsc->plane = plane; + rsc->rusi = rusi; + rsc->lpf_sf = lpf_sf; + + const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf; + const int is_uv = plane != AOM_PLANE_Y; + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + assert(plane_w == src->crop_widths[is_uv]); + assert(plane_h == src->crop_heights[is_uv]); + assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]); + assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]); + + rsc->plane_w = plane_w; + rsc->plane_h = plane_h; + rsc->src_buffer = src->buffers[plane]; + rsc->src_stride = src->strides[is_uv]; + rsc->dgd_buffer = dgd->buffers[plane]; + rsc->dgd_stride = dgd->strides[is_uv]; +} + +static int64_t try_restoration_unit(const RestSearchCtxt *rsc, + const RestorationTileLimits *limits, + const RestorationUnitInfo *rui) { + const AV1_COMMON *const cm = rsc->cm; + const int plane = rsc->plane; + const int is_uv = plane > 0; + const RestorationInfo *rsi = &cm->rst_info[plane]; + RestorationLineBuffers rlbs; + const int bit_depth = cm->seq_params->bit_depth; + const int highbd = cm->seq_params->use_highbitdepth; + + const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf; + // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be + // also used in encoder. + const int optimized_lr = 0; + + av1_loop_restoration_filter_unit( + limits, rui, &rsi->boundaries, &rlbs, rsc->plane_w, rsc->plane_h, + is_uv && cm->seq_params->subsampling_x, + is_uv && cm->seq_params->subsampling_y, highbd, bit_depth, + fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane], + rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr, cm->error); + + return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd); +} + +int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, int32_t *flt1, + int flt1_stride, int xq[2], + const sgr_params_type *params) { + int i, j; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); + assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[0] * (flt0[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + } + } else if (params->r[1] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[1] * (flt1[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt1 += flt1_stride; + } + } else { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t e = (int32_t)(dat[j]) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + + return err; +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], + const sgr_params_type *params) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int i, j; + int64_t err = 0; + const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + if (params->r[0] > 0 && params->r[1] > 0) { + int xq0 = xq[0]; + int xq1 = xq[1]; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS); + int32_t v0 = flt0[j] - u; + int32_t v1 = flt1[j] - u; + int32_t v = half; + v += xq0 * v0; + v += xq1 * v1; + const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; + err += ((int64_t)e * e); + } + dat += dat_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + src += src_stride; + } + } else if (params->r[0] > 0 || params->r[1] > 0) { + int exq; + int32_t *flt; + int flt_stride; + if (params->r[0] > 0) { + exq = xq[0]; + flt = flt0; + flt_stride = flt0_stride; + } else { + exq = xq[1]; + flt = flt1; + flt_stride = flt1_stride; + } + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS); + int32_t v = half; + v += exq * (flt[j] - u); + const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; + err += ((int64_t)e * e); + } + dat += dat_stride; + flt += flt_stride; + src += src_stride; + } + } else { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t e = d - s; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + return err; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int use_highbitdepth, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int *xqd, + const sgr_params_type *params) { + int xq[2]; + av1_decode_xq(xqd, xq, params); + +#if CONFIG_AV1_HIGHBITDEPTH + if (use_highbitdepth) { + return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); + + } else { + return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); + } +#else + (void)use_highbitdepth; + return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); +#endif +} + +#define USE_SGRPROJ_REFINEMENT_SEARCH 1 +static int64_t finer_search_pixel_proj_error( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0, + int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd, + const sgr_params_type *params) { + int64_t err = get_pixel_proj_error( + src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); + (void)start_step; +#if USE_SGRPROJ_REFINEMENT_SEARCH + int64_t err2; + int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 }; + int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 }; + for (int s = start_step; s >= 1; s >>= 1) { + for (int p = 0; p < 2; ++p) { + if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) { + continue; + } + int skip = 0; + do { + if (xqd[p] - s >= tap_min[p]) { + xqd[p] -= s; + err2 = + get_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); + if (err2 > err) { + xqd[p] += s; + } else { + err = err2; + skip = 1; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + if (skip) break; + do { + if (xqd[p] + s <= tap_max[p]) { + xqd[p] += s; + err2 = + get_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); + if (err2 > err) { + xqd[p] -= s; + } else { + err = err2; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + } + } +#endif // USE_SGRPROJ_REFINEMENT_SEARCH + return err; +} + +static int64_t signed_rounded_divide(int64_t dividend, int64_t divisor) { + if (dividend < 0) + return (dividend - divisor / 2) / divisor; + else + return (dividend + divisor / 2) / divisor; +} + +static AOM_INLINE void calc_proj_params_r0_r1_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + H[1][1] += (int64_t)f2 * f2; + H[0][1] += (int64_t)f1 * f2; + C[0] += (int64_t)f1 * s; + C[1] += (int64_t)f2 * s; + } + } + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + H[1][1] += (int64_t)f2 * f2; + H[0][1] += (int64_t)f1 * f2; + C[0] += (int64_t)f1 * s; + C[1] += (int64_t)f2 * s; + } + } + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, int64_t H[2][2], + int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + C[0] += (int64_t)f1 * s; + } + } + H[0][0] /= size; + C[0] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_high_bd_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + C[0] += (int64_t)f1 * s; + } + } + H[0][0] /= size; + C[0] /= size; +} + +static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt1, + int flt1_stride, int64_t H[2][2], + int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[1][1] += (int64_t)f2 * f2; + C[1] += (int64_t)f2 * s; + } + } + H[1][1] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r1_high_bd_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[1][1] += (int64_t)f2 * f2; + C[1] += (int64_t)f2 * s; + } + } + H[1][1] /= size; + C[1] /= size; +} + +// The function calls 3 subfunctions for the following cases : +// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements +// of C and H need to be computed. +// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +void av1_calc_proj_params_c(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, int32_t *flt1, + int flt1_stride, int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_c(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, flt1, flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_c(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_c(src8, width, height, src_stride, dat8, dat_stride, + flt1, flt1_stride, H, C); + } +} + +void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, int32_t *flt1, + int flt1_stride, int64_t H[2][2], + int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int use_highbitdepth, int32_t *flt0, + int flt0_stride, int32_t *flt1, + int flt1_stride, int *xq, + const sgr_params_type *params) { + int64_t H[2][2] = { { 0, 0 }, { 0, 0 } }; + int64_t C[2] = { 0, 0 }; + + // Default values to be returned if the problem becomes ill-posed + xq[0] = 0; + xq[1] = 0; + + if (!use_highbitdepth) { + if ((width & 0x7) == 0) { + av1_calc_proj_params(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, flt1, flt1_stride, H, C, params); + } else { + av1_calc_proj_params_c(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, flt1, flt1_stride, H, C, + params); + } + } +#if CONFIG_AV1_HIGHBITDEPTH + else { // NOLINT + if ((width & 0x7) == 0) { + av1_calc_proj_params_high_bd(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C, params); + } else { + av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C, params); + } + } +#endif + + if (params->r[0] == 0) { + // H matrix is now only the scalar H[1][1] + // C vector is now only the scalar C[1] + const int64_t Det = H[1][1]; + if (Det == 0) return; // ill-posed, return default values + xq[0] = 0; + xq[1] = (int)signed_rounded_divide(C[1] * (1 << SGRPROJ_PRJ_BITS), Det); + } else if (params->r[1] == 0) { + // H matrix is now only the scalar H[0][0] + // C vector is now only the scalar C[0] + const int64_t Det = H[0][0]; + if (Det == 0) return; // ill-posed, return default values + xq[0] = (int)signed_rounded_divide(C[0] * (1 << SGRPROJ_PRJ_BITS), Det); + xq[1] = 0; + } else { + const int64_t Det = H[0][0] * H[1][1] - H[0][1] * H[1][0]; + if (Det == 0) return; // ill-posed, return default values + + // If scaling up dividend would overflow, instead scale down the divisor + const int64_t div1 = H[1][1] * C[0] - H[0][1] * C[1]; + if ((div1 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div1) || + (div1 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div1)) + xq[0] = (int)signed_rounded_divide(div1, Det / (1 << SGRPROJ_PRJ_BITS)); + else + xq[0] = (int)signed_rounded_divide(div1 * (1 << SGRPROJ_PRJ_BITS), Det); + + const int64_t div2 = H[0][0] * C[1] - H[1][0] * C[0]; + if ((div2 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div2) || + (div2 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div2)) + xq[1] = (int)signed_rounded_divide(div2, Det / (1 << SGRPROJ_PRJ_BITS)); + else + xq[1] = (int)signed_rounded_divide(div2 * (1 << SGRPROJ_PRJ_BITS), Det); + } +} + +static AOM_INLINE void encode_xq(int *xq, int *xqd, + const sgr_params_type *params) { + if (params->r[0] == 0) { + xqd[0] = 0; + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } else if (params->r[1] == 0) { + xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } else { + xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } +} + +// Apply the self-guided filter across an entire restoration unit. +static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8, + int width, int height, int dat_stride, + int use_highbd, int bit_depth, int pu_width, + int pu_height, int32_t *flt0, int32_t *flt1, + int flt_stride, + struct aom_internal_error_info *error_info) { + for (int i = 0; i < height; i += pu_height) { + const int h = AOMMIN(pu_height, height - i); + int32_t *flt0_row = flt0 + i * flt_stride; + int32_t *flt1_row = flt1 + i * flt_stride; + const uint8_t *dat8_row = dat8 + i * dat_stride; + + // Iterate over the stripe in blocks of width pu_width + for (int j = 0; j < width; j += pu_width) { + const int w = AOMMIN(pu_width, width - j); + if (av1_selfguided_restoration( + dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j, + flt_stride, sgr_params_idx, bit_depth, use_highbd) != 0) { + aom_internal_error( + error_info, AOM_CODEC_MEM_ERROR, + "Error allocating buffer in av1_selfguided_restoration"); + } + } + } +} + +static AOM_INLINE void compute_sgrproj_err( + const uint8_t *dat8, const int width, const int height, + const int dat_stride, const uint8_t *src8, const int src_stride, + const int use_highbitdepth, const int bit_depth, const int pu_width, + const int pu_height, const int ep, int32_t *flt0, int32_t *flt1, + const int flt_stride, int *exqd, int64_t *err, + struct aom_internal_error_info *error_info) { + int exq[2]; + apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth, + pu_width, pu_height, flt0, flt1, flt_stride, error_info); + const sgr_params_type *const params = &av1_sgr_params[ep]; + get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride, + use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq, + params); + encode_xq(exq, exqd, params); + *err = finer_search_pixel_proj_error( + src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, + flt_stride, flt1, flt_stride, 2, exqd, params); +} + +static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err, + const int *exqd, int *bestxqd, + int *bestep, const int ep) { + if (*besterr == -1 || err < *besterr) { + *bestep = ep; + *besterr = err; + bestxqd[0] = exqd[0]; + bestxqd[1] = exqd[1]; + } +} + +static SgrprojInfo search_selfguided_restoration( + const uint8_t *dat8, int width, int height, int dat_stride, + const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth, + int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning, + struct aom_internal_error_info *error_info) { + int32_t *flt0 = rstbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + int ep, idx, bestep = 0; + int64_t besterr = -1; + int exqd[2], bestxqd[2] = { 0, 0 }; + int flt_stride = ((width + 7) & ~7) + 8; + assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) || + pu_width == RESTORATION_PROC_UNIT_SIZE); + assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) || + pu_height == RESTORATION_PROC_UNIT_SIZE); + if (!enable_sgr_ep_pruning) { + for (ep = 0; ep < SGRPROJ_PARAMS; ep++) { + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err, error_info); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + } else { + // evaluate first four seed ep in first group + for (idx = 0; idx < SGRPROJ_EP_GRP1_SEARCH_COUNT; idx++) { + ep = sgproj_ep_grp1_seed[idx]; + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err, error_info); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + // evaluate left and right ep of winner in seed ep + int bestep_ref = bestep; + for (ep = bestep_ref - 1; ep < bestep_ref + 2; ep += 2) { + if (ep < SGRPROJ_EP_GRP1_START_IDX || ep > SGRPROJ_EP_GRP1_END_IDX) + continue; + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err, error_info); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + // evaluate last two group + for (idx = 0; idx < SGRPROJ_EP_GRP2_3_SEARCH_COUNT; idx++) { + ep = sgproj_ep_grp2_3[idx][bestep]; + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err, error_info); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + } + + SgrprojInfo ret; + ret.ep = bestep; + ret.xqd[0] = bestxqd[0]; + ret.xqd[1] = bestxqd[1]; + return ret; +} + +static int count_sgrproj_bits(SgrprojInfo *sgrproj_info, + SgrprojInfo *ref_sgrproj_info) { + int bits = SGRPROJ_PARAMS_BITS; + const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; + if (params->r[0] > 0) + bits += aom_count_primitive_refsubexpfin( + SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + if (params->r[1] > 0) + bits += aom_count_primitive_refsubexpfin( + SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + return bits; +} + +static AOM_INLINE void search_sgrproj( + const RestorationTileLimits *limits, int rest_unit_idx, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info) { + (void)rlbs; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; + + const MACROBLOCK *const x = rsc->x; + const AV1_COMMON *const cm = rsc->cm; + const int highbd = cm->seq_params->use_highbitdepth; + const int bit_depth = cm->seq_params->bit_depth; + + const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0]; + // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set + if (rsc->skip_sgr_eval) { + rsc->total_bits[RESTORE_SGRPROJ] += bits_none; + rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE; + rsc->sse[RESTORE_SGRPROJ] = INT64_MAX; + return; + } + + uint8_t *dgd_start = + rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start; + const uint8_t *src_start = + rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start; + + const int is_uv = rsc->plane > 0; + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; + const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + + rusi->sgrproj = search_selfguided_restoration( + dgd_start, limits->h_end - limits->h_start, + limits->v_end - limits->v_start, rsc->dgd_stride, src_start, + rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height, + tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning, error_info); + + RestorationUnitInfo rui; + rui.restoration_type = RESTORE_SGRPROJ; + rui.sgrproj_info = rusi->sgrproj; + + rsc->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, &rui); + + const int64_t bits_sgr = + x->mode_costs.sgrproj_restore_cost[1] + + (count_sgrproj_bits(&rusi->sgrproj, &rsc->ref_sgrproj) + << AV1_PROB_COST_SHIFT); + double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], bit_depth); + double cost_sgr = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits_sgr >> 4, rsc->sse[RESTORE_SGRPROJ], bit_depth); + if (rusi->sgrproj.ep < 10) + cost_sgr *= + (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level); + + RestorationType rtype = + (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE; + rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype; + +#if DEBUG_LR_COSTING + // Store ref params for later checking + lr_ref_params[RESTORE_SGRPROJ][rsc->plane][rest_unit_idx].sgrproj_info = + rsc->ref_sgrproj; +#endif // DEBUG_LR_COSTING + + rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[rtype]; + rsc->total_bits[RESTORE_SGRPROJ] += + (cost_sgr < cost_none) ? bits_sgr : bits_none; + if (cost_sgr < cost_none) rsc->ref_sgrproj = rusi->sgrproj; +} + +static void acc_stat_one_line(const uint8_t *dgd, const uint8_t *src, + int dgd_stride, int h_start, int h_end, + uint8_t avg, const int wiener_halfwin, + const int wiener_win2, int32_t *M_int32, + int32_t *H_int32, int count) { + int j, k, l; + int16_t Y[WIENER_WIN2]; + + for (j = h_start; j < h_end; j++) { + const int16_t X = (int16_t)src[j] - (int16_t)avg; + int idx = 0; + for (k = -wiener_halfwin; k <= wiener_halfwin; k++) { + for (l = -wiener_halfwin; l <= wiener_halfwin; l++) { + Y[idx] = + (int16_t)dgd[(count + l) * dgd_stride + (j + k)] - (int16_t)avg; + idx++; + } + } + assert(idx == wiener_win2); + for (k = 0; k < wiener_win2; ++k) { + M_int32[k] += (int32_t)Y[k] * X; + for (l = k; l < wiener_win2; ++l) { + // H is a symmetric matrix, so we only need to fill out the upper + // triangle here. We can copy it down to the lower triangle outside + // the (i, j) loops. + H_int32[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l]; + } + } + } +} + +void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src, + int16_t *dgd_avg, int16_t *src_avg, int h_start, + int h_end, int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + (void)dgd_avg; + (void)src_avg; + int i, k, l; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + int32_t M_row[WIENER_WIN2] = { 0 }; + int32_t H_row[WIENER_WIN2 * WIENER_WIN2] = { 0 }; + int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + + memset(M, 0, sizeof(*M) * wiener_win2); + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + + for (i = v_start; i < v_end; i = i + downsample_factor) { + if (use_downsampled_wiener_stats && + (v_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) { + downsample_factor = v_end - i; + } + + memset(M_row, 0, sizeof(int32_t) * WIENER_WIN2); + memset(H_row, 0, sizeof(int32_t) * WIENER_WIN2 * WIENER_WIN2); + acc_stat_one_line(dgd, src + i * src_stride, dgd_stride, h_start, h_end, + avg, wiener_halfwin, wiener_win2, M_row, H_row, i); + + for (k = 0; k < wiener_win2; ++k) { + // Scale M matrix based on the downsampling factor + M[k] += ((int64_t)M_row[k] * downsample_factor); + for (l = k; l < wiener_win2; ++l) { + // H is a symmetric matrix, so we only need to fill out the upper + // triangle here. We can copy it down to the lower triangle outside + // the (i, j) loops. + // Scale H Matrix based on the downsampling factor + H[k * wiener_win2 + l] += + ((int64_t)H_row[k * wiener_win2 + l] * downsample_factor); + } + } + } + + for (k = 0; k < wiener_win2; ++k) { + for (l = k + 1; l < wiener_win2; ++l) { + H[l * wiener_win2 + k] = H[k * wiener_win2 + l]; + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + int i, j, k, l; + int32_t Y[WIENER_WIN2]; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + memset(M, 0, sizeof(*M) * wiener_win2); + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + for (i = v_start; i < v_end; i++) { + for (j = h_start; j < h_end; j++) { + const int32_t X = (int32_t)src[i * src_stride + j] - (int32_t)avg; + int idx = 0; + for (k = -wiener_halfwin; k <= wiener_halfwin; k++) { + for (l = -wiener_halfwin; l <= wiener_halfwin; l++) { + Y[idx] = (int32_t)dgd[(i + l) * dgd_stride + (j + k)] - (int32_t)avg; + idx++; + } + } + assert(idx == wiener_win2); + for (k = 0; k < wiener_win2; ++k) { + M[k] += (int64_t)Y[k] * X; + for (l = k; l < wiener_win2; ++l) { + // H is a symmetric matrix, so we only need to fill out the upper + // triangle here. We can copy it down to the lower triangle outside + // the (i, j) loops. + H[k * wiener_win2 + l] += (int64_t)Y[k] * Y[l]; + } + } + } + } + for (k = 0; k < wiener_win2; ++k) { + M[k] /= bit_depth_divider; + H[k * wiener_win2 + k] /= bit_depth_divider; + for (l = k + 1; l < wiener_win2; ++l) { + H[k * wiener_win2 + l] /= bit_depth_divider; + H[l * wiener_win2 + k] = H[k * wiener_win2 + l]; + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE int wrap_index(int i, int wiener_win) { + const int wiener_halfwin1 = (wiener_win >> 1) + 1; + return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i); +} + +// Solve linear equations to find Wiener filter tap values +// Taps are output scaled by WIENER_FILT_STEP +static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b, + int64_t *x) { + for (int k = 0; k < n - 1; k++) { + // Partial pivoting: bring the row with the largest pivot to the top + for (int i = n - 1; i > k; i--) { + // If row i has a better (bigger) pivot than row (i-1), swap them + if (llabs(A[(i - 1) * stride + k]) < llabs(A[i * stride + k])) { + for (int j = 0; j < n; j++) { + const int64_t c = A[i * stride + j]; + A[i * stride + j] = A[(i - 1) * stride + j]; + A[(i - 1) * stride + j] = c; + } + const int64_t c = b[i]; + b[i] = b[i - 1]; + b[i - 1] = c; + } + } + + // b/278065963: The multiplies + // c / 256 * A[k * stride + j] / cd * 256 + // and + // c / 256 * b[k] / cd * 256 + // within Gaussian elimination can cause a signed integer overflow. Rework + // the multiplies so that larger scaling is used without significantly + // impacting the overall precision. + // + // Precision guidance: + // scale_threshold: Pick as high as possible. + // For max_abs_akj >= scale_threshold scenario: + // scaler_A: Pick as low as possible. Needed for A[(i + 1) * stride + j]. + // scaler_c: Pick as low as possible while maintaining scaler_c >= + // (1 << 7). Needed for A[(i + 1) * stride + j] and b[i + 1]. + int64_t max_abs_akj = 0; + for (int j = 0; j < n; j++) { + const int64_t abs_akj = llabs(A[k * stride + j]); + if (abs_akj > max_abs_akj) max_abs_akj = abs_akj; + } + const int scale_threshold = 1 << 22; + const int scaler_A = max_abs_akj < scale_threshold ? 1 : (1 << 5); + const int scaler_c = max_abs_akj < scale_threshold ? 1 : (1 << 7); + const int scaler = scaler_c * scaler_A; + + // Forward elimination (convert A to row-echelon form) + for (int i = k; i < n - 1; i++) { + if (A[k * stride + k] == 0) return 0; + const int64_t c = A[(i + 1) * stride + k] / scaler_c; + const int64_t cd = A[k * stride + k]; + for (int j = 0; j < n; j++) { + A[(i + 1) * stride + j] -= + A[k * stride + j] / scaler_A * c / cd * scaler; + } + b[i + 1] -= c * b[k] / cd * scaler_c; + } + } + // Back-substitution + for (int i = n - 1; i >= 0; i--) { + if (A[i * stride + i] == 0) return 0; + int64_t c = 0; + for (int j = i + 1; j <= n - 1; j++) { + c += A[i * stride + j] * x[j] / WIENER_TAP_SCALE_FACTOR; + } + // Store filter taps x in scaled form. + x[i] = WIENER_TAP_SCALE_FACTOR * (b[i] - c) / A[i * stride + i]; + } + + return 1; +} + +// Fix vector b, update vector a +static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc, + int64_t **Hc, int32_t *a, int32_t *b) { + int i, j; + int64_t S[WIENER_WIN]; + int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin1 = (wiener_win >> 1) + 1; + memset(A, 0, sizeof(A)); + memset(B, 0, sizeof(B)); + for (i = 0; i < wiener_win; i++) { + for (j = 0; j < wiener_win; ++j) { + const int jj = wrap_index(j, wiener_win); + A[jj] += Mc[i][j] * b[i] / WIENER_TAP_SCALE_FACTOR; + } + } + + // b/274668506: This is the dual branch for the issue in b/272139363. The fix + // is similar. See comments in update_b_sep_sym() below. + int32_t max_b_l = 0; + for (int l = 0; l < wiener_win; ++l) { + const int32_t abs_b_l = abs(b[l]); + if (abs_b_l > max_b_l) max_b_l = abs_b_l; + } + const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR; + const int scaler = max_b_l < scale_threshold ? 1 : 4; + + for (i = 0; i < wiener_win; i++) { + for (j = 0; j < wiener_win; j++) { + int k, l; + for (k = 0; k < wiener_win; ++k) { + const int kk = wrap_index(k, wiener_win); + for (l = 0; l < wiener_win; ++l) { + const int ll = wrap_index(l, wiener_win); + B[ll * wiener_halfwin1 + kk] += + Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] / + (scaler * WIENER_TAP_SCALE_FACTOR) * b[j] / + (WIENER_TAP_SCALE_FACTOR / scaler); + } + } + } + } + // Normalization enforcement in the system of equations itself + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + A[i] -= + A[wiener_halfwin1 - 1] * 2 + + B[i * wiener_halfwin1 + wiener_halfwin1 - 1] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)]; + } + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + for (j = 0; j < wiener_halfwin1 - 1; ++j) { + B[i * wiener_halfwin1 + j] -= + 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] + + B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + + (wiener_halfwin1 - 1)]); + } + } + if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) { + S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR; + for (i = wiener_halfwin1; i < wiener_win; ++i) { + S[i] = S[wiener_win - 1 - i]; + S[wiener_halfwin1 - 1] -= 2 * S[i]; + } + for (i = 0; i < wiener_win; ++i) { + a[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)), + (1 << (WIENER_FILT_BITS - 1)) - 1); + } + } +} + +// Fix vector a, update vector b +static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc, + int64_t **Hc, int32_t *a, int32_t *b) { + int i, j; + int64_t S[WIENER_WIN]; + int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin1 = (wiener_win >> 1) + 1; + memset(A, 0, sizeof(A)); + memset(B, 0, sizeof(B)); + for (i = 0; i < wiener_win; i++) { + const int ii = wrap_index(i, wiener_win); + for (j = 0; j < wiener_win; j++) { + A[ii] += Mc[i][j] * a[j] / WIENER_TAP_SCALE_FACTOR; + } + } + + // b/272139363: The computation, + // Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] / + // WIENER_TAP_SCALE_FACTOR * a[l] / WIENER_TAP_SCALE_FACTOR; + // may generate a signed-integer-overflow. Conditionally scale the terms to + // avoid a potential overflow. + // + // Hc contains accumulated correlation statistics and it is desired to leave + // as much room as possible for Hc. It was experimentally observed that the + // primary issue manifests itself with the second, a[l], multiply. For + // max_a_l < WIENER_TAP_SCALE_FACTOR the first multiply with a[k] should not + // increase dynamic range and the second multiply should hence be safe. + // Thereafter a safe scale_threshold depends on the actual operational range + // of Hc. The largest scale_threshold is expected to depend on bit-depth + // (av1_compute_stats_highbd_c() scales highbd to 8-bit) and maximum + // restoration-unit size (256), leading up to 32-bit positive numbers in Hc. + // Noting that the caller, wiener_decompose_sep_sym(), initializes a[...] + // to a range smaller than 16 bits, the scale_threshold is set as below for + // convenience. + int32_t max_a_l = 0; + for (int l = 0; l < wiener_win; ++l) { + const int32_t abs_a_l = abs(a[l]); + if (abs_a_l > max_a_l) max_a_l = abs_a_l; + } + const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR; + const int scaler = max_a_l < scale_threshold ? 1 : 4; + + for (i = 0; i < wiener_win; i++) { + const int ii = wrap_index(i, wiener_win); + for (j = 0; j < wiener_win; j++) { + const int jj = wrap_index(j, wiener_win); + int k, l; + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + B[jj * wiener_halfwin1 + ii] += + Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] / + (scaler * WIENER_TAP_SCALE_FACTOR) * a[l] / + (WIENER_TAP_SCALE_FACTOR / scaler); + } + } + } + } + // Normalization enforcement in the system of equations itself + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + A[i] -= + A[wiener_halfwin1 - 1] * 2 + + B[i * wiener_halfwin1 + wiener_halfwin1 - 1] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)]; + } + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + for (j = 0; j < wiener_halfwin1 - 1; ++j) { + B[i * wiener_halfwin1 + j] -= + 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] + + B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + + (wiener_halfwin1 - 1)]); + } + } + if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) { + S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR; + for (i = wiener_halfwin1; i < wiener_win; ++i) { + S[i] = S[wiener_win - 1 - i]; + S[wiener_halfwin1 - 1] -= 2 * S[i]; + } + for (i = 0; i < wiener_win; ++i) { + b[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)), + (1 << (WIENER_FILT_BITS - 1)) - 1); + } + } +} + +static void wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H, + int32_t *a, int32_t *b) { + static const int32_t init_filt[WIENER_WIN] = { + WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV, + WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV, + WIENER_FILT_TAP0_MIDV, + }; + int64_t *Hc[WIENER_WIN2]; + int64_t *Mc[WIENER_WIN]; + int i, j, iter; + const int plane_off = (WIENER_WIN - wiener_win) >> 1; + const int wiener_win2 = wiener_win * wiener_win; + for (i = 0; i < wiener_win; i++) { + a[i] = b[i] = + WIENER_TAP_SCALE_FACTOR / WIENER_FILT_STEP * init_filt[i + plane_off]; + } + for (i = 0; i < wiener_win; i++) { + Mc[i] = M + i * wiener_win; + for (j = 0; j < wiener_win; j++) { + Hc[i * wiener_win + j] = + H + i * wiener_win * wiener_win2 + j * wiener_win; + } + } + + iter = 1; + while (iter < NUM_WIENER_ITERS) { + update_a_sep_sym(wiener_win, Mc, Hc, a, b); + update_b_sep_sym(wiener_win, Mc, Hc, a, b); + iter++; + } +} + +// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares +// against identity filters; Final score is defined as the difference between +// the function values +static int64_t compute_score(int wiener_win, int64_t *M, int64_t *H, + InterpKernel vfilt, InterpKernel hfilt) { + int32_t ab[WIENER_WIN * WIENER_WIN]; + int16_t a[WIENER_WIN], b[WIENER_WIN]; + int64_t P = 0, Q = 0; + int64_t iP = 0, iQ = 0; + int64_t Score, iScore; + int i, k, l; + const int plane_off = (WIENER_WIN - wiener_win) >> 1; + const int wiener_win2 = wiener_win * wiener_win; + + a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = WIENER_FILT_STEP; + for (i = 0; i < WIENER_HALFWIN; ++i) { + a[i] = a[WIENER_WIN - i - 1] = vfilt[i]; + b[i] = b[WIENER_WIN - i - 1] = hfilt[i]; + a[WIENER_HALFWIN] -= 2 * a[i]; + b[WIENER_HALFWIN] -= 2 * b[i]; + } + memset(ab, 0, sizeof(ab)); + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) + ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off]; + } + for (k = 0; k < wiener_win2; ++k) { + P += ab[k] * M[k] / WIENER_FILT_STEP / WIENER_FILT_STEP; + for (l = 0; l < wiener_win2; ++l) { + Q += ab[k] * H[k * wiener_win2 + l] * ab[l] / WIENER_FILT_STEP / + WIENER_FILT_STEP / WIENER_FILT_STEP / WIENER_FILT_STEP; + } + } + Score = Q - 2 * P; + + iP = M[wiener_win2 >> 1]; + iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)]; + iScore = iQ - 2 * iP; + + return Score - iScore; +} + +static AOM_INLINE void finalize_sym_filter(int wiener_win, int32_t *f, + InterpKernel fi) { + int i; + const int wiener_halfwin = (wiener_win >> 1); + + for (i = 0; i < wiener_halfwin; ++i) { + const int64_t dividend = (int64_t)f[i] * WIENER_FILT_STEP; + const int64_t divisor = WIENER_TAP_SCALE_FACTOR; + // Perform this division with proper rounding rather than truncation + if (dividend < 0) { + fi[i] = (int16_t)((dividend - (divisor / 2)) / divisor); + } else { + fi[i] = (int16_t)((dividend + (divisor / 2)) / divisor); + } + } + // Specialize for 7-tap filter + if (wiener_win == WIENER_WIN) { + fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV); + fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV); + fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV); + } else { + fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV); + fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV); + fi[0] = 0; + } + // Satisfy filter constraints + fi[WIENER_WIN - 1] = fi[0]; + fi[WIENER_WIN - 2] = fi[1]; + fi[WIENER_WIN - 3] = fi[2]; + // The central element has an implicit +WIENER_FILT_STEP + fi[3] = -2 * (fi[0] + fi[1] + fi[2]); +} + +static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info, + WienerInfo *ref_wiener_info) { + int bits = 0; + if (wiener_win == WIENER_WIN) + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV); + if (wiener_win == WIENER_WIN) + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV); + return bits; +} + +static int64_t finer_search_wiener(const RestSearchCtxt *rsc, + const RestorationTileLimits *limits, + RestorationUnitInfo *rui, int wiener_win) { + const int plane_off = (WIENER_WIN - wiener_win) >> 1; + int64_t err = try_restoration_unit(rsc, limits, rui); + + if (rsc->lpf_sf->disable_wiener_coeff_refine_search) return err; + + // Refinement search around the wiener filter coefficients. + int64_t err2; + int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV, + WIENER_FILT_TAP2_MINV }; + int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV, + WIENER_FILT_TAP2_MAXV }; + + WienerInfo *plane_wiener = &rui->wiener_info; + + // printf("err pre = %"PRId64"\n", err); + const int start_step = 4; + for (int s = start_step; s >= 1; s >>= 1) { + for (int p = plane_off; p < WIENER_HALFWIN; ++p) { + int skip = 0; + do { + if (plane_wiener->hfilter[p] - s >= tap_min[p]) { + plane_wiener->hfilter[p] -= s; + plane_wiener->hfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s; + err2 = try_restoration_unit(rsc, limits, rui); + if (err2 > err) { + plane_wiener->hfilter[p] += s; + plane_wiener->hfilter[WIENER_WIN - p - 1] += s; + plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s; + } else { + err = err2; + skip = 1; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + if (skip) break; + do { + if (plane_wiener->hfilter[p] + s <= tap_max[p]) { + plane_wiener->hfilter[p] += s; + plane_wiener->hfilter[WIENER_WIN - p - 1] += s; + plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s; + err2 = try_restoration_unit(rsc, limits, rui); + if (err2 > err) { + plane_wiener->hfilter[p] -= s; + plane_wiener->hfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s; + } else { + err = err2; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + } + for (int p = plane_off; p < WIENER_HALFWIN; ++p) { + int skip = 0; + do { + if (plane_wiener->vfilter[p] - s >= tap_min[p]) { + plane_wiener->vfilter[p] -= s; + plane_wiener->vfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s; + err2 = try_restoration_unit(rsc, limits, rui); + if (err2 > err) { + plane_wiener->vfilter[p] += s; + plane_wiener->vfilter[WIENER_WIN - p - 1] += s; + plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s; + } else { + err = err2; + skip = 1; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + if (skip) break; + do { + if (plane_wiener->vfilter[p] + s <= tap_max[p]) { + plane_wiener->vfilter[p] += s; + plane_wiener->vfilter[WIENER_WIN - p - 1] += s; + plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s; + err2 = try_restoration_unit(rsc, limits, rui); + if (err2 > err) { + plane_wiener->vfilter[p] -= s; + plane_wiener->vfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s; + } else { + err = err2; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + } + } + // printf("err post = %"PRId64"\n", err); + return err; +} + +static AOM_INLINE void search_wiener( + const RestorationTileLimits *limits, int rest_unit_idx, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info) { + (void)tmpbuf; + (void)rlbs; + (void)error_info; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; + + const MACROBLOCK *const x = rsc->x; + const int64_t bits_none = x->mode_costs.wiener_restore_cost[0]; + + // Skip Wiener search for low variance contents + if (rsc->lpf_sf->prune_wiener_based_on_src_var) { + const int scale[3] = { 0, 1, 2 }; + // Obtain the normalized Qscale + const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0, + rsc->cm->seq_params->bit_depth) >> + 3; + // Derive threshold as sqr(normalized Qscale) * scale / 16, + const uint64_t thresh = + (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4; + const int highbd = rsc->cm->seq_params->use_highbitdepth; + const uint64_t src_var = + var_restoration_unit(limits, rsc->src, rsc->plane, highbd); + // Do not perform Wiener search if source variance is lower than threshold + // or if the reconstruction error is zero + int prune_wiener = (src_var < thresh) || (rsc->sse[RESTORE_NONE] == 0); + if (prune_wiener) { + rsc->total_bits[RESTORE_WIENER] += bits_none; + rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; + rsc->sse[RESTORE_WIENER] = INT64_MAX; + if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1; + return; + } + } + + const int wiener_win = + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; + + int reduced_wiener_win = wiener_win; + if (rsc->lpf_sf->reduce_wiener_window_size) { + reduced_wiener_win = + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA; + } + + int64_t M[WIENER_WIN2]; + int64_t H[WIENER_WIN2 * WIENER_WIN2]; + int32_t vfilter[WIENER_WIN], hfilter[WIENER_WIN]; + +#if CONFIG_AV1_HIGHBITDEPTH + const AV1_COMMON *const cm = rsc->cm; + if (cm->seq_params->use_highbitdepth) { + // TODO(any) : Add support for use_downsampled_wiener_stats SF in HBD + // functions. Optimize intrinsics of HBD design similar to LBD (i.e., + // pre-calculate d and s buffers and avoid most of the C operations). + av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer, + rsc->src_buffer, limits->h_start, limits->h_end, + limits->v_start, limits->v_end, rsc->dgd_stride, + rsc->src_stride, M, H, cm->seq_params->bit_depth); + } else { + av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, + rsc->dgd_avg, rsc->src_avg, limits->h_start, + limits->h_end, limits->v_start, limits->v_end, + rsc->dgd_stride, rsc->src_stride, M, H, + rsc->lpf_sf->use_downsampled_wiener_stats); + } +#else + av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, + rsc->dgd_avg, rsc->src_avg, limits->h_start, limits->h_end, + limits->v_start, limits->v_end, rsc->dgd_stride, + rsc->src_stride, M, H, + rsc->lpf_sf->use_downsampled_wiener_stats); +#endif + + wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter); + + RestorationUnitInfo rui; + memset(&rui, 0, sizeof(rui)); + rui.restoration_type = RESTORE_WIENER; + finalize_sym_filter(reduced_wiener_win, vfilter, rui.wiener_info.vfilter); + finalize_sym_filter(reduced_wiener_win, hfilter, rui.wiener_info.hfilter); + + // Filter score computes the value of the function x'*A*x - x'*b for the + // learned filter and compares it against identity filer. If there is no + // reduction in the function, the filter is reverted back to identity + if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter, + rui.wiener_info.hfilter) > 0) { + rsc->total_bits[RESTORE_WIENER] += bits_none; + rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; + rsc->sse[RESTORE_WIENER] = INT64_MAX; + if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1; + return; + } + + rsc->sse[RESTORE_WIENER] = + finer_search_wiener(rsc, limits, &rui, reduced_wiener_win); + rusi->wiener = rui.wiener_info; + + if (reduced_wiener_win != WIENER_WIN) { + assert(rui.wiener_info.vfilter[0] == 0 && + rui.wiener_info.vfilter[WIENER_WIN - 1] == 0); + assert(rui.wiener_info.hfilter[0] == 0 && + rui.wiener_info.hfilter[WIENER_WIN - 1] == 0); + } + + const int64_t bits_wiener = + x->mode_costs.wiener_restore_cost[1] + + (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->ref_wiener) + << AV1_PROB_COST_SHIFT); + + double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], + rsc->cm->seq_params->bit_depth); + double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits_wiener >> 4, rsc->sse[RESTORE_WIENER], + rsc->cm->seq_params->bit_depth); + + RestorationType rtype = + (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE; + rusi->best_rtype[RESTORE_WIENER - 1] = rtype; + + // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and + // RESTORE_NONE or based on best_rtype + if (rsc->lpf_sf->prune_sgr_based_on_wiener == 1) { + rsc->skip_sgr_eval = cost_wiener > (1.01 * cost_none); + } else if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) { + rsc->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE; + } + +#if DEBUG_LR_COSTING + // Store ref params for later checking + lr_ref_params[RESTORE_WIENER][rsc->plane][rest_unit_idx].wiener_info = + rsc->ref_wiener; +#endif // DEBUG_LR_COSTING + + rsc->total_sse[RESTORE_WIENER] += rsc->sse[rtype]; + rsc->total_bits[RESTORE_WIENER] += + (cost_wiener < cost_none) ? bits_wiener : bits_none; + if (cost_wiener < cost_none) rsc->ref_wiener = rusi->wiener; +} + +static AOM_INLINE void search_norestore( + const RestorationTileLimits *limits, int rest_unit_idx, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info) { + (void)rest_unit_idx; + (void)tmpbuf; + (void)rlbs; + (void)error_info; + + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + + const int highbd = rsc->cm->seq_params->use_highbitdepth; + rsc->sse[RESTORE_NONE] = sse_restoration_unit( + limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd); + + rsc->total_sse[RESTORE_NONE] += rsc->sse[RESTORE_NONE]; +} + +static AOM_INLINE void search_switchable( + const RestorationTileLimits *limits, int rest_unit_idx, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info) { + (void)limits; + (void)tmpbuf; + (void)rlbs; + (void)error_info; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; + + const MACROBLOCK *const x = rsc->x; + + const int wiener_win = + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; + + double best_cost = 0; + int64_t best_bits = 0; + RestorationType best_rtype = RESTORE_NONE; + + for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) { + // If this restoration mode was skipped, or could not find a solution + // that was better than RESTORE_NONE, then we can't select it here either. + // + // Note: It is possible for the restoration search functions to find a + // filter which is better than RESTORE_NONE when looking purely at SSE, but + // for it to be rejected overall due to its rate cost. In this case, there + // is a chance that it may be have a lower rate cost when looking at + // RESTORE_SWITCHABLE, and so it might be acceptable here. + // + // Therefore we prune based on SSE, rather than on whether or not the + // previous search function selected this mode. + if (r > RESTORE_NONE) { + if (rsc->sse[r] > rsc->sse[RESTORE_NONE]) continue; + } + + const int64_t sse = rsc->sse[r]; + int64_t coeff_pcost = 0; + switch (r) { + case RESTORE_NONE: coeff_pcost = 0; break; + case RESTORE_WIENER: + coeff_pcost = count_wiener_bits(wiener_win, &rusi->wiener, + &rsc->switchable_ref_wiener); + break; + case RESTORE_SGRPROJ: + coeff_pcost = + count_sgrproj_bits(&rusi->sgrproj, &rsc->switchable_ref_sgrproj); + break; + default: assert(0); break; + } + const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT; + const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits; + double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits >> 4, sse, rsc->cm->seq_params->bit_depth); + if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10) + cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level); + if (r == 0 || cost < best_cost) { + best_cost = cost; + best_bits = bits; + best_rtype = r; + } + } + + rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype; + +#if DEBUG_LR_COSTING + // Store ref params for later checking + lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].wiener_info = + rsc->switchable_ref_wiener; + lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].sgrproj_info = + rsc->switchable_ref_sgrproj; +#endif // DEBUG_LR_COSTING + + rsc->total_sse[RESTORE_SWITCHABLE] += rsc->sse[best_rtype]; + rsc->total_bits[RESTORE_SWITCHABLE] += best_bits; + if (best_rtype == RESTORE_WIENER) rsc->switchable_ref_wiener = rusi->wiener; + if (best_rtype == RESTORE_SGRPROJ) + rsc->switchable_ref_sgrproj = rusi->sgrproj; +} + +static AOM_INLINE void copy_unit_info(RestorationType frame_rtype, + const RestUnitSearchInfo *rusi, + RestorationUnitInfo *rui) { + assert(frame_rtype > 0); + rui->restoration_type = rusi->best_rtype[frame_rtype - 1]; + if (rui->restoration_type == RESTORE_WIENER) + rui->wiener_info = rusi->wiener; + else + rui->sgrproj_info = rusi->sgrproj; +} + +static void restoration_search(AV1_COMMON *cm, int plane, RestSearchCtxt *rsc, + bool *disable_lr_filter) { + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int mib_size_log2 = cm->seq_params->mib_size_log2; + const CommonTileParams *tiles = &cm->tiles; + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + RestorationInfo *rsi = &cm->rst_info[plane]; + const int ru_size = rsi->restoration_unit_size; + const int ext_size = ru_size * 3 / 2; + + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + static const rest_unit_visitor_t funs[RESTORE_TYPES] = { + search_norestore, search_wiener, search_sgrproj, search_switchable + }; + + const int plane_num_units = rsi->num_rest_units; + const RestorationType num_rtypes = + (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES; + + reset_rsc(rsc); + + // Iterate over restoration units in encoding order, so that each RU gets + // the correct reference parameters when we cost it up. This is effectively + // a nested iteration over: + // * Each tile, order does not matter + // * Each superblock within that tile, in raster order + // * Each LR unit which is coded within that superblock, in raster order + for (int tile_row = 0; tile_row < tiles->rows; tile_row++) { + int sb_row_start = tiles->row_start_sb[tile_row]; + int sb_row_end = tiles->row_start_sb[tile_row + 1]; + for (int tile_col = 0; tile_col < tiles->cols; tile_col++) { + int sb_col_start = tiles->col_start_sb[tile_col]; + int sb_col_end = tiles->col_start_sb[tile_col + 1]; + + // Reset reference parameters for delta-coding at the start of each tile + rsc_on_tile(rsc); + + for (int sb_row = sb_row_start; sb_row < sb_row_end; sb_row++) { + int mi_row = sb_row << mib_size_log2; + for (int sb_col = sb_col_start; sb_col < sb_col_end; sb_col++) { + int mi_col = sb_col << mib_size_log2; + + int rcol0, rcol1, rrow0, rrow1; + int has_lr_info = av1_loop_restoration_corners_in_sb( + cm, plane, mi_row, mi_col, sb_size, &rcol0, &rcol1, &rrow0, + &rrow1); + + if (!has_lr_info) continue; + + RestorationTileLimits limits; + for (int rrow = rrow0; rrow < rrow1; rrow++) { + int y0 = rrow * ru_size; + int remaining_h = plane_h - y0; + int h = (remaining_h < ext_size) ? remaining_h : ru_size; + + limits.v_start = y0; + limits.v_end = y0 + h; + assert(limits.v_end <= plane_h); + // Offset upwards to align with the restoration processing stripe + const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; + limits.v_start = AOMMAX(0, limits.v_start - voffset); + if (limits.v_end < plane_h) limits.v_end -= voffset; + + for (int rcol = rcol0; rcol < rcol1; rcol++) { + int x0 = rcol * ru_size; + int remaining_w = plane_w - x0; + int w = (remaining_w < ext_size) ? remaining_w : ru_size; + + limits.h_start = x0; + limits.h_end = x0 + w; + assert(limits.h_end <= plane_w); + + const int unit_idx = rrow * rsi->horz_units + rcol; + + rsc->skip_sgr_eval = 0; + for (RestorationType r = RESTORE_NONE; r < num_rtypes; r++) { + if (disable_lr_filter[r]) continue; + + funs[r](&limits, unit_idx, rsc, rsc->cm->rst_tmpbuf, NULL, + cm->error); + } + } + } + } + } + } + } +} + +static INLINE void av1_derive_flags_for_lr_processing( + const LOOP_FILTER_SPEED_FEATURES *lpf_sf, bool *disable_lr_filter) { + const bool is_wiener_disabled = lpf_sf->disable_wiener_filter; + const bool is_sgr_disabled = lpf_sf->disable_sgr_filter; + + // Enable None Loop restoration filter if either of Wiener or Self-guided is + // enabled. + disable_lr_filter[RESTORE_NONE] = (is_wiener_disabled && is_sgr_disabled); + + disable_lr_filter[RESTORE_WIENER] = is_wiener_disabled; + disable_lr_filter[RESTORE_SGRPROJ] = is_sgr_disabled; + + // Enable Swicthable Loop restoration filter if both of the Wiener and + // Self-guided are enabled. + disable_lr_filter[RESTORE_SWITCHABLE] = + (is_wiener_disabled || is_sgr_disabled); +} + +#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0 +// Allocate both decoder-side and encoder-side info structs for a single plane. +// The unit size passed in should be the minimum size which we are going to +// search; before each search, set_restoration_unit_size() must be called to +// configure the actual size. +static RestUnitSearchInfo *allocate_search_structs(AV1_COMMON *cm, + RestorationInfo *rsi, + int is_uv, + int min_luma_unit_size) { +#if COUPLED_CHROMA_FROM_LUMA_RESTORATION + int sx = cm->seq_params.subsampling_x; + int sy = cm->seq_params.subsampling_y; + int s = (p > 0) ? AOMMIN(sx, sy) : 0; +#else + int s = 0; +#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION + int min_unit_size = min_luma_unit_size >> s; + + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + const int max_horz_units = av1_lr_count_units(min_unit_size, plane_w); + const int max_vert_units = av1_lr_count_units(min_unit_size, plane_h); + const int max_num_units = max_horz_units * max_vert_units; + + aom_free(rsi->unit_info); + CHECK_MEM_ERROR(cm, rsi->unit_info, + (RestorationUnitInfo *)aom_memalign( + 16, sizeof(*rsi->unit_info) * max_num_units)); + + RestUnitSearchInfo *rusi; + CHECK_MEM_ERROR( + cm, rusi, + (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * max_num_units)); + + // If the restoration unit dimensions are not multiples of + // rsi->restoration_unit_size then some elements of the rusi array may be + // left uninitialised when we reach copy_unit_info(...). This is not a + // problem, as these elements are ignored later, but in order to quiet + // Valgrind's warnings we initialise the array below. + memset(rusi, 0, sizeof(*rusi) * max_num_units); + + return rusi; +} + +static void set_restoration_unit_size(AV1_COMMON *cm, RestorationInfo *rsi, + int is_uv, int luma_unit_size) { +#if COUPLED_CHROMA_FROM_LUMA_RESTORATION + int sx = cm->seq_params.subsampling_x; + int sy = cm->seq_params.subsampling_y; + int s = (p > 0) ? AOMMIN(sx, sy) : 0; +#else + int s = 0; +#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION + int unit_size = luma_unit_size >> s; + + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + const int horz_units = av1_lr_count_units(unit_size, plane_w); + const int vert_units = av1_lr_count_units(unit_size, plane_h); + + rsi->restoration_unit_size = unit_size; + rsi->num_rest_units = horz_units * vert_units; + rsi->horz_units = horz_units; + rsi->vert_units = vert_units; +} + +void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + const SequenceHeader *const seq_params = cm->seq_params; + const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf; + const int num_planes = av1_num_planes(cm); + const int highbd = cm->seq_params->use_highbitdepth; + assert(!cm->features.all_lossless); + + av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx); + + // Select unit size based on speed feature settings, and allocate + // rui structs based on this size + int min_lr_unit_size = cpi->sf.lpf_sf.min_lr_unit_size; + int max_lr_unit_size = cpi->sf.lpf_sf.max_lr_unit_size; + + // The minimum allowed unit size at a syntax level is 1 superblock. + // Apply this constraint here so that the speed features code which sets + // cpi->sf.lpf_sf.min_lr_unit_size does not need to know the superblock size + min_lr_unit_size = + AOMMAX(min_lr_unit_size, block_size_wide[cm->seq_params->sb_size]); + + for (int plane = 0; plane < num_planes; ++plane) { + cpi->pick_lr_ctxt.rusi[plane] = allocate_search_structs( + cm, &cm->rst_info[plane], plane > 0, min_lr_unit_size); + } + + x->rdmult = cpi->rd.RDMULT; + + // Allocate the frame buffer trial_frame_rst, which is used to temporarily + // store the loop restored frame. + if (aom_realloc_frame_buffer( + &cpi->trial_frame_rst, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, + cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate trial restored frame buffer"); + + RestSearchCtxt rsc; + + // The buffers 'src_avg' and 'dgd_avg' are used to compute H and M buffers. + // These buffers are only required for the AVX2 and NEON implementations of + // av1_compute_stats. The buffer size required is calculated based on maximum + // width and height of the LRU (i.e., from foreach_rest_unit_in_plane() 1.5 + // times the RESTORATION_UNITSIZE_MAX) allowed for Wiener filtering. The width + // and height aligned to multiple of 16 is considered for intrinsic purpose. + rsc.dgd_avg = NULL; + rsc.src_avg = NULL; +#if HAVE_AVX2 || HAVE_NEON + // The buffers allocated below are used during Wiener filter processing of low + // bitdepth path. Hence, allocate the same when Wiener filter is enabled in + // low bitdepth path. + if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) { + const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 * + RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; + CHECK_MEM_ERROR(cm, cpi->pick_lr_ctxt.dgd_avg, + (int16_t *)aom_memalign(32, buf_size)); + + rsc.dgd_avg = cpi->pick_lr_ctxt.dgd_avg; + // When LRU width isn't multiple of 16, the 256 bits load instruction used + // in AVX2 intrinsic can read data beyond valid LRU. Hence, in order to + // silence Valgrind warning this buffer is initialized with zero. Overhead + // due to this initialization is negligible since it is done at frame level. + memset(rsc.dgd_avg, 0, buf_size); + rsc.src_avg = + rsc.dgd_avg + 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; + // Asserts the starting address of src_avg is always 32-bytes aligned. + assert(!((intptr_t)rsc.src_avg % 32)); + } +#endif + + // Initialize all planes, so that any planes we skip searching will still have + // valid data + for (int plane = 0; plane < num_planes; plane++) { + cm->rst_info[plane].frame_restoration_type = RESTORE_NONE; + } + + // Decide which planes to search + int plane_start, plane_end; + + if (lpf_sf->disable_loop_restoration_luma) { + plane_start = AOM_PLANE_U; + } else { + plane_start = AOM_PLANE_Y; + } + + if (num_planes == 1 || lpf_sf->disable_loop_restoration_chroma) { + plane_end = AOM_PLANE_Y; + } else { + plane_end = AOM_PLANE_V; + } + + // Derive the flags to enable/disable Loop restoration filters based on the + // speed features 'disable_wiener_filter' and 'disable_sgr_filter'. + bool disable_lr_filter[RESTORE_TYPES] = { false }; + av1_derive_flags_for_lr_processing(lpf_sf, disable_lr_filter); + + for (int plane = plane_start; plane <= plane_end; plane++) { + const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf; + const int is_uv = plane != AOM_PLANE_Y; + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + av1_extend_frame(dgd->buffers[plane], plane_w, plane_h, dgd->strides[is_uv], + RESTORATION_BORDER, RESTORATION_BORDER, highbd); + } + + double best_cost = DBL_MAX; + int best_luma_unit_size = max_lr_unit_size; + for (int luma_unit_size = max_lr_unit_size; + luma_unit_size >= min_lr_unit_size; luma_unit_size >>= 1) { + int64_t bits_this_size = 0; + int64_t sse_this_size = 0; + RestorationType best_rtype[MAX_MB_PLANE] = { RESTORE_NONE, RESTORE_NONE, + RESTORE_NONE }; + for (int plane = plane_start; plane <= plane_end; ++plane) { + set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0, + luma_unit_size); + init_rsc(src, &cpi->common, x, lpf_sf, plane, + cpi->pick_lr_ctxt.rusi[plane], &cpi->trial_frame_rst, &rsc); + + restoration_search(cm, plane, &rsc, disable_lr_filter); + + const int plane_num_units = cm->rst_info[plane].num_rest_units; + const RestorationType num_rtypes = + (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES; + double best_cost_this_plane = DBL_MAX; + for (RestorationType r = 0; r < num_rtypes; ++r) { + // Disable Loop restoration filter based on the flags set using speed + // feature 'disable_wiener_filter' and 'disable_sgr_filter'. + if (disable_lr_filter[r]) continue; + + double cost_this_plane = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, rsc.total_bits[r] >> 4, rsc.total_sse[r], + cm->seq_params->bit_depth); + + if (cost_this_plane < best_cost_this_plane) { + best_cost_this_plane = cost_this_plane; + best_rtype[plane] = r; + } + } + + bits_this_size += rsc.total_bits[best_rtype[plane]]; + sse_this_size += rsc.total_sse[best_rtype[plane]]; + } + + double cost_this_size = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits_this_size >> 4, sse_this_size, + cm->seq_params->bit_depth); + + if (cost_this_size < best_cost) { + best_cost = cost_this_size; + best_luma_unit_size = luma_unit_size; + // Copy parameters out of rusi struct, before we overwrite it at + // the start of the next iteration + bool all_none = true; + for (int plane = plane_start; plane <= plane_end; ++plane) { + cm->rst_info[plane].frame_restoration_type = best_rtype[plane]; + if (best_rtype[plane] != RESTORE_NONE) { + all_none = false; + const int plane_num_units = cm->rst_info[plane].num_rest_units; + for (int u = 0; u < plane_num_units; ++u) { + copy_unit_info(best_rtype[plane], &cpi->pick_lr_ctxt.rusi[plane][u], + &cm->rst_info[plane].unit_info[u]); + } + } + } + // Heuristic: If all best_rtype entries are RESTORE_NONE, this means we + // couldn't find any good filters at this size. So we likely won't find + // any good filters at a smaller size either, so skip + if (all_none) { + break; + } + } else { + // Heuristic: If this size is worse than the previous (larger) size, then + // the next size down will likely be even worse, so skip + break; + } + } + + // Final fixup to set the correct unit size + // We set this for all planes, even ones we have skipped searching, + // so that other code does not need to care which planes were and weren't + // searched + for (int plane = 0; plane < num_planes; ++plane) { + set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0, + best_luma_unit_size); + } + +#if HAVE_AVX || HAVE_NEON + if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) { + aom_free(cpi->pick_lr_ctxt.dgd_avg); + cpi->pick_lr_ctxt.dgd_avg = NULL; + } +#endif + for (int plane = 0; plane < num_planes; plane++) { + aom_free(cpi->pick_lr_ctxt.rusi[plane]); + cpi->pick_lr_ctxt.rusi[plane] = NULL; + } +} diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h new file mode 100644 index 0000000000..d1d0b0cec6 --- /dev/null +++ b/third_party/aom/av1/encoder/pickrst.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_PICKRST_H_ +#define AOM_AV1_ENCODER_PICKRST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/encoder.h" + +struct yv12_buffer_config; +struct AV1_COMP; + +// Enable extra debugging for loop restoration costing? +// +// If this is set to 1, then we record not just the selected LR parameters, but +// also the values which the search process thinks they should be delta-coded +// against. Then, when writing out the bitstream, we verify this information, +// to help ensure that the search code is costing things properly +#define DEBUG_LR_COSTING 0 + +#if DEBUG_LR_COSTING +#define MAX_LR_UNITS_W 64 +#define MAX_LR_UNITS_H 64 + +// Storage for reference parameters. +// +// The storage size is determined by: +// * This is always written and then checked within the same frame encode pass, +// so we do not need to buffer multiple frames of data +// * The parameters can be different per plane within one frame +// * The relevant set of ref parameters can differ between the search where +// we set the frame restoration mode to RESTORE_WIENER, and the search where +// we set it to RESTORE_SWITCHABLE. +// So we need to store at least two sets of Wiener params and two sets of +// SGR params, and the easiest way to do this is to index by +// frame_restoration_type +extern RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE] + [MAX_LR_UNITS_W * MAX_LR_UNITS_H]; +#endif // DEBUG_LR_COSTING + +static const uint8_t g_shuffle_stats_data[16] = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, +}; + +static const uint8_t g_shuffle_stats_highbd_data[32] = { + 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, + 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, +}; + +static INLINE uint8_t find_average(const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int stride) { + uint64_t sum = 0; + for (int i = v_start; i < v_end; i++) { + for (int j = h_start; j < h_end; j++) { + sum += src[i * stride + j]; + } + } + uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start)); + return (uint8_t)avg; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start, + int h_end, int v_start, int v_end, + int stride) { + uint64_t sum = 0; + for (int i = v_start; i < v_end; i++) { + for (int j = h_start; j < h_end; j++) { + sum += src[i * stride + j]; + } + } + uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start)); + return (uint16_t)avg; +} +#endif + +/*!\brief Algorithm for AV1 loop restoration search and estimation. + * + * \ingroup in_loop_restoration + * This function determines proper restoration filter types and + * associated parameters for each restoration unit in a frame. + * + * \param[in] sd Source frame buffer + * \param[in,out] cpi Top-level encoder structure + * + * \remark Nothing is returned. Instead, chosen restoration filter + * types and parameters are stored per plane in the \c rst_info structure + * of type \ref RestorationInfo inside \c cpi->common: + * \arg \c rst_info[ \c 0 ]: Chosen parameters for Y plane + * \arg \c rst_info[ \c 1 ]: Chosen parameters for U plane if it exists + * \arg \c rst_info[ \c 2 ]: Chosen parameters for V plane if it exists + * \par + * The following fields in each \c rst_info[ \c p], \c p = 0, 1, 2 + * are populated: + * \arg \c rst_info[ \c p ].\c frame_restoration_type + * \arg \c rst_info[ \c p ].\c unit_info[ \c u ], + * for each \c u in 0, 1, ..., \c n( \c p ) - 1, + * where \c n( \c p ) is the number of restoration units in plane \c p. + * \par + * The following fields in each \c rst_info[ \c p ].\c unit_info[ \c u ], + * \c p = 0, 1, 2 and \c u = 0, 1, ..., \c n( \c p ) - 1, of type + * \ref RestorationUnitInfo are populated: + * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type + * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c wiener_info OR + * \c rst_info[ \c p ].\c unit_info[ \c u ].\c sgrproj_info OR + * neither, depending on + * \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type + * + */ +void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PICKRST_H_ diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h new file mode 100644 index 0000000000..2e8710108b --- /dev/null +++ b/third_party/aom/av1/encoder/pustats.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PUSTATS_H_ +#define AOM_AV1_ENCODER_PUSTATS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +#define NUM_FEATURES_PUSTATS 8 +#define NUM_HIDDEN_LAYERS 2 +#define HIDDEN_LAYERS_0_NODES 12 +#define HIDDEN_LAYERS_1_NODES 10 +#define LOGITS_NODES 1 + +static const float + av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * + HIDDEN_LAYERS_0_NODES] = { + -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f, + -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f, + 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f, + 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f, + -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f, + -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f, + -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f, + -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f, + 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f, + -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f, + -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f, + -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f, + 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f, + -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f, + }; + +static const float + av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { + 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f, + 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f, + }; + +static const float + av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * + HIDDEN_LAYERS_1_NODES] = { + 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f, + 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f, + -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f, + 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f, + 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f, + -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f, + -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f, + -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f, + 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f, + 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f, + -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f, + -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f, + -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f, + 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f, + -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f, + -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f, + 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f, + -2.7566f, + }; + +static const float + av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { + 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f, + 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f, + }; + +static const float + av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { + 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f, + 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f, + }; + +static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = { + 4.5103f, +}; + +static const NN_CONFIG av1_pustats_rate_nnconfig = { + NUM_FEATURES_PUSTATS, // num_inputs + LOGITS_NODES, // num_outputs + NUM_HIDDEN_LAYERS, // num_hidden_layers + { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes + { + av1_pustats_rate_hiddenlayer_0_kernel, + av1_pustats_rate_hiddenlayer_1_kernel, + av1_pustats_rate_logits_kernel, + }, + { + av1_pustats_rate_hiddenlayer_0_bias, + av1_pustats_rate_hiddenlayer_1_bias, + av1_pustats_rate_logits_bias, + }, +}; + +static const float + av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * + HIDDEN_LAYERS_0_NODES] = { + -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f, + 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f, + 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f, + 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f, + 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f, + -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f, + -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f, + -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f, + 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f, + 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f, + -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f, + -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f, + 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f, + -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f, + }; + +static const float + av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { + 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f, + 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f, + }; + +static const float + av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * + HIDDEN_LAYERS_1_NODES] = { + -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f, + -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f, + 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f, + 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f, + -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f, + -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f, + -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f, + 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f, + -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f, + 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f, + 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f, + -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f, + 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f, + 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f, + 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f, + -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f, + -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f, + -0.4164f, + }; + +static const float + av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { + -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f, + 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f, + }; + +static const float + av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { + -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f, + 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f, + }; + +static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = { + 2.3371f, +}; + +static const NN_CONFIG av1_pustats_dist_nnconfig = { + NUM_FEATURES_PUSTATS, // num_inputs + LOGITS_NODES, // num_outputs + NUM_HIDDEN_LAYERS, // num_hidden_layers + { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes + { + av1_pustats_dist_hiddenlayer_0_kernel, + av1_pustats_dist_hiddenlayer_1_kernel, + av1_pustats_dist_logits_kernel, + }, + { + av1_pustats_dist_hiddenlayer_0_bias, + av1_pustats_dist_hiddenlayer_1_bias, + av1_pustats_dist_logits_bias, + }, +}; + +#undef NUM_HIDDEN_LAYERS +#undef HIDDEN_LAYERS_0_NODES +#undef HIDDEN_LAYERS_1_NODES +#undef LOGITS_NODES + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PUSTATS_H_ diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h new file mode 100644 index 0000000000..efe909b6db --- /dev/null +++ b/third_party/aom/av1/encoder/random.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RANDOM_H_ +#define AOM_AV1_ENCODER_RANDOM_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Advance the generator to its next state, and generate the next 32-bit output. +// Note that the low bits of this output are comparatively low-quality, so users +// of this function should ensure that the high bits factor through to their +// outputs. +static INLINE uint32_t lcg_next(uint32_t *state) { + *state = (uint32_t)(*state * 1103515245ULL + 12345); + return *state; +} + +// Generate a random number in the range [0, 32768). +static INLINE uint32_t lcg_rand16(uint32_t *state) { + return (lcg_next(state) / 65536) % 32768; +} + +// Generate a random number in the range [0, n) +// This is implemented as (rand() * n) / rather than +// rand() % n, for a few reasons: This implementation is faster and less biased, +// and if is a power of 2, this uses the higher-quality top bits from the RNG +// output rather than the lower-quality bottom bits. +static INLINE uint32_t lcg_randint(uint32_t *state, uint32_t n) { + uint64_t v = ((uint64_t)lcg_next(state) * n) >> 32; + return (uint32_t)v; +} + +// Generate a random number in the range [lo, hi) +static INLINE uint32_t lcg_randrange(uint32_t *state, uint32_t lo, + uint32_t hi) { + assert(lo < hi); + return lo + lcg_randint(state, hi - lo); +} + +// Pick k distinct numbers from the set {0, ..., n-1} +// All possible sets of k numbers, and all possible orderings of those numbers, +// are equally likely. +// +// Note: The algorithm used here uses resampling to avoid choosing repeated +// values. This works well as long as n >> k, but can potentially lead to many +// resampling attempts if n is equal to or only slightly larger than k. +static INLINE void lcg_pick(int n, int k, int *out, unsigned int *seed) { + assert(0 <= k && k <= n); + for (int i = 0; i < k; i++) { + int v; + + // Inner resampling loop + // We have to use a goto here because C does not have a multi-level continue + // statement + resample: + v = (int)lcg_randint(seed, n); + for (int j = 0; j < i; j++) { + if (v == out[j]) { + // Repeated v, resample + goto resample; + } + } + + // New v, accept + out[i] = v; + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RANDOM_H_ diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c new file mode 100644 index 0000000000..df86380272 --- /dev/null +++ b/third_party/aom/av1/encoder/ratectrl.c @@ -0,0 +1,3587 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_once.h" + +#include "av1/common/alloccommon.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/common/common.h" +#include "av1/common/entropymode.h" +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/random.h" +#include "av1/encoder/ratectrl.h" + +#include "config/aom_dsp_rtcd.h" + +#define USE_UNRESTRICTED_Q_IN_CQ_MODE 0 + +// Max rate target for 1080P and below encodes under normal circumstances +// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB +#define MAX_MB_RATE 250 +#define MAXRATE_1080P 2025000 + +#define MIN_BPB_FACTOR 0.005 +#define MAX_BPB_FACTOR 50 + +#define SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO 0 +#define SUPERRES_QADJ_PER_DENOM_KEYFRAME 2 +#define SUPERRES_QADJ_PER_DENOM_ARFFRAME 0 + +#define FRAME_OVERHEAD_BITS 200 +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + switch (bit_depth) { \ + case AOM_BITS_8: name = name##_8; break; \ + case AOM_BITS_10: name = name##_10; break; \ + case AOM_BITS_12: name = name##_12; break; \ + default: \ + assert(0 && \ + "bit_depth should be AOM_BITS_8, AOM_BITS_10" \ + " or AOM_BITS_12"); \ + name = NULL; \ + } \ + } while (0) + +// Tables relating active max Q to active min Q +static int kf_low_motion_minq_8[QINDEX_RANGE]; +static int kf_high_motion_minq_8[QINDEX_RANGE]; +static int arfgf_low_motion_minq_8[QINDEX_RANGE]; +static int arfgf_high_motion_minq_8[QINDEX_RANGE]; +static int inter_minq_8[QINDEX_RANGE]; +static int rtc_minq_8[QINDEX_RANGE]; + +static int kf_low_motion_minq_10[QINDEX_RANGE]; +static int kf_high_motion_minq_10[QINDEX_RANGE]; +static int arfgf_low_motion_minq_10[QINDEX_RANGE]; +static int arfgf_high_motion_minq_10[QINDEX_RANGE]; +static int inter_minq_10[QINDEX_RANGE]; +static int rtc_minq_10[QINDEX_RANGE]; +static int kf_low_motion_minq_12[QINDEX_RANGE]; +static int kf_high_motion_minq_12[QINDEX_RANGE]; +static int arfgf_low_motion_minq_12[QINDEX_RANGE]; +static int arfgf_high_motion_minq_12[QINDEX_RANGE]; +static int inter_minq_12[QINDEX_RANGE]; +static int rtc_minq_12[QINDEX_RANGE]; + +static int gf_high = 2400; +static int gf_low = 300; +#ifdef STRICT_RC +static int kf_high = 3200; +#else +static int kf_high = 5000; +#endif +static int kf_low = 400; + +// How many times less pixels there are to encode given the current scaling. +// Temporary replacement for rcf_mult and rate_thresh_mult. +static double resize_rate_factor(const FrameDimensionCfg *const frm_dim_cfg, + int width, int height) { + return (double)(frm_dim_cfg->width * frm_dim_cfg->height) / (width * height); +} + +// Functions to compute the active minq lookup table entries based on a +// formulaic approach to facilitate easier adjustment of the Q tables. +// The formulae were derived from computing a 3rd order polynomial best +// fit to the original data (after plotting real maxq vs minq (not q index)) +static int get_minq_index(double maxq, double x3, double x2, double x1, + aom_bit_depth_t bit_depth) { + const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq); + + // Special case handling to deal with the step from q2.0 + // down to lossless mode represented by q 1.0. + if (minqtarget <= 2.0) return 0; + + return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1); +} + +static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, + int *arfgf_high, int *inter, int *rtc, + aom_bit_depth_t bit_depth) { + int i; + for (i = 0; i < QINDEX_RANGE; i++) { + const double maxq = av1_convert_qindex_to_q(i, bit_depth); + kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth); + arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); + arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth); + rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); + } +} + +static void rc_init_minq_luts(void) { + init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8, + arfgf_low_motion_minq_8, arfgf_high_motion_minq_8, + inter_minq_8, rtc_minq_8, AOM_BITS_8); + init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10, + arfgf_low_motion_minq_10, arfgf_high_motion_minq_10, + inter_minq_10, rtc_minq_10, AOM_BITS_10); + init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12, + arfgf_low_motion_minq_12, arfgf_high_motion_minq_12, + inter_minq_12, rtc_minq_12, AOM_BITS_12); +} + +void av1_rc_init_minq_luts(void) { aom_once(rc_init_minq_luts); } + +// These functions use formulaic calculations to make playing with the +// quantizer tables easier. If necessary they can be replaced by lookup +// tables if and when things settle down in the experimental bitstream +double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) { + // Convert the index to a real Q value (scaled down to match old Q values) + switch (bit_depth) { + case AOM_BITS_8: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 4.0; + case AOM_BITS_10: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 16.0; + case AOM_BITS_12: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 64.0; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1.0; + } +} + +int av1_get_bpmb_enumerator(FRAME_TYPE frame_type, + const int is_screen_content_type) { + int enumerator; + + if (is_screen_content_type) { + enumerator = (frame_type == KEY_FRAME) ? 1000000 : 750000; + } else { + enumerator = (frame_type == KEY_FRAME) ? 2000000 : 1500000; + } + + return enumerator; +} + +static int get_init_ratio(double sse) { return (int)(300000 / sse); } + +int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex, + double correction_factor, int accurate_estimate) { + const AV1_COMMON *const cm = &cpi->common; + const int is_screen_content_type = cpi->is_screen_content_type; + const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; + const double q = av1_convert_qindex_to_q(qindex, bit_depth); + int enumerator = av1_get_bpmb_enumerator(frame_type, is_screen_content_type); + + assert(correction_factor <= MAX_BPB_FACTOR && + correction_factor >= MIN_BPB_FACTOR); + + if (cpi->oxcf.rc_cfg.mode == AOM_CBR && frame_type != KEY_FRAME && + accurate_estimate && cpi->rec_sse != UINT64_MAX) { + const int mbs = cm->mi_params.MBs; + const double sse_sqrt = + (double)((int)sqrt((double)(cpi->rec_sse)) << BPER_MB_NORMBITS) / + (double)mbs; + const int ratio = (cpi->rc.bit_est_ratio == 0) ? get_init_ratio(sse_sqrt) + : cpi->rc.bit_est_ratio; + // Clamp the enumerator to lower the q fluctuations. + enumerator = AOMMIN(AOMMAX((int)(ratio * sse_sqrt), 20000), 170000); + } + + // q based adjustment to baseline enumerator + return (int)(enumerator * correction_factor / q); +} + +int av1_estimate_bits_at_q(const AV1_COMP *cpi, int q, + double correction_factor) { + const AV1_COMMON *const cm = &cpi->common; + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + const int mbs = cm->mi_params.MBs; + const int bpm = + (int)(av1_rc_bits_per_mb(cpi, frame_type, q, correction_factor, + cpi->sf.hl_sf.accurate_bit_estimate)); + return AOMMAX(FRAME_OVERHEAD_BITS, + (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS); +} + +int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target, + FRAME_UPDATE_TYPE frame_update_type) { + const RATE_CONTROL *rc = &cpi->rc; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + const int min_frame_target = + AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); + // Clip the frame target to the minimum setup value. + if (frame_update_type == OVERLAY_UPDATE || + frame_update_type == INTNL_OVERLAY_UPDATE) { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; + } else if (target < min_frame_target) { + target = min_frame_target; + } + + // Clip the frame target to the maximum allowed value. + if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + if (oxcf->rc_cfg.max_inter_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * oxcf->rc_cfg.max_inter_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + + return target; +} + +int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int64_t target) { + const RATE_CONTROL *rc = &cpi->rc; + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + if (rc_cfg->max_intra_bitrate_pct) { + const int64_t max_rate = + (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_intra_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + return (int)target; +} + +// Update the buffer level for higher temporal layers, given the encoded current +// temporal layer. +static void update_layer_buffer_level(SVC *svc, int encoded_frame_size, + bool is_screen) { + const int current_temporal_layer = svc->temporal_layer_id; + for (int i = current_temporal_layer + 1; i < svc->number_temporal_layers; + ++i) { + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc; + lp_rc->bits_off_target += + (int)round(lc->target_bandwidth / lc->framerate) - encoded_frame_size; + // Clip buffer level to maximum buffer size for the layer. + lp_rc->bits_off_target = + AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size); + lp_rc->buffer_level = lp_rc->bits_off_target; + + // For screen-content mode: don't let buffer level go below threshold, + // given here as -rc->maximum_ buffer_size, to allow buffer to come back + // up sooner after slide change with big oveshoot. + if (is_screen) { + lp_rc->bits_off_target = + AOMMAX(lp_rc->bits_off_target, -lp_rc->maximum_buffer_size); + lp_rc->buffer_level = lp_rc->bits_off_target; + } + } +} +// Update the buffer level: leaky bucket model. +static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + + // Non-viewable frames are a special case and are treated as pure overhead. + if (!cm->show_frame) + p_rc->bits_off_target -= encoded_frame_size; + else + p_rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; + + // Clip the buffer level to the maximum specified buffer size. + p_rc->bits_off_target = + AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size); + // For screen-content mode: don't let buffel level go below threshold, + // given here as -rc->maximum_ buffer_size, to allow buffer to come back + // up sooner after slide change with big oveshoot. + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) + p_rc->bits_off_target = + AOMMAX(p_rc->bits_off_target, -p_rc->maximum_buffer_size); + p_rc->buffer_level = p_rc->bits_off_target; + + if (cpi->ppi->use_svc) + update_layer_buffer_level(&cpi->svc, encoded_frame_size, + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); + +#if CONFIG_FPMT_TEST + /* The variable temp_buffer_level is introduced for quality + * simulation purpose, it retains the value previous to the parallel + * encode frames. The variable is updated based on the update flag. + * + * If there exist show_existing_frames between parallel frames, then to + * retain the temp state do not update it. */ + int show_existing_between_parallel_frames = + (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); + + if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + p_rc->temp_buffer_level = p_rc->buffer_level; + } +#endif +} + +int av1_rc_get_default_min_gf_interval(int width, int height, + double framerate) { + // Assume we do not need any constraint lower than 4K 20 fps + static const double factor_safe = 3840 * 2160 * 20.0; + const double factor = (double)width * height * framerate; + const int default_interval = + clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL); + + if (factor <= factor_safe) + return default_interval; + else + return AOMMAX(default_interval, + (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5)); + // Note this logic makes: + // 4K24: 5 + // 4K30: 6 + // 4K60: 12 +} + +int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) { + int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75)); + interval += (interval & 0x01); // Round to even value + interval = AOMMAX(MAX_GF_INTERVAL, interval); + return AOMMAX(interval, min_gf_interval); +} + +void av1_primary_rc_init(const AV1EncoderConfig *oxcf, + PRIMARY_RATE_CONTROL *p_rc) { + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + int worst_allowed_q = rc_cfg->worst_allowed_q; + + int min_gf_interval = oxcf->gf_cfg.min_gf_interval; + int max_gf_interval = oxcf->gf_cfg.max_gf_interval; + if (min_gf_interval == 0) + min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, + oxcf->input_cfg.init_framerate); + if (max_gf_interval == 0) + max_gf_interval = av1_rc_get_default_max_gf_interval( + oxcf->input_cfg.init_framerate, min_gf_interval); + p_rc->baseline_gf_interval = (min_gf_interval + max_gf_interval) / 2; + p_rc->this_key_frame_forced = 0; + p_rc->next_key_frame_forced = 0; + p_rc->ni_frames = 0; + + p_rc->tot_q = 0.0; + p_rc->total_actual_bits = 0; + p_rc->total_target_bits = 0; + p_rc->buffer_level = p_rc->starting_buffer_level; + + if (oxcf->target_seq_level_idx[0] < SEQ_LEVELS) { + worst_allowed_q = 255; + } + if (oxcf->pass == AOM_RC_ONE_PASS && rc_cfg->mode == AOM_CBR) { + p_rc->avg_frame_qindex[KEY_FRAME] = worst_allowed_q; + p_rc->avg_frame_qindex[INTER_FRAME] = worst_allowed_q; + } else { + p_rc->avg_frame_qindex[KEY_FRAME] = + (worst_allowed_q + rc_cfg->best_allowed_q) / 2; + p_rc->avg_frame_qindex[INTER_FRAME] = + (worst_allowed_q + rc_cfg->best_allowed_q) / 2; + } + p_rc->avg_q = av1_convert_qindex_to_q(rc_cfg->worst_allowed_q, + oxcf->tool_cfg.bit_depth); + p_rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q; + p_rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q; + + for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) { + p_rc->rate_correction_factors[i] = 0.7; + } + p_rc->rate_correction_factors[KF_STD] = 1.0; + p_rc->bits_off_target = p_rc->starting_buffer_level; + + p_rc->rolling_target_bits = + (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate); + p_rc->rolling_actual_bits = + (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate); +} + +void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) { + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + rc->frames_since_key = 8; // Sensible default for first frame. + rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist; + + rc->frames_till_gf_update_due = 0; + rc->ni_av_qi = rc_cfg->worst_allowed_q; + rc->ni_tot_qi = 0; + + rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval; + rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval; + if (rc->min_gf_interval == 0) + rc->min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, + oxcf->input_cfg.init_framerate); + if (rc->max_gf_interval == 0) + rc->max_gf_interval = av1_rc_get_default_max_gf_interval( + oxcf->input_cfg.init_framerate, rc->min_gf_interval); + rc->avg_frame_low_motion = 0; + + rc->resize_state = ORIG; + rc->resize_avg_qp = 0; + rc->resize_buffer_underflow = 0; + rc->resize_count = 0; + rc->rtc_external_ratectrl = 0; + rc->frame_level_fast_extra_bits = 0; + rc->use_external_qp_one_pass = 0; +} + +static bool check_buffer_below_thresh(AV1_COMP *cpi, int64_t buffer_level, + int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->ppi->use_svc || cpi->svc.number_spatial_layers == 1 || + cpi->svc.framedrop_mode == AOM_LAYER_DROP) { + return (buffer_level <= drop_mark); + } else { + // For SVC in the AOM_FULL_SUPERFRAME_DROP): the condition on + // buffer is checked on current and upper spatial layers. + for (int i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + PRIMARY_RATE_CONTROL *lrc = &lc->p_rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_thresh = cpi->oxcf.rc_cfg.drop_frames_water_mark; + const int drop_mark_layer = + (int)(drop_thresh * lrc->optimal_buffer_level / 100); + if (lrc->buffer_level <= drop_mark_layer) return true; + } + } + return false; + } +} + +int av1_rc_drop_frame(AV1_COMP *cpi) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int64_t buffer_level = + simulate_parallel_frame ? p_rc->temp_buffer_level : p_rc->buffer_level; +#else + int64_t buffer_level = p_rc->buffer_level; +#endif + // Never drop on key frame, or for frame whose base layer is key. + // If drop_count_consec hits or exceeds max_consec_drop then don't drop. + if (cpi->common.current_frame.frame_type == KEY_FRAME || + (cpi->ppi->use_svc && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || + !oxcf->rc_cfg.drop_frames_water_mark || + (rc->max_consec_drop > 0 && + rc->drop_count_consec >= rc->max_consec_drop)) { + return 0; + } else { + SVC *svc = &cpi->svc; + // In the full_superframe framedrop mode for svc, if the previous spatial + // layer was dropped, drop the current spatial layer. + if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1] && + svc->framedrop_mode == AOM_FULL_SUPERFRAME_DROP) + return 1; + // -1 is passed here for drop_mark since we are checking if + // buffer goes below 0 (<= -1). + if (check_buffer_below_thresh(cpi, buffer_level, -1)) { + // Always drop if buffer is below 0. + rc->drop_count_consec++; + return 1; + } else { + // If buffer is below drop_mark, for now just drop every other frame + // (starting with the next frame) until it increases back over drop_mark. + const int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark * + p_rc->optimal_buffer_level / 100); + const bool buffer_below_thresh = + check_buffer_below_thresh(cpi, buffer_level, drop_mark); + if (!buffer_below_thresh && rc->decimation_factor > 0) { + --rc->decimation_factor; + } else if (buffer_below_thresh && rc->decimation_factor == 0) { + rc->decimation_factor = 1; + } + if (rc->decimation_factor > 0) { + if (rc->decimation_count > 0) { + --rc->decimation_count; + rc->drop_count_consec++; + return 1; + } else { + rc->decimation_count = rc->decimation_factor; + return 0; + } + } else { + rc->decimation_count = 0; + return 0; + } + } + } +} + +static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality, + int width, int height) { + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1_COMMON *const cm = &cpi->common; + const SVC *const svc = &cpi->svc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + // Flag to indicate previous frame has overshoot, and buffer level + // for current frame is low (less than ~half of optimal). For such + // (inter) frames, if the source_sad is non-zero, relax the max_delta_up + // and clamp applied below. + const bool overshoot_buffer_low = + cpi->rc.rc_1_frame == -1 && rc->frame_source_sad > 1000 && + p_rc->buffer_level < (p_rc->optimal_buffer_level >> 1) && + rc->frames_since_key > 4; + int max_delta_down; + int max_delta_up = overshoot_buffer_low ? 60 : 20; + const int change_avg_frame_bandwidth = + abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) > + 0.1 * (rc->avg_frame_bandwidth); + + // Set the maximum adjustment down for Q for this frame. + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->apply_cyclic_refresh) { + // For static screen type content limit the Q drop till the start of the + // next refresh cycle. + if (cpi->is_screen_content_type && + (cpi->cyclic_refresh->sb_index > cpi->cyclic_refresh->last_sb_index)) { + max_delta_down = AOMMIN(8, AOMMAX(1, rc->q_1_frame / 32)); + } else { + max_delta_down = AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8)); + } + if (!cpi->ppi->use_svc && cpi->is_screen_content_type) { + // Link max_delta_up to max_delta_down and buffer status. + if (p_rc->buffer_level > p_rc->optimal_buffer_level) { + max_delta_up = AOMMAX(4, max_delta_down); + } else { + max_delta_up = AOMMAX(8, max_delta_down); + } + } + } else { + max_delta_down = (cpi->is_screen_content_type) + ? AOMMIN(8, AOMMAX(1, rc->q_1_frame / 16)) + : AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8)); + } + // If resolution changes or avg_frame_bandwidth significantly changed, + // then set this flag to indicate change in target bits per macroblock. + const int change_target_bits_mb = + cm->prev_frame && + (width != cm->prev_frame->width || height != cm->prev_frame->height || + change_avg_frame_bandwidth); + // Apply some control/clamp to QP under certain conditions. + // Delay the use of the clamping for svc until after num_temporal_layers, + // to make they have been set for each temporal layer. + if (!frame_is_intra_only(cm) && rc->frames_since_key > 1 && + (!cpi->ppi->use_svc || + svc->current_superframe > (unsigned int)svc->number_temporal_layers) && + !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl && + (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct || + !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) { + // If in the previous two frames we have seen both overshoot and undershoot + // clamp Q between the two. Check for rc->q_1/2_frame > 0 in case they have + // not been set due to dropped frames. + if (rc->rc_1_frame * rc->rc_2_frame == -1 && + rc->q_1_frame != rc->q_2_frame && rc->q_1_frame > 0 && + rc->q_2_frame > 0 && !overshoot_buffer_low) { + int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame), + AOMMAX(rc->q_1_frame, rc->q_2_frame)); + // If the previous frame had overshoot and the current q needs to + // increase above the clamped value, reduce the clamp for faster reaction + // to overshoot. + if (cpi->rc.rc_1_frame == -1 && q > qclamp && rc->frames_since_key > 10) + q = (q + qclamp) >> 1; + else + q = qclamp; + } + // Adjust Q base on source content change from scene detection. + if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 && + rc->frames_since_key > 10 && rc->frame_source_sad > 0 && + !cpi->rc.rtc_external_ratectrl) { + const int bit_depth = cm->seq_params->bit_depth; + double delta = + (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0; + // Push Q downwards if content change is decreasing and buffer level + // is stable (at least 1/4-optimal level), so not overshooting. Do so + // only for high Q to avoid excess overshoot. + // Else reduce decrease in Q from previous frame if content change is + // increasing and buffer is below max (so not undershooting). + if (delta < 0.0 && + p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) && + q > (rc->worst_quality >> 1)) { + double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta); + double q_val = av1_convert_qindex_to_q(q, bit_depth); + q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + } else if (rc->q_1_frame - q > 0 && delta > 0.1 && + p_rc->buffer_level < AOMMIN(p_rc->maximum_buffer_size, + p_rc->optimal_buffer_level << 1)) { + q = (3 * q + rc->q_1_frame) >> 2; + } + } + // Limit the decrease in Q from previous frame. + if (rc->q_1_frame - q > max_delta_down) q = rc->q_1_frame - max_delta_down; + // Limit the increase in Q from previous frame. + else if (q - rc->q_1_frame > max_delta_up) + q = rc->q_1_frame + max_delta_up; + } + // Adjustment for temporal layers. + if (svc->number_temporal_layers > 1 && svc->spatial_layer_id == 0 && + !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl && + cpi->oxcf.resize_cfg.resize_mode != RESIZE_DYNAMIC) { + if (svc->temporal_layer_id > 0) { + // Constrain enhancement relative to the previous base TL0. + // Get base temporal layer TL0. + const int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + // lc->rc.avg_frame_bandwidth and lc->p_rc.last_q correspond to the + // last TL0 frame. + if (rc->avg_frame_bandwidth < lc->rc.avg_frame_bandwidth && + q < lc->p_rc.last_q[INTER_FRAME] - 4) + q = lc->p_rc.last_q[INTER_FRAME] - 4; + } else if (cpi->svc.temporal_layer_id == 0 && + p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) && + rc->frame_source_sad < 100000) { + // Push base TL0 Q down if buffer is stable and frame_source_sad + // is below threshold. + int delta = (svc->number_temporal_layers == 2) ? 4 : 10; + q = q - delta; + } + } + // For non-svc (single layer): if resolution has increased push q closer + // to the active_worst to avoid excess overshoot. + if (!cpi->ppi->use_svc && cm->prev_frame && + (width * height > 1.5 * cm->prev_frame->width * cm->prev_frame->height)) + q = (q + active_worst_quality) >> 1; + // For single layer RPS: Bias Q based on distance of closest reference. + if (cpi->ppi->rtc_ref.bias_recovery_frame) { + const int min_dist = av1_svc_get_min_ref_dist(cpi); + q = q - AOMMIN(min_dist, 20); + } + return AOMMAX(AOMMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality); +} + +static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = { + KF_STD, // KF_UPDATE + INTER_NORMAL, // LF_UPDATE + GF_ARF_STD, // GF_UPDATE + GF_ARF_STD, // ARF_UPDATE + INTER_NORMAL, // OVERLAY_UPDATE + INTER_NORMAL, // INTNL_OVERLAY_UPDATE + GF_ARF_LOW, // INTNL_ARF_UPDATE +}; + +static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group, + int gf_frame_index) { + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index]; + assert(update_type < FRAME_UPDATE_TYPES); + return rate_factor_levels[update_type]; +} + +/*!\brief Gets a rate vs Q correction factor + * + * This function returns the current value of a correction factor used to + * dynamilcally adjust the relationship between Q and the expected number + * of bits for the frame. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder instance structure + * \param[in] width Frame width + * \param[in] height Frame height + * + * \return Returns a correction factor for the current frame + */ +static double get_rate_correction_factor(const AV1_COMP *cpi, int width, + int height) { + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + double rcf; + double rate_correction_factors_kfstd; + double rate_correction_factors_gfarfstd; + double rate_correction_factors_internormal; + + rate_correction_factors_kfstd = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? rc->frame_level_rate_correction_factors[KF_STD] + : p_rc->rate_correction_factors[KF_STD]; + rate_correction_factors_gfarfstd = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? rc->frame_level_rate_correction_factors[GF_ARF_STD] + : p_rc->rate_correction_factors[GF_ARF_STD]; + rate_correction_factors_internormal = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? rc->frame_level_rate_correction_factors[INTER_NORMAL] + : p_rc->rate_correction_factors[INTER_NORMAL]; + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + rcf = rate_correction_factors_kfstd; + } else if (is_stat_consumption_stage(cpi)) { + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index); + double rate_correction_factors_rflvl = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? rc->frame_level_rate_correction_factors[rf_lvl] + : p_rc->rate_correction_factors[rf_lvl]; + rcf = rate_correction_factors_rflvl; + } else { + if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) && + !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && + (cpi->oxcf.rc_cfg.mode != AOM_CBR || + cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) + rcf = rate_correction_factors_gfarfstd; + else + rcf = rate_correction_factors_internormal; + } + rcf *= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height); + return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR); +} + +/*!\brief Sets a rate vs Q correction factor + * + * This function updates the current value of a correction factor used to + * dynamilcally adjust the relationship between Q and the expected number + * of bits for the frame. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder instance structure + * \param[in] is_encode_stage Indicates if recode loop or post-encode + * \param[in] factor New correction factor + * \param[in] width Frame width + * \param[in] height Frame height + * + * \remark Updates the rate correction factor for the + * current frame type in cpi->rc. + */ +static void set_rate_correction_factor(AV1_COMP *cpi, int is_encode_stage, + double factor, int width, int height) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + int update_default_rcf = 1; + // Normalize RCF to account for the size-dependent scaling factor. + factor /= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height); + + factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR); + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + p_rc->rate_correction_factors[KF_STD] = factor; + } else if (is_stat_consumption_stage(cpi)) { + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index); + if (is_encode_stage && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + rc->frame_level_rate_correction_factors[rf_lvl] = factor; + update_default_rcf = 0; + } + if (update_default_rcf) p_rc->rate_correction_factors[rf_lvl] = factor; + } else { + if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) && + !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && + (cpi->oxcf.rc_cfg.mode != AOM_CBR || + cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) { + p_rc->rate_correction_factors[GF_ARF_STD] = factor; + } else { + if (is_encode_stage && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + rc->frame_level_rate_correction_factors[INTER_NORMAL] = factor; + update_default_rcf = 0; + } + if (update_default_rcf) + p_rc->rate_correction_factors[INTER_NORMAL] = factor; + } + } +} + +void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int is_encode_stage, + int width, int height) { + const AV1_COMMON *const cm = &cpi->common; + double correction_factor = 1.0; + double rate_correction_factor = + get_rate_correction_factor(cpi, width, height); + double adjustment_limit; + int projected_size_based_on_q = 0; + int cyclic_refresh_active = + cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled; + + // Do not update the rate factors for arf overlay frames. + if (cpi->rc.is_src_frame_alt_ref) return; + + // Don't update rate correction factors here on scene changes as + // it is already reset in av1_encodedframe_overshoot_cbr(), + // but reset variables related to previous frame q and size. + // Note that the counter of frames since the last scene change + // is only valid when cyclic refresh mode is enabled and that + // this break out only applies to scene changes that are not + // recorded as INTRA only key frames. + if ((cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) && + (cpi->cyclic_refresh->counter_encode_maxq_scene_change == 0) && + !frame_is_intra_only(cm) && !cpi->ppi->use_svc) { + cpi->rc.q_2_frame = cm->quant_params.base_qindex; + cpi->rc.q_1_frame = cm->quant_params.base_qindex; + cpi->rc.rc_2_frame = 0; + cpi->rc.rc_1_frame = 0; + return; + } + + // Clear down mmx registers to allow floating point in what follows + + // Work out how big we would have expected the frame to be at this Q given + // the current correction factor. + // Stay in double to avoid int overflow when values are large + if (cyclic_refresh_active) { + projected_size_based_on_q = + av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor); + } else { + projected_size_based_on_q = av1_estimate_bits_at_q( + cpi, cm->quant_params.base_qindex, rate_correction_factor); + } + // Work out a size correction factor. + if (projected_size_based_on_q > FRAME_OVERHEAD_BITS) + correction_factor = (double)cpi->rc.projected_frame_size / + (double)projected_size_based_on_q; + + // Clamp correction factor to prevent anything too extreme + correction_factor = AOMMAX(correction_factor, 0.25); + + cpi->rc.q_2_frame = cpi->rc.q_1_frame; + cpi->rc.q_1_frame = cm->quant_params.base_qindex; + cpi->rc.rc_2_frame = cpi->rc.rc_1_frame; + if (correction_factor > 1.1) + cpi->rc.rc_1_frame = -1; + else if (correction_factor < 0.9) + cpi->rc.rc_1_frame = 1; + else + cpi->rc.rc_1_frame = 0; + + // Decide how heavily to dampen the adjustment + if (correction_factor > 0.0) { + if (cpi->is_screen_content_type) { + adjustment_limit = + 0.25 + 0.5 * AOMMIN(0.5, fabs(log10(correction_factor))); + } else { + adjustment_limit = + 0.25 + 0.75 * AOMMIN(0.5, fabs(log10(correction_factor))); + } + } else { + adjustment_limit = 0.75; + } + + // Adjustment to delta Q and number of blocks updated in cyclic refressh + // based on over or under shoot of target in current frame. + if (cyclic_refresh_active && cpi->rc.this_frame_target > 0) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + if (correction_factor > 1.25) { + cr->percent_refresh_adjustment = + AOMMAX(cr->percent_refresh_adjustment - 1, -5); + cr->rate_ratio_qdelta_adjustment = + AOMMAX(cr->rate_ratio_qdelta_adjustment - 0.05, -0.0); + } else if (correction_factor < 0.5) { + cr->percent_refresh_adjustment = + AOMMIN(cr->percent_refresh_adjustment + 1, 5); + cr->rate_ratio_qdelta_adjustment = + AOMMIN(cr->rate_ratio_qdelta_adjustment + 0.05, 0.25); + } + } + + if (correction_factor > 1.01) { + // We are not already at the worst allowable quality + correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit)); + rate_correction_factor = rate_correction_factor * correction_factor; + // Keep rate_correction_factor within limits + if (rate_correction_factor > MAX_BPB_FACTOR) + rate_correction_factor = MAX_BPB_FACTOR; + } else if (correction_factor < 0.99) { + // We are not already at the best allowable quality + correction_factor = 1.0 / correction_factor; + correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit)); + correction_factor = 1.0 / correction_factor; + + rate_correction_factor = rate_correction_factor * correction_factor; + + // Keep rate_correction_factor within limits + if (rate_correction_factor < MIN_BPB_FACTOR) + rate_correction_factor = MIN_BPB_FACTOR; + } + + set_rate_correction_factor(cpi, is_encode_stage, rate_correction_factor, + width, height); +} + +// Calculate rate for the given 'q'. +static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh, + double correction_factor, int q) { + const AV1_COMMON *const cm = &cpi->common; + return use_cyclic_refresh + ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor) + : av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, q, + correction_factor, + cpi->sf.hl_sf.accurate_bit_estimate); +} + +/*!\brief Searches for a Q index value predicted to give an average macro + * block rate closest to the target value. + * + * Similar to find_qindex_by_rate() function, but returns a q index with a + * rate just above or below the desired rate, depending on which of the two + * rates is closer to the desired rate. + * Also, respects the selected aq_mode when computing the rate. + * + * \ingroup rate_control + * \param[in] desired_bits_per_mb Target bits per mb + * \param[in] cpi Top level encoder instance structure + * \param[in] correction_factor Current Q to rate correction factor + * \param[in] best_qindex Min allowed Q value. + * \param[in] worst_qindex Max allowed Q value. + * + * \return Returns a correction factor for the current frame + */ +static int find_closest_qindex_by_rate(int desired_bits_per_mb, + const AV1_COMP *cpi, + double correction_factor, + int best_qindex, int worst_qindex) { + const int use_cyclic_refresh = cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->apply_cyclic_refresh; + + // Find 'qindex' based on 'desired_bits_per_mb'. + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const int mid_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid); + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + + // Calculate rate difference of this q index from the desired rate. + const int curr_q = low; + const int curr_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q); + const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb) + ? desired_bits_per_mb - curr_bits_per_mb + : INT_MAX; + assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) || + curr_q == worst_qindex); + + // Calculate rate difference for previous q index too. + const int prev_q = curr_q - 1; + int prev_bit_diff; + if (curr_bit_diff == INT_MAX || curr_q == best_qindex) { + prev_bit_diff = INT_MAX; + } else { + const int prev_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q); + assert(prev_bits_per_mb > desired_bits_per_mb); + prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb; + } + + // Pick one of the two q indices, depending on which one has rate closer to + // the desired rate. + return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q; +} + +int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality, + int width, int height) { + const int MBs = av1_get_MBs(width, height); + const double correction_factor = + get_rate_correction_factor(cpi, width, height); + const int target_bits_per_mb = + (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / MBs); + + int q = + find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor, + active_best_quality, active_worst_quality); + if (cpi->oxcf.rc_cfg.mode == AOM_CBR && has_no_stats_stage(cpi)) + return adjust_q_cbr(cpi, q, active_worst_quality, width, height); + + return q; +} + +static int get_active_quality(int q, int gfu_boost, int low, int high, + int *low_motion_minq, int *high_motion_minq) { + if (gfu_boost > high) { + return low_motion_minq[q]; + } else if (gfu_boost < low) { + return high_motion_minq[q]; + } else { + const int gap = high - low; + const int offset = high - gfu_boost; + const int qdiff = high_motion_minq[q] - low_motion_minq[q]; + const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; + return low_motion_minq[q] + adjustment; + } +} + +static int get_kf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q, + aom_bit_depth_t bit_depth) { + int *kf_low_motion_minq; + int *kf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq); + return get_active_quality(q, p_rc->kf_boost, kf_low, kf_high, + kf_low_motion_minq, kf_high_motion_minq); +} + +static int get_gf_active_quality_no_rc(int gfu_boost, int q, + aom_bit_depth_t bit_depth) { + int *arfgf_low_motion_minq; + int *arfgf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); + return get_active_quality(q, gfu_boost, gf_low, gf_high, + arfgf_low_motion_minq, arfgf_high_motion_minq); +} + +static int get_gf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q, + aom_bit_depth_t bit_depth) { + return get_gf_active_quality_no_rc(p_rc->gfu_boost, q, bit_depth); +} + +static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) { + int *arfgf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); + return arfgf_high_motion_minq[q]; +} + +static int calc_active_worst_quality_no_stats_vbr(const AV1_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const unsigned int curr_frame = cpi->common.current_frame.frame_number; + int active_worst_quality; + int last_q_key_frame; + int last_q_inter_frame; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + last_q_key_frame = simulate_parallel_frame ? p_rc->temp_last_q[KEY_FRAME] + : p_rc->last_q[KEY_FRAME]; + last_q_inter_frame = simulate_parallel_frame ? p_rc->temp_last_q[INTER_FRAME] + : p_rc->last_q[INTER_FRAME]; +#else + last_q_key_frame = p_rc->last_q[KEY_FRAME]; + last_q_inter_frame = p_rc->last_q[INTER_FRAME]; +#endif + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + active_worst_quality = + curr_frame == 0 ? rc->worst_quality : last_q_key_frame * 2; + } else { + if (!rc->is_src_frame_alt_ref && + (refresh_frame->golden_frame || refresh_frame->bwd_ref_frame || + refresh_frame->alt_ref_frame)) { + active_worst_quality = + curr_frame == 1 ? last_q_key_frame * 5 / 4 : last_q_inter_frame; + } else { + active_worst_quality = + curr_frame == 1 ? last_q_key_frame * 2 : last_q_inter_frame * 2; + } + } + return AOMMIN(active_worst_quality, rc->worst_quality); +} + +// Adjust active_worst_quality level based on buffer level. +static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) { + // Adjust active_worst_quality: If buffer is above the optimal/target level, + // bring active_worst_quality down depending on fullness of buffer. + // If buffer is below the optimal level, let the active_worst_quality go from + // ambient Q (at buffer = optimal level) to worst_quality level + // (at buffer = critical level). + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; + const SVC *const svc = &cpi->svc; + unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers; + // Buffer level below which we push active_worst to worst_quality. + int64_t critical_level = p_rc->optimal_buffer_level >> 3; + int64_t buff_lvl_step = 0; + int adjustment = 0; + int active_worst_quality; + int ambient_qp; + if (cm->current_frame.frame_type == KEY_FRAME) return rc->worst_quality; + // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] + // for the first few frames following key frame. These are both initialized + // to worst_quality and updated with (3/4, 1/4) average in postencode_update. + // So for first few frames following key, the qp of that key frame is weighted + // into the active_worst_quality setting. For SVC the key frame should + // correspond to layer (0, 0), so use that for layer context. + int avg_qindex_key = p_rc->avg_frame_qindex[KEY_FRAME]; + if (svc->number_temporal_layers > 1) { + int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers); + const LAYER_CONTEXT *lc = &svc->layer_context[layer]; + const PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc; + avg_qindex_key = + AOMMIN(lp_rc->avg_frame_qindex[KEY_FRAME], lp_rc->last_q[KEY_FRAME]); + } + ambient_qp = (cm->current_frame.frame_number < num_frames_weight_key) + ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key) + : p_rc->avg_frame_qindex[INTER_FRAME]; + ambient_qp = AOMMIN(rc->worst_quality, ambient_qp); + + if (p_rc->buffer_level > p_rc->optimal_buffer_level) { + // Adjust down. + int max_adjustment_down; // Maximum adjustment down for Q + + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && !cpi->ppi->use_svc && + (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)) { + active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp); + max_adjustment_down = AOMMIN(4, active_worst_quality / 16); + } else { + active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4); + max_adjustment_down = active_worst_quality / 3; + } + + if (max_adjustment_down) { + buff_lvl_step = + ((p_rc->maximum_buffer_size - p_rc->optimal_buffer_level) / + max_adjustment_down); + if (buff_lvl_step) + adjustment = (int)((p_rc->buffer_level - p_rc->optimal_buffer_level) / + buff_lvl_step); + active_worst_quality -= adjustment; + } + } else if (p_rc->buffer_level > critical_level) { + // Adjust up from ambient Q. + active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp); + if (critical_level) { + buff_lvl_step = (p_rc->optimal_buffer_level - critical_level); + if (buff_lvl_step) { + adjustment = (int)((rc->worst_quality - ambient_qp) * + (p_rc->optimal_buffer_level - p_rc->buffer_level) / + buff_lvl_step); + } + active_worst_quality += adjustment; + } + } else { + // Set to worst_quality if buffer is below critical level. + active_worst_quality = rc->worst_quality; + } + return active_worst_quality; +} + +// Calculate the active_best_quality level. +static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi, + int active_worst_quality, + int width, int height) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const CurrentFrame *const current_frame = &cm->current_frame; + int *rtc_minq; + const int bit_depth = cm->seq_params->bit_depth; + int active_best_quality = rc->best_quality; + ASSIGN_MINQ_TABLE(bit_depth, rtc_minq); + + if (frame_is_intra_only(cm)) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (p_rc->this_key_frame_forced) { + int qindex = p_rc->last_boosted_qindex; + double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + int delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + (last_boosted_q * 0.75), bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else if (current_frame->frame_number > 0) { + // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + double q_val; + active_best_quality = get_kf_active_quality( + p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth); + // Allow somewhat lower kf minq with small image formats. + if ((width * height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + } + } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && + cpi->oxcf.rc_cfg.gf_cbr_boost_pct && + (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + int q = active_worst_quality; + if (rc->frames_since_key > 1 && + p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = p_rc->avg_frame_qindex[INTER_FRAME]; + } + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); + } else { + // Use the lower of active_worst_quality and recent/average Q. + FRAME_TYPE frame_type = + (current_frame->frame_number > 1) ? INTER_FRAME : KEY_FRAME; + if (p_rc->avg_frame_qindex[frame_type] < active_worst_quality) + active_best_quality = rtc_minq[p_rc->avg_frame_qindex[frame_type]]; + else + active_best_quality = rtc_minq[active_worst_quality]; + } + return active_best_quality; +} + +#if RT_PASSIVE_STRATEGY +static int get_q_passive_strategy(const AV1_COMP *const cpi, + const int q_candidate, const int threshold) { + const AV1_COMMON *const cm = &cpi->common; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const CurrentFrame *const current_frame = &cm->current_frame; + int sum = 0; + int count = 0; + int i = 1; + while (i < MAX_Q_HISTORY) { + int frame_id = current_frame->frame_number - i; + if (frame_id <= 0) break; + sum += p_rc->q_history[frame_id % MAX_Q_HISTORY]; + ++count; + ++i; + } + if (count > 0) { + const int avg_q = sum / count; + if (abs(avg_q - q_candidate) <= threshold) return avg_q; + } + return q_candidate; +} +#endif // RT_PASSIVE_STRATEGY + +/*!\brief Picks q and q bounds given CBR rate control parameters in \c cpi->rc. + * + * Handles the special case when using: + * - Constant bit-rate mode: \c cpi->oxcf.rc_cfg.mode == \ref AOM_CBR, and + * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are + * NOT available. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] width Coded frame width + * \param[in] height Coded frame height + * \param[out] bottom_index Bottom bound for q index (best quality) + * \param[out] top_index Top bound for q index (worst quality) + * \return Returns selected q index to be used for encoding this frame. + */ +static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width, + int height, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const CurrentFrame *const current_frame = &cm->current_frame; + int q; + int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi); + int active_best_quality = calc_active_best_quality_no_stats_cbr( + cpi, active_worst_quality, width, height); + assert(has_no_stats_stage(cpi)); + assert(cpi->oxcf.rc_cfg.mode == AOM_CBR); + + // Clip the active best and worst quality values to limits + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + // Limit Q range for the adaptive loop. + if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced && + current_frame->frame_number != 0) { + int qdelta = 0; + qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type, + active_worst_quality, 2.0); + *top_index = active_worst_quality + qdelta; + *top_index = AOMMAX(*top_index, *bottom_index); + } + + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality, width, height); +#if RT_PASSIVE_STRATEGY + if (current_frame->frame_type != KEY_FRAME && + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + q = get_q_passive_strategy(cpi, q, 50); + } +#endif // RT_PASSIVE_STRATEGY + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static int gf_group_pyramid_level(const GF_GROUP *gf_group, int gf_index) { + return gf_group->layer_depth[gf_index]; +} + +static int get_active_cq_level(const RATE_CONTROL *rc, + const PRIMARY_RATE_CONTROL *p_rc, + const AV1EncoderConfig *const oxcf, + int intra_only, aom_superres_mode superres_mode, + int superres_denom) { + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + static const double cq_adjust_threshold = 0.1; + int active_cq_level = rc_cfg->cq_level; + if (rc_cfg->mode == AOM_CQ || rc_cfg->mode == AOM_Q) { + // printf("Superres %d %d %d = %d\n", superres_denom, intra_only, + // rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1)); + if ((superres_mode == AOM_SUPERRES_QTHRESH || + superres_mode == AOM_SUPERRES_AUTO) && + superres_denom != SCALE_NUMERATOR) { + int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO; + if (intra_only && rc->frames_to_key <= 1) { + mult = 0; + } else if (intra_only) { + mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME; + } else { + mult = SUPERRES_QADJ_PER_DENOM_ARFFRAME; + } + active_cq_level = AOMMAX( + active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0); + } + } + if (rc_cfg->mode == AOM_CQ && p_rc->total_target_bits > 0) { + const double x = (double)p_rc->total_actual_bits / p_rc->total_target_bits; + if (x < cq_adjust_threshold) { + active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold); + } + } + return active_cq_level; +} + +/*!\brief Picks q and q bounds given non-CBR rate control params in \c cpi->rc. + * + * Handles the special case when using: + * - Any rate control other than constant bit-rate mode: + * \c cpi->oxcf.rc_cfg.mode != \ref AOM_CBR, and + * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are + * NOT available. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] width Coded frame width + * \param[in] height Coded frame height + * \param[out] bottom_index Bottom bound for q index (best quality) + * \param[out] top_index Top bound for q index (worst quality) + * \return Returns selected q index to be used for encoding this frame. + */ +static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, + int height, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const CurrentFrame *const current_frame = &cm->current_frame; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode; + + assert(has_no_stats_stage(cpi)); + assert(rc_mode == AOM_VBR || + (!USE_UNRESTRICTED_Q_IN_CQ_MODE && rc_mode == AOM_CQ) || + rc_mode == AOM_Q); + + const int cq_level = + get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm), + cpi->superres_mode, cm->superres_scale_denominator); + const int bit_depth = cm->seq_params->bit_depth; + + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_no_stats_vbr(cpi); + int q; + int *inter_minq; + ASSIGN_MINQ_TABLE(bit_depth, inter_minq); + + if (frame_is_intra_only(cm)) { + if (rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = + av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else if (p_rc->this_key_frame_forced) { +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int qindex = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex + : p_rc->last_boosted_qindex; +#else + int qindex = p_rc->last_boosted_qindex; +#endif + const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = av1_compute_qdelta( + rc, last_boosted_q, last_boosted_q * 0.75, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + + active_best_quality = get_kf_active_quality( + p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth); + + // Allow somewhat lower kf minq with small image formats. + if ((width * height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta on active_best_quality. + { + const double q_val = + av1_convert_qindex_to_q(active_best_quality, bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + } + } + } else if (!rc->is_src_frame_alt_ref && + (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + q = (rc->frames_since_key > 1 && + p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) + ? p_rc->avg_frame_qindex[INTER_FRAME] + : p_rc->avg_frame_qindex[KEY_FRAME]; + // For constrained quality dont allow Q less than the cq level + if (rc_mode == AOM_CQ) { + if (q < cq_level) q = cq_level; + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; + } else if (rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = + (refresh_frame->alt_ref_frame) + ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth) + : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); + } + } else { + if (rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); + const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0, + 0.70, 1.0, 0.85, 1.0 }; + const int delta_qindex = av1_compute_qdelta( + rc, q_val, + q_val * delta_rate[current_frame->frame_number % FIXED_GF_INTERVAL], + bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { + // Use the lower of active_worst_quality and recent/average Q. + active_best_quality = + (current_frame->frame_number > 1) + ? inter_minq[p_rc->avg_frame_qindex[INTER_FRAME]] + : inter_minq[p_rc->avg_frame_qindex[KEY_FRAME]]; + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + // Limit Q range for the adaptive loop. + { + int qdelta = 0; + if (current_frame->frame_type == KEY_FRAME && + !p_rc->this_key_frame_forced && current_frame->frame_number != 0) { + qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type, + active_worst_quality, 2.0); + } else if (!rc->is_src_frame_alt_ref && + (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) { + qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type, + active_worst_quality, 1.75); + } + *top_index = active_worst_quality + qdelta; + *top_index = AOMMAX(*top_index, *bottom_index); + } + + if (rc_mode == AOM_Q) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames + } else if ((current_frame->frame_type == KEY_FRAME) && + p_rc->this_key_frame_forced) { +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + q = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex + : p_rc->last_boosted_qindex; +#else + q = p_rc->last_boosted_qindex; +#endif + } else { + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality, width, height); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75, + 1.50, 1.25, 1.15, + 1.0 }; +int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) { + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(gf_group, cpi->gf_frame_index); + const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index]; + const int arf_layer = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const double rate_factor = + (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer]; + + return av1_compute_qdelta_by_rate(cpi, frame_type, q, rate_factor); +} + +// This unrestricted Q selection on CQ mode is useful when testing new features, +// but may lead to Q being out of range on current RC restrictions +#if USE_UNRESTRICTED_Q_IN_CQ_MODE +static int rc_pick_q_and_bounds_no_stats_cq(const AV1_COMP *cpi, int width, + int height, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = + get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, + cm->superres_scale_denominator); + const int bit_depth = cm->seq_params->bit_depth; + const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth); + (void)width; + (void)height; + assert(has_no_stats_stage(cpi)); + assert(cpi->oxcf.rc_cfg.mode == AOM_CQ); + + *top_index = q; + *bottom_index = q; + + return q; +} +#endif // USE_UNRESTRICTED_Q_IN_CQ_MODE + +#define STATIC_MOTION_THRESH 95 +static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height, + int *active_best, int *active_worst, + int cq_level) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + int active_best_quality; + int active_worst_quality = *active_worst; + const int bit_depth = cm->seq_params->bit_depth; + + if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) { + // If the next frame is also a key frame or the current frame is the + // only frame in the sequence in AOM_Q mode, just use the cq_level + // as q. + active_best_quality = cq_level; + active_worst_quality = cq_level; + } else if (p_rc->this_key_frame_forced) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + double last_boosted_q; + int delta_qindex; + int qindex; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int last_boosted_qindex = simulate_parallel_frame + ? p_rc->temp_last_boosted_qindex + : p_rc->last_boosted_qindex; +#else + int last_boosted_qindex = p_rc->last_boosted_qindex; +#endif + if (is_stat_consumption_stage_twopass(cpi) && + cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + qindex = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex); + active_best_quality = qindex; + last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 1.25, bit_depth); + active_worst_quality = + AOMMIN(qindex + delta_qindex, active_worst_quality); + } else { + qindex = last_boosted_qindex; + last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.50, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } + } else { + // Not forced keyframe. + double q_adj_factor = 1.0; + double q_val; + + // Baseline value derived from active_worst_quality and kf boost. + active_best_quality = + get_kf_active_quality(p_rc, active_worst_quality, bit_depth); + if (cpi->is_screen_content_type) { + active_best_quality /= 2; + } + + if (is_stat_consumption_stage_twopass(cpi) && + cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { + active_best_quality /= 3; + } + + // Allow somewhat lower kf minq with small image formats. + if ((width * height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Make a further adjustment based on the kf zero motion measure. + if (is_stat_consumption_stage_twopass(cpi)) + q_adj_factor += + 0.05 - (0.001 * (double)cpi->ppi->twopass.kf_zeromotion_pct); + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + + // Tweak active_best_quality for AOM_Q mode when superres is on, as this + // will be used directly as 'q' later. + if (oxcf->rc_cfg.mode == AOM_Q && + (cpi->superres_mode == AOM_SUPERRES_QTHRESH || + cpi->superres_mode == AOM_SUPERRES_AUTO) && + cm->superres_scale_denominator != SCALE_NUMERATOR) { + active_best_quality = + AOMMAX(active_best_quality - + ((cm->superres_scale_denominator - SCALE_NUMERATOR) * + SUPERRES_QADJ_PER_DENOM_KEYFRAME), + 0); + } + } + *active_best = active_best_quality; + *active_worst = active_worst_quality; +} + +static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi, + const int is_intrl_arf_boost, + int *active_worst, + int *active_best) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + int active_best_quality = *active_best; + int active_worst_quality = *active_worst; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int extend_minq = simulate_parallel_frame ? p_rc->temp_extend_minq + : cpi->ppi->twopass.extend_minq; + int extend_maxq = simulate_parallel_frame ? p_rc->temp_extend_maxq + : cpi->ppi->twopass.extend_maxq; +#endif + // Extension to max or min Q if undershoot or overshoot is outside + // the permitted range. + if (cpi->oxcf.rc_cfg.mode != AOM_Q) { + if (frame_is_intra_only(cm) || + (!rc->is_src_frame_alt_ref && + (refresh_frame->golden_frame || is_intrl_arf_boost || + refresh_frame->alt_ref_frame))) { +#if CONFIG_FPMT_TEST + active_best_quality -= extend_minq; + active_worst_quality += (extend_maxq / 2); +#else + active_best_quality -= cpi->ppi->twopass.extend_minq / 4; + active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2); +#endif + } else { +#if CONFIG_FPMT_TEST + active_best_quality -= extend_minq / 2; + active_worst_quality += extend_maxq; +#else + active_best_quality -= cpi->ppi->twopass.extend_minq / 4; + active_worst_quality += cpi->ppi->twopass.extend_maxq; +#endif + } + } + +#ifndef STRICT_RC + // Static forced key frames Q restrictions dealt with elsewhere. + if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced || + (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { + const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality); + active_worst_quality = + AOMMAX(active_worst_quality + qdelta, active_best_quality); + } +#endif + + // Modify active_best_quality for downscaled normal frames. + if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) { + int qdelta = av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, + active_best_quality, 2.0); + active_best_quality = + AOMMAX(active_best_quality + qdelta, rc->best_quality); + } + + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *active_best = active_best_quality; + *active_worst = active_worst_quality; +} + +/*!\brief Gets a Q value to use for the current frame + * + * + * Selects a Q value from a permitted range that we estimate + * will result in approximately the target number of bits. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder instance structure + * \param[in] width Width of frame + * \param[in] height Height of frame + * \param[in] active_worst_quality Max Q allowed + * \param[in] active_best_quality Min Q allowed + * + * \return The suggested Q for this frame. + */ +static int get_q(const AV1_COMP *cpi, const int width, const int height, + const int active_worst_quality, + const int active_best_quality) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int q; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg; + int last_boosted_qindex = simulate_parallel_frame + ? p_rc->temp_last_boosted_qindex + : p_rc->last_boosted_qindex; +#else + int last_boosted_qindex = p_rc->last_boosted_qindex; +#endif + + if (cpi->oxcf.rc_cfg.mode == AOM_Q || + (frame_is_intra_only(cm) && !p_rc->this_key_frame_forced && + cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH && + rc->frames_to_key > 1)) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames. + } else if (frame_is_intra_only(cm) && p_rc->this_key_frame_forced) { + // If static since last kf use better of last boosted and last kf q. + if (cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + q = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex); + } else { + q = AOMMIN(last_boosted_qindex, + (active_best_quality + active_worst_quality) / 2); + } + q = clamp(q, active_best_quality, active_worst_quality); + } else { + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality, width, height); + if (q > active_worst_quality) { + // Special case when we are targeting the max allowed rate. + if (rc->this_frame_target < rc->max_frame_bandwidth) { + q = active_worst_quality; + } + } + q = AOMMAX(q, active_best_quality); + } + return q; +} + +// Returns |active_best_quality| for an inter frame. +// The |active_best_quality| depends on different rate control modes: +// VBR, Q, CQ, CBR. +// The returning active_best_quality could further be adjusted in +// adjust_active_best_and_worst_quality(). +static int get_active_best_quality(const AV1_COMP *const cpi, + const int active_worst_quality, + const int cq_level, const int gf_index) { + const AV1_COMMON *const cm = &cpi->common; + const int bit_depth = cm->seq_params->bit_depth; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode; + int *inter_minq; + ASSIGN_MINQ_TABLE(bit_depth, inter_minq); + int active_best_quality = 0; + const int is_intrl_arf_boost = + gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; + int is_leaf_frame = + !(gf_group->update_type[gf_index] == ARF_UPDATE || + gf_group->update_type[gf_index] == GF_UPDATE || is_intrl_arf_boost); + + // TODO(jingning): Consider to rework this hack that covers issues incurred + // in lightfield setting. + if (cm->tiles.large_scale) { + is_leaf_frame = !(refresh_frame->golden_frame || + refresh_frame->alt_ref_frame || is_intrl_arf_boost); + } + const int is_overlay_frame = rc->is_src_frame_alt_ref; + + if (is_leaf_frame || is_overlay_frame) { + if (rc_mode == AOM_Q) return cq_level; + + active_best_quality = inter_minq[active_worst_quality]; + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + return active_best_quality; + } + + // Determine active_best_quality for frames that are not leaf or overlay. + int q = active_worst_quality; + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = p_rc->avg_frame_qindex[INTER_FRAME]; + } + if (rc_mode == AOM_CQ && q < cq_level) q = cq_level; + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); + // Constrained quality use slightly lower active best. + if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16; + const int min_boost = get_gf_high_motion_quality(q, bit_depth); + const int boost = min_boost - active_best_quality; + active_best_quality = min_boost - (int)(boost * p_rc->arf_boost_factor); + if (!is_intrl_arf_boost) return active_best_quality; + + if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = p_rc->arf_q; + int this_height = gf_group_pyramid_level(gf_group, gf_index); + while (this_height > 1) { + active_best_quality = (active_best_quality + active_worst_quality + 1) / 2; + --this_height; + } + return active_best_quality; +} + +// Returns the q_index for a single frame in the GOP. +// This function assumes that rc_mode == AOM_Q mode. +int av1_q_mode_get_q_index(int base_q_index, int gf_update_type, + int gf_pyramid_level, int arf_q) { + const int is_intrl_arf_boost = gf_update_type == INTNL_ARF_UPDATE; + int is_leaf_or_overlay_frame = gf_update_type == LF_UPDATE || + gf_update_type == OVERLAY_UPDATE || + gf_update_type == INTNL_OVERLAY_UPDATE; + + if (is_leaf_or_overlay_frame) return base_q_index; + + if (!is_intrl_arf_boost) return arf_q; + + int active_best_quality = arf_q; + int active_worst_quality = base_q_index; + + while (gf_pyramid_level > 1) { + active_best_quality = (active_best_quality + active_worst_quality + 1) / 2; + --gf_pyramid_level; + } + return active_best_quality; +} + +// Returns the q_index for the ARF in the GOP. +int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth, + double arf_boost_factor) { + int active_best_quality = + get_gf_active_quality_no_rc(gfu_boost, base_q_index, bit_depth); + const int min_boost = get_gf_high_motion_quality(base_q_index, bit_depth); + const int boost = min_boost - active_best_quality; + return min_boost - (int)(boost * arf_boost_factor); +} + +static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width, + int height, int gf_index, + int *bottom_index, int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = + get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm), + cpi->superres_mode, cm->superres_scale_denominator); + int active_best_quality = 0; + int active_worst_quality = rc->active_worst_quality; + int q; + + if (frame_is_intra_only(cm)) { + get_intra_q_and_bounds(cpi, width, height, &active_best_quality, + &active_worst_quality, cq_level); + } else { + // Active best quality limited by previous layer. + active_best_quality = + get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index); + } + + if (cq_level > 0) active_best_quality = AOMMAX(1, active_best_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + *top_index = AOMMAX(*top_index, rc->best_quality); + *top_index = AOMMIN(*top_index, rc->worst_quality); + + *bottom_index = AOMMAX(*bottom_index, rc->best_quality); + *bottom_index = AOMMIN(*bottom_index, rc->worst_quality); + + q = active_best_quality; + + q = AOMMAX(q, rc->best_quality); + q = AOMMIN(q, rc->worst_quality); + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + + return q; +} + +/*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc. + * + * Handles the the general cases not covered by + * \ref rc_pick_q_and_bounds_no_stats_cbr() and + * \ref rc_pick_q_and_bounds_no_stats() + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] width Coded frame width + * \param[in] height Coded frame height + * \param[in] gf_index Index of this frame in the golden frame group + * \param[out] bottom_index Bottom bound for q index (best quality) + * \param[out] top_index Top bound for q index (worst quality) + * \return Returns selected q index to be used for encoding this frame. + */ +static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, + int gf_index, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(has_no_stats_stage(cpi), + cpi->oxcf.rc_cfg.mode == AOM_Q && + gf_group->update_type[gf_index] != ARF_UPDATE)); + const int cq_level = + get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm), + cpi->superres_mode, cm->superres_scale_denominator); + + if (oxcf->rc_cfg.mode == AOM_Q) { + return rc_pick_q_and_bounds_q_mode(cpi, width, height, gf_index, + bottom_index, top_index); + } + + int active_best_quality = 0; + int active_worst_quality = rc->active_worst_quality; + int q; + + const int is_intrl_arf_boost = + gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; + + if (frame_is_intra_only(cm)) { + get_intra_q_and_bounds(cpi, width, height, &active_best_quality, + &active_worst_quality, cq_level); +#ifdef STRICT_RC + active_best_quality = 0; +#endif + } else { + // Active best quality limited by previous layer. + const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index); + + if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS)) { + active_best_quality = get_active_best_quality(cpi, active_worst_quality, + cq_level, gf_index); + } else { +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int local_active_best_quality = + simulate_parallel_frame + ? p_rc->temp_active_best_quality[pyramid_level - 1] + : p_rc->active_best_quality[pyramid_level - 1]; + active_best_quality = local_active_best_quality + 1; +#else + active_best_quality = p_rc->active_best_quality[pyramid_level - 1] + 1; +#endif + + active_best_quality = AOMMIN(active_best_quality, active_worst_quality); +#ifdef STRICT_RC + active_best_quality += (active_worst_quality - active_best_quality) / 16; +#else + active_best_quality += (active_worst_quality - active_best_quality) / 2; +#endif + } + + // For alt_ref and GF frames (including internal arf frames) adjust the + // worst allowed quality as well. This insures that even on hard + // sections we dont clamp the Q at the same value for arf frames and + // leaf (non arf) frames. This is important to the TPL model which assumes + // Q drops with each arf level. + if (!(rc->is_src_frame_alt_ref) && + (refresh_frame->golden_frame || refresh_frame->alt_ref_frame || + is_intrl_arf_boost)) { + active_worst_quality = + (active_best_quality + (3 * active_worst_quality) + 2) / 4; + } + } + + adjust_active_best_and_worst_quality( + cpi, is_intrl_arf_boost, &active_worst_quality, &active_best_quality); + q = get_q(cpi, width, height, active_worst_quality, active_best_quality); + + // Special case when we are targeting the max allowed rate. + if (rc->this_frame_target >= rc->max_frame_bandwidth && + q > active_worst_quality) { + active_worst_quality = q; + } + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + + return q; +} + +static void rc_compute_variance_onepass_rt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + YV12_BUFFER_CONFIG const *const unscaled_src = cpi->unscaled_source; + if (unscaled_src == NULL) return; + + const uint8_t *src_y = unscaled_src->y_buffer; + const int src_ystride = unscaled_src->y_stride; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + const uint8_t *pre_y = yv12->buffers[0]; + const int pre_ystride = yv12->strides[0]; + + // TODO(yunqing): support scaled reference frames. + if (cpi->scaled_ref_buf[LAST_FRAME - 1]) return; + + for (int i = 0; i < 2; ++i) { + if (unscaled_src->widths[i] != yv12->widths[i] || + unscaled_src->heights[i] != yv12->heights[i]) { + return; + } + } + + const int num_mi_cols = cm->mi_params.mi_cols; + const int num_mi_rows = cm->mi_params.mi_rows; + const BLOCK_SIZE bsize = BLOCK_64X64; + int num_samples = 0; + // sse is computed on 64x64 blocks + const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; + const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; + const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb; + + uint64_t fsse = 0; + cpi->rec_sse = 0; + + for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) { + for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { + unsigned int sse; + uint8_t src[64 * 64] = { 0 }; + // Apply 4x4 block averaging/denoising on source frame. + for (int i = 0; i < 64; i += 4) { + for (int j = 0; j < 64; j += 4) { + const unsigned int avg = + aom_avg_4x4(src_y + i * src_ystride + j, src_ystride); + + for (int m = 0; m < 4; ++m) { + for (int n = 0; n < 4; ++n) src[i * 64 + j + m * 64 + n] = avg; + } + } + } + + cpi->ppi->fn_ptr[bsize].vf(src, 64, pre_y, pre_ystride, &sse); + fsse += sse; + num_samples++; + src_y += 64; + pre_y += 64; + } + src_y += (src_ystride << 6) - (sb_cols << 6); + pre_y += (pre_ystride << 6) - (sb_cols << 6); + } + assert(num_samples > 0); + // Ensure rec_sse > 0 + if (num_samples > 0) cpi->rec_sse = fsse > 0 ? fsse : 1; +} + +int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height, int gf_index, + int *bottom_index, int *top_index) { + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int q; + // TODO(sarahparker) merge no-stats vbr and altref q computation + // with rc_pick_q_and_bounds(). + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + if ((cpi->oxcf.rc_cfg.mode != AOM_Q || + gf_group->update_type[gf_index] == ARF_UPDATE) && + has_no_stats_stage(cpi)) { + if (cpi->oxcf.rc_cfg.mode == AOM_CBR) { + // TODO(yunqing): the results could be used for encoder optimization. + cpi->rec_sse = UINT64_MAX; + if (cpi->sf.hl_sf.accurate_bit_estimate && + cpi->common.current_frame.frame_type != KEY_FRAME) + rc_compute_variance_onepass_rt(cpi); + + q = rc_pick_q_and_bounds_no_stats_cbr(cpi, width, height, bottom_index, + top_index); + // preserve copy of active worst quality selected. + cpi->rc.active_worst_quality = *top_index; + +#if USE_UNRESTRICTED_Q_IN_CQ_MODE + } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) { + q = rc_pick_q_and_bounds_no_stats_cq(cpi, width, height, bottom_index, + top_index); +#endif // USE_UNRESTRICTED_Q_IN_CQ_MODE + } else { + q = rc_pick_q_and_bounds_no_stats(cpi, width, height, bottom_index, + top_index); + } + } else { + q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index, + top_index); + } + if (gf_group->update_type[gf_index] == ARF_UPDATE) p_rc->arf_q = q; + + return q; +} + +void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit) { + if (cpi->oxcf.rc_cfg.mode == AOM_Q) { + *frame_under_shoot_limit = 0; + *frame_over_shoot_limit = INT_MAX; + } else { + // For very small rate targets where the fractional adjustment + // may be tiny make sure there is at least a minimum range. + assert(cpi->sf.hl_sf.recode_tolerance <= 100); + const int tolerance = (int)AOMMAX( + 100, ((int64_t)cpi->sf.hl_sf.recode_tolerance * frame_target) / 100); + *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0); + *frame_over_shoot_limit = + AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth); + } +} + +void av1_rc_set_frame_target(AV1_COMP *cpi, int target, int width, int height) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + + rc->this_frame_target = target; + + // Modify frame size target when down-scaled. + if (av1_frame_scaled(cm) && cpi->oxcf.rc_cfg.mode != AOM_CBR) { + rc->this_frame_target = + (int)(rc->this_frame_target * + resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height)); + } + + // Target rate per SB64 (including partial SB64s. + rc->sb64_target_rate = + (int)(((int64_t)rc->this_frame_target << 12) / (width * height)); +} + +static void update_alt_ref_frame_stats(AV1_COMP *cpi) { + // this frame refreshes means next frames don't unless specified by user + RATE_CONTROL *const rc = &cpi->rc; + rc->frames_since_golden = 0; +} + +static void update_golden_frame_stats(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + + // Update the Golden frame usage counts. + if (cpi->refresh_frame.golden_frame || rc->is_src_frame_alt_ref) { + rc->frames_since_golden = 0; + } else if (cpi->common.show_frame) { + rc->frames_since_golden++; + } +} + +void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { + const AV1_COMMON *const cm = &cpi->common; + const CurrentFrame *const current_frame = &cm->current_frame; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + + const int is_intrnl_arf = + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; + + const int qindex = cm->quant_params.base_qindex; + +#if RT_PASSIVE_STRATEGY + const int frame_number = current_frame->frame_number % MAX_Q_HISTORY; + p_rc->q_history[frame_number] = qindex; +#endif // RT_PASSIVE_STRATEGY + + // Update rate control heuristics + rc->projected_frame_size = (int)(bytes_used << 3); + + // Post encode loop adjustment of Q prediction. + av1_rc_update_rate_correction_factors(cpi, 0, cm->width, cm->height); + + // Update bit estimation ratio. + if (cpi->oxcf.rc_cfg.mode == AOM_CBR && + cm->current_frame.frame_type != KEY_FRAME && + cpi->sf.hl_sf.accurate_bit_estimate) { + const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex, + cm->seq_params->bit_depth); + const int this_bit_est_ratio = + (int)(rc->projected_frame_size * q / sqrt((double)cpi->rec_sse)); + cpi->rc.bit_est_ratio = + cpi->rc.bit_est_ratio == 0 + ? this_bit_est_ratio + : (7 * cpi->rc.bit_est_ratio + this_bit_est_ratio) / 8; + } + + // Keep a record of last Q and ambient average Q. + if (current_frame->frame_type == KEY_FRAME) { + p_rc->last_q[KEY_FRAME] = qindex; + p_rc->avg_frame_qindex[KEY_FRAME] = + ROUND_POWER_OF_TWO(3 * p_rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); + } else { + if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) || + cpi->rc.rtc_external_ratectrl || + (!rc->is_src_frame_alt_ref && + !(refresh_frame->golden_frame || is_intrnl_arf || + refresh_frame->alt_ref_frame))) { + p_rc->last_q[INTER_FRAME] = qindex; + p_rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO( + 3 * p_rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); + p_rc->ni_frames++; + p_rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params->bit_depth); + p_rc->avg_q = p_rc->tot_q / p_rc->ni_frames; + // Calculate the average Q for normal inter frames (not key or GFU + // frames). + rc->ni_tot_qi += qindex; + rc->ni_av_qi = rc->ni_tot_qi / p_rc->ni_frames; + } + } + // Keep record of last boosted (KF/GF/ARF) Q value. + // If the current frame is coded at a lower Q then we also update it. + // If all mbs in this group are skipped only update if the Q value is + // better than that already stored. + // This is used to help set quality in forced key frames to reduce popping + if ((qindex < p_rc->last_boosted_qindex) || + (current_frame->frame_type == KEY_FRAME) || + (!p_rc->constrained_gf_group && + (refresh_frame->alt_ref_frame || is_intrnl_arf || + (refresh_frame->golden_frame && !rc->is_src_frame_alt_ref)))) { + p_rc->last_boosted_qindex = qindex; + } + if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex; + + update_buffer_level(cpi, rc->projected_frame_size); + rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth; + + // Rolling monitors of whether we are over or underspending used to help + // regulate min and Max Q in two pass. + if (av1_frame_scaled(cm)) + rc->this_frame_target = (int)(rc->this_frame_target / + resize_rate_factor(&cpi->oxcf.frm_dim_cfg, + cm->width, cm->height)); + if (current_frame->frame_type != KEY_FRAME) { + p_rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64( + p_rc->rolling_target_bits * 3 + rc->this_frame_target, 2); + p_rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64( + p_rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2); + } + + // Actual bits spent + p_rc->total_actual_bits += rc->projected_frame_size; + p_rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0; + + if (is_altref_enabled(cpi->oxcf.gf_cfg.lag_in_frames, + cpi->oxcf.gf_cfg.enable_auto_arf) && + refresh_frame->alt_ref_frame && + (current_frame->frame_type != KEY_FRAME && !frame_is_sframe(cm))) + // Update the alternate reference frame stats as appropriate. + update_alt_ref_frame_stats(cpi); + else + // Update the Golden frame stats as appropriate. + update_golden_frame_stats(cpi); + +#if CONFIG_FPMT_TEST + /*The variables temp_avg_frame_qindex, temp_last_q, temp_avg_q, + * temp_last_boosted_qindex are introduced only for quality simulation + * purpose, it retains the value previous to the parallel encode frames. The + * variables are updated based on the update flag. + * + * If there exist show_existing_frames between parallel frames, then to + * retain the temp state do not update it. */ + int show_existing_between_parallel_frames = + (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); + + if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + for (int i = 0; i < FRAME_TYPES; i++) { + p_rc->temp_last_q[i] = p_rc->last_q[i]; + } + p_rc->temp_avg_q = p_rc->avg_q; + p_rc->temp_last_boosted_qindex = p_rc->last_boosted_qindex; + p_rc->temp_total_actual_bits = p_rc->total_actual_bits; + p_rc->temp_projected_frame_size = rc->projected_frame_size; + for (int i = 0; i < RATE_FACTOR_LEVELS; i++) + p_rc->temp_rate_correction_factors[i] = p_rc->rate_correction_factors[i]; + } +#endif + if (current_frame->frame_type == KEY_FRAME) rc->frames_since_key = 0; + if (cpi->refresh_frame.golden_frame) + rc->frame_num_last_gf_refresh = current_frame->frame_number; + rc->prev_coded_width = cm->width; + rc->prev_coded_height = cm->height; + rc->frame_number_encoded++; + rc->prev_frame_is_dropped = 0; + rc->drop_count_consec = 0; + // if (current_frame->frame_number == 1 && cm->show_frame) + /* + rc->this_frame_target = + (int)(rc->this_frame_target / resize_rate_factor(&cpi->oxcf.frm_dim_cfg, + cm->width, cm->height)); + */ +} + +void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) { + // Update buffer level with zero size, update frame counters, and return. + update_buffer_level(cpi, 0); + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + } + cpi->rc.rc_2_frame = 0; + cpi->rc.rc_1_frame = 0; + cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth; + cpi->rc.prev_coded_width = cpi->common.width; + cpi->rc.prev_coded_height = cpi->common.height; + cpi->rc.prev_frame_is_dropped = 1; + // On a scene/slide change for dropped frame: reset the avg_source_sad to 0, + // otherwise the avg_source_sad can get too large and subsequent frames + // may miss the scene/slide detection. + if (cpi->rc.high_source_sad) cpi->rc.avg_source_sad = 0; + if (cpi->ppi->use_svc && cpi->svc.number_spatial_layers > 1) { + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = true; + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = true; + } +} + +int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, + int best_qindex, int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const double mid_q = av1_convert_qindex_to_q(mid, bit_depth); + if (mid_q < desired_q) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q || + low == worst_qindex); + return low; +} + +int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + aom_bit_depth_t bit_depth) { + const int start_index = + av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality); + const int target_index = + av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality); + return target_index - start_index; +} + +// Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex], +// assuming 'correction_factor' is 1.0. +// To be precise, 'q_index' is the smallest integer, for which the corresponding +// bits per mb <= desired_bits_per_mb. +// If no such q index is found, returns 'worst_qindex'. +static int find_qindex_by_rate(const AV1_COMP *const cpi, + int desired_bits_per_mb, FRAME_TYPE frame_type, + int best_qindex, int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const int mid_bits_per_mb = + av1_rc_bits_per_mb(cpi, frame_type, mid, 1.0, 0); + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + assert(av1_rc_bits_per_mb(cpi, frame_type, low, 1.0, 0) <= + desired_bits_per_mb || + low == worst_qindex); + return low; +} + +int av1_compute_qdelta_by_rate(const AV1_COMP *cpi, FRAME_TYPE frame_type, + int qindex, double rate_target_ratio) { + const RATE_CONTROL *rc = &cpi->rc; + + // Look up the current projected bits per block for the base index + const int base_bits_per_mb = + av1_rc_bits_per_mb(cpi, frame_type, qindex, 1.0, 0); + + // Find the target bits per mb based on the base value and given ratio. + const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb); + + const int target_index = find_qindex_by_rate( + cpi, target_bits_per_mb, frame_type, rc->best_quality, rc->worst_quality); + return target_index - qindex; +} + +void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi, + RATE_CONTROL *const rc) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + // Special case code for 1 pass fixed Q mode tests + if ((has_no_stats_stage(cpi)) && (oxcf->rc_cfg.mode == AOM_Q)) { + rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval; + rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval; + rc->static_scene_max_gf_interval = rc->min_gf_interval + 1; + } else { + // Set Maximum gf/arf interval + rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval; + rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval; + if (rc->min_gf_interval == 0) + rc->min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, cpi->framerate); + if (rc->max_gf_interval == 0) + rc->max_gf_interval = av1_rc_get_default_max_gf_interval( + cpi->framerate, rc->min_gf_interval); + /* + * Extended max interval for genuinely static scenes like slide shows. + * The no.of.stats available in the case of LAP is limited, + * hence setting to max_gf_interval. + */ + if (cpi->ppi->lap_enabled) + rc->static_scene_max_gf_interval = rc->max_gf_interval + 1; + else + rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; + + if (rc->max_gf_interval > rc->static_scene_max_gf_interval) + rc->max_gf_interval = rc->static_scene_max_gf_interval; + + // Clamp min to max + rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval); + } +} + +void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + int vbr_max_bits; + const int MBs = av1_get_MBs(width, height); + + rc->avg_frame_bandwidth = + (int)round(oxcf->rc_cfg.target_bandwidth / cpi->framerate); + rc->min_frame_bandwidth = + (int)(rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100); + + rc->min_frame_bandwidth = + AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); + + // A maximum bitrate for a frame is defined. + // The baseline for this aligns with HW implementations that + // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits + // per 16x16 MB (averaged over a frame). However this limit is extended if + // a very high rate is given on the command line or the the rate cannnot + // be acheived because of a user specificed max q (e.g. when the user + // specifies lossless encode. + vbr_max_bits = + (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) / + 100); + rc->max_frame_bandwidth = + AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits); + + av1_rc_set_gf_interval_range(cpi, rc); +} + +#define VBR_PCT_ADJUSTMENT_LIMIT 50 +// For VBR...adjustment to the frame target based on error from previous frames +static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int64_t vbr_bits_off_target = simulate_parallel_frame + ? cpi->ppi->p_rc.temp_vbr_bits_off_target + : p_rc->vbr_bits_off_target; +#else + int64_t vbr_bits_off_target = p_rc->vbr_bits_off_target; +#endif + const int stats_count = + cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL + ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count + : 0; + const int frame_window = AOMMIN( + 16, (int)(stats_count - (int)cpi->common.current_frame.frame_number)); + assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100); + if (frame_window > 0) { + const int max_delta = (int)AOMMIN( + abs((int)(vbr_bits_off_target / frame_window)), + ((int64_t)(*this_frame_target) * VBR_PCT_ADJUSTMENT_LIMIT) / 100); + + // vbr_bits_off_target > 0 means we have extra bits to spend + // vbr_bits_off_target < 0 we are currently overshooting + *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta; + } + +#if CONFIG_FPMT_TEST + int64_t vbr_bits_off_target_fast = + simulate_parallel_frame ? cpi->ppi->p_rc.temp_vbr_bits_off_target_fast + : p_rc->vbr_bits_off_target_fast; +#endif + // Fast redistribution of bits arising from massive local undershoot. + // Dont do it for kf,arf,gf or overlay frames. + if (!frame_is_kf_gf_arf(cpi) && +#if CONFIG_FPMT_TEST + vbr_bits_off_target_fast && +#else + p_rc->vbr_bits_off_target_fast && +#endif + !rc->is_src_frame_alt_ref) { + int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target); + int fast_extra_bits; +#if CONFIG_FPMT_TEST + fast_extra_bits = (int)AOMMIN(vbr_bits_off_target_fast, one_frame_bits); + fast_extra_bits = + (int)AOMMIN(fast_extra_bits, + AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8)); +#else + fast_extra_bits = + (int)AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits); + fast_extra_bits = (int)AOMMIN( + fast_extra_bits, + AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8)); +#endif + if (fast_extra_bits > 0) { + // Update this_frame_target only if additional bits are available from + // local undershoot. + *this_frame_target += (int)fast_extra_bits; + } + // Store the fast_extra_bits of the frame and reduce it from + // vbr_bits_off_target_fast during postencode stage. + rc->frame_level_fast_extra_bits = fast_extra_bits; + // Retaining the condition to udpate during postencode stage since + // fast_extra_bits are calculated based on vbr_bits_off_target_fast. + cpi->do_update_vbr_bits_off_target_fast = 1; + } +} + +void av1_set_target_rate(AV1_COMP *cpi, int width, int height) { + RATE_CONTROL *const rc = &cpi->rc; + int target_rate = rc->base_frame_target; + + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_cfg.mode == AOM_VBR || cpi->oxcf.rc_cfg.mode == AOM_CQ) + vbr_rate_correction(cpi, &target_rate); + av1_rc_set_frame_target(cpi, target_rate, width, height); +} + +int av1_calc_pframe_target_size_one_pass_vbr( + const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) { + static const int af_ratio = 10; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int64_t target; +#if USE_ALTREF_FOR_ONE_PASS + if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE || + frame_update_type == ARF_UPDATE) { + target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * + af_ratio) / + (p_rc->baseline_gf_interval + af_ratio - 1); + } else { + target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) / + (p_rc->baseline_gf_interval + af_ratio - 1); + } + if (target > INT_MAX) target = INT_MAX; +#else + target = rc->avg_frame_bandwidth; +#endif + return av1_rc_clamp_pframe_target_size(cpi, (int)target, frame_update_type); +} + +int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) { + static const int kf_ratio = 25; + const RATE_CONTROL *rc = &cpi->rc; + const int64_t target = (int64_t)rc->avg_frame_bandwidth * kf_ratio; + return av1_rc_clamp_iframe_target_size(cpi, target); +} + +int av1_calc_pframe_target_size_one_pass_cbr( + const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + const RATE_CONTROL *rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; + const RateControlCfg *rc_cfg = &oxcf->rc_cfg; + const int64_t diff = p_rc->optimal_buffer_level - p_rc->buffer_level; + const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100; + int min_frame_target = + AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); + int target; + + if (rc_cfg->gf_cbr_boost_pct) { + const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100; + if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) { + target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * + af_ratio_pct) / + (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } else { + target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) / + (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } + } else { + target = rc->avg_frame_bandwidth; + } + if (cpi->ppi->use_svc) { + // Note that for layers, avg_frame_bandwidth is the cumulative + // per-frame-bandwidth. For the target size of this frame, use the + // layer average frame size (i.e., non-cumulative per-frame-bw). + int layer = + LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + target = lc->avg_frame_size; + min_frame_target = AOMMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS); + } + if (diff > 0) { + // Lower the target bandwidth for this frame. + const int pct_low = + (int)AOMMIN(diff / one_pct_bits, rc_cfg->under_shoot_pct); + target -= (target * pct_low) / 200; + } else if (diff < 0) { + // Increase the target bandwidth for this frame. + const int pct_high = + (int)AOMMIN(-diff / one_pct_bits, rc_cfg->over_shoot_pct); + target += (target * pct_high) / 200; + } + if (rc_cfg->max_inter_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + return AOMMAX(min_frame_target, target); +} + +int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) { + const RATE_CONTROL *rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; + int64_t target; + if (cpi->common.current_frame.frame_number == 0) { + target = ((p_rc->starting_buffer_level / 2) > INT_MAX) + ? INT_MAX + : (int)(p_rc->starting_buffer_level / 2); + if (cpi->svc.number_temporal_layers > 1 && target < (INT_MAX >> 2)) { + target = target << AOMMIN(2, (cpi->svc.number_temporal_layers - 1)); + } + } else { + int kf_boost = 32; + int framerate = (int)round(cpi->framerate); + + kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16)); + if (rc->frames_since_key < framerate / 2) { + kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2)); + } + target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4; + } + return av1_rc_clamp_iframe_target_size(cpi, target); +} + +static void set_golden_update(AV1_COMP *const cpi) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int divisor = 10; + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) + divisor = cpi->cyclic_refresh->percent_refresh; + + // Set minimum gf_interval for GF update to a multiple of the refresh period, + // with some max limit. Depending on past encoding stats, GF flag may be + // reset and update may not occur until next baseline_gf_interval. + const int gf_length_mult[2] = { 8, 4 }; + if (divisor > 0) + p_rc->baseline_gf_interval = + AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] * (100 / divisor), + MAX_GF_INTERVAL_RT); + else + p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT; + if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40) + p_rc->baseline_gf_interval = 16; +} + +static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + + set_golden_update(cpi); + + if (p_rc->baseline_gf_interval > rc->frames_to_key && + cpi->oxcf.kf_cfg.auto_key) + p_rc->baseline_gf_interval = rc->frames_to_key; + p_rc->gfu_boost = DEFAULT_GF_BOOST_RT; + p_rc->constrained_gf_group = + (p_rc->baseline_gf_interval >= rc->frames_to_key && + cpi->oxcf.kf_cfg.auto_key) + ? 1 + : 0; + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; + cpi->gf_frame_index = 0; + // SVC does not use GF as periodic boost. + // TODO(marpan): Find better way to disable this for SVC. + if (cpi->ppi->use_svc) { + SVC *const svc = &cpi->svc; + p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1; + p_rc->gfu_boost = 1; + p_rc->constrained_gf_group = 0; + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; + for (int layer = 0; + layer < svc->number_spatial_layers * svc->number_temporal_layers; + ++layer) { + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval; + lc->p_rc.gfu_boost = p_rc->gfu_boost; + lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group; + lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due; + lc->group_index = 0; + } + } + gf_group->size = p_rc->baseline_gf_interval; + gf_group->update_type[0] = (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE; + gf_group->refbuf_state[cpi->gf_frame_index] = + (frame_type == KEY_FRAME) ? REFBUF_RESET : REFBUF_UPDATE; +} + +void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + const int resize_pending = is_frame_resize_pending(cpi); + if (!resize_pending && !rc->high_source_sad) { + // Check if we should disable GF refresh (if period is up), + // or force a GF refresh update (if we are at least halfway through + // period) based on QP. Look into add info on segment deltaq. + PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; + const int avg_qp = p_rc->avg_frame_qindex[INTER_FRAME]; + const int allow_gf_update = + rc->frames_till_gf_update_due <= (p_rc->baseline_gf_interval - 10); + int gf_update_changed = 0; + int thresh = 87; + if ((cm->current_frame.frame_number - cpi->rc.frame_num_last_gf_refresh) < + FIXED_GF_INTERVAL_RT && + rc->frames_till_gf_update_due == 1 && + cm->quant_params.base_qindex > avg_qp) { + // Disable GF refresh since QP is above the running average QP. + rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 0; + gf_update_changed = 1; + cpi->refresh_frame.golden_frame = 0; + } else if (allow_gf_update && + ((cm->quant_params.base_qindex < thresh * avg_qp / 100) || + (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 20))) { + // Force refresh since QP is well below average QP or this is a high + // motion frame. + rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 1; + gf_update_changed = 1; + cpi->refresh_frame.golden_frame = 1; + } + if (gf_update_changed) { + set_baseline_gf_interval(cpi, INTER_FRAME); + int refresh_mask = 0; + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + int ref_frame_map_idx = rtc_ref->ref_idx[i]; + refresh_mask |= rtc_ref->refresh[ref_frame_map_idx] + << ref_frame_map_idx; + } + cm->current_frame.refresh_frame_flags = refresh_mask; + } + } +} + +/*!\brief Setup the reference prediction structure for 1 pass real-time + * + * Set the reference prediction structure for 1 layer. + * Current structue is to use 3 references (LAST, GOLDEN, ALTREF), + * where ALT_REF always behind current by lag_alt frames, and GOLDEN is + * either updated on LAST with period baseline_gf_interval (fixed slot) + * or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7). + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] gf_update Flag to indicate if GF is updated + * + * \remark Nothing is returned. Instead the settings for the prediction + * structure are set in \c cpi-ext_flags; and the buffer slot index + * (for each of 7 references) and refresh flags (for each of the 8 slots) + * are set in \c cpi->svc.ref_idx[] and \c cpi->svc.refresh[]. + */ +void av1_set_rtc_reference_structure_one_layer(AV1_COMP *cpi, int gf_update) { + AV1_COMMON *const cm = &cpi->common; + ExternalFlags *const ext_flags = &cpi->ext_flags; + RATE_CONTROL *const rc = &cpi->rc; + ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = + &ext_flags->refresh_frame; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + unsigned int frame_number = (cpi->oxcf.rc_cfg.drop_frames_water_mark) + ? rc->frame_number_encoded + : cm->current_frame.frame_number; + unsigned int lag_alt = 4; + int last_idx = 0; + int last_idx_refresh = 0; + int gld_idx = 0; + int alt_ref_idx = 0; + int last2_idx = 0; + ext_refresh_frame_flags->update_pending = 1; + ext_flags->ref_frame_flags = 0; + ext_refresh_frame_flags->last_frame = 1; + ext_refresh_frame_flags->golden_frame = 0; + ext_refresh_frame_flags->alt_ref_frame = 0; + // Decide altref lag adaptively for rt + if (cpi->sf.rt_sf.sad_based_adp_altref_lag) { + lag_alt = 6; + const uint64_t th_frame_sad[4][3] = { + { 18000, 18000, 18000 }, // HDRES CPU 9 + { 25000, 25000, 25000 }, // MIDRES CPU 9 + { 40000, 30000, 20000 }, // HDRES CPU10 + { 30000, 25000, 20000 } // MIDRES CPU 10 + }; + int th_idx = cpi->sf.rt_sf.sad_based_adp_altref_lag - 1; + assert(th_idx < 4); + if (rc->avg_source_sad > th_frame_sad[th_idx][0]) + lag_alt = 3; + else if (rc->avg_source_sad > th_frame_sad[th_idx][1]) + lag_alt = 4; + else if (rc->avg_source_sad > th_frame_sad[th_idx][2]) + lag_alt = 5; + } + // This defines the reference structure for 1 layer (non-svc) RTC encoding. + // To avoid the internal/default reference structure for non-realtime + // overwriting this behavior, we use the "svc" ref parameters from the + // external control SET_SVC_REF_FRAME_CONFIG. + // TODO(marpan): rename that control and the related internal parameters + // to rtc_ref. + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) rtc_ref->ref_idx[i] = 7; + for (int i = 0; i < REF_FRAMES; ++i) rtc_ref->refresh[i] = 0; + // Set the reference frame flags. + ext_flags->ref_frame_flags ^= AOM_LAST_FLAG; + if (!cpi->sf.rt_sf.force_only_last_ref) { + ext_flags->ref_frame_flags ^= AOM_ALT_FLAG; + ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; + if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) + ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG; + } + const int sh = 6; + // Moving index slot for last: 0 - (sh - 1). + if (frame_number > 1) last_idx = ((frame_number - 1) % sh); + // Moving index for refresh of last: one ahead for next frame. + last_idx_refresh = (frame_number % sh); + gld_idx = 6; + + // Moving index for alt_ref, lag behind LAST by lag_alt frames. + if (frame_number > lag_alt) alt_ref_idx = ((frame_number - lag_alt) % sh); + if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) { + // Moving index for LAST2, lag behind LAST by 2 frames. + if (frame_number > 2) last2_idx = ((frame_number - 2) % sh); + } + rtc_ref->ref_idx[0] = last_idx; // LAST + rtc_ref->ref_idx[1] = last_idx_refresh; // LAST2 (for refresh of last). + if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) { + rtc_ref->ref_idx[1] = last2_idx; // LAST2 + rtc_ref->ref_idx[2] = last_idx_refresh; // LAST3 (for refresh of last). + } + rtc_ref->ref_idx[3] = gld_idx; // GOLDEN + rtc_ref->ref_idx[6] = alt_ref_idx; // ALT_REF + // Refresh this slot, which will become LAST on next frame. + rtc_ref->refresh[last_idx_refresh] = 1; + // Update GOLDEN on period for fixed slot case. + if (gf_update && cm->current_frame.frame_type != KEY_FRAME) { + ext_refresh_frame_flags->golden_frame = 1; + rtc_ref->refresh[gld_idx] = 1; + } + rtc_ref->gld_idx_1layer = gld_idx; + // Set the flag to reduce the number of reference frame buffers used. + // This assumes that slot 7 is never used. + cpi->rt_reduce_num_ref_buffers = 1; + cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[0] < 7); + cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[1] < 7); + cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[3] < 7); + cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[6] < 7); + if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) + cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[2] < 7); +} + +/*!\brief Check for scene detection, for 1 pass real-time mode. + * + * Compute average source sad (temporal sad: between current source and + * previous source) over a subset of superblocks. Use this is detect big changes + * in content and set the \c cpi->rc.high_source_sad flag. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] frame_input Current and last input source frames + * + * \remark Nothing is returned. Instead the flag \c cpi->rc.high_source_sad + * is set if scene change is detected, and \c cpi->rc.avg_source_sad is updated. + */ +static void rc_scene_detection_onepass_rt(AV1_COMP *cpi, + const EncodeFrameInput *frame_input) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + YV12_BUFFER_CONFIG const *const unscaled_src = frame_input->source; + YV12_BUFFER_CONFIG const *const unscaled_last_src = frame_input->last_source; + uint8_t *src_y; + int src_ystride; + int src_width; + int src_height; + uint8_t *last_src_y; + int last_src_ystride; + int last_src_width; + int last_src_height; + int width = cm->width; + int height = cm->height; + if (cpi->svc.number_spatial_layers > 1) { + width = cpi->oxcf.frm_dim_cfg.width; + height = cpi->oxcf.frm_dim_cfg.height; + } + if (width != cm->render_width || height != cm->render_height || + unscaled_src == NULL || unscaled_last_src == NULL) { + aom_free(cpi->src_sad_blk_64x64); + cpi->src_sad_blk_64x64 = NULL; + } + if (unscaled_src == NULL || unscaled_last_src == NULL) return; + src_y = unscaled_src->y_buffer; + src_ystride = unscaled_src->y_stride; + src_width = unscaled_src->y_width; + src_height = unscaled_src->y_height; + last_src_y = unscaled_last_src->y_buffer; + last_src_ystride = unscaled_last_src->y_stride; + last_src_width = unscaled_last_src->y_width; + last_src_height = unscaled_last_src->y_height; + if (src_width != last_src_width || src_height != last_src_height) { + aom_free(cpi->src_sad_blk_64x64); + cpi->src_sad_blk_64x64 = NULL; + return; + } + rc->high_source_sad = 0; + rc->percent_blocks_with_motion = 0; + rc->max_block_source_sad = 0; + rc->prev_avg_source_sad = rc->avg_source_sad; + int num_mi_cols = cm->mi_params.mi_cols; + int num_mi_rows = cm->mi_params.mi_rows; + if (cpi->svc.number_spatial_layers > 1) { + num_mi_cols = cpi->svc.mi_cols_full_resoln; + num_mi_rows = cpi->svc.mi_rows_full_resoln; + } + int num_zero_temp_sad = 0; + uint32_t min_thresh = 10000; + if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { + min_thresh = cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 + ? 50000 + : 100000; + } + const BLOCK_SIZE bsize = BLOCK_64X64; + // Loop over sub-sample of frame, compute average sad over 64x64 blocks. + uint64_t avg_sad = 0; + uint64_t tmp_sad = 0; + int num_samples = 0; + const int thresh = + cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 ? 5 : 6; + // SAD is computed on 64x64 blocks + const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; + const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; + const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb; + uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5 + int num_low_var_high_sumdiff = 0; + int light_change = 0; + // Flag to check light change or not. + const int check_light_change = 0; + // TODO(marpan): There seems some difference along the bottom border when + // using the source_last_tl0 for last_source (used for temporal layers or + // when previous frame is dropped). + // Remove this bord parameter when issue is resolved: difference is that + // non-zero sad exists along bottom border even though source is static. + const int border = + rc->prev_frame_is_dropped || cpi->svc.number_temporal_layers > 1; + // Store blkwise SAD for later use + if (width == cm->render_width && height == cm->render_height) { + if (cpi->src_sad_blk_64x64 == NULL) { + CHECK_MEM_ERROR(cm, cpi->src_sad_blk_64x64, + (uint64_t *)aom_calloc(sb_cols * sb_rows, + sizeof(*cpi->src_sad_blk_64x64))); + } + } + // Avoid bottom and right border. + for (int sbi_row = 0; sbi_row < sb_rows - border; ++sbi_row) { + for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { + tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, + last_src_ystride); + if (cpi->src_sad_blk_64x64 != NULL) + cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad; + if (check_light_change) { + unsigned int sse, variance; + variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y, + last_src_ystride, &sse); + // Note: sse - variance = ((sum * sum) >> 12) + // Detect large lighting change. + if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) { + num_low_var_high_sumdiff++; + } + } + avg_sad += tmp_sad; + num_samples++; + if (tmp_sad == 0) num_zero_temp_sad++; + if (tmp_sad > rc->max_block_source_sad) + rc->max_block_source_sad = tmp_sad; + + src_y += 64; + last_src_y += 64; + } + src_y += (src_ystride << 6) - (sb_cols << 6); + last_src_y += (last_src_ystride << 6) - (sb_cols << 6); + } + if (check_light_change && num_samples > 0 && + num_low_var_high_sumdiff > (num_samples >> 1)) + light_change = 1; + if (num_samples > 0) avg_sad = avg_sad / num_samples; + // Set high_source_sad flag if we detect very high increase in avg_sad + // between current and previous frame value(s). Use minimum threshold + // for cases where there is small change from content that is completely + // static. + if (!light_change && + avg_sad > + AOMMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) && + rc->frames_since_key > 1 + cpi->svc.number_spatial_layers && + num_zero_temp_sad < 3 * (num_samples >> 2)) + rc->high_source_sad = 1; + else + rc->high_source_sad = 0; + rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2; + rc->frame_source_sad = avg_sad; + if (num_samples > 0) + rc->percent_blocks_with_motion = + ((num_samples - num_zero_temp_sad) * 100) / num_samples; + // Scene detection is only on base SLO, and using full/orignal resolution. + // Pass the state to the upper spatial layers. + if (cpi->svc.number_spatial_layers > 1) { + SVC *svc = &cpi->svc; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + int tl = svc->temporal_layer_id; + const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->high_source_sad = rc->high_source_sad; + lrc->frame_source_sad = rc->frame_source_sad; + lrc->avg_source_sad = rc->avg_source_sad; + lrc->percent_blocks_with_motion = rc->percent_blocks_with_motion; + lrc->max_block_source_sad = rc->max_block_source_sad; + } + } +} + +/*!\brief Set the GF baseline interval for 1 pass real-time mode. + * + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] frame_type frame type + * + * \return Return GF update flag, and update the \c cpi->rc with + * the next GF interval settings. + */ +static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi, + FRAME_TYPE frame_type) { + RATE_CONTROL *const rc = &cpi->rc; + int gf_update = 0; + const int resize_pending = is_frame_resize_pending(cpi); + // GF update based on frames_till_gf_update_due, also + // force upddate on resize pending frame or for scene change. + if ((resize_pending || rc->high_source_sad || + rc->frames_till_gf_update_due == 0) && + cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) { + set_baseline_gf_interval(cpi, frame_type); + gf_update = 1; + } + return gf_update; +} + +static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height, + int prev_width, int prev_height) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + SVC *const svc = &cpi->svc; + int target_bits_per_frame; + int active_worst_quality; + int qindex; + double tot_scale_change = (double)(resize_width * resize_height) / + (double)(prev_width * prev_height); + // Disable the skip mv search for svc on resize frame. + svc->skip_mvsearch_last = 0; + svc->skip_mvsearch_gf = 0; + svc->skip_mvsearch_altref = 0; + // Reset buffer level to optimal, update target size. + p_rc->buffer_level = p_rc->optimal_buffer_level; + p_rc->bits_off_target = p_rc->optimal_buffer_level; + rc->this_frame_target = + av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME); + target_bits_per_frame = rc->this_frame_target; + if (tot_scale_change > 4.0) + p_rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality; + else if (tot_scale_change > 1.0) + p_rc->avg_frame_qindex[INTER_FRAME] = + (p_rc->avg_frame_qindex[INTER_FRAME] + rc->worst_quality) >> 1; + active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi); + qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality, + active_worst_quality, resize_width, resize_height); + // If resize is down, check if projected q index is close to worst_quality, + // and if so, reduce the rate correction factor (since likely can afford + // lower q for resized frame). + if (tot_scale_change < 1.0 && qindex > 90 * rc->worst_quality / 100) + p_rc->rate_correction_factors[INTER_NORMAL] *= 0.85; + // If resize is back up: check if projected q index is too much above the + // previous index, and if so, reduce the rate correction factor + // (since prefer to keep q for resized frame at least closet to previous q). + // Also check if projected qindex is close to previous qindex, if so + // increase correction factor (to push qindex higher and avoid overshoot). + if (tot_scale_change >= 1.0) { + if (tot_scale_change < 4.0 && + qindex > 130 * p_rc->last_q[INTER_FRAME] / 100) + p_rc->rate_correction_factors[INTER_NORMAL] *= 0.8; + if (qindex <= 120 * p_rc->last_q[INTER_FRAME] / 100) + p_rc->rate_correction_factors[INTER_NORMAL] *= 1.5; + } + if (svc->number_temporal_layers > 1) { + // Apply the same rate control reset to all temporal layers. + for (int tl = 0; tl < svc->number_temporal_layers; tl++) { + LAYER_CONTEXT *lc = NULL; + lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + tl]; + lc->rc.resize_state = rc->resize_state; + lc->p_rc.buffer_level = lc->p_rc.optimal_buffer_level; + lc->p_rc.bits_off_target = lc->p_rc.optimal_buffer_level; + lc->p_rc.rate_correction_factors[INTER_NORMAL] = + p_rc->rate_correction_factors[INTER_NORMAL]; + lc->p_rc.avg_frame_qindex[INTER_FRAME] = + p_rc->avg_frame_qindex[INTER_FRAME]; + } + } +} + +/*!\brief ChecK for resize based on Q, for 1 pass real-time mode. + * + * Check if we should resize, based on average QP from past x frames. + * Only allow for resize at most 1/2 scale down for now, Scaling factor + * for each step may be 3/4 or 1/2. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * + * \remark Return resized width/height in \c cpi->resize_pending_params, + * and update some resize counters in \c rc. + */ +static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + RESIZE_ACTION resize_action = NO_RESIZE; + const int avg_qp_thr1 = 70; + const int avg_qp_thr2 = 50; + // Don't allow for resized frame to go below 160x90, resize in steps of 3/4. + const int min_width = (160 * 4) / 3; + const int min_height = (90 * 4) / 3; + int down_size_on = 1; + // Don't resize on key frame; reset the counters on key frame. + if (cm->current_frame.frame_type == KEY_FRAME) { + rc->resize_avg_qp = 0; + rc->resize_count = 0; + rc->resize_buffer_underflow = 0; + return; + } + // No resizing down if frame size is below some limit. + if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0; + + // Resize based on average buffer underflow and QP over some window. + // Ignore samples close to key frame, since QP is usually high after key. + if (cpi->rc.frames_since_key > cpi->framerate) { + const int window = AOMMIN(30, (int)(2 * cpi->framerate)); + rc->resize_avg_qp += p_rc->last_q[INTER_FRAME]; + if (cpi->ppi->p_rc.buffer_level < + (int)(30 * p_rc->optimal_buffer_level / 100)) + ++rc->resize_buffer_underflow; + ++rc->resize_count; + // Check for resize action every "window" frames. + if (rc->resize_count >= window) { + int avg_qp = rc->resize_avg_qp / rc->resize_count; + // Resize down if buffer level has underflowed sufficient amount in past + // window, and we are at original or 3/4 of original resolution. + // Resize back up if average QP is low, and we are currently in a resized + // down state, i.e. 1/2 or 3/4 of original resolution. + // Currently, use a flag to turn 3/4 resizing feature on/off. + if (rc->resize_buffer_underflow > (rc->resize_count >> 2) && + down_size_on) { + if (rc->resize_state == THREE_QUARTER) { + resize_action = DOWN_ONEHALF; + rc->resize_state = ONE_HALF; + } else if (rc->resize_state == ORIG) { + resize_action = DOWN_THREEFOUR; + rc->resize_state = THREE_QUARTER; + } + } else if (rc->resize_state != ORIG && + avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) { + if (rc->resize_state == THREE_QUARTER || + avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100) { + resize_action = UP_ORIG; + rc->resize_state = ORIG; + } else if (rc->resize_state == ONE_HALF) { + resize_action = UP_THREEFOUR; + rc->resize_state = THREE_QUARTER; + } + } + // Reset for next window measurement. + rc->resize_avg_qp = 0; + rc->resize_count = 0; + rc->resize_buffer_underflow = 0; + } + } + // If decision is to resize, reset some quantities, and check is we should + // reduce rate correction factor, + if (resize_action != NO_RESIZE) { + int resize_width = cpi->oxcf.frm_dim_cfg.width; + int resize_height = cpi->oxcf.frm_dim_cfg.height; + int resize_scale_num = 1; + int resize_scale_den = 1; + if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) { + resize_scale_num = 3; + resize_scale_den = 4; + } else if (resize_action == DOWN_ONEHALF) { + resize_scale_num = 1; + resize_scale_den = 2; + } + resize_width = resize_width * resize_scale_num / resize_scale_den; + resize_height = resize_height * resize_scale_num / resize_scale_den; + resize_reset_rc(cpi, resize_width, resize_height, cm->width, cm->height); + } + return; +} + +static INLINE int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) { + RATE_CONTROL *const rc = &cpi->rc; + AV1_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + + // Very first frame has to be key frame. + if (cm->current_frame.frame_number == 0) return 1; + // Set key frame if forced by frame flags. + if (frame_flags & FRAMEFLAGS_KEY) return 1; + if (!cpi->ppi->use_svc) { + // Non-SVC + if (cpi->oxcf.kf_cfg.auto_key && rc->frames_to_key == 0) return 1; + } else { + // SVC + if (svc->spatial_layer_id == 0 && + (cpi->oxcf.kf_cfg.auto_key && + (cpi->oxcf.kf_cfg.key_freq_max == 0 || + svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0))) + return 1; + } + + return 0; +} + +// Set to true if this frame is a recovery frame, for 1 layer RPS, +// and whether we should apply some boost (QP, adjust speed features, etc). +// Recovery frame here means frame whose closest reference suddenly +// switched from previous frame to one much further away. +// TODO(marpan): Consider adding on/off flag to SVC_REF_FRAME_CONFIG to +// allow more control for applications. +static bool set_flag_rps_bias_recovery_frame(const AV1_COMP *const cpi) { + if (cpi->ppi->rtc_ref.set_ref_frame_config && + cpi->svc.number_temporal_layers == 1 && + cpi->svc.number_spatial_layers == 1 && + cpi->ppi->rtc_ref.reference_was_previous_frame) { + int min_dist = av1_svc_get_min_ref_dist(cpi); + // Only consider boost for this frame if its closest reference is further + // than x frames away, using x = 4 for now. + if (min_dist != INT_MAX && min_dist > 4) return true; + } + return false; +} + +void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type, + const EncodeFrameInput *frame_input, + unsigned int frame_flags) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + AV1_COMMON *const cm = &cpi->common; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + SVC *const svc = &cpi->svc; + ResizePendingParams *const resize_pending_params = + &cpi->resize_pending_params; + int target; + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + if (cpi->ppi->use_svc) { + av1_update_temporal_layer_framerate(cpi); + av1_restore_layer_context(cpi); + } + cpi->ppi->rtc_ref.bias_recovery_frame = set_flag_rps_bias_recovery_frame(cpi); + // Set frame type. + if (set_key_frame(cpi, frame_flags)) { + *frame_type = KEY_FRAME; + p_rc->this_key_frame_forced = + cm->current_frame.frame_number != 0 && rc->frames_to_key == 0; + rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max; + p_rc->kf_boost = DEFAULT_KF_BOOST_RT; + gf_group->update_type[cpi->gf_frame_index] = KF_UPDATE; + gf_group->frame_type[cpi->gf_frame_index] = KEY_FRAME; + gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_RESET; + if (cpi->ppi->use_svc) { + if (cm->current_frame.frame_number > 0) + av1_svc_reset_temporal_layers(cpi, 1); + svc->layer_context[layer].is_key_frame = 1; + } + rc->frame_number_encoded = 0; + cpi->ppi->rtc_ref.non_reference_frame = 0; + } else { + *frame_type = INTER_FRAME; + gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE; + gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME; + gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE; + if (cpi->ppi->use_svc) { + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + lc->is_key_frame = + svc->spatial_layer_id == 0 + ? 0 + : svc->layer_context[svc->temporal_layer_id].is_key_frame; + // If the user is setting the reference structure with + // set_ref_frame_config and did not set any references, set the + // frame type to Intra-only. + if (cpi->ppi->rtc_ref.set_ref_frame_config) { + int no_references_set = 1; + for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { + if (cpi->ppi->rtc_ref.reference[i]) { + no_references_set = 0; + break; + } + } + // Set to intra_only_frame if no references are set. + // The stream can start decoding on INTRA_ONLY_FRAME so long as the + // layer with the intra_only_frame doesn't signal a reference to a slot + // that hasn't been set yet. + if (no_references_set) *frame_type = INTRA_ONLY_FRAME; + } + } + } + // Check for scene change: for SVC check on base spatial layer only. + if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0) { + if (rc->prev_coded_width == cm->width && + rc->prev_coded_height == cm->height) { + rc_scene_detection_onepass_rt(cpi, frame_input); + } else { + aom_free(cpi->src_sad_blk_64x64); + cpi->src_sad_blk_64x64 = NULL; + } + } + // Check for dynamic resize, for single spatial layer for now. + // For temporal layers only check on base temporal layer. + if (cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC) { + if (svc->number_spatial_layers == 1 && svc->temporal_layer_id == 0) + dynamic_resize_one_pass_cbr(cpi); + if (rc->resize_state == THREE_QUARTER) { + resize_pending_params->width = (3 + cpi->oxcf.frm_dim_cfg.width * 3) >> 2; + resize_pending_params->height = + (3 + cpi->oxcf.frm_dim_cfg.height * 3) >> 2; + } else if (rc->resize_state == ONE_HALF) { + resize_pending_params->width = (1 + cpi->oxcf.frm_dim_cfg.width) >> 1; + resize_pending_params->height = (1 + cpi->oxcf.frm_dim_cfg.height) >> 1; + } else { + resize_pending_params->width = cpi->oxcf.frm_dim_cfg.width; + resize_pending_params->height = cpi->oxcf.frm_dim_cfg.height; + } + } else if (is_frame_resize_pending(cpi)) { + resize_reset_rc(cpi, resize_pending_params->width, + resize_pending_params->height, cm->width, cm->height); + } + // Set the GF interval and update flag. + if (!rc->rtc_external_ratectrl) + set_gf_interval_update_onepass_rt(cpi, *frame_type); + // Set target size. + if (cpi->oxcf.rc_cfg.mode == AOM_CBR) { + if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) { + target = av1_calc_iframe_target_size_one_pass_cbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_cbr( + cpi, gf_group->update_type[cpi->gf_frame_index]); + } + } else { + if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) { + target = av1_calc_iframe_target_size_one_pass_vbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_vbr( + cpi, gf_group->update_type[cpi->gf_frame_index]); + } + } + if (cpi->oxcf.rc_cfg.mode == AOM_Q) + rc->active_worst_quality = cpi->oxcf.rc_cfg.cq_level; + + av1_rc_set_frame_target(cpi, target, cm->width, cm->height); + rc->base_frame_target = target; + cm->current_frame.frame_type = *frame_type; + // For fixed mode SVC: if KSVC is enabled remove inter layer + // prediction on spatial enhancement layer frames for frames + // whose base is not KEY frame. + if (cpi->ppi->use_svc && !svc->use_flexible_mode && svc->ksvc_fixed_mode && + svc->number_spatial_layers > 1 && + !svc->layer_context[layer].is_key_frame) { + ExternalFlags *const ext_flags = &cpi->ext_flags; + ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; + } +} + +int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) { + AV1_COMMON *const cm = &cpi->common; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + double rate_correction_factor = + cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL]; + const int target_size = cpi->rc.avg_frame_bandwidth; + double new_correction_factor; + int target_bits_per_mb; + double q2; + int enumerator; + int is_screen_content = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); + *q = (3 * cpi->rc.worst_quality + *q) >> 2; + // For screen content use the max-q set by the user to allow for less + // overshoot on slide changes. + if (is_screen_content) *q = cpi->rc.worst_quality; + cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0; + // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as + // these parameters will affect QP selection for subsequent frames. If they + // have settled down to a very different (low QP) state, then not adjusting + // them may cause next frame to select low QP and overshoot again. + p_rc->avg_frame_qindex[INTER_FRAME] = *q; + p_rc->buffer_level = p_rc->optimal_buffer_level; + p_rc->bits_off_target = p_rc->optimal_buffer_level; + // Reset rate under/over-shoot flags. + cpi->rc.rc_1_frame = 0; + cpi->rc.rc_2_frame = 0; + // Adjust rate correction factor. + target_bits_per_mb = + (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs); + // Reset rate correction factor: for now base it on target_bits_per_mb + // and qp (==max_QP). This comes from the inverse computation of + // av1_rc_bits_per_mb(). + q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth); + enumerator = av1_get_bpmb_enumerator(INTER_NORMAL, is_screen_content); + new_correction_factor = (double)target_bits_per_mb * q2 / enumerator; + if (new_correction_factor > rate_correction_factor) { + rate_correction_factor = + (new_correction_factor + rate_correction_factor) / 2.0; + if (rate_correction_factor > MAX_BPB_FACTOR) + rate_correction_factor = MAX_BPB_FACTOR; + cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL] = + rate_correction_factor; + } + // For temporal layers: reset the rate control parameters across all + // temporal layers. + if (cpi->svc.number_temporal_layers > 1) { + SVC *svc = &cpi->svc; + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int sl = svc->spatial_layer_id; + const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc; + lp_rc->avg_frame_qindex[INTER_FRAME] = *q; + lp_rc->buffer_level = lp_rc->optimal_buffer_level; + lp_rc->bits_off_target = lp_rc->optimal_buffer_level; + lrc->rc_1_frame = 0; + lrc->rc_2_frame = 0; + lp_rc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor; + } + } + return 1; +} diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h new file mode 100644 index 0000000000..6802ad42d0 --- /dev/null +++ b/third_party/aom/av1/encoder/ratectrl.h @@ -0,0 +1,864 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RATECTRL_H_ +#define AOM_AV1_ENCODER_RATECTRL_H_ + +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" + +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ + +// Bits Per MB at different Q (Multiplied by 512) +#define BPER_MB_NORMBITS 9 + +// Use this macro to turn on/off use of alt-refs in one-pass mode. +#define USE_ALTREF_FOR_ONE_PASS 1 + +// Threshold used to define if a KF group is static (e.g. a slide show). +// Essentially, this means that no frame in the group has more than 1% of MBs +// that are not marked as coded with 0,0 motion in the first pass. +#define STATIC_KF_GROUP_THRESH 99 +#define STATIC_KF_GROUP_FLOAT_THRESH 0.99 + +// The maximum duration of a GF group that is static (e.g. a slide show). +#define MAX_STATIC_GF_GROUP_LENGTH 250 + +#define MIN_GF_INTERVAL 4 +#define MAX_GF_INTERVAL 32 +#define FIXED_GF_INTERVAL 16 +#define MAX_GF_LENGTH_LAP 16 + +#define FIXED_GF_INTERVAL_RT 80 +#define MAX_GF_INTERVAL_RT 160 + +#define MAX_NUM_GF_INTERVALS 15 + +#define MAX_ARF_LAYERS 6 +// #define STRICT_RC + +#define DEFAULT_KF_BOOST_RT 2300 +#define DEFAULT_GF_BOOST_RT 2000 + +// A passive rate control strategy for screen content type in real-time mode. +// When it is turned on, the compression performance is improved by +// 7.8% (overall_psnr), 5.0% (VMAF) on average. Some clips see gains +// over 20% on metric. +// The downside is that it does not guarantee frame size. +// Since RT mode has a tight restriction on buffer overflow control, we +// turn it off by default. +#define RT_PASSIVE_STRATEGY 0 +#define MAX_Q_HISTORY 1000 + +typedef struct { + int resize_width; + int resize_height; + uint8_t superres_denom; +} size_params_type; + +enum { + INTER_NORMAL, + GF_ARF_LOW, + GF_ARF_STD, + KF_STD, + RATE_FACTOR_LEVELS +} UENUM1BYTE(RATE_FACTOR_LEVEL); + +enum { + KF_UPDATE, + LF_UPDATE, + GF_UPDATE, + ARF_UPDATE, + OVERLAY_UPDATE, + INTNL_OVERLAY_UPDATE, // Internal Overlay Frame + INTNL_ARF_UPDATE, // Internal Altref Frame + FRAME_UPDATE_TYPES +} UENUM1BYTE(FRAME_UPDATE_TYPE); + +enum { + REFBUF_RESET, // Clear reference frame buffer + REFBUF_UPDATE, // Refresh reference frame buffer + REFBUF_STATES +} UENUM1BYTE(REFBUF_STATE); + +typedef enum { + NO_RESIZE = 0, + DOWN_THREEFOUR = 1, // From orig to 3/4. + DOWN_ONEHALF = 2, // From orig or 3/4 to 1/2. + UP_THREEFOUR = -1, // From 1/2 to 3/4. + UP_ORIG = -2, // From 1/2 or 3/4 to orig. +} RESIZE_ACTION; + +typedef enum { ORIG = 0, THREE_QUARTER = 1, ONE_HALF = 2 } RESIZE_STATE; + +#define MAX_FIRSTPASS_ANALYSIS_FRAMES 150 +typedef enum region_types { + STABLE_REGION = 0, + HIGH_VAR_REGION = 1, + SCENECUT_REGION = 2, + BLENDING_REGION = 3, +} REGION_TYPES; + +typedef struct regions { + int start; + int last; + double avg_noise_var; + double avg_cor_coeff; + double avg_sr_fr_ratio; + double avg_intra_err; + double avg_coded_err; + REGION_TYPES type; +} REGIONS; + +/*!\endcond */ +/*! + * \brief Rate Control parameters and status + */ +typedef struct { + // Rate targetting variables + + /*! + * Baseline target rate for frame before adjustment for previous under or + * over shoot. + */ + int base_frame_target; + /*! + * Target rate for frame after adjustment for previous under or over shoot. + */ + int this_frame_target; // Actual frame target after rc adjustment. + + /*! + * Projected size for current frame + */ + int projected_frame_size; + + /*! + * Bit size of transform coefficient for current frame. + */ + int coefficient_size; + + /*! + * Super block rate target used with some adaptive quantization strategies. + */ + int sb64_target_rate; + + /*! + * Number of frames since the last ARF / GF. + */ + int frames_since_golden; + + /*! + * Number of frames till the next ARF / GF is due. + */ + int frames_till_gf_update_due; + + /*! + * Number of determined gf groups left + */ + int intervals_till_gf_calculate_due; + + /*!\cond */ + int min_gf_interval; + int max_gf_interval; + int static_scene_max_gf_interval; + /*!\endcond */ + /*! + * Frames before the next key frame + */ + int frames_to_key; + /*!\cond */ + int frames_since_key; + int frames_to_fwd_kf; + int is_src_frame_alt_ref; + int sframe_due; + + int high_source_sad; + uint64_t avg_source_sad; + uint64_t prev_avg_source_sad; + uint64_t frame_source_sad; + + int avg_frame_bandwidth; // Average frame size target for clip + int min_frame_bandwidth; // Minimum allocation used for any frame + int max_frame_bandwidth; // Maximum burst rate allowed for a frame. + int prev_avg_frame_bandwidth; + + int ni_av_qi; + int ni_tot_qi; + + int decimation_factor; + int decimation_count; + int prev_frame_is_dropped; + int drop_count_consec; + int max_consec_drop; + + /*! + * Frame number for encoded frames (non-dropped). + * Use for setting the rtc reference structure. + */ + unsigned int frame_number_encoded; + + /*!\endcond */ + /*! + * User specified maximum Q allowed for current frame + */ + int worst_quality; + /*! + * User specified minimum Q allowed for current frame + */ + int best_quality; + + /*!\cond */ + + // rate control history for last frame(1) and the frame before(2). + // -1: overshoot + // 1: undershoot + // 0: not initialized. + int rc_1_frame; + int rc_2_frame; + int q_1_frame; + int q_2_frame; + + /*!\endcond */ + /*! + * Proposed maximum allowed Q for current frame + */ + int active_worst_quality; + + /*!\cond */ + // Track amount of low motion in scene + int avg_frame_low_motion; + int cnt_zeromv; + + // signals if number of blocks with motion is high + int percent_blocks_with_motion; + + // Maximum value of source sad across all blocks of frame. + uint64_t max_block_source_sad; + + // For dynamic resize, 1 pass cbr. + RESIZE_STATE resize_state; + int resize_avg_qp; + int resize_buffer_underflow; + int resize_count; + + // Flag to disable content related qp adjustment. + int rtc_external_ratectrl; + + // Stores fast_extra_bits of the current frame. + int frame_level_fast_extra_bits; + + double frame_level_rate_correction_factors[RATE_FACTOR_LEVELS]; + + int frame_num_last_gf_refresh; + + int prev_coded_width; + int prev_coded_height; + + // The ratio used for inter frames in bit estimation. + // TODO(yunqing): if golden frame is treated differently (e.g. gf_cbr_boost_ + // pct > THR), consider to add bit_est_ratio_g for golden frames. + int bit_est_ratio; + + // Whether to use a fixed qp for the frame, bypassing internal rate control. + // This flag will reset to 0 after every frame. + int use_external_qp_one_pass; + /*!\endcond */ +} RATE_CONTROL; + +/*! + * \brief Primary Rate Control parameters and status + */ +typedef struct { + // Sub-gop level Rate targetting variables + + /*! + * Target bit budget for the current GF / ARF group of frame. + */ + int64_t gf_group_bits; + + /*! + * Boost factor used to calculate the extra bits allocated to the key frame + */ + int kf_boost; + + /*! + * Boost factor used to calculate the extra bits allocated to ARFs and GFs + */ + int gfu_boost; + + /*! + * Stores the determined gf group lengths for a set of gf groups + */ + int gf_intervals[MAX_NUM_GF_INTERVALS]; + + /*! + * The current group's index into gf_intervals[] + */ + int cur_gf_index; + + /*!\cond */ + int num_regions; + + REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES]; + int regions_offset; // offset of regions from the last keyframe + int frames_till_regions_update; + + int baseline_gf_interval; + + int constrained_gf_group; + + int this_key_frame_forced; + + int next_key_frame_forced; + /*!\endcond */ + + /*! + * Initial buffuer level in ms for CBR / low delay encoding + */ + int64_t starting_buffer_level; + + /*! + * Optimum / target buffuer level in ms for CBR / low delay encoding + */ + int64_t optimal_buffer_level; + + /*! + * Maximum target buffuer level in ms for CBR / low delay encoding + */ + int64_t maximum_buffer_size; + + /*! + * Q index used for ALT frame + */ + int arf_q; + + /*!\cond */ + float_t arf_boost_factor; + + int base_layer_qp; + + // Total number of stats used only for kf_boost calculation. + int num_stats_used_for_kf_boost; + + // Total number of stats used only for gfu_boost calculation. + int num_stats_used_for_gfu_boost; + + // Total number of stats required by gfu_boost calculation. + int num_stats_required_for_gfu_boost; + + int enable_scenecut_detection; + + int use_arf_in_this_kf_group; + + int ni_frames; + + double tot_q; + /*!\endcond */ + + /*! + * Q used for last boosted (non leaf) frame + */ + int last_kf_qindex; + + /*! + * Average of q index of previous encoded frames in a sequence. + */ + int avg_frame_qindex[FRAME_TYPES]; + +#if CONFIG_FPMT_TEST + /*! + * Temporary variable used in simulating the delayed update of + * active_best_quality. + */ + int temp_active_best_quality[MAX_ARF_LAYERS + 1]; + + /*! + * Temporary variable used in simulating the delayed update of + * last_boosted_qindex. + */ + int temp_last_boosted_qindex; + + /*! + * Temporary variable used in simulating the delayed update of + * avg_q. + */ + double temp_avg_q; + + /*! + * Temporary variable used in simulating the delayed update of + * last_q. + */ + int temp_last_q[FRAME_TYPES]; + + /*! + * Temporary variable used in simulating the delayed update of + * projected_frame_size. + */ + int temp_projected_frame_size; + + /*! + * Temporary variable used in simulating the delayed update of + * total_actual_bits. + */ + int64_t temp_total_actual_bits; + + /*! + * Temporary variable used in simulating the delayed update of + * buffer_level. + */ + int64_t temp_buffer_level; + + /*! + * Temporary variable used in simulating the delayed update of + * vbr_bits_off_target. + */ + int64_t temp_vbr_bits_off_target; + + /*! + * Temporary variable used in simulating the delayed update of + * vbr_bits_off_target_fast. + */ + int64_t temp_vbr_bits_off_target_fast; + + /*! + * Temporary variable used in simulating the delayed update of + * rate_correction_factors. + */ + double temp_rate_correction_factors[RATE_FACTOR_LEVELS]; + + /*! + * Temporary variable used in simulating the delayed update of + * rate_error_estimate. + */ + int temp_rate_error_estimate; + + /*! + * Temporary variable used in simulating the delayed update of + * rolling_arf_group_target_bits. + */ + int temp_rolling_arf_group_target_bits; + + /*! + * Temporary variable used in simulating the delayed update of + * rolling_arf_group_actual_bits;. + */ + int temp_rolling_arf_group_actual_bits; + + /*! + * Temporary variable used in simulating the delayed update of + * bits_left;. + */ + int64_t temp_bits_left; + + /*! + * Temporary variable used in simulating the delayed update of + * extend_minq. + */ + int temp_extend_minq; + + /*! + * Temporary variable used in simulating the delayed update of + * extend_maxq. + */ + int temp_extend_maxq; + +#endif + /*! + * Proposed minimum allowed Q different layers in a coding pyramid + */ + int active_best_quality[MAX_ARF_LAYERS + 1]; + + /*! + * Q used for last boosted (non leaf) frame (GF/KF/ARF) + */ + int last_boosted_qindex; + + /*! + * Average Q value of previous inter frames + */ + double avg_q; + + /*! + * Q used on last encoded frame of the given type. + */ + int last_q[FRAME_TYPES]; + + /*! + * Correction factors used to adjust the q estimate for a given target rate + * in the encode loop. + */ + double rate_correction_factors[RATE_FACTOR_LEVELS]; + + /*! + * Current total consumed bits. + */ + int64_t total_actual_bits; + + /*! + * Current total target bits. + */ + int64_t total_target_bits; + + /*! + * Current buffer level. + */ + int64_t buffer_level; + + /*! + * PCT rc error. + */ + int rate_error_estimate; + + /*! + * Error bits available from previously encoded frames. + */ + int64_t vbr_bits_off_target; + + /*! + * Error bits available from previously encoded frames undershoot. + */ + int64_t vbr_bits_off_target_fast; + + /*! + * Total bits deviated from the average frame target, from previously + * encoded frames. + */ + int64_t bits_off_target; + + /*! + * Rolling monitor target bits updated based on current frame target size. + */ + int rolling_target_bits; + + /*! + * Rolling monitor actual bits updated based on current frame final projected + * size. + */ + int rolling_actual_bits; + + /*! + * The history of qindex for each frame. + * Only used when RT_PASSIVE_STRATEGY = 1. + */ + int q_history[MAX_Q_HISTORY]; +} PRIMARY_RATE_CONTROL; + +/*!\cond */ + +struct AV1_COMP; +struct AV1EncoderConfig; +struct GF_GROUP; + +void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf, + PRIMARY_RATE_CONTROL *p_rc); + +void av1_rc_init(const struct AV1EncoderConfig *oxcf, RATE_CONTROL *rc); + +int av1_estimate_bits_at_q(const struct AV1_COMP *cpi, int q, + double correction_factor); + +double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth); + +void av1_rc_init_minq_luts(void); + +int av1_rc_get_default_min_gf_interval(int width, int height, double framerate); +// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to +// be passed in to ensure that the max_gf_interval returned is at least as bis +// as that. +int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval); + +// Generally at the high level, the following flow is expected +// to be enforced for rate control: +// First call per frame, one of: +// av1_get_one_pass_rt_params() +// av1_get_second_pass_params() +// depending on the usage to set the rate control encode parameters desired. +// +// Then, call encode_frame_to_data_rate() to perform the +// actual encode. This function will in turn call encode_frame() +// one or more times, followed by: +// av1_rc_postencode_update_drop_frame() +// +// The majority of rate control parameters are only expected +// to be set in the av1_get_..._params() functions and +// updated during the av1_rc_postencode_update...() functions. +// The only exceptions are av1_rc_drop_frame() and +// av1_rc_update_rate_correction_factors() functions. + +// Functions to set parameters for encoding before the actual +// encode_frame_to_data_rate() function. +struct EncodeFrameInput; + +// Post encode update of the rate control parameters based +// on bytes used +void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used); +// Post encode update of the rate control parameters for dropped frames +void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi); + +/*!\endcond */ +/*!\brief Updates the rate correction factor linking Q to output bits + * + * This function updates the Q rate correction factor after an encode + * cycle depending on whether we overshot or undershot the target rate. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder instance structure + * \param[in] is_encode_stage Indicates if recode loop or post-encode + * \param[in] width Frame width + * \param[in] height Frame height + * + * \remark Updates the relevant rate correction factor in cpi->rc + */ +void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, + int is_encode_stage, int width, + int height); +/*!\cond */ + +// Decide if we should drop this frame: For 1-pass CBR. +// Changes only the decimation count in the rate control structure +int av1_rc_drop_frame(struct AV1_COMP *cpi); + +// Computes frame size bounds. +void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi, + int this_frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit); + +/*!\endcond */ + +/*!\brief Picks q and q bounds given the rate control parameters in \c cpi->rc. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] width Coded frame width + * \param[in] height Coded frame height + * \param[in] gf_index Index of this frame in the golden frame group + * \param[out] bottom_index Bottom bound for q index (best quality) + * \param[out] top_index Top bound for q index (worst quality) + * \return Returns selected q index to be used for encoding this frame. + * Also, updates \c rc->arf_q. + */ +int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height, + int gf_index, int *bottom_index, int *top_index); + +/*!\brief Estimates q to achieve a target bits per frame + * + * \ingroup rate_control + * \param[in] cpi Top level encoder instance structure + * \param[in] target_bits_per_frame Frame rate target + * \param[in] active_worst_quality Max Q allowed + * \param[in] active_best_quality Min Q allowed + * \param[in] width Frame width + * \param[in] height Frame height + * + * \return Returns a q index value + */ +int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality, + int width, int height); + +/*!\cond */ +// Gets the appropriate bpmb ennumerator based on the frame and content type +int av1_get_bpmb_enumerator(FRAME_TYPE frame_type, + const int is_screen_content_type); + +// Estimates bits per mb for a given qindex and correction factor. +int av1_rc_bits_per_mb(const struct AV1_COMP *cpi, FRAME_TYPE frame_type, + int qindex, double correction_factor, + int accurate_estimate); + +// Clamping utilities for bitrate targets for iframes and pframes. +int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi, + int64_t target); +int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi, + int target, uint8_t frame_update_type); + +// Find q_index corresponding to desired_q, within [best_qindex, worst_qindex]. +// To be precise, 'q_index' is the smallest integer, for which the corresponding +// q >= desired_q. +// If no such q index is found, returns 'worst_qindex'. +int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, + int best_qindex, int worst_qindex); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a target q value +int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + aom_bit_depth_t bit_depth); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a value that should equate to the given rate ratio. +int av1_compute_qdelta_by_rate(const struct AV1_COMP *cpi, + FRAME_TYPE frame_type, int qindex, + double rate_target_ratio); + +int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q); + +void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height); + +void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi, + RATE_CONTROL *const rc); + +void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height); + +int av1_resize_one_pass_cbr(struct AV1_COMP *cpi); + +void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width, + int height); + +void av1_adjust_gf_refresh_qp_one_pass_rt(struct AV1_COMP *cpi); + +void av1_set_rtc_reference_structure_one_layer(struct AV1_COMP *cpi, + int gf_update); + +/*!\endcond */ +/*!\brief Calculates how many bits to use for a P frame in one pass vbr + * + * \ingroup rate_control + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] frame_update_type Type of frame + * + * \return Returns the target number of bits for this frame. + */ +int av1_calc_pframe_target_size_one_pass_vbr( + const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type); + +/*!\brief Calculates how many bits to use for an i frame in one pass vbr + * + * \ingroup rate_control + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \return Returns the target number of bits for this frame. + */ +int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi); + +/*!\brief Calculates how many bits to use for a P frame in one pass cbr + * + * \ingroup rate_control + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] frame_update_type Type of frame + * + * \return Returns the target number of bits for this frame. + */ +int av1_calc_pframe_target_size_one_pass_cbr( + const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type); + +/*!\brief Calculates how many bits to use for an i frame in one pass cbr + * + * \ingroup rate_control + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \return Returns the target number of bits for this frame. + */ +int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi); + +/*!\brief Setup the rate control parameters for 1 pass real-time mode. + * + * - Sets the frame type and target frame size. + * - Sets the GF update. + * - Checks for scene change. + * - Sets the reference prediction structure for 1 layers (non-SVC). + * - Resets and updates are done for SVC. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] frame_type Encoder frame type + * \param[in] frame_input Current and last input source frames + * \param[in] frame_flags Encoder frame flags + * + * \remark Nothing is returned. Instead the settings computed in this + * function are set in: \c frame_params, \c cpi->common, \c cpi->rc, + * \c cpi->svc. + */ +void av1_get_one_pass_rt_params(struct AV1_COMP *cpi, + FRAME_TYPE *const frame_type, + const struct EncodeFrameInput *frame_input, + unsigned int frame_flags); + +/*!\brief Increase q on expected encoder overshoot, for CBR mode. + * + * Handles the case when encoder is expected to create a large frame: + * - q is increased to value closer to \c cpi->rc.worst_quality + * - avg_frame_qindex is reset + * - buffer levels are reset + * - rate correction factor is adjusted + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] q Current q index + * + * \return q is returned, and updates are done to \c cpi->rc. + */ +int av1_encodedframe_overshoot_cbr(struct AV1_COMP *cpi, int *q); + +/*!\brief Compute the q_indices for a single frame. + * + * Intended to be used with AOM_Q mode. + * + * \param[in] base_q_index Base q index + * \param[in] gf_update_type GOP update type + * \param[in] gf_pyramid_level GOP level of the current frame + * \param[in] arf_q ARF q_index + * + * \return Returns the q_index for the current frame. + */ +int av1_q_mode_get_q_index(int base_q_index, int gf_update_type, + int gf_pyramid_level, int arf_q); + +/*!\brief Compute the q_indices for the ARF of a GOP. + * + * \param[in] base_q_index Base q index + * \param[in] gfu_boost GFU boost + * \param[in] bit_depth Bit depth + * \param[in] arf_boost_factor ARF boost factor + * + * \return Returns the q_index for the ARF frame. + */ +int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth, + double arf_boost_factor); + +#if !CONFIG_REALTIME_ONLY +struct TplDepFrame; +/*!\brief Compute the q_indices for the ARF of a GOP in Q mode. + * + * \param[in] cpi Top level encoder structure + * \param[in] tpl_frame Tpl Frame stats + * + * \return Returns the q_index for the ARF frame. + */ +int av1_get_arf_q_index_q_mode(struct AV1_COMP *cpi, + struct TplDepFrame *tpl_frame); +#endif +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RATECTRL_H_ diff --git a/third_party/aom/av1/encoder/rc_utils.h b/third_party/aom/av1/encoder/rc_utils.h new file mode 100644 index 0000000000..fe22ee5afb --- /dev/null +++ b/third_party/aom/av1/encoder/rc_utils.h @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RC_UTILS_H_ +#define AOM_AV1_ENCODER_RC_UTILS_H_ + +#include "av1/encoder/encoder.h" +#include "aom_dsp/psnr.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) { + RATE_CONTROL *rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + if (cpi->common.current_frame.frame_number > + (unsigned int)cpi->svc.number_spatial_layers) { + if (cpi->ppi->use_svc) { + av1_svc_check_reset_layer_rc_flag(cpi); + } else { + if (rc->avg_frame_bandwidth > (3 * rc->prev_avg_frame_bandwidth >> 1) || + rc->avg_frame_bandwidth < (rc->prev_avg_frame_bandwidth >> 1)) { + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + p_rc->bits_off_target = p_rc->optimal_buffer_level; + p_rc->buffer_level = p_rc->optimal_buffer_level; + } + } + } +} + +static AOM_INLINE void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf, + AV1_PRIMARY *ppi) { + PRIMARY_RATE_CONTROL *p_rc = &ppi->p_rc; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + const int64_t bandwidth = rc_cfg->target_bandwidth; + const int64_t starting = rc_cfg->starting_buffer_level_ms; + const int64_t optimal = rc_cfg->optimal_buffer_level_ms; + const int64_t maximum = rc_cfg->maximum_buffer_size_ms; + + p_rc->starting_buffer_level = starting * bandwidth / 1000; + p_rc->optimal_buffer_level = + (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000; + p_rc->maximum_buffer_size = + (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000; + + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + p_rc->bits_off_target = + AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size); + p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size); +} + +static AOM_INLINE void config_target_level(AV1_COMP *const cpi, + AV1_LEVEL target_level, int tier) { + AV1EncoderConfig *const oxcf = &cpi->oxcf; + SequenceHeader *const seq_params = cpi->common.seq_params; + TileConfig *const tile_cfg = &oxcf->tile_cfg; + RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + // Adjust target bitrate to be no larger than 70% of level limit. + const BITSTREAM_PROFILE profile = seq_params->profile; + const double level_bitrate_limit = + av1_get_max_bitrate_for_level(target_level, tier, profile); + const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70); + rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate); + // Also need to update cpi->ppi->twopass.bits_left. + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats; + if (stats != NULL) + cpi->ppi->twopass.bits_left = + (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0); + + // Adjust max over-shoot percentage. + rc_cfg->over_shoot_pct = 0; + + // Adjust max quantizer. + rc_cfg->worst_allowed_q = 255; + + // Adjust number of tiles and tile columns to be under level limit. + int max_tiles, max_tile_cols; + av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols); + while (tile_cfg->tile_columns > 0 && + (1 << tile_cfg->tile_columns) > max_tile_cols) { + --tile_cfg->tile_columns; + } + const int tile_cols = (1 << tile_cfg->tile_columns); + while (tile_cfg->tile_rows > 0 && + tile_cols * (1 << tile_cfg->tile_rows) > max_tiles) { + --tile_cfg->tile_rows; + } + + // Adjust min compression ratio. + const int still_picture = seq_params->still_picture; + const double min_cr = + av1_get_min_cr_for_level(target_level, tier, still_picture); + rc_cfg->min_cr = AOMMAX(rc_cfg->min_cr, (unsigned int)(min_cr * 100)); +} + +#if !CONFIG_REALTIME_ONLY + +/*!\brief Function to test for conditions that indicate we should loop + * back and recode a frame. + * + * \ingroup rate_control + * + * \param[in] cpi Top-level encoder structure + * \param[in] high_limit Upper rate threshold + * \param[in] low_limit Lower rate threshold + * \param[in] q Current q index + * \param[in] maxq Maximum allowed q index + * \param[in] minq Minimum allowed q index + * + * \return Indicates if a recode is required. + * \retval 1 Recode Required + * \retval 0 No Recode required + */ +static AOM_INLINE int recode_loop_test(AV1_COMP *cpi, int high_limit, + int low_limit, int q, int maxq, + int minq) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi); + int force_recode = 0; + + if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) || + (frame_is_kfgfarf && + (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) { + // TODO(agrange) high_limit could be greater than the scale-down threshold. + if ((rc->projected_frame_size > high_limit && q < maxq) || + (rc->projected_frame_size < low_limit && q > minq)) { + force_recode = 1; + } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) { + // Deal with frame undershoot and whether or not we are + // below the automatically set cq level. + if (q > oxcf->rc_cfg.cq_level && + rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) { + force_recode = 1; + } + } + } + return force_recode; +} + +static AOM_INLINE double av1_get_gfu_boost_projection_factor(double min_factor, + double max_factor, + int frame_count) { + double factor = sqrt((double)frame_count); + factor = AOMMIN(factor, max_factor); + factor = AOMMAX(factor, min_factor); + factor = (200.0 + 10.0 * factor); + return factor; +} + +static AOM_INLINE int get_gfu_boost_from_r0_lap(double min_factor, + double max_factor, double r0, + int frames_to_key) { + double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor, + frames_to_key); + const int boost = (int)rint(factor / r0); + return boost; +} + +static AOM_INLINE double av1_get_kf_boost_projection_factor(int frame_count) { + double factor = sqrt((double)frame_count); + factor = AOMMIN(factor, 10.0); + factor = AOMMAX(factor, 4.0); + factor = (75.0 + 14.0 * factor); + return factor; +} + +static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi, + int is_encode_stage, int q_low, + int q_high, int top_index, + int bottom_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + + av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, + cm->height); + + int q_regulated = + av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + AOMMAX(q_high, top_index), cm->width, cm->height); + + int retries = 0; + while (q_regulated < q_low && retries < 10) { + av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, + cm->height); + q_regulated = + av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + AOMMAX(q_high, top_index), cm->width, cm->height); + retries++; + } + return q_regulated; +} + +static AOM_INLINE int get_regulated_q_undershoot(AV1_COMP *const cpi, + int is_encode_stage, + int q_high, int top_index, + int bottom_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + + av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, + cm->height); + int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + top_index, cm->width, cm->height); + + int retries = 0; + while (q_regulated > q_high && retries < 10) { + av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, + cm->height); + q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + top_index, cm->width, cm->height); + retries++; + } + return q_regulated; +} + +/*!\brief Called after encode_with_recode_loop() has just encoded a frame. + * This function works out whether we undershot or overshot our bitrate + * target and adjusts q as appropriate. It also decides whether or not + * we need to recode the frame to get closer to the target rate. + * + * \ingroup rate_control + * + * \param[in] cpi Top-level encoder structure + * \param[out] loop Should we go around the recode loop again + * \param[in,out] q New q index value + * \param[in,out] q_low Low q index limit for this loop itteration + * \param[in,out] q_high High q index limit for this loop itteration + * \param[in] top_index Max permited new value for q index + * \param[in] bottom_index Min permited new value for q index + * \param[in,out] undershoot_seen Have we seen undershoot on this frame + * \param[in,out] overshoot_seen Have we seen overshoot on this frame + * \param[in,out] low_cr_seen Have we previously trriggered recode + * because the compression ration was less + * than a given minimum threshold. + * \param[in] loop_count Loop itterations so far. + * + */ +static AOM_INLINE void recode_loop_update_q( + AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low, + int *const q_high, const int top_index, const int bottom_index, + int *const undershoot_seen, int *const overshoot_seen, + int *const low_cr_seen, const int loop_count) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + *loop = 0; + + // Special case for overlay frame. + if (rc->is_src_frame_alt_ref && + rc->projected_frame_size < rc->max_frame_bandwidth) + return; + + const int min_cr = rc_cfg->min_cr; + if (min_cr > 0) { + const double compression_ratio = + av1_get_compression_ratio(cm, rc->projected_frame_size >> 3); + const double target_cr = min_cr / 100.0; + if (compression_ratio < target_cr) { + *low_cr_seen = 1; + if (*q < rc->worst_quality) { + const double cr_ratio = target_cr / compression_ratio; + const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio)); + *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality); + *q_low = AOMMAX(*q, *q_low); + *q_high = AOMMAX(*q, *q_high); + *loop = 1; + } + } + if (*low_cr_seen) return; + } + + if (cpi->ppi->level_params.keep_level_stats && + !is_stat_generation_stage(cpi)) { + // Initialize level info. at the beginning of each sequence. + if (cm->current_frame.frame_type == KEY_FRAME && + cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { + av1_init_level_info(cpi); + } + const AV1LevelParams *const level_params = &cpi->ppi->level_params; + // TODO(any): currently only checking operating point 0 + const AV1LevelInfo *const level_info = level_params->level_info[0]; + const DECODER_MODEL *const decoder_models = level_info->decoder_models; + const AV1_LEVEL target_level = level_params->target_seq_level_idx[0]; + + if (target_level < SEQ_LEVELS && + decoder_models[target_level].status == DECODER_MODEL_OK) { + DECODER_MODEL_STATUS status = av1_decoder_model_try_smooth_buf( + cpi, rc->projected_frame_size, &decoder_models[target_level]); + + if ((status == SMOOTHING_BUFFER_UNDERFLOW || + status == SMOOTHING_BUFFER_OVERFLOW) && + *q < rc->worst_quality) { + *q = AOMMIN(*q + 10, rc->worst_quality); + *q_low = AOMMAX(*q, *q_low); + *q_high = AOMMAX(*q, *q_high); + *loop = 1; + return; + } + } + } + + if (rc_cfg->mode == AOM_Q) return; + + const int last_q = *q; + int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0; + av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, + &frame_under_shoot_limit, + &frame_over_shoot_limit); + if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; + + if (cm->current_frame.frame_type == KEY_FRAME && + p_rc->this_key_frame_forced && + rc->projected_frame_size < rc->max_frame_bandwidth) { + int64_t kf_err; + const int64_t high_err_target = cpi->ambient_err; + const int64_t low_err_target = cpi->ambient_err >> 1; + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params->use_highbitdepth) { + kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); + } else { + kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); + } +#else + kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#endif + // Prevent possible divide by zero error below for perfect KF + kf_err += !kf_err; + + // The key frame is not good enough or we can afford + // to make it better without undue risk of popping. + if ((kf_err > high_err_target && + rc->projected_frame_size <= frame_over_shoot_limit) || + (kf_err > low_err_target && + rc->projected_frame_size <= frame_under_shoot_limit)) { + // Lower q_high + *q_high = AOMMAX(*q - 1, *q_low); + + // Adjust Q + *q = (int)((*q * high_err_target) / kf_err); + *q = AOMMIN(*q, (*q_high + *q_low) >> 1); + } else if (kf_err < low_err_target && + rc->projected_frame_size >= frame_under_shoot_limit) { + // The key frame is much better than the previous frame + // Raise q_low + *q_low = AOMMIN(*q + 1, *q_high); + + // Adjust Q + *q = (int)((*q * low_err_target) / kf_err); + *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1); + } + + // Clamp Q to upper and lower limits: + *q = clamp(*q, *q_low, *q_high); + *loop = (*q != last_q); + return; + } + + if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q, + AOMMAX(*q_high, top_index), bottom_index)) { + // Is the projected frame size out of range and are we allowed + // to attempt to recode. + + // Frame size out of permitted range: + // Update correction factor & compute new Q to try... + // Frame is too large + if (rc->projected_frame_size > rc->this_frame_target) { + // Special case if the projected size is > the max allowed. + if (*q == *q_high && + rc->projected_frame_size >= rc->max_frame_bandwidth) { + const double q_val_high_current = + av1_convert_qindex_to_q(*q_high, cm->seq_params->bit_depth); + const double q_val_high_new = + q_val_high_current * + ((double)rc->projected_frame_size / rc->max_frame_bandwidth); + *q_high = av1_find_qindex(q_val_high_new, cm->seq_params->bit_depth, + rc->best_quality, rc->worst_quality); + } + + // Raise Qlow as to at least the current value + *q_low = AOMMIN(*q + 1, *q_high); + + if (*undershoot_seen || loop_count > 2 || + (loop_count == 2 && !frame_is_intra_only(cm))) { + av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height); + + *q = (*q_high + *q_low + 1) / 2; + } else if (loop_count == 2 && frame_is_intra_only(cm)) { + const int q_mid = (*q_high + *q_low + 1) / 2; + const int q_regulated = get_regulated_q_overshoot( + cpi, 1, *q_low, *q_high, top_index, bottom_index); + // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth + // transition between loop_count < 2 and loop_count > 2. + *q = (q_mid + q_regulated + 1) / 2; + } else { + *q = get_regulated_q_overshoot(cpi, 1, *q_low, *q_high, top_index, + bottom_index); + } + + *overshoot_seen = 1; + } else { + // Frame is too small + *q_high = AOMMAX(*q - 1, *q_low); + + if (*overshoot_seen || loop_count > 2 || + (loop_count == 2 && !frame_is_intra_only(cm))) { + av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height); + *q = (*q_high + *q_low) / 2; + } else if (loop_count == 2 && frame_is_intra_only(cm)) { + const int q_mid = (*q_high + *q_low) / 2; + const int q_regulated = get_regulated_q_undershoot( + cpi, 1, *q_high, top_index, bottom_index); + // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth + // transition between loop_count < 2 and loop_count > 2. + *q = (q_mid + q_regulated) / 2; + + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (rc_cfg->mode == AOM_CQ && q_regulated < *q_low) { + *q_low = *q; + } + } else { + *q = get_regulated_q_undershoot(cpi, 1, *q_high, top_index, + bottom_index); + + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (rc_cfg->mode == AOM_CQ && *q < *q_low) { + *q_low = *q; + } + } + + *undershoot_seen = 1; + } + + // Clamp Q to upper and lower limits: + *q = clamp(*q, *q_low, *q_high); + } + + *loop = (*q != last_q); +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RC_UTILS_H_ diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c new file mode 100644 index 0000000000..c2d76e7a9a --- /dev/null +++ b/third_party/aom/av1/encoder/rd.c @@ -0,0 +1,1580 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_once.h" + +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/nonrd_opt.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" + +#define RD_THRESH_POW 1.25 + +// The baseline rd thresholds for breaking out of the rd loop for +// certain modes are assumed to be based on 8x8 blocks. +// This table is used to correct for block size. +// The factors here are << 2 (2 = x0.5, 32 = x8 etc). +static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = { + 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16 +}; + +static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA] + [EXT_TX_SIZES] = { + { 1, 1, 1, 1 }, // unused + { 1, 1, 0, 0 }, + { 0, 0, 1, 0 }, + }; + +static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER] + [EXT_TX_SIZES] = { + { 1, 1, 1, 1 }, // unused + { 1, 1, 0, 0 }, + { 0, 0, 1, 0 }, + { 0, 1, 1, 1 }, + }; + +static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA, + EXT_TX_SETS_INTER)] = { + { + // Intra + EXT_TX_SET_DCTONLY, + EXT_TX_SET_DTT4_IDTX_1DDCT, + EXT_TX_SET_DTT4_IDTX, + }, + { + // Inter + EXT_TX_SET_DCTONLY, + EXT_TX_SET_ALL16, + EXT_TX_SET_DTT9_IDTX_1DDCT, + EXT_TX_SET_DCT_IDTX, + }, +}; + +void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs, + FRAME_CONTEXT *fc) { + int i, j; + + for (i = 0; i < PARTITION_CONTEXTS; ++i) + av1_cost_tokens_from_cdf(mode_costs->partition_cost[i], + fc->partition_cdf[i], NULL); + + if (cm->current_frame.skip_mode_info.skip_mode_flag) { + for (i = 0; i < SKIP_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->skip_mode_cost[i], + fc->skip_mode_cdfs[i], NULL); + } + } + + for (i = 0; i < SKIP_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->skip_txfm_cost[i], + fc->skip_txfm_cdfs[i], NULL); + } + + for (i = 0; i < KF_MODE_CONTEXTS; ++i) + for (j = 0; j < KF_MODE_CONTEXTS; ++j) + av1_cost_tokens_from_cdf(mode_costs->y_mode_costs[i][j], + fc->kf_y_cdf[i][j], NULL); + + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) + av1_cost_tokens_from_cdf(mode_costs->mbmode_cost[i], fc->y_mode_cdf[i], + NULL); + for (i = 0; i < CFL_ALLOWED_TYPES; ++i) + for (j = 0; j < INTRA_MODES; ++j) + av1_cost_tokens_from_cdf(mode_costs->intra_uv_mode_cost[i][j], + fc->uv_mode_cdf[i][j], NULL); + + av1_cost_tokens_from_cdf(mode_costs->filter_intra_mode_cost, + fc->filter_intra_mode_cdf, NULL); + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + if (av1_filter_intra_allowed_bsize(cm, i)) + av1_cost_tokens_from_cdf(mode_costs->filter_intra_cost[i], + fc->filter_intra_cdfs[i], NULL); + } + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + av1_cost_tokens_from_cdf(mode_costs->switchable_interp_costs[i], + fc->switchable_interp_cdf[i], NULL); + + for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->palette_y_size_cost[i], + fc->palette_y_size_cdf[i], NULL); + av1_cost_tokens_from_cdf(mode_costs->palette_uv_size_cost[i], + fc->palette_uv_size_cdf[i], NULL); + for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) { + av1_cost_tokens_from_cdf(mode_costs->palette_y_mode_cost[i][j], + fc->palette_y_mode_cdf[i][j], NULL); + } + } + + for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->palette_uv_mode_cost[i], + fc->palette_uv_mode_cdf[i], NULL); + } + + for (i = 0; i < PALETTE_SIZES; ++i) { + for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) { + av1_cost_tokens_from_cdf(mode_costs->palette_y_color_cost[i][j], + fc->palette_y_color_index_cdf[i][j], NULL); + av1_cost_tokens_from_cdf(mode_costs->palette_uv_color_cost[i][j], + fc->palette_uv_color_index_cdf[i][j], NULL); + } + } + + int sign_cost[CFL_JOINT_SIGNS]; + av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL); + for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) { + int *cost_u = mode_costs->cfl_cost[joint_sign][CFL_PRED_U]; + int *cost_v = mode_costs->cfl_cost[joint_sign][CFL_PRED_V]; + if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) { + memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u)); + } else { + const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL); + } + if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) { + memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v)); + } else { + const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL); + } + for (int u = 0; u < CFL_ALPHABET_SIZE; u++) + cost_u[u] += sign_cost[joint_sign]; + } + + for (i = 0; i < MAX_TX_CATS; ++i) + for (j = 0; j < TX_SIZE_CONTEXTS; ++j) + av1_cost_tokens_from_cdf(mode_costs->tx_size_cost[i][j], + fc->tx_size_cdf[i][j], NULL); + + for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->txfm_partition_cost[i], + fc->txfm_partition_cdf[i], NULL); + } + + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + int s; + for (s = 1; s < EXT_TX_SETS_INTER; ++s) { + if (use_inter_ext_tx_for_txsize[s][i]) { + av1_cost_tokens_from_cdf( + mode_costs->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i], + av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]); + } + } + for (s = 1; s < EXT_TX_SETS_INTRA; ++s) { + if (use_intra_ext_tx_for_txsize[s][i]) { + for (j = 0; j < INTRA_MODES; ++j) { + av1_cost_tokens_from_cdf( + mode_costs->intra_tx_type_costs[s][i][j], + fc->intra_ext_tx_cdf[s][i][j], + av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]); + } + } + } + } + for (i = 0; i < DIRECTIONAL_MODES; ++i) { + av1_cost_tokens_from_cdf(mode_costs->angle_delta_cost[i], + fc->angle_delta_cdf[i], NULL); + } + av1_cost_tokens_from_cdf(mode_costs->intrabc_cost, fc->intrabc_cdf, NULL); + + for (i = 0; i < SPATIAL_PREDICTION_PROBS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->spatial_pred_cost[i], + fc->seg.spatial_pred_seg_cdf[i], NULL); + } + + for (i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->tmp_pred_cost[i], fc->seg.pred_cdf[i], + NULL); + } + + if (!frame_is_intra_only(cm)) { + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->comp_inter_cost[i], + fc->comp_inter_cdf[i], NULL); + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < SINGLE_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(mode_costs->single_ref_cost[i][j], + fc->single_ref_cdf[i][j], NULL); + } + } + + for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->comp_ref_type_cost[i], + fc->comp_ref_type_cdf[i], NULL); + } + + for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) { + for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(mode_costs->uni_comp_ref_cost[i][j], + fc->uni_comp_ref_cdf[i][j], NULL); + } + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < FWD_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(mode_costs->comp_ref_cost[i][j], + fc->comp_ref_cdf[i][j], NULL); + } + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < BWD_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(mode_costs->comp_bwdref_cost[i][j], + fc->comp_bwdref_cdf[i][j], NULL); + } + } + + for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->intra_inter_cost[i], + fc->intra_inter_cdf[i], NULL); + } + + for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->newmv_mode_cost[i], fc->newmv_cdf[i], + NULL); + } + + for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->zeromv_mode_cost[i], + fc->zeromv_cdf[i], NULL); + } + + for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->refmv_mode_cost[i], fc->refmv_cdf[i], + NULL); + } + + for (i = 0; i < DRL_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->drl_mode_cost0[i], fc->drl_cdf[i], + NULL); + } + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + av1_cost_tokens_from_cdf(mode_costs->inter_compound_mode_cost[i], + fc->inter_compound_mode_cdf[i], NULL); + for (i = 0; i < BLOCK_SIZES_ALL; ++i) + av1_cost_tokens_from_cdf(mode_costs->compound_type_cost[i], + fc->compound_type_cdf[i], NULL); + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + if (av1_is_wedge_used(i)) { + av1_cost_tokens_from_cdf(mode_costs->wedge_idx_cost[i], + fc->wedge_idx_cdf[i], NULL); + } + } + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->interintra_cost[i], + fc->interintra_cdf[i], NULL); + av1_cost_tokens_from_cdf(mode_costs->interintra_mode_cost[i], + fc->interintra_mode_cdf[i], NULL); + } + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + av1_cost_tokens_from_cdf(mode_costs->wedge_interintra_cost[i], + fc->wedge_interintra_cdf[i], NULL); + } + for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) { + av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost[i], + fc->motion_mode_cdf[i], NULL); + } + for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) { + av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost1[i], + fc->obmc_cdf[i], NULL); + } + for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->comp_idx_cost[i], + fc->compound_index_cdf[i], NULL); + } + for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->comp_group_idx_cost[i], + fc->comp_group_idx_cdf[i], NULL); + } + } +} + +void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc) { + av1_cost_tokens_from_cdf(mode_costs->switchable_restore_cost, + fc->switchable_restore_cdf, NULL); + av1_cost_tokens_from_cdf(mode_costs->wiener_restore_cost, + fc->wiener_restore_cdf, NULL); + av1_cost_tokens_from_cdf(mode_costs->sgrproj_restore_cost, + fc->sgrproj_restore_cdf, NULL); +} + +// Values are now correlated to quantizer. +static int sad_per_bit_lut_8[QINDEX_RANGE]; +static int sad_per_bit_lut_10[QINDEX_RANGE]; +static int sad_per_bit_lut_12[QINDEX_RANGE]; + +static void init_me_luts_bd(int *bit16lut, int range, + aom_bit_depth_t bit_depth) { + int i; + // Initialize the sad lut tables using a formulaic calculation for now. + // This is to make it easier to resolve the impact of experimental changes + // to the quantizer tables. + for (i = 0; i < range; i++) { + const double q = av1_convert_qindex_to_q(i, bit_depth); + bit16lut[i] = (int)(0.0418 * q + 2.4107); + } +} + +static void init_me_luts(void) { + init_me_luts_bd(sad_per_bit_lut_8, QINDEX_RANGE, AOM_BITS_8); + init_me_luts_bd(sad_per_bit_lut_10, QINDEX_RANGE, AOM_BITS_10); + init_me_luts_bd(sad_per_bit_lut_12, QINDEX_RANGE, AOM_BITS_12); +} + +void av1_init_me_luts(void) { aom_once(init_me_luts); } + +static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, + 8, 8, 4, 4, 2, 2, 1, 0 }; + +static const int rd_layer_depth_factor[7] = { + 160, 160, 160, 160, 192, 208, 224 +}; + +// Returns the default rd multiplier for inter frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_inter_rd_multiplier(int qindex) { + return 3.2 + (0.0015 * (double)qindex); +} + +// Returns the default rd multiplier for ARF/Golden Frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_arf_rd_multiplier(int qindex) { + return 3.25 + (0.0015 * (double)qindex); +} + +// Returns the default rd multiplier for key frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_kf_rd_multiplier(int qindex) { + return 3.3 + (0.0015 * (double)qindex); +} + +int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth, + FRAME_UPDATE_TYPE update_type, + int qindex) { + const int q = av1_dc_quant_QTX(qindex, 0, bit_depth); + int64_t rdmult = q * q; + if (update_type == KF_UPDATE) { + double def_rd_q_mult = def_kf_rd_multiplier(q); + rdmult = (int64_t)((double)rdmult * def_rd_q_mult); + } else if ((update_type == GF_UPDATE) || (update_type == ARF_UPDATE)) { + double def_rd_q_mult = def_arf_rd_multiplier(q); + rdmult = (int64_t)((double)rdmult * def_rd_q_mult); + } else { + double def_rd_q_mult = def_inter_rd_multiplier(q); + rdmult = (int64_t)((double)rdmult * def_rd_q_mult); + } + + switch (bit_depth) { + case AOM_BITS_8: break; + case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break; + case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + return rdmult > 0 ? (int)AOMMIN(rdmult, INT_MAX) : 1; +} + +int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth, + const FRAME_UPDATE_TYPE update_type, + const int layer_depth, const int boost_index, + const FRAME_TYPE frame_type, + const int use_fixed_qp_offsets, + const int is_stat_consumption_stage) { + int64_t rdmult = + av1_compute_rd_mult_based_on_qindex(bit_depth, update_type, qindex); + if (is_stat_consumption_stage && !use_fixed_qp_offsets && + (frame_type != KEY_FRAME)) { + // Layer depth adjustment + rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7; + // ARF boost adjustment + rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); + } + return (int)rdmult; +} + +int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) { + assert(beta > 0.0); + int q = av1_dc_quant_QTX(qindex, 0, bit_depth); + int newq = (int)rint(q / sqrt(beta)); + int orig_qindex = qindex; + if (newq == q) { + return 0; + } + if (newq < q) { + while (qindex > 0) { + qindex--; + q = av1_dc_quant_QTX(qindex, 0, bit_depth); + if (newq >= q) { + break; + } + } + } else { + while (qindex < MAXQ) { + qindex++; + q = av1_dc_quant_QTX(qindex, 0, bit_depth); + if (newq <= q) { + break; + } + } + } + return qindex - orig_qindex; +} + +int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex, + int curr_qindex) { + curr_qindex = clamp(curr_qindex, delta_q_res, 256 - delta_q_res); + const int sign_deltaq_index = curr_qindex - prev_qindex >= 0 ? 1 : -1; + const int deltaq_deadzone = delta_q_res / 4; + const int qmask = ~(delta_q_res - 1); + int abs_deltaq_index = abs(curr_qindex - prev_qindex); + abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask; + int adjust_qindex = prev_qindex + sign_deltaq_index * abs_deltaq_index; + adjust_qindex = AOMMAX(adjust_qindex, MINQ + 1); + return adjust_qindex; +} + +int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) { + assert(beta > 0.0); + const AV1_COMMON *cm = &cpi->common; + + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + + const int qindex_rdmult = cm->quant_params.base_qindex; + return (int)(av1_compute_rd_mult( + qindex_rdmult, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], + layer_depth, boost_index, frame_type, + cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)) / + beta); +} + +static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) { + double q; + switch (bit_depth) { + case AOM_BITS_8: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_8) / 4.0; break; + case AOM_BITS_10: + q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_10) / 16.0; + break; + case AOM_BITS_12: + q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_12) / 64.0; + break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + // TODO(debargha): Adjust the function below. + return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8); +} + +void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) { + switch (cpi->common.seq_params->bit_depth) { + case AOM_BITS_8: *sadperbit = sad_per_bit_lut_8[qindex]; break; + case AOM_BITS_10: *sadperbit = sad_per_bit_lut_10[qindex]; break; + case AOM_BITS_12: *sadperbit = sad_per_bit_lut_12[qindex]; break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + } +} + +static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd, + int use_nonrd_pick_mode) { + int i, bsize, segment_id; + THR_MODES mode_indices[RTC_REFS * RTC_MODES] = { 0 }; + int num_modes_count = use_nonrd_pick_mode ? 0 : MAX_MODES; + + if (use_nonrd_pick_mode) { + for (int r_idx = 0; r_idx < RTC_REFS; r_idx++) { + const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0]; + if (ref != INTRA_FRAME) { + for (i = 0; i < RTC_INTER_MODES; i++) + mode_indices[num_modes_count++] = + mode_idx[ref][mode_offset(inter_mode_list[i])]; + } else { + for (i = 0; i < RTC_INTRA_MODES; i++) + mode_indices[num_modes_count++] = + mode_idx[ref][mode_offset(intra_mode_list[i])]; + } + } + } + + for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { + const int qindex = clamp( + av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) + + cm->quant_params.y_dc_delta_q, + 0, MAXQ); + const int q = compute_rd_thresh_factor(qindex, cm->seq_params->bit_depth); + + for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + // Threshold here seems unnecessarily harsh but fine given actual + // range of values used for cpi->sf.thresh_mult[]. + const int t = q * rd_thresh_block_size_factor[bsize]; + const int thresh_max = INT_MAX / t; + + for (i = 0; i < num_modes_count; ++i) { + const int mode_index = use_nonrd_pick_mode ? mode_indices[i] : i; + rd->threshes[segment_id][bsize][mode_index] = + rd->thresh_mult[mode_index] < thresh_max + ? rd->thresh_mult[mode_index] * t / 4 + : INT_MAX; + } + } + } +} + +void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc, + const int num_planes) { + const int nplanes = AOMMIN(num_planes, PLANE_TYPES); + for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) { + for (int plane = 0; plane < nplanes; ++plane) { + LV_MAP_EOB_COST *pcost = &coeff_costs->eob_costs[eob_multi_size][plane]; + + for (int ctx = 0; ctx < 2; ++ctx) { + aom_cdf_prob *pcdf; + switch (eob_multi_size) { + case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break; + case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break; + case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break; + case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break; + case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break; + case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break; + case 6: + default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break; + } + av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL); + } + } + } + for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) { + for (int plane = 0; plane < nplanes; ++plane) { + LV_MAP_COEFF_COST *pcost = &coeff_costs->coeff_costs[tx_size][plane]; + + for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx], + fc->txb_skip_cdf[tx_size][ctx], NULL); + + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx) + av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx], + fc->coeff_base_eob_cdf[tx_size][plane][ctx], + NULL); + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->base_cost[ctx], + fc->coeff_base_cdf[tx_size][plane][ctx], NULL); + + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) { + pcost->base_cost[ctx][4] = 0; + pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] + + av1_cost_literal(1) - + pcost->base_cost[ctx][0]; + pcost->base_cost[ctx][6] = + pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1]; + pcost->base_cost[ctx][7] = + pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2]; + } + + for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx], + fc->eob_extra_cdf[tx_size][plane][ctx], NULL); + + for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx], + fc->dc_sign_cdf[plane][ctx], NULL); + + for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { + int br_rate[BR_CDF_SIZE]; + int prev_cost = 0; + int i, j; + av1_cost_tokens_from_cdf( + br_rate, fc->coeff_br_cdf[AOMMIN(tx_size, TX_32X32)][plane][ctx], + NULL); + // printf("br_rate: "); + // for(j = 0; j < BR_CDF_SIZE; j++) + // printf("%4d ", br_rate[j]); + // printf("\n"); + for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) { + for (j = 0; j < BR_CDF_SIZE - 1; j++) { + pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j]; + } + prev_cost += br_rate[j]; + } + pcost->lps_cost[ctx][i] = prev_cost; + // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx); + // for (i = 0; i <= COEFF_BASE_RANGE; i++) + // printf("%5d ", pcost->lps_cost[ctx][i]); + // printf("\n"); + } + for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { + pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] = + pcost->lps_cost[ctx][0]; + for (int i = 1; i <= COEFF_BASE_RANGE; ++i) { + pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] = + pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1]; + } + } + } + } +} + +void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp, + MvCosts *mv_costs) { + // Avoid accessing 'mv_costs' when it is not allocated. + if (mv_costs == NULL) return; + + mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX]; + mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX]; + mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX]; + mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX]; + if (integer_mv) { + mv_costs->mv_cost_stack = (int **)&mv_costs->nmv_cost; + av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, + nmvc, MV_SUBPEL_NONE); + } else { + mv_costs->mv_cost_stack = + usehp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost; + av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, + nmvc, usehp); + } +} + +void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs) { + dv_costs->dv_costs[0] = &dv_costs->dv_costs_alloc[0][MV_MAX]; + dv_costs->dv_costs[1] = &dv_costs->dv_costs_alloc[1][MV_MAX]; + av1_build_nmv_cost_table(dv_costs->joint_mv, dv_costs->dv_costs, ndvc, + MV_SUBPEL_NONE); +} + +// Populates speed features based on codec control settings (of type +// COST_UPDATE_TYPE) and expected speed feature settings (of type +// INTERNAL_COST_UPDATE_TYPE) by considering the least frequent cost update. +// The populated/updated speed features are used for cost updates in the +// encoder. +// WARNING: Population of unified cost update frequency needs to be taken care +// accordingly, in case of any modifications/additions to the enum +// COST_UPDATE_TYPE/INTERNAL_COST_UPDATE_TYPE. +static INLINE void populate_unified_cost_update_freq( + const CostUpdateFreq cost_upd_freq, SPEED_FEATURES *const sf) { + INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf; + // Mapping of entropy cost update frequency from the encoder's codec control + // settings of type COST_UPDATE_TYPE to speed features of type + // INTERNAL_COST_UPDATE_TYPE. + static const INTERNAL_COST_UPDATE_TYPE + map_cost_upd_to_internal_cost_upd[NUM_COST_UPDATE_TYPES] = { + INTERNAL_COST_UPD_SB, INTERNAL_COST_UPD_SBROW, INTERNAL_COST_UPD_TILE, + INTERNAL_COST_UPD_OFF + }; + + inter_sf->mv_cost_upd_level = + AOMMIN(inter_sf->mv_cost_upd_level, + map_cost_upd_to_internal_cost_upd[cost_upd_freq.mv]); + inter_sf->coeff_cost_upd_level = + AOMMIN(inter_sf->coeff_cost_upd_level, + map_cost_upd_to_internal_cost_upd[cost_upd_freq.coeff]); + inter_sf->mode_cost_upd_level = + AOMMIN(inter_sf->mode_cost_upd_level, + map_cost_upd_to_internal_cost_upd[cost_upd_freq.mode]); + sf->intra_sf.dv_cost_upd_level = + AOMMIN(sf->intra_sf.dv_cost_upd_level, + map_cost_upd_to_internal_cost_upd[cost_upd_freq.dv]); +} + +// Checks if entropy costs should be initialized/updated at frame level or not. +static INLINE int is_frame_level_cost_upd_freq_set( + const AV1_COMMON *const cm, const INTERNAL_COST_UPDATE_TYPE cost_upd_level, + const int use_nonrd_pick_mode, const int frames_since_key) { + const int fill_costs = + frame_is_intra_only(cm) || + (use_nonrd_pick_mode ? frames_since_key < 2 + : (cm->current_frame.frame_number & 0x07) == 1); + return ((!use_nonrd_pick_mode && cost_upd_level != INTERNAL_COST_UPD_OFF) || + cost_upd_level == INTERNAL_COST_UPD_TILE || fill_costs); +} + +// Decide whether we want to update the mode entropy cost for the current frame. +// The logit is currently inherited from selective_disable_cdf_rtc. +static AOM_INLINE int should_force_mode_cost_update(const AV1_COMP *cpi) { + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + if (!rt_sf->frame_level_mode_cost_update) { + return false; + } + + if (cpi->oxcf.algo_cfg.cdf_update_mode == 2) { + return cpi->frames_since_last_update == 1; + } else if (cpi->oxcf.algo_cfg.cdf_update_mode == 1) { + if (cpi->svc.number_spatial_layers == 1 && + cpi->svc.number_temporal_layers == 1) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + + return frame_is_intra_only(cm) || is_frame_resize_pending(cpi) || + rc->high_source_sad || rc->frames_since_key < 10 || + cpi->cyclic_refresh->counter_encode_maxq_scene_change < 10 || + cm->current_frame.frame_number % 8 == 0; + } else if (cpi->svc.number_temporal_layers > 1) { + return cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1; + } + } + + return false; +} + +void av1_initialize_rd_consts(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + SPEED_FEATURES *const sf = &cpi->sf; + RD_OPT *const rd = &cpi->rd; + int use_nonrd_pick_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; + int frames_since_key = cpi->rc.frames_since_key; + + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + + const int qindex_rdmult = + cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q; + rd->RDMULT = av1_compute_rd_mult( + qindex_rdmult, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); +#if CONFIG_RD_COMMAND + if (cpi->oxcf.pass == 2) { + const RD_COMMAND *rd_command = &cpi->rd_command; + if (rd_command->option_ls[rd_command->frame_index] == + RD_OPTION_SET_Q_RDMULT) { + rd->RDMULT = rd_command->rdmult_ls[rd_command->frame_index]; + } + } +#endif // CONFIG_RD_COMMAND + + av1_set_error_per_bit(&x->errorperbit, rd->RDMULT); + + set_block_thresholds(cm, rd, cpi->sf.rt_sf.use_nonrd_pick_mode); + + populate_unified_cost_update_freq(cpi->oxcf.cost_upd_freq, sf); + const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf; + // Frame level mv cost update + if (is_frame_level_cost_upd_freq_set(cm, inter_sf->mv_cost_upd_level, + use_nonrd_pick_mode, frames_since_key)) + av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv, + cm->features.allow_high_precision_mv, x->mv_costs); + + // Frame level coefficient cost update + if (is_frame_level_cost_upd_freq_set(cm, inter_sf->coeff_cost_upd_level, + use_nonrd_pick_mode, frames_since_key)) + av1_fill_coeff_costs(&x->coeff_costs, cm->fc, av1_num_planes(cm)); + + // Frame level mode cost update + if (should_force_mode_cost_update(cpi) || + is_frame_level_cost_upd_freq_set(cm, inter_sf->mode_cost_upd_level, + use_nonrd_pick_mode, frames_since_key)) + av1_fill_mode_rates(cm, &x->mode_costs, cm->fc); + + // Frame level dv cost update + if (av1_need_dv_costs(cpi)) { + if (cpi->td.dv_costs_alloc == NULL) { + CHECK_MEM_ERROR( + cm, cpi->td.dv_costs_alloc, + (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.dv_costs_alloc))); + cpi->td.mb.dv_costs = cpi->td.dv_costs_alloc; + } + av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs); + } +} + +static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { + // NOTE: The tables below must be of the same size. + + // The functions described below are sampled at the four most significant + // bits of x^2 + 8 / 256. + + // Normalized rate: + // This table models the rate for a Laplacian source with given variance + // when quantized with a uniform quantizer with given stepsize. The + // closed form expression is: + // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], + // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), + // and H(x) is the binary entropy function. + static const int rate_tab_q10[] = { + 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, + 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, + 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, + 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, + 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, + 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424, + 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87, + 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6, + 5, 3, 2, 1, 1, 1, 0, 0, + }; + // Normalized distortion: + // This table models the normalized distortion for a Laplacian source + // with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) + // where x = qpstep / sqrt(variance). + // Note the actual distortion is Dn * variance. + static const int dist_tab_q10[] = { + 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, + 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, + 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, + 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142, + 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351, + 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659, + 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936, + 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017, + 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, + }; + static const int xsq_iq_q10[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, + 40, 48, 56, 64, 72, 80, 88, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 256, 288, + 320, 352, 384, 416, 448, 480, 544, 608, 672, + 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504, + 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296, + 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136, + 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, + 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736, + 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696, + 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808, + 180192, 196576, 212960, 229344, 245728, + }; + const int tmp = (xsq_q10 >> 2) + 8; + const int k = get_msb(tmp) - 3; + const int xq = (k << 3) + ((tmp >> k) & 0x7); + const int one_q10 = 1 << 10; + const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k); + const int b_q10 = one_q10 - a_q10; + *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; + *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; +} + +void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2, + unsigned int qstep, int *rate, + int64_t *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + if (var == 0) { + *rate = 0; + *dist = 0; + } else { + int d_q10, r_q10; + static const uint32_t MAX_XSQ_Q10 = 245727; + const uint64_t xsq_q10_64 = + (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var; + const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10); + model_rd_norm(xsq_q10, &r_q10, &d_q10); + *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT); + *dist = (var * (int64_t)d_q10 + 512) >> 10; + } +} + +static double interp_cubic(const double *p, double x) { + return p[1] + 0.5 * x * + (p[2] - p[0] + + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + + x * (3.0 * (p[1] - p[2]) + p[3] - p[0]))); +} + +/* +static double interp_bicubic(const double *p, int p_stride, double x, + double y) { + double q[4]; + q[0] = interp_cubic(p, x); + q[1] = interp_cubic(p + p_stride, x); + q[2] = interp_cubic(p + 2 * p_stride, x); + q[3] = interp_cubic(p + 3 * p_stride, x); + return interp_cubic(q, y); +} +*/ + +static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 3, 3 +}; + +static int sse_norm_curvfit_model_cat_lookup(double sse_norm) { + return (sse_norm > 16.0); +} + +// Models distortion by sse using a logistic function on +// l = log2(sse / q^2) as: +// dbysse = 16 / (1 + k exp(l + c)) +static double get_dbysse_logistic(double l, double c, double k) { + const double A = 16.0; + const double dbysse = A / (1 + k * exp(l + c)); + return dbysse; +} + +// Models rate using a clamped linear function on +// l = log2(sse / q^2) as: +// rate = max(0, a + b * l) +static double get_rate_clamplinear(double l, double a, double b) { + const double rate = a + b * l; + return (rate < 0 ? 0 : rate); +} + +static const uint8_t bsize_surffit_model_cat_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 0, 0, 2, 2, 4, 4 +}; + +static const double surffit_rate_params[9][4] = { + { + 638.390212, + 2.253108, + 166.585650, + -3.939401, + }, + { + 5.256905, + 81.997240, + -1.321771, + 17.694216, + }, + { + -74.193045, + 72.431868, + -19.033152, + 15.407276, + }, + { + 416.770113, + 14.794188, + 167.686830, + -6.997756, + }, + { + 378.511276, + 9.558376, + 154.658843, + -6.635663, + }, + { + 277.818787, + 4.413180, + 150.317637, + -9.893038, + }, + { + 142.212132, + 11.542038, + 94.393964, + -5.518517, + }, + { + 219.100256, + 4.007421, + 108.932852, + -6.981310, + }, + { + 222.261971, + 3.251049, + 95.972916, + -5.609789, + }, +}; + +static const double surffit_dist_params[7] = { 1.475844, 4.328362, -5.680233, + -0.500994, 0.554585, 4.839478, + -0.695837 }; + +static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm, + double *rpar) { + const int cat = bsize_surffit_model_cat_lookup[bsize]; + rpar[0] = surffit_rate_params[cat][0] + surffit_rate_params[cat][1] * xm; + rpar[1] = surffit_rate_params[cat][2] + surffit_rate_params[cat][3] * xm; +} + +static void dist_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm, + double *dpar) { + (void)bsize; + const double *params = surffit_dist_params; + dpar[0] = params[0] + params[1] / (1 + exp((xm + params[2]) * params[3])); + dpar[1] = params[4] + params[5] * exp(params[6] * xm); +} + +void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm, + double yl, double *rate_f, double *distbysse_f) { + (void)sse_norm; + double rpar[2], dpar[2]; + rate_surffit_model_params_lookup(bsize, xm, rpar); + dist_surffit_model_params_lookup(bsize, xm, dpar); + + *rate_f = get_rate_clamplinear(yl, rpar[0], rpar[1]); + *distbysse_f = get_dbysse_logistic(yl, dpar[0], dpar[1]); +} + +static const double interp_rgrid_curv[4][65] = { + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 118.257702, 120.210658, 121.434853, 122.100487, + 122.377758, 122.436865, 72.290102, 96.974289, 101.652727, + 126.830141, 140.417377, 157.644879, 184.315291, 215.823873, + 262.300169, 335.919859, 420.624173, 519.185032, 619.854243, + 726.053595, 827.663369, 933.127475, 1037.988755, 1138.839609, + 1233.342933, 1333.508064, 1428.760126, 1533.396364, 1616.952052, + 1744.539319, 1803.413586, 1951.466618, 1994.227838, 2086.031680, + 2148.635443, 2239.068450, 2222.590637, 2338.859809, 2402.929011, + 2418.727875, 2435.342670, 2471.159469, 2523.187446, 2591.183827, + 2674.905840, 2774.110714, 2888.555675, 3017.997952, 3162.194773, + 3320.903365, 3493.880956, 3680.884773, 3881.672045, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 13.087244, 15.919735, 25.930313, 24.412411, + 28.567417, 29.924194, 30.857010, 32.742979, 36.382570, + 39.210386, 42.265690, 47.378572, 57.014850, 82.740067, + 137.346562, 219.968084, 316.781856, 415.643773, 516.706538, + 614.914364, 714.303763, 815.512135, 911.210485, 1008.501528, + 1109.787854, 1213.772279, 1322.922561, 1414.752579, 1510.505641, + 1615.741888, 1697.989032, 1780.123933, 1847.453790, 1913.742309, + 1960.828122, 2047.500168, 2085.454095, 2129.230668, 2158.171824, + 2182.231724, 2217.684864, 2269.589211, 2337.264824, 2420.618694, + 2519.557814, 2633.989178, 2763.819779, 2908.956609, 3069.306660, + 3244.776927, 3435.274401, 3640.706076, 3860.978945, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 4.656893, 5.123633, 5.594132, 6.162376, + 6.918433, 7.768444, 8.739415, 10.105862, 11.477328, + 13.236604, 15.421030, 19.093623, 25.801871, 46.724612, + 98.841054, 181.113466, 272.586364, 359.499769, 445.546343, + 525.944439, 605.188743, 681.793483, 756.668359, 838.486885, + 926.950356, 1015.482542, 1113.353926, 1204.897193, 1288.871992, + 1373.464145, 1455.746628, 1527.796460, 1588.475066, 1658.144771, + 1710.302500, 1807.563351, 1863.197608, 1927.281616, 1964.450872, + 2022.719898, 2100.041145, 2185.205712, 2280.993936, 2387.616216, + 2505.282950, 2634.204540, 2774.591385, 2926.653884, 3090.602436, + 3266.647443, 3454.999303, 3655.868416, 3869.465182, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.337370, 0.391916, 0.468839, 0.566334, + 0.762564, 1.069225, 1.384361, 1.787581, 2.293948, + 3.251909, 4.412991, 8.050068, 11.606073, 27.668092, + 65.227758, 128.463938, 202.097653, 262.715851, 312.464873, + 355.601398, 400.609054, 447.201352, 495.761568, 552.871938, + 619.067625, 691.984883, 773.753288, 860.628503, 946.262808, + 1019.805896, 1106.061360, 1178.422145, 1244.852258, 1302.173987, + 1399.650266, 1548.092912, 1545.928652, 1670.817500, 1694.523823, + 1779.195362, 1882.155494, 1990.662097, 2108.325181, 2235.456119, + 2372.366287, 2519.367059, 2676.769812, 2844.885918, 3024.026754, + 3214.503695, 3416.628115, 3630.711389, 3857.064892, 4096.000000, + }, +}; + +static const double interp_dgrid_curv[3][65] = { + { + 16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770, + 15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870, + 15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387, + 13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790, + 7.487633, 5.688649, 4.267515, 3.196300, 2.434201, 1.834064, + 1.369920, 1.035921, 0.775279, 0.574895, 0.427232, 0.314123, + 0.233236, 0.171440, 0.128188, 0.092762, 0.067569, 0.049324, + 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733, + 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848, + 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550, + 0.000348, 0.000193, 0.000085, 0.000021, 0.000000, + }, + { + 16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501, + 15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967, + 15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212, + 13.073692, 12.222005, 11.237799, 9.985848, 8.898823, 7.423519, + 5.995325, 4.773152, 3.744032, 2.938217, 2.294526, 1.762412, + 1.327145, 1.020728, 0.765535, 0.570548, 0.425833, 0.313825, + 0.232959, 0.171324, 0.128174, 0.092750, 0.067558, 0.049319, + 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733, + 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848, + 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550, + 0.000348, 0.000193, 0.000085, 0.000021, -0.000000, + }, +}; + +void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr, + double *rate_f, double *distbysse_f) { + const double x_start = -15.5; + const double x_end = 16.5; + const double x_step = 0.5; + const double epsilon = 1e-6; + const int rcat = bsize_curvfit_model_cat_lookup[bsize]; + const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm); + (void)x_end; + + xqr = AOMMAX(xqr, x_start + x_step + epsilon); + xqr = AOMMIN(xqr, x_end - x_step - epsilon); + const double x = (xqr - x_start) / x_step; + const int xi = (int)floor(x); + const double xo = x - xi; + + assert(xi > 0); + + const double *prate = &interp_rgrid_curv[rcat][(xi - 1)]; + *rate_f = interp_cubic(prate, xo); + const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)]; + *distbysse_f = interp_cubic(pdist, xo); +} + +static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) { + const int num_4x4_w = mi_size_wide[plane_bsize]; + const int num_4x4_h = mi_size_high[plane_bsize]; + const ENTROPY_CONTEXT *const above = pd->above_entropy_context; + const ENTROPY_CONTEXT *const left = pd->left_entropy_context; + + memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); +} + +void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) { + assert(plane_bsize < BLOCK_SIZES_ALL); + get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left); +} + +// Special clamping used in the encoder when calculating a prediction +// +// Logically, all pixel fetches used for prediction are clamped against the +// edges of the frame. But doing this directly is slow, so instead we allocate +// a finite border around the frame and fill it with copies of the outermost +// pixels. +// +// Since this border is finite, we need to clamp the motion vector before +// prediction in order to avoid out-of-bounds reads. At the same time, this +// clamp must not change the prediction result. +// +// We can balance both of these concerns by calculating how far we would have +// to go in each direction before the extended prediction region (the current +// block + AOM_INTERP_EXTEND many pixels around the block) would be mapped +// so that it touches the frame only at one row or column. This is a special +// point because any more extreme MV will always lead to the same prediction. +// So it is safe to clamp at that point. +// +// In the worst case, this requires a border of +// max_block_width + 2*AOM_INTERP_EXTEND = 128 + 2*4 = 136 pixels +// around the frame edges. +static INLINE void enc_clamp_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, + MV *mv) { + int bw = xd->width << MI_SIZE_LOG2; + int bh = xd->height << MI_SIZE_LOG2; + + int px_to_left_edge = xd->mi_col << MI_SIZE_LOG2; + int px_to_right_edge = (cm->mi_params.mi_cols - xd->mi_col) << MI_SIZE_LOG2; + int px_to_top_edge = xd->mi_row << MI_SIZE_LOG2; + int px_to_bottom_edge = (cm->mi_params.mi_rows - xd->mi_row) << MI_SIZE_LOG2; + + const SubpelMvLimits mv_limits = { + .col_min = -GET_MV_SUBPEL(px_to_left_edge + bw + AOM_INTERP_EXTEND), + .col_max = GET_MV_SUBPEL(px_to_right_edge + AOM_INTERP_EXTEND), + .row_min = -GET_MV_SUBPEL(px_to_top_edge + bh + AOM_INTERP_EXTEND), + .row_max = GET_MV_SUBPEL(px_to_bottom_edge + AOM_INTERP_EXTEND) + }; + clamp_mv(mv, &mv_limits); +} + +void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, + int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) { + const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME }; + const int_mv ref_mv = + av1_get_ref_mv_from_stack(0, ref_frames, 0, &x->mbmi_ext); + const int_mv ref_mv1 = + av1_get_ref_mv_from_stack(0, ref_frames, 1, &x->mbmi_ext); + MV pred_mv[MAX_MV_REF_CANDIDATES + 1]; + int num_mv_refs = 0; + pred_mv[num_mv_refs++] = ref_mv.as_mv; + if (ref_mv.as_int != ref_mv1.as_int) { + pred_mv[num_mv_refs++] = ref_mv1.as_mv; + } + + assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0]))); + + const uint8_t *const src_y_ptr = x->plane[0].src.buf; + int zero_seen = 0; + int best_sad = INT_MAX; + int max_mv = 0; + // Get the sad for each candidate reference mv. + for (int i = 0; i < num_mv_refs; ++i) { + MV *this_mv = &pred_mv[i]; + enc_clamp_mv(&cpi->common, &x->e_mbd, this_mv); + + const int fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3; + const int fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3; + max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3); + + if (fp_row == 0 && fp_col == 0 && zero_seen) continue; + zero_seen |= (fp_row == 0 && fp_col == 0); + + const uint8_t *const ref_y_ptr = + &ref_y_buffer[ref_y_stride * fp_row + fp_col]; + // Find sad for current vector. + const int this_sad = cpi->ppi->fn_ptr[block_size].sdf( + src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride); + // Note if it is the best so far. + if (this_sad < best_sad) { + best_sad = this_sad; + } + if (i == 0) + x->pred_mv0_sad[ref_frame] = this_sad; + else if (i == 1) + x->pred_mv1_sad[ref_frame] = this_sad; + } + + // Note the index of the mv that worked best in the reference list. + x->max_mv_context[ref_frame] = max_mv; + x->pred_mv_sad[ref_frame] = best_sad; +} + +void av1_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + const struct scale_factors *scale, + const struct scale_factors *scale_uv, + const int num_planes) { + dst[0].buf = src->y_buffer; + dst[0].stride = src->y_stride; + dst[1].buf = src->u_buffer; + dst[2].buf = src->v_buffer; + dst[1].stride = dst[2].stride = src->uv_stride; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + for (int i = 0; i < num_planes; ++i) { + setup_pred_plane(dst + i, xd->mi[0]->bsize, dst[i].buf, + i ? src->uv_crop_width : src->y_crop_width, + i ? src->uv_crop_height : src->y_crop_height, + dst[i].stride, mi_row, mi_col, i ? scale_uv : scale, + xd->plane[i].subsampling_x, xd->plane[i].subsampling_y); + } +} + +YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi, + int ref_frame) { + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); + RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1]; + const RefCntBuffer *const ref_buf = + get_ref_frame_buf(&cpi->common, ref_frame); + return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf + : NULL; +} + +int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd, + InterpFilter interp_filter, int dual_filter) { + if (interp_filter == SWITCHABLE) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + int inter_filter_cost = 0; + for (int dir = 0; dir < 2; ++dir) { + if (dir && !dual_filter) break; + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + const InterpFilter filter = + av1_extract_interp_filter(mbmi->interp_filters, dir); + inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx][filter]; + } + return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; + } else { + return 0; + } +} + +void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { + RD_OPT *const rd = &cpi->rd; + + // Set baseline threshold values. + av1_zero(rd->thresh_mult); + + rd->thresh_mult[THR_NEARESTMV] = 300; + rd->thresh_mult[THR_NEARESTL2] = 300; + rd->thresh_mult[THR_NEARESTL3] = 300; + rd->thresh_mult[THR_NEARESTB] = 300; + rd->thresh_mult[THR_NEARESTA2] = 300; + rd->thresh_mult[THR_NEARESTA] = 300; + rd->thresh_mult[THR_NEARESTG] = 300; + + rd->thresh_mult[THR_NEWMV] = 1000; + rd->thresh_mult[THR_NEWL2] = 1000; + rd->thresh_mult[THR_NEWL3] = 1000; + rd->thresh_mult[THR_NEWB] = 1000; + rd->thresh_mult[THR_NEWA2] = 1100; + rd->thresh_mult[THR_NEWA] = 1000; + rd->thresh_mult[THR_NEWG] = 1000; + + rd->thresh_mult[THR_NEARMV] = 1000; + rd->thresh_mult[THR_NEARL2] = 1000; + rd->thresh_mult[THR_NEARL3] = 1000; + rd->thresh_mult[THR_NEARB] = 1000; + rd->thresh_mult[THR_NEARA2] = 1000; + rd->thresh_mult[THR_NEARA] = 1000; + rd->thresh_mult[THR_NEARG] = 1000; + + rd->thresh_mult[THR_GLOBALMV] = 2200; + rd->thresh_mult[THR_GLOBALL2] = 2000; + rd->thresh_mult[THR_GLOBALL3] = 2000; + rd->thresh_mult[THR_GLOBALB] = 2400; + rd->thresh_mult[THR_GLOBALA2] = 2000; + rd->thresh_mult[THR_GLOBALG] = 2000; + rd->thresh_mult[THR_GLOBALA] = 2400; + + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] = 1100; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] = 800; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] = 900; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] = 1000; + + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] = 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] = 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] = 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] = 2000; + + rd->thresh_mult[THR_COMP_NEAR_NEARLA] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLA] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTLA] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLA] = 1530; + rd->thresh_mult[THR_COMP_NEW_NEARLA] = 1870; + rd->thresh_mult[THR_COMP_NEW_NEWLA] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] = 2750; + + rd->thresh_mult[THR_COMP_NEAR_NEARL2A] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2A] = 1870; + rd->thresh_mult[THR_COMP_NEW_NEARL2A] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2A] = 1800; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL3A] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3A] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL3A] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3A] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] = 3000; + + rd->thresh_mult[THR_COMP_NEAR_NEARGA] = 1320; + rd->thresh_mult[THR_COMP_NEAREST_NEWGA] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGA] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGA] = 2040; + rd->thresh_mult[THR_COMP_NEW_NEARGA] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGA] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] = 2250; + + rd->thresh_mult[THR_COMP_NEAR_NEARLB] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLB] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTLB] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLB] = 1360; + rd->thresh_mult[THR_COMP_NEW_NEARLB] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWLB] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] = 2250; + + rd->thresh_mult[THR_COMP_NEAR_NEARL2B] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2B] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL2B] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2B] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL3B] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3B] = 1870; + rd->thresh_mult[THR_COMP_NEW_NEARL3B] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3B] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARGB] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWGB] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGB] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGB] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARGB] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGB] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARLA2] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] = 1800; + rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARLA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWLA2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL2A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2A2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] = 1440; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL3A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3A2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARGA2] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARGA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGA2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] = 2750; + + rd->thresh_mult[THR_COMP_NEAR_NEARLL2] = 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] = 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] = 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWLL2] = 2640; + rd->thresh_mult[THR_COMP_NEW_NEARLL2] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEWLL2] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] = 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARLL3] = 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] = 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] = 1800; + rd->thresh_mult[THR_COMP_NEAR_NEWLL3] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEARLL3] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEWLL3] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] = 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARLG] = 1760; + rd->thresh_mult[THR_COMP_NEAREST_NEWLG] = 2400; + rd->thresh_mult[THR_COMP_NEW_NEARESTLG] = 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWLG] = 1760; + rd->thresh_mult[THR_COMP_NEW_NEARLG] = 2640; + rd->thresh_mult[THR_COMP_NEW_NEWLG] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] = 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARBA] = 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWBA] = 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTBA] = 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWBA] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEARBA] = 1980; + rd->thresh_mult[THR_COMP_NEW_NEWBA] = 2640; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] = 3200; + + rd->thresh_mult[THR_DC] = 1000; + rd->thresh_mult[THR_PAETH] = 1000; + rd->thresh_mult[THR_SMOOTH] = 2200; + rd->thresh_mult[THR_SMOOTH_V] = 2000; + rd->thresh_mult[THR_SMOOTH_H] = 2000; + rd->thresh_mult[THR_H_PRED] = 2000; + rd->thresh_mult[THR_V_PRED] = 1800; + rd->thresh_mult[THR_D135_PRED] = 2500; + rd->thresh_mult[THR_D203_PRED] = 2000; + rd->thresh_mult[THR_D157_PRED] = 2500; + rd->thresh_mult[THR_D67_PRED] = 2000; + rd->thresh_mult[THR_D113_PRED] = 2500; + rd->thresh_mult[THR_D45_PRED] = 2500; +} + +static INLINE void update_thr_fact(int (*factor_buf)[MAX_MODES], + THR_MODES best_mode_index, + THR_MODES mode_start, THR_MODES mode_end, + BLOCK_SIZE min_size, BLOCK_SIZE max_size, + int max_rd_thresh_factor) { + for (THR_MODES mode = mode_start; mode < mode_end; ++mode) { + for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) { + int *const fact = &factor_buf[bs][mode]; + if (mode == best_mode_index) { + *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR); + } else { + *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor); + } + } + } +} + +void av1_update_rd_thresh_fact( + const AV1_COMMON *const cm, int (*factor_buf)[MAX_MODES], + int use_adaptive_rd_thresh, BLOCK_SIZE bsize, THR_MODES best_mode_index, + THR_MODES inter_mode_start, THR_MODES inter_mode_end, + THR_MODES intra_mode_start, THR_MODES intra_mode_end) { + assert(use_adaptive_rd_thresh > 0); + const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT; + + const int bsize_is_1_to_4 = bsize > cm->seq_params->sb_size; + BLOCK_SIZE min_size, max_size; + if (bsize_is_1_to_4) { + // This part handles block sizes with 1:4 and 4:1 aspect ratios + // TODO(any): Experiment with threshold update for parent/child blocks + min_size = bsize; + max_size = bsize; + } else { + min_size = AOMMAX(bsize - 2, BLOCK_4X4); + max_size = AOMMIN(bsize + 2, (int)cm->seq_params->sb_size); + } + + update_thr_fact(factor_buf, best_mode_index, inter_mode_start, inter_mode_end, + min_size, max_size, max_rd_thresh_factor); + update_thr_fact(factor_buf, best_mode_index, intra_mode_start, intra_mode_end, + min_size, max_size, max_rd_thresh_factor); +} + +int av1_get_intra_cost_penalty(int qindex, int qdelta, + aom_bit_depth_t bit_depth) { + const int q = av1_dc_quant_QTX(qindex, qdelta, bit_depth); + switch (bit_depth) { + case AOM_BITS_8: return 20 * q; + case AOM_BITS_10: return 5 * q; + case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2); + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h new file mode 100644 index 0000000000..b38d9ca542 --- /dev/null +++ b/third_party/aom/av1/encoder/rd.h @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RD_H_ +#define AOM_AV1_ENCODER_RD_H_ + +#include + +#include "av1/common/blockd.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RDDIV_BITS 7 +#define RD_EPB_SHIFT 6 + +#define RDCOST(RM, R, D) \ + (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \ + ((D) * (1 << RDDIV_BITS))) + +#define RDCOST_NEG_R(RM, R, D) \ + (((D) * (1 << RDDIV_BITS)) - \ + ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT)) + +#define RDCOST_DBL_WITH_NATIVE_BD_DIST(RM, R, D, BD) \ + (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \ + ((double)((D) >> (2 * (BD - 8))) * (1 << RDDIV_BITS))) + +#define QIDX_SKIP_THRESH 115 + +#define MV_COST_WEIGHT 108 +#define MV_COST_WEIGHT_SUB 120 + +// The fractional part of rd_thresh factor is stored with 5 bits. The maximum +// factor that we allow is two, which is stored as 2 ** (5+1) = 64 +#define RD_THRESH_FAC_FRAC_BITS (5) +#define RD_THRESH_FAC_FRAC_VAL (1 << (RD_THRESH_FAC_FRAC_BITS)) +#define RD_THRESH_MAX_FACT ((RD_THRESH_FAC_FRAC_VAL) << 1) +#define RD_THRESH_LOG_DEC_FACTOR (4) +#define RD_THRESH_INC (1) + +// Factor to weigh the rate for switchable interp filters. +#define SWITCHABLE_INTERP_RATE_FACTOR 1 + +// Macros for common video resolutions: width x height +// For example, 720p represents video resolution of 1280x720 pixels. +#define RESOLUTION_288P 352 * 288 +#define RESOLUTION_360P 640 * 360 +#define RESOLUTION_480P 640 * 480 +#define RESOLUTION_720P 1280 * 720 +#define RESOLUTION_1080P 1920 * 1080 +#define RESOLUTION_1440P 2560 * 1440 +#define RESOLUTION_4K 3840 * 2160 + +#define RTC_REFS 4 +static const MV_REFERENCE_FRAME real_time_ref_combos[RTC_REFS][2] = { + { LAST_FRAME, NONE_FRAME }, + { ALTREF_FRAME, NONE_FRAME }, + { GOLDEN_FRAME, NONE_FRAME }, + { INTRA_FRAME, NONE_FRAME } +}; + +static INLINE int mode_offset(const PREDICTION_MODE mode) { + if (mode >= NEARESTMV) { + return INTER_OFFSET(mode); + } else { + switch (mode) { + case DC_PRED: return 0; + case V_PRED: return 1; + case H_PRED: return 2; + case SMOOTH_PRED: return 3; + default: assert(0); return -1; + } + } +} + +enum { + // Default initialization when we are not using winner mode framework. e.g. + // intrabc + DEFAULT_EVAL = 0, + // Initialization for selecting winner mode + MODE_EVAL, + // Initialization for winner mode evaluation + WINNER_MODE_EVAL, + // All mode evaluation types + MODE_EVAL_TYPES, +} UENUM1BYTE(MODE_EVAL_TYPE); + +typedef struct RD_OPT { + // Thresh_mult is used to set a threshold for the rd score. A higher value + // means that we will accept the best mode so far more often. This number + // is used in combination with the current block size, and thresh_freq_fact + // to pick a threshold. + int thresh_mult[MAX_MODES]; + + int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES]; + + int RDMULT; + + double r0; +} RD_OPT; + +static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) { +#if CONFIG_RD_DEBUG + int plane; +#endif + rd_stats->rate = 0; + rd_stats->dist = 0; + rd_stats->rdcost = 0; + rd_stats->sse = 0; + rd_stats->skip_txfm = 1; + rd_stats->zero_rate = 0; +#if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats->txb_coeff_cost[plane] = 0; + } +#endif +} + +static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) { +#if CONFIG_RD_DEBUG + int plane; +#endif + rd_stats->rate = INT_MAX; + rd_stats->dist = INT64_MAX; + rd_stats->rdcost = INT64_MAX; + rd_stats->sse = INT64_MAX; + rd_stats->skip_txfm = 0; + rd_stats->zero_rate = 0; +#if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats->txb_coeff_cost[plane] = INT_MAX; + } +#endif +} + +static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst, + const RD_STATS *rd_stats_src) { + if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) { + // If rd_stats_dst or rd_stats_src has invalid rate, we will make + // rd_stats_dst invalid. + av1_invalid_rd_stats(rd_stats_dst); + return; + } + rd_stats_dst->rate = (int)AOMMIN( + ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX); + if (!rd_stats_dst->zero_rate) + rd_stats_dst->zero_rate = rd_stats_src->zero_rate; + rd_stats_dst->dist += rd_stats_src->dist; + if (rd_stats_dst->sse < INT64_MAX && rd_stats_src->sse < INT64_MAX) { + rd_stats_dst->sse += rd_stats_src->sse; + } + rd_stats_dst->skip_txfm &= rd_stats_src->skip_txfm; +#if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane + for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane]; + } +#endif +} + +static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist, + int rate, int skip_txfm, int64_t sse, + int zero_rate) { + assert(rd_stats->rate != INT_MAX && rate != INT_MAX); + rd_stats->rate += rate; + if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate; + rd_stats->dist += dist; + rd_stats->skip_txfm &= skip_txfm; + rd_stats->sse += sse; +} + +static INLINE int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) { + assert(mult >= 0); + if (rate >= 0) { + return RDCOST(mult, rate, dist); + } + return RDCOST_NEG_R(mult, -rate, dist); +} + +static INLINE void av1_rd_cost_update(int mult, RD_STATS *rd_cost) { + if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX && + rd_cost->rdcost < INT64_MAX) { + rd_cost->rdcost = av1_calculate_rd_cost(mult, rd_cost->rate, rd_cost->dist); + } else { + av1_invalid_rd_stats(rd_cost); + } +} + +static INLINE void av1_rd_stats_subtraction(int mult, + const RD_STATS *const left, + const RD_STATS *const right, + RD_STATS *result) { + if (left->rate == INT_MAX || right->rate == INT_MAX || + left->dist == INT64_MAX || right->dist == INT64_MAX || + left->rdcost == INT64_MAX || right->rdcost == INT64_MAX) { + av1_invalid_rd_stats(result); + } else { + result->rate = left->rate - right->rate; + result->dist = left->dist - right->dist; + result->rdcost = av1_calculate_rd_cost(mult, result->rate, result->dist); + } +} + +struct TileInfo; +struct TileDataEnc; +struct AV1_COMP; +struct macroblock; + +/*!\brief Compute rdmult based on q index and frame update type + * + * \param[in] bit_depth bit depth + * \param[in] update_type frame update type + * \param[in] qindex q index + * + * \return rdmult + */ +int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth, + FRAME_UPDATE_TYPE update_type, + int qindex); + +int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth, + const FRAME_UPDATE_TYPE update_type, + const int layer_depth, const int boost_index, + const FRAME_TYPE frame_type, + const int use_fixed_qp_offsets, + const int is_stat_consumption_stage); + +void av1_initialize_rd_consts(struct AV1_COMP *cpi); + +// Sets the multiplier to convert mv cost to l1 error during motion search. +void av1_set_sad_per_bit(const struct AV1_COMP *cpi, int *sadperbit, + int qindex); + +void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n, + unsigned int qstep, int *rate, int64_t *dist); + +void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr, + double *rate_f, double *distbysse_f); +void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm, + double yl, double *rate_f, double *distbysse_f); + +int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd, + InterpFilter interp_filter, int dual_filter); + +YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi, + int ref_frame); + +void av1_init_me_luts(void); + +void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx); + +void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]); + +void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi); + +void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, + int (*fact)[MAX_MODES], int rd_thresh, + BLOCK_SIZE bsize, THR_MODES best_mode_index, + THR_MODES inter_mode_start, + THR_MODES inter_mode_end, + THR_MODES intra_mode_start, + THR_MODES intra_mode_end); + +static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) { + for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { + for (int j = 0; j < MAX_MODES; ++j) { + x->thresh_freq_fact[i][j] = RD_THRESH_FAC_FRAC_VAL; + } + } +} + +static INLINE int rd_less_than_thresh(int64_t best_rd, int64_t thresh, + int thresh_fact) { + return best_rd < (thresh * thresh_fact >> 5) || thresh == INT_MAX; +} + +void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x, + uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, + BLOCK_SIZE block_size); + +// Sets the multiplier to convert mv cost to l2 error during motion search. +static INLINE void av1_set_error_per_bit(int *errorperbit, int rdmult) { + *errorperbit = AOMMAX(rdmult >> RD_EPB_SHIFT, 1); +} + +// Get the threshold for R-D optimization of coefficients depending upon mode +// decision/winner mode processing +static INLINE void get_rd_opt_coeff_thresh( + const uint32_t (*const coeff_opt_threshold)[2], + TxfmSearchParams *txfm_params, int enable_winner_mode_for_coeff_opt, + int is_winner_mode) { + if (!enable_winner_mode_for_coeff_opt) { + // Default initialization of threshold + txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[DEFAULT_EVAL][0]; + txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[DEFAULT_EVAL][1]; + return; + } + // TODO(any): Experiment with coeff_opt_dist_threshold values when + // enable_winner_mode_for_coeff_opt is ON + // TODO(any): Skip the winner mode processing for blocks with lower residual + // energy as R-D optimization of coefficients would have been enabled during + // mode decision + + // Use conservative threshold during mode decision and perform R-D + // optimization of coeffs always for winner modes + if (is_winner_mode) { + txfm_params->coeff_opt_thresholds[0] = + coeff_opt_threshold[WINNER_MODE_EVAL][0]; + txfm_params->coeff_opt_thresholds[1] = + coeff_opt_threshold[WINNER_MODE_EVAL][1]; + } else { + txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[MODE_EVAL][0]; + txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[MODE_EVAL][1]; + } +} + +// Used to reset the state of mb rd hash information +static INLINE void reset_mb_rd_record(MB_RD_RECORD *const mb_rd_record) { + if (!mb_rd_record) return; + + // Reset the state for use_mb_rd_hash + mb_rd_record->num = mb_rd_record->index_start = 0; +} + +void av1_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + const struct scale_factors *scale, + const struct scale_factors *scale_uv, + const int num_planes); + +int av1_get_intra_cost_penalty(int qindex, int qdelta, + aom_bit_depth_t bit_depth); + +void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs, + FRAME_CONTEXT *fc); + +void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc); + +void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc, + const int num_planes); + +void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp, + MvCosts *mv_costs); + +void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs); + +int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta); + +int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta); + +/*!\brief Adjust current superblock's q_index based on delta q resolution + * + * \param[in] delta_q_res delta q resolution + * \param[in] prev_qindex previous superblock's q index + * \param[in] curr_qindex current superblock's q index + * + * \return the current superblock's adjusted q_index + */ +int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex, + int curr_qindex); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RD_H_ diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c new file mode 100644 index 0000000000..c17fbccf8c --- /dev/null +++ b/third_party/aom/av1/encoder/rdopt.c @@ -0,0 +1,6598 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/blockd.h" +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/idct.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" +#include "av1/common/txb_common.h" +#include "av1/common/warped_motion.h" + +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/compound_type.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/interp_search.h" +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/ml.h" +#include "av1/encoder/mode_prune_model_weights.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/pustats.h" +#include "av1/encoder/random.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/tx_search.h" +#include "av1/encoder/var_based_part.h" + +#define LAST_NEW_MV_INDEX 6 + +// Mode_threshold multiplication factor table for prune_inter_modes_if_skippable +// The values are kept in Q12 format and equation used to derive is +// (2.5 - ((float)x->qindex / MAXQ) * 1.5) +#define MODE_THRESH_QBITS 12 +static const int mode_threshold_mul_factor[QINDEX_RANGE] = { + 10240, 10216, 10192, 10168, 10144, 10120, 10095, 10071, 10047, 10023, 9999, + 9975, 9951, 9927, 9903, 9879, 9854, 9830, 9806, 9782, 9758, 9734, + 9710, 9686, 9662, 9638, 9614, 9589, 9565, 9541, 9517, 9493, 9469, + 9445, 9421, 9397, 9373, 9349, 9324, 9300, 9276, 9252, 9228, 9204, + 9180, 9156, 9132, 9108, 9083, 9059, 9035, 9011, 8987, 8963, 8939, + 8915, 8891, 8867, 8843, 8818, 8794, 8770, 8746, 8722, 8698, 8674, + 8650, 8626, 8602, 8578, 8553, 8529, 8505, 8481, 8457, 8433, 8409, + 8385, 8361, 8337, 8312, 8288, 8264, 8240, 8216, 8192, 8168, 8144, + 8120, 8096, 8072, 8047, 8023, 7999, 7975, 7951, 7927, 7903, 7879, + 7855, 7831, 7806, 7782, 7758, 7734, 7710, 7686, 7662, 7638, 7614, + 7590, 7566, 7541, 7517, 7493, 7469, 7445, 7421, 7397, 7373, 7349, + 7325, 7301, 7276, 7252, 7228, 7204, 7180, 7156, 7132, 7108, 7084, + 7060, 7035, 7011, 6987, 6963, 6939, 6915, 6891, 6867, 6843, 6819, + 6795, 6770, 6746, 6722, 6698, 6674, 6650, 6626, 6602, 6578, 6554, + 6530, 6505, 6481, 6457, 6433, 6409, 6385, 6361, 6337, 6313, 6289, + 6264, 6240, 6216, 6192, 6168, 6144, 6120, 6096, 6072, 6048, 6024, + 5999, 5975, 5951, 5927, 5903, 5879, 5855, 5831, 5807, 5783, 5758, + 5734, 5710, 5686, 5662, 5638, 5614, 5590, 5566, 5542, 5518, 5493, + 5469, 5445, 5421, 5397, 5373, 5349, 5325, 5301, 5277, 5253, 5228, + 5204, 5180, 5156, 5132, 5108, 5084, 5060, 5036, 5012, 4987, 4963, + 4939, 4915, 4891, 4867, 4843, 4819, 4795, 4771, 4747, 4722, 4698, + 4674, 4650, 4626, 4602, 4578, 4554, 4530, 4506, 4482, 4457, 4433, + 4409, 4385, 4361, 4337, 4313, 4289, 4265, 4241, 4216, 4192, 4168, + 4144, 4120, 4096 +}; + +static const THR_MODES av1_default_mode_order[MAX_MODES] = { + THR_NEARESTMV, + THR_NEARESTL2, + THR_NEARESTL3, + THR_NEARESTB, + THR_NEARESTA2, + THR_NEARESTA, + THR_NEARESTG, + + THR_NEWMV, + THR_NEWL2, + THR_NEWL3, + THR_NEWB, + THR_NEWA2, + THR_NEWA, + THR_NEWG, + + THR_NEARMV, + THR_NEARL2, + THR_NEARL3, + THR_NEARB, + THR_NEARA2, + THR_NEARA, + THR_NEARG, + + THR_GLOBALMV, + THR_GLOBALL2, + THR_GLOBALL3, + THR_GLOBALB, + THR_GLOBALA2, + THR_GLOBALA, + THR_GLOBALG, + + THR_COMP_NEAREST_NEARESTLA, + THR_COMP_NEAREST_NEARESTL2A, + THR_COMP_NEAREST_NEARESTL3A, + THR_COMP_NEAREST_NEARESTGA, + THR_COMP_NEAREST_NEARESTLB, + THR_COMP_NEAREST_NEARESTL2B, + THR_COMP_NEAREST_NEARESTL3B, + THR_COMP_NEAREST_NEARESTGB, + THR_COMP_NEAREST_NEARESTLA2, + THR_COMP_NEAREST_NEARESTL2A2, + THR_COMP_NEAREST_NEARESTL3A2, + THR_COMP_NEAREST_NEARESTGA2, + THR_COMP_NEAREST_NEARESTLL2, + THR_COMP_NEAREST_NEARESTLL3, + THR_COMP_NEAREST_NEARESTLG, + THR_COMP_NEAREST_NEARESTBA, + + THR_COMP_NEAR_NEARLB, + THR_COMP_NEW_NEWLB, + THR_COMP_NEW_NEARESTLB, + THR_COMP_NEAREST_NEWLB, + THR_COMP_NEW_NEARLB, + THR_COMP_NEAR_NEWLB, + THR_COMP_GLOBAL_GLOBALLB, + + THR_COMP_NEAR_NEARLA, + THR_COMP_NEW_NEWLA, + THR_COMP_NEW_NEARESTLA, + THR_COMP_NEAREST_NEWLA, + THR_COMP_NEW_NEARLA, + THR_COMP_NEAR_NEWLA, + THR_COMP_GLOBAL_GLOBALLA, + + THR_COMP_NEAR_NEARL2A, + THR_COMP_NEW_NEWL2A, + THR_COMP_NEW_NEARESTL2A, + THR_COMP_NEAREST_NEWL2A, + THR_COMP_NEW_NEARL2A, + THR_COMP_NEAR_NEWL2A, + THR_COMP_GLOBAL_GLOBALL2A, + + THR_COMP_NEAR_NEARL3A, + THR_COMP_NEW_NEWL3A, + THR_COMP_NEW_NEARESTL3A, + THR_COMP_NEAREST_NEWL3A, + THR_COMP_NEW_NEARL3A, + THR_COMP_NEAR_NEWL3A, + THR_COMP_GLOBAL_GLOBALL3A, + + THR_COMP_NEAR_NEARGA, + THR_COMP_NEW_NEWGA, + THR_COMP_NEW_NEARESTGA, + THR_COMP_NEAREST_NEWGA, + THR_COMP_NEW_NEARGA, + THR_COMP_NEAR_NEWGA, + THR_COMP_GLOBAL_GLOBALGA, + + THR_COMP_NEAR_NEARL2B, + THR_COMP_NEW_NEWL2B, + THR_COMP_NEW_NEARESTL2B, + THR_COMP_NEAREST_NEWL2B, + THR_COMP_NEW_NEARL2B, + THR_COMP_NEAR_NEWL2B, + THR_COMP_GLOBAL_GLOBALL2B, + + THR_COMP_NEAR_NEARL3B, + THR_COMP_NEW_NEWL3B, + THR_COMP_NEW_NEARESTL3B, + THR_COMP_NEAREST_NEWL3B, + THR_COMP_NEW_NEARL3B, + THR_COMP_NEAR_NEWL3B, + THR_COMP_GLOBAL_GLOBALL3B, + + THR_COMP_NEAR_NEARGB, + THR_COMP_NEW_NEWGB, + THR_COMP_NEW_NEARESTGB, + THR_COMP_NEAREST_NEWGB, + THR_COMP_NEW_NEARGB, + THR_COMP_NEAR_NEWGB, + THR_COMP_GLOBAL_GLOBALGB, + + THR_COMP_NEAR_NEARLA2, + THR_COMP_NEW_NEWLA2, + THR_COMP_NEW_NEARESTLA2, + THR_COMP_NEAREST_NEWLA2, + THR_COMP_NEW_NEARLA2, + THR_COMP_NEAR_NEWLA2, + THR_COMP_GLOBAL_GLOBALLA2, + + THR_COMP_NEAR_NEARL2A2, + THR_COMP_NEW_NEWL2A2, + THR_COMP_NEW_NEARESTL2A2, + THR_COMP_NEAREST_NEWL2A2, + THR_COMP_NEW_NEARL2A2, + THR_COMP_NEAR_NEWL2A2, + THR_COMP_GLOBAL_GLOBALL2A2, + + THR_COMP_NEAR_NEARL3A2, + THR_COMP_NEW_NEWL3A2, + THR_COMP_NEW_NEARESTL3A2, + THR_COMP_NEAREST_NEWL3A2, + THR_COMP_NEW_NEARL3A2, + THR_COMP_NEAR_NEWL3A2, + THR_COMP_GLOBAL_GLOBALL3A2, + + THR_COMP_NEAR_NEARGA2, + THR_COMP_NEW_NEWGA2, + THR_COMP_NEW_NEARESTGA2, + THR_COMP_NEAREST_NEWGA2, + THR_COMP_NEW_NEARGA2, + THR_COMP_NEAR_NEWGA2, + THR_COMP_GLOBAL_GLOBALGA2, + + THR_COMP_NEAR_NEARLL2, + THR_COMP_NEW_NEWLL2, + THR_COMP_NEW_NEARESTLL2, + THR_COMP_NEAREST_NEWLL2, + THR_COMP_NEW_NEARLL2, + THR_COMP_NEAR_NEWLL2, + THR_COMP_GLOBAL_GLOBALLL2, + + THR_COMP_NEAR_NEARLL3, + THR_COMP_NEW_NEWLL3, + THR_COMP_NEW_NEARESTLL3, + THR_COMP_NEAREST_NEWLL3, + THR_COMP_NEW_NEARLL3, + THR_COMP_NEAR_NEWLL3, + THR_COMP_GLOBAL_GLOBALLL3, + + THR_COMP_NEAR_NEARLG, + THR_COMP_NEW_NEWLG, + THR_COMP_NEW_NEARESTLG, + THR_COMP_NEAREST_NEWLG, + THR_COMP_NEW_NEARLG, + THR_COMP_NEAR_NEWLG, + THR_COMP_GLOBAL_GLOBALLG, + + THR_COMP_NEAR_NEARBA, + THR_COMP_NEW_NEWBA, + THR_COMP_NEW_NEARESTBA, + THR_COMP_NEAREST_NEWBA, + THR_COMP_NEW_NEARBA, + THR_COMP_NEAR_NEWBA, + THR_COMP_GLOBAL_GLOBALBA, + + THR_DC, + THR_PAETH, + THR_SMOOTH, + THR_SMOOTH_V, + THR_SMOOTH_H, + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D203_PRED, + THR_D157_PRED, + THR_D67_PRED, + THR_D113_PRED, + THR_D45_PRED, +}; + +/*!\cond */ +typedef struct SingleInterModeState { + int64_t rd; + MV_REFERENCE_FRAME ref_frame; + int valid; +} SingleInterModeState; + +typedef struct InterModeSearchState { + int64_t best_rd; + int64_t best_skip_rd[2]; + MB_MODE_INFO best_mbmode; + int best_rate_y; + int best_rate_uv; + int best_mode_skippable; + int best_skip2; + THR_MODES best_mode_index; + int num_available_refs; + int64_t dist_refs[REF_FRAMES]; + int dist_order_refs[REF_FRAMES]; + int64_t mode_threshold[MAX_MODES]; + int64_t best_intra_rd; + unsigned int best_pred_sse; + + /*! + * \brief Keep track of best intra rd for use in compound mode. + */ + int64_t best_pred_rd[REFERENCE_MODES]; + // Save a set of single_newmv for each checked ref_mv. + int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES]; + int single_newmv_rate[MAX_REF_MV_SEARCH][REF_FRAMES]; + int single_newmv_valid[MAX_REF_MV_SEARCH][REF_FRAMES]; + int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES]; + // The rd of simple translation in single inter modes + int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES]; + int64_t best_single_rd[REF_FRAMES]; + PREDICTION_MODE best_single_mode[REF_FRAMES]; + + // Single search results by [directions][modes][reference frames] + SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; + int single_state_cnt[2][SINGLE_INTER_MODE_NUM]; + SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM] + [FWD_REFS]; + int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM]; + MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; + IntraModeSearchState intra_search_state; + RD_STATS best_y_rdcost; +} InterModeSearchState; +/*!\endcond */ + +void av1_inter_mode_data_init(TileDataEnc *tile_data) { + for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { + InterModeRdModel *md = &tile_data->inter_mode_rd_models[i]; + md->ready = 0; + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; + } +} + +static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int *est_residue_cost, + int64_t *est_dist) { + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (md->ready) { + if (sse < md->dist_mean) { + *est_residue_cost = 0; + *est_dist = sse; + } else { + *est_dist = (int64_t)round(md->dist_mean); + const double est_ld = md->a * sse + md->b; + // Clamp estimated rate cost by INT_MAX / 2. + // TODO(angiebird@google.com): find better solution than clamping. + if (fabs(est_ld) < 1e-2) { + *est_residue_cost = INT_MAX / 2; + } else { + double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld); + if (est_residue_cost_dbl < 0) { + *est_residue_cost = 0; + } else { + *est_residue_cost = + (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2); + } + } + if (*est_residue_cost <= 0) { + *est_residue_cost = 0; + *est_dist = sse; + } + } + return 1; + } + return 0; +} + +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) { + for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + const int block_idx = inter_mode_data_block_idx(bsize); + InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (block_idx == -1) continue; + if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) { + continue; + } else { + if (md->ready == 0) { + md->dist_mean = md->dist_sum / md->num; + md->ld_mean = md->ld_sum / md->num; + md->sse_mean = md->sse_sum / md->num; + md->sse_sse_mean = md->sse_sse_sum / md->num; + md->sse_ld_mean = md->sse_ld_sum / md->num; + } else { + const double factor = 3; + md->dist_mean = + (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1); + md->ld_mean = + (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1); + md->sse_mean = + (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1); + md->sse_sse_mean = + (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) / + (factor + 1); + md->sse_ld_mean = + (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) / + (factor + 1); + } + + const double my = md->ld_mean; + const double mx = md->sse_mean; + const double dx = sqrt(md->sse_sse_mean); + const double dxy = md->sse_ld_mean; + + md->a = (dxy - mx * my) / (dx * dx - mx * mx); + md->b = my - md->a * mx; + md->ready = 1; + + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; + } + (void)rdmult; + } +} + +static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data, + BLOCK_SIZE bsize, int64_t sse, + int64_t dist, int residue_cost) { + if (residue_cost == 0 || sse == dist) return; + const int block_idx = inter_mode_data_block_idx(bsize); + if (block_idx == -1) return; + InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize]; + if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) { + const double ld = (sse - dist) * 1. / residue_cost; + ++rd_model->num; + rd_model->dist_sum += dist; + rd_model->ld_sum += ld; + rd_model->sse_sum += sse; + rd_model->sse_sse_sum += (double)sse * (double)sse; + rd_model->sse_ld_sum += sse * ld; + } +} + +static AOM_INLINE void inter_modes_info_push(InterModesInfo *inter_modes_info, + int mode_rate, int64_t sse, + int64_t rd, RD_STATS *rd_cost, + RD_STATS *rd_cost_y, + RD_STATS *rd_cost_uv, + const MB_MODE_INFO *mbmi) { + const int num = inter_modes_info->num; + assert(num < MAX_INTER_MODES); + inter_modes_info->mbmi_arr[num] = *mbmi; + inter_modes_info->mode_rate_arr[num] = mode_rate; + inter_modes_info->sse_arr[num] = sse; + inter_modes_info->est_rd_arr[num] = rd; + inter_modes_info->rd_cost_arr[num] = *rd_cost; + inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y; + inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv; + ++inter_modes_info->num; +} + +static int compare_rd_idx_pair(const void *a, const void *b) { + if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) { + // To avoid inconsistency in qsort() ordering when two elements are equal, + // using idx as tie breaker. Refer aomedia:2928 + if (((RdIdxPair *)a)->idx == ((RdIdxPair *)b)->idx) + return 0; + else if (((RdIdxPair *)a)->idx > ((RdIdxPair *)b)->idx) + return 1; + else + return -1; + } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) { + return 1; + } else { + return -1; + } +} + +static AOM_INLINE void inter_modes_info_sort( + const InterModesInfo *inter_modes_info, RdIdxPair *rd_idx_pair_arr) { + if (inter_modes_info->num == 0) { + return; + } + for (int i = 0; i < inter_modes_info->num; ++i) { + rd_idx_pair_arr[i].idx = i; + rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i]; + } + qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]), + compare_rd_idx_pair); +} + +// Similar to get_horver_correlation, but also takes into account first +// row/column, when computing horizontal/vertical correlation. +void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - left neighbor pixel + // z - top neighbor pixel + int64_t x_sum = 0, x2_sum = 0, xy_sum = 0, xz_sum = 0; + int64_t x_firstrow = 0, x_finalrow = 0, x_firstcol = 0, x_finalcol = 0; + int64_t x2_firstrow = 0, x2_finalrow = 0, x2_firstcol = 0, x2_finalcol = 0; + + // First, process horizontal correlation on just the first row + x_sum += diff[0]; + x2_sum += diff[0] * diff[0]; + x_firstrow += diff[0]; + x2_firstrow += diff[0] * diff[0]; + for (int j = 1; j < width; ++j) { + const int16_t x = diff[j]; + const int16_t y = diff[j - 1]; + x_sum += x; + x_firstrow += x; + x2_sum += x * x; + x2_firstrow += x * x; + xy_sum += x * y; + } + + // Process vertical correlation in the first column + x_firstcol += diff[0]; + x2_firstcol += diff[0] * diff[0]; + for (int i = 1; i < height; ++i) { + const int16_t x = diff[i * stride]; + const int16_t z = diff[(i - 1) * stride]; + x_sum += x; + x_firstcol += x; + x2_sum += x * x; + x2_firstcol += x * x; + xz_sum += x * z; + } + + // Now process horiz and vert correlation through the rest unit + for (int i = 1; i < height; ++i) { + for (int j = 1; j < width; ++j) { + const int16_t x = diff[i * stride + j]; + const int16_t y = diff[i * stride + j - 1]; + const int16_t z = diff[(i - 1) * stride + j]; + x_sum += x; + x2_sum += x * x; + xy_sum += x * y; + xz_sum += x * z; + } + } + + for (int j = 0; j < width; ++j) { + x_finalrow += diff[(height - 1) * stride + j]; + x2_finalrow += + diff[(height - 1) * stride + j] * diff[(height - 1) * stride + j]; + } + for (int i = 0; i < height; ++i) { + x_finalcol += diff[i * stride + width - 1]; + x2_finalcol += diff[i * stride + width - 1] * diff[i * stride + width - 1]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} + +static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x, + int64_t *sse_y) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int64_t total_sse = 0; + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + unsigned int sse; + + cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); + total_sse += sse; + if (!plane && sse_y) *sse_y = sse; + } + total_sse <<= 4; + return total_sse; +} + +int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int i; + int64_t error = 0, sqcoeff = 0; + + for (i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += coeff[i] * coeff[i]; + } + + *ssz = sqcoeff; + return error; +} + +int64_t av1_block_error_lp_c(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t block_size) { + int64_t error = 0; + + for (int i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + } + + return error; +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_block_error_c(const tran_low_t *coeff, + const tran_low_t *dqcoeff, intptr_t block_size, + int64_t *ssz, int bd) { + int i; + int64_t error = 0, sqcoeff = 0; + int shift = 2 * (bd - 8); + int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i++) { + const int64_t diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i]; + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} +#endif + +static int conditional_skipintra(PREDICTION_MODE mode, + PREDICTION_MODE best_intra_mode) { + if (mode == D113_PRED && best_intra_mode != V_PRED && + best_intra_mode != D135_PRED) + return 1; + if (mode == D67_PRED && best_intra_mode != V_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D203_PRED && best_intra_mode != H_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D157_PRED && best_intra_mode != H_PRED && + best_intra_mode != D135_PRED) + return 1; + return 0; +} + +static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode, + int16_t mode_context) { + if (is_inter_compound_mode(mode)) { + return mode_costs + ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)]; + } + + int mode_cost = 0; + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + + assert(is_inter_mode(mode)); + + if (mode == NEWMV) { + mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + + if (mode == GLOBALMV) { + mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV]; + return mode_cost; + } + } +} + +static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode, + int ref_idx) { + return ref_idx ? compound_ref1_mode(this_mode) + : compound_ref0_mode(this_mode); +} + +static AOM_INLINE void estimate_ref_frame_costs( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs, + int segment_id, unsigned int *ref_costs_single, + unsigned int (*ref_costs_comp)[REF_FRAMES]) { + int seg_ref_active = + segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + if (seg_ref_active) { + memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single)); + int ref_frame; + for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) + memset(ref_costs_comp[ref_frame], 0, + REF_FRAMES * sizeof((*ref_costs_comp)[0])); + } else { + int intra_inter_ctx = av1_get_intra_inter_context(xd); + ref_costs_single[INTRA_FRAME] = + mode_costs->intra_inter_cost[intra_inter_ctx][0]; + unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1]; + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + ref_costs_single[i] = base_cost; + + const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd); + const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd); + const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd); + const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd); + const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd); + const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd); + + // Determine cost of a single ref frame, where frame types are represented + // by a tree: + // Level 0: add cost whether this ref is a forward or backward ref + ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1]; + ref_costs_single[ALTREF2_FRAME] += + mode_costs->single_ref_cost[ctx_p1][0][1]; + ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1]; + + // Level 1: if this ref is forward ref, + // add cost whether it is last/last2 or last3/golden + ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0]; + ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0]; + ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1]; + ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1]; + + // Level 1: if this ref is backward ref + // then add cost whether this ref is altref or backward ref + ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][0]; + ref_costs_single[ALTREF2_FRAME] += + mode_costs->single_ref_cost[ctx_p2][1][0]; + ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][1]; + + // Level 2: further add cost whether this ref is last or last2 + ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][0]; + ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][1]; + + // Level 2: last3 or golden + ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][0]; + ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][1]; + + // Level 2: bwdref or altref2 + ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p6][5][0]; + ref_costs_single[ALTREF2_FRAME] += + mode_costs->single_ref_cost[ctx_p6][5][1]; + + if (cm->current_frame.reference_mode != SINGLE_REFERENCE) { + // Similar to single ref, determine cost of compound ref frames. + // cost_compound_refs = cost_first_ref + cost_second_ref + const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd); + const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd); + const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd); + const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd); + const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd); + + const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd); + unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 }; + + ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] = + ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] = + base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1]; + ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0; + ref_bicomp_costs[ALTREF_FRAME] = 0; + + // cost of first ref frame + ref_bicomp_costs[LAST_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0]; + ref_bicomp_costs[LAST2_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0]; + ref_bicomp_costs[LAST3_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1]; + ref_bicomp_costs[GOLDEN_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1]; + + ref_bicomp_costs[LAST_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][0]; + ref_bicomp_costs[LAST2_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][1]; + + ref_bicomp_costs[LAST3_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][0]; + ref_bicomp_costs[GOLDEN_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][1]; + + // cost of second ref frame + ref_bicomp_costs[BWDREF_FRAME] += + mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; + ref_bicomp_costs[ALTREF2_FRAME] += + mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; + ref_bicomp_costs[ALTREF_FRAME] += + mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][1]; + + ref_bicomp_costs[BWDREF_FRAME] += + mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0]; + ref_bicomp_costs[ALTREF2_FRAME] += + mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1]; + + // cost: if one ref frame is forward ref, the other ref is backward ref + int ref0, ref1; + for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { + for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) { + ref_costs_comp[ref0][ref1] = + ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1]; + } + } + + // cost: if both ref frames are the same side. + const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd); + const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd); + const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd); + ref_costs_comp[LAST_FRAME][LAST2_FRAME] = + base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0]; + ref_costs_comp[LAST_FRAME][LAST3_FRAME] = + base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0]; + ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = + base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1]; + ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = + base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1]; + } else { + int ref0, ref1; + for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { + for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) + ref_costs_comp[ref0][ref1] = 512; + } + ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512; + ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512; + ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512; + ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512; + } + } +} + +static AOM_INLINE void store_coding_context( +#if CONFIG_INTERNAL_STATS + MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index, +#else + MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, +#endif // CONFIG_INTERNAL_STATS + int skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + // Take a snapshot of the coding context so it can be + // restored if we decide to encode this way + ctx->rd_stats.skip_txfm = x->txfm_search_info.skip_txfm; + ctx->skippable = skippable; +#if CONFIG_INTERNAL_STATS + ctx->best_mode_index = mode_index; +#endif // CONFIG_INTERNAL_STATS + ctx->mic = *xd->mi[0]; + av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); +} + +static AOM_INLINE void setup_buffer_ref_mvs_inter( + const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, + BLOCK_SIZE block_size, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref_frame); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, ref_frame); + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame); + assert(yv12 != NULL); + + if (scaled_ref_frame) { + // Setup pred block based on scaled reference, because av1_mv_pred() doesn't + // support scaling. + av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, NULL, NULL, + num_planes); + } else { + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); + } + + // Gets an initial list of candidate vectors from neighbours and orders them + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + // Further refinement that is encode side only to test the top few candidates + // in full and choose the best as the center point for subsequent searches. + // The current implementation doesn't support scaling. + av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride, + ref_frame, block_size); + + // Go back to unscaled reference. + if (scaled_ref_frame) { + // We had temporarily setup pred block based on scaled reference above. Go + // back to unscaled reference now, for subsequent use. + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); + } +} + +#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) + +// TODO(jingning): this mv clamping function should be block size dependent. +static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { + const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + + RIGHT_BOTTOM_MARGIN }; + clamp_mv(mv, &mv_limits); +} + +/* If the current mode shares the same mv with other modes with higher cost, + * skip this mode. */ +static int skip_repeated_mv(const AV1_COMMON *const cm, + const MACROBLOCK *const x, + PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME ref_frames[2], + InterModeSearchState *search_state) { + const int is_comp_pred = ref_frames[1] > INTRA_FRAME; + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + PREDICTION_MODE compare_mode = MB_MODE_COUNT; + if (!is_comp_pred) { + if (this_mode == NEARMV) { + if (ref_mv_count == 0) { + // NEARMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // NEARMV has the same motion vector as GLOBALMV + compare_mode = GLOBALMV; + } + } + if (this_mode == GLOBALMV) { + if (ref_mv_count == 0 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // GLOBALMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1) { + // GLOBALMV has the same motion vector as NEARMV + compare_mode = NEARMV; + } + } + + if (compare_mode != MB_MODE_COUNT) { + // Use modelled_rd to check whether compare mode was searched + if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] != + INT64_MAX) { + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames); + const int compare_cost = + cost_mv_ref(&x->mode_costs, compare_mode, mode_ctx); + const int this_cost = cost_mv_ref(&x->mode_costs, this_mode, mode_ctx); + + // Only skip if the mode cost is larger than compare mode cost + if (this_cost > compare_cost) { + search_state->modelled_rd[this_mode][0][ref_frames[0]] = + search_state->modelled_rd[compare_mode][0][ref_frames[0]]; + return 1; + } + } + } + } + return 0; +} + +static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv, + const AV1_COMMON *cm, + const MACROBLOCK *x) { + const MACROBLOCKD *const xd = &x->e_mbd; + *out_mv = in_mv; + lower_mv_precision(&out_mv->as_mv, cm->features.allow_high_precision_mv, + cm->features.cur_frame_force_integer_mv); + clamp_mv2(&out_mv->as_mv, xd); + return av1_is_fullmv_in_range(&x->mv_limits, + get_fullmv_from_mv(&out_mv->as_mv)); +} + +// To use single newmv directly for compound modes, need to clamp the mv to the +// valid mv range. Without this, encoder would generate out of range mv, and +// this is seen in 8k encoding. +static INLINE void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv, + int ref_idx) { + const int_mv ref_mv = av1_get_ref_mv(x, ref_idx); + SubpelMvLimits mv_limits; + + av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv); + clamp_mv(&mv->as_mv, &mv_limits); +} + +static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, int_mv *cur_mv, + int *const rate_mv, HandleInterModeArgs *const args, + inter_mode_info *mode_info) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; + const int refs[2] = { mbmi->ref_frame[0], + mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] }; + const int ref_mv_idx = mbmi->ref_mv_idx; + + if (is_comp_pred) { + const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]]; + const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]]; + if (this_mode == NEW_NEWMV) { + if (valid_mv0) { + cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int; + clamp_mv_in_range(x, &cur_mv[0], 0); + } + if (valid_mv1) { + cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int; + clamp_mv_in_range(x, &cur_mv[1], 1); + } + *rate_mv = 0; + for (int i = 0; i < 2; ++i) { + const int_mv ref_mv = av1_get_ref_mv(x, i); + *rate_mv += av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, + x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } + } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { + if (valid_mv1) { + cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int; + clamp_mv_in_range(x, &cur_mv[1], 1); + } + const int_mv ref_mv = av1_get_ref_mv(x, 1); + *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv, + x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } else { + assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV); + if (valid_mv0) { + cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int; + clamp_mv_in_range(x, &cur_mv[0], 0); + } + const int_mv ref_mv = av1_get_ref_mv(x, 0); + *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv, + x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } + } else { + // Single ref case. + const int ref_idx = 0; + int search_range = INT_MAX; + + if (cpi->sf.mv_sf.reduce_search_range && mbmi->ref_mv_idx > 0) { + const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv; + int min_mv_diff = INT_MAX; + int best_match = -1; + MV prev_ref_mv[2] = { { 0 } }; + for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) { + prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, + idx, &x->mbmi_ext) + .as_mv; + const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row), + abs(ref_mv.col - prev_ref_mv[idx].col)); + + if (min_mv_diff > ref_mv_diff) { + min_mv_diff = ref_mv_diff; + best_match = idx; + } + } + + if (min_mv_diff < (16 << 3)) { + if (args->single_newmv_valid[best_match][refs[0]]) { + search_range = min_mv_diff; + search_range += + AOMMAX(abs(args->single_newmv[best_match][refs[0]].as_mv.row - + prev_ref_mv[best_match].row), + abs(args->single_newmv[best_match][refs[0]].as_mv.col - + prev_ref_mv[best_match].col)); + // Get full pixel search range. + search_range = (search_range + 4) >> 3; + } + } + } + + int_mv best_mv; + av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range, + mode_info, &best_mv, args); + if (best_mv.as_int == INVALID_MV) return INT64_MAX; + + args->single_newmv[ref_mv_idx][refs[0]] = best_mv; + args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv; + args->single_newmv_valid[ref_mv_idx][refs[0]] = 1; + cur_mv[0].as_int = best_mv.as_int; + + // Return after single_newmv is set. + if (mode_info[mbmi->ref_mv_idx].skip) return INT64_MAX; + } + + return 0; +} + +static INLINE void update_mode_start_end_index( + const AV1_COMP *const cpi, const MB_MODE_INFO *const mbmi, + int *mode_index_start, int *mode_index_end, int last_motion_mode_allowed, + int interintra_allowed, int eval_motion_mode) { + *mode_index_start = (int)SIMPLE_TRANSLATION; + *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed; + if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) { + if (!eval_motion_mode) { + *mode_index_end = (int)SIMPLE_TRANSLATION; + } else { + // Set the start index appropriately to process motion modes other than + // simple translation + *mode_index_start = 1; + } + } + if (cpi->sf.inter_sf.extra_prune_warped && mbmi->bsize > BLOCK_16X16) + *mode_index_end = SIMPLE_TRANSLATION; +} + +/*!\brief AV1 motion mode search + * + * \ingroup inter_mode_search + * Function to search over and determine the motion mode. It will update + * mbmi->motion_mode to one of SIMPLE_TRANSLATION, OBMC_CAUSAL, or + * WARPED_CAUSAL and determine any necessary side information for the selected + * motion mode. It will also perform the full transform search, unless the + * input parameter do_tx_search indicates to do an estimation of the RD rather + * than an RD corresponding to a full transform search. It will return the + * RD for the final motion_mode. + * Do the RD search for a given inter mode and compute all information relevant + * to the input mode. It will compute the best MV, + * compound parameters (if the mode is a compound mode) and interpolation filter + * parameters. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during + * encoding. + * \param[in] x Pointer to struct holding all the data for + * the current macroblock. + * \param[in] bsize Current block size. + * \param[in,out] rd_stats Struct to keep track of the overall RD + * information. + * \param[in,out] rd_stats_y Struct to keep track of the RD information + * for only the Y plane. + * \param[in,out] rd_stats_uv Struct to keep track of the RD information + * for only the UV planes. + * \param[in] args HandleInterModeArgs struct holding + * miscellaneous arguments for inter mode + * search. See the documentation for this + * struct for a description of each member. + * \param[in] ref_best_rd Best RD found so far for this block. + * It is used for early termination of this + * search if the RD exceeds this value. + * \param[in,out] ref_skip_rd A length 2 array, where skip_rd[0] is the + * best total RD for a skip mode so far, and + * skip_rd[1] is the best RD for a skip mode so + * far in luma. This is used as a speed feature + * to skip the transform search if the computed + * skip RD for the current mode is not better + * than the best skip_rd so far. + * \param[in,out] rate_mv The rate associated with the motion vectors. + * This will be modified if a motion search is + * done in the motion mode search. + * \param[in,out] orig_dst A prediction buffer to hold a computed + * prediction. This will eventually hold the + * final prediction, and the tmp_dst info will + * be copied here. + * \param[in,out] best_est_rd Estimated RD for motion mode search if + * do_tx_search (see below) is 0. + * \param[in] do_tx_search Parameter to indicate whether or not to do + * a full transform search. This will compute + * an estimated RD for the modes without the + * transform search and later perform the full + * transform search on the best candidates. + * \param[in] inter_modes_info InterModesInfo struct to hold inter mode + * information to perform a full transform + * search only on winning candidates searched + * with an estimate for transform coding RD. + * \param[in] eval_motion_mode Boolean whether or not to evaluate motion + * motion modes other than SIMPLE_TRANSLATION. + * \param[out] yrd Stores the rdcost corresponding to encoding + * the luma plane. + * \return Returns INT64_MAX if the determined motion mode is invalid and the + * current motion mode being tested should be skipped. It returns 0 if the + * motion mode search is a success. + */ +static int64_t motion_mode_rd( + const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x, + BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, HandleInterModeArgs *const args, int64_t ref_best_rd, + int64_t *ref_skip_rd, int *rate_mv, const BUFFER_SET *orig_dst, + int64_t *best_est_rd, int do_tx_search, InterModesInfo *inter_modes_info, + int eval_motion_mode, int64_t *yrd) { + const AV1_COMMON *const cm = &cpi->common; + const FeatureFlags *const features = &cm->features; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; + const int rate2_nocoeff = rd_stats->rate; + int best_xskip_txfm = 0; + RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int rate_mv0 = *rate_mv; + const int interintra_allowed = cm->seq_params->enable_interintra_compound && + is_interintra_allowed(mbmi) && + mbmi->compound_idx; + WARP_SAMPLE_INFO *const warp_sample_info = + &x->warp_sample_info[mbmi->ref_frame[0]]; + int *pts0 = warp_sample_info->pts; + int *pts_inref0 = warp_sample_info->pts_inref; + + assert(mbmi->ref_frame[1] != INTRA_FRAME); + const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; + av1_invalid_rd_stats(&best_rd_stats); + mbmi->num_proj_ref = 1; // assume num_proj_ref >=1 + MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; + *yrd = INT64_MAX; + if (features->switchable_motion_mode) { + // Determine which motion modes to search if more than SIMPLE_TRANSLATION + // is allowed. + last_motion_mode_allowed = motion_mode_allowed( + xd->global_motion, xd, mbmi, features->allow_warped_motion); + } + + if (last_motion_mode_allowed == WARPED_CAUSAL) { + // Collect projection samples used in least squares approximation of + // the warped motion parameters if WARPED_CAUSAL is going to be searched. + if (warp_sample_info->num < 0) { + warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0); + } + mbmi->num_proj_ref = warp_sample_info->num; + } + const int total_samples = mbmi->num_proj_ref; + if (total_samples == 0) { + // Do not search WARPED_CAUSAL if there are no samples to use to determine + // warped parameters. + last_motion_mode_allowed = OBMC_CAUSAL; + } + + const MB_MODE_INFO base_mbmi = *mbmi; + MB_MODE_INFO best_mbmi; + const int interp_filter = features->interp_filter; + const int switchable_rate = + av1_is_interp_needed(xd) + ? av1_get_switchable_rate(x, xd, interp_filter, + cm->seq_params->enable_dual_filter) + : 0; + int64_t best_rd = INT64_MAX; + int best_rate_mv = rate_mv0; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int mode_index_start, mode_index_end; + const int txfm_rd_gate_level = + get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound, + cpi->sf.inter_sf.txfm_rd_gate_level, bsize, + TX_SEARCH_MOTION_MODE, eval_motion_mode); + + // Modify the start and end index according to speed features. For example, + // if SIMPLE_TRANSLATION has already been searched according to + // the motion_mode_for_winner_cand speed feature, update the mode_index_start + // to avoid searching it again. + update_mode_start_end_index(cpi, mbmi, &mode_index_start, &mode_index_end, + last_motion_mode_allowed, interintra_allowed, + eval_motion_mode); + // Main function loop. This loops over all of the possible motion modes and + // computes RD to determine the best one. This process includes computing + // any necessary side information for the motion mode and performing the + // transform search. + for (int mode_index = mode_index_start; mode_index <= mode_index_end; + mode_index++) { + if (args->skip_motion_mode && mode_index) continue; + int tmp_rate2 = rate2_nocoeff; + const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed; + int tmp_rate_mv = rate_mv0; + + *mbmi = base_mbmi; + if (is_interintra_mode) { + // Only use SIMPLE_TRANSLATION for interintra + mbmi->motion_mode = SIMPLE_TRANSLATION; + } else { + mbmi->motion_mode = (MOTION_MODE)mode_index; + assert(mbmi->ref_frame[1] != INTRA_FRAME); + } + + // Do not search OBMC if the probability of selecting it is below a + // predetermined threshold for this update_type and block size. + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int use_actual_frame_probs = 1; + int prune_obmc; +#if CONFIG_FPMT_TEST + use_actual_frame_probs = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; + if (!use_actual_frame_probs) { + prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] < + cpi->sf.inter_sf.prune_obmc_prob_thresh; + } +#endif + if (use_actual_frame_probs) { + prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] < + cpi->sf.inter_sf.prune_obmc_prob_thresh; + } + if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || prune_obmc) && + mbmi->motion_mode == OBMC_CAUSAL) + continue; + + if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) { + // SIMPLE_TRANSLATION mode: no need to recalculate. + // The prediction is calculated before motion_mode_rd() is called in + // handle_inter_mode() + } else if (mbmi->motion_mode == OBMC_CAUSAL) { + const uint32_t cur_mv = mbmi->mv[0].as_int; + // OBMC_CAUSAL not allowed for compound prediction + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { + av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL, + &mbmi->mv[0], NULL); + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; + } + if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) { + // Build the predictor according to the current motion vector if it has + // not already been built + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + 0, av1_num_planes(cm) - 1); + } + // Build the inter predictor by blending the predictor corresponding to + // this MV, and the neighboring blocks using the OBMC model + av1_build_obmc_inter_prediction( + cm, xd, args->above_pred_buf, args->above_pred_stride, + args->left_pred_buf, args->left_pred_stride); +#if !CONFIG_REALTIME_ONLY + } else if (mbmi->motion_mode == WARPED_CAUSAL) { + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + mbmi->motion_mode = WARPED_CAUSAL; + mbmi->wm_params.wmtype = DEFAULT_WMTYPE; + mbmi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + + memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); + memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); + // Select the samples according to motion vector difference + if (mbmi->num_proj_ref > 1) { + mbmi->num_proj_ref = av1_selectSamples( + &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize); + } + + // Compute the warped motion parameters with a least squares fit + // using the collected samples + if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, + mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, + &mbmi->wm_params, mi_row, mi_col)) { + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { + // Refine MV for NEWMV mode + const int_mv mv0 = mbmi->mv[0]; + const WarpedMotionParams wm_params0 = mbmi->wm_params; + const int num_proj_ref0 = mbmi->num_proj_ref; + + const int_mv ref_mv = av1_get_ref_mv(x, 0); + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, + &ref_mv.as_mv, NULL); + + // Refine MV in a small range. + av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0, + total_samples, cpi->sf.mv_sf.warp_search_method, + cpi->sf.mv_sf.warp_search_iters); + + if (mv0.as_int != mbmi->mv[0].as_int) { + // Keep the refined MV and WM parameters. + tmp_rate_mv = av1_mv_bit_cost( + &mbmi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; + } else { + // Restore the old MV and WM parameters. + mbmi->mv[0] = mv0; + mbmi->wm_params = wm_params0; + mbmi->num_proj_ref = num_proj_ref0; + } + } + + // Build the warped predictor + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + } else { + continue; + } +#endif // !CONFIG_REALTIME_ONLY + } else if (is_interintra_mode) { + const int ret = + av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd, + &tmp_rate_mv, &tmp_rate2, orig_dst); + if (ret < 0) continue; + } + + // If we are searching newmv and the mv is the same as refmv, skip the + // current mode + if (!av1_check_newmv_joint_nonzero(cm, x)) continue; + + // Update rd_stats for the current motion mode + txfm_info->skip_txfm = 0; + rd_stats->dist = 0; + rd_stats->sse = 0; + rd_stats->skip_txfm = 1; + rd_stats->rate = tmp_rate2; + const ModeCosts *mode_costs = &x->mode_costs; + if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate; + if (interintra_allowed) { + rd_stats->rate += + mode_costs->interintra_cost[size_group_lookup[bsize]] + [mbmi->ref_frame[1] == INTRA_FRAME]; + } + if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) && + (mbmi->ref_frame[1] != INTRA_FRAME)) { + if (last_motion_mode_allowed == WARPED_CAUSAL) { + rd_stats->rate += + mode_costs->motion_mode_cost[bsize][mbmi->motion_mode]; + } else { + rd_stats->rate += + mode_costs->motion_mode_cost1[bsize][mbmi->motion_mode]; + } + } + + int64_t this_yrd = INT64_MAX; + + if (!do_tx_search) { + // Avoid doing a transform search here to speed up the overall mode + // search. It will be done later in the mode search if the current + // motion mode seems promising. + int64_t curr_sse = -1; + int64_t sse_y = -1; + int est_residue_cost = 0; + int64_t est_dist = 0; + int64_t est_rd = 0; + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + curr_sse = get_sse(cpi, x, &sse_y); + const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse, + &est_residue_cost, &est_dist); + (void)has_est_rd; + assert(has_est_rd); + } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 || + cpi->sf.rt_sf.use_nonrd_pick_mode) { + model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD]( + cpi, bsize, x, xd, 0, num_planes - 1, &est_residue_cost, &est_dist, + NULL, &curr_sse, NULL, NULL, NULL); + sse_y = x->pred_sse[xd->mi[0]->ref_frame[0]]; + } + est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist); + if (est_rd * 0.80 > *best_est_rd) { + mbmi->ref_frame[1] = ref_frame_1; + continue; + } + const int mode_rate = rd_stats->rate; + rd_stats->rate += est_residue_cost; + rd_stats->dist = est_dist; + rd_stats->rdcost = est_rd; + if (rd_stats->rdcost < *best_est_rd) { + *best_est_rd = rd_stats->rdcost; + assert(sse_y >= 0); + ref_skip_rd[1] = txfm_rd_gate_level + ? RDCOST(x->rdmult, mode_rate, (sse_y << 4)) + : INT64_MAX; + } + if (cm->current_frame.reference_mode == SINGLE_REFERENCE) { + if (!is_comp_pred) { + assert(curr_sse >= 0); + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, rd_stats, rd_stats_y, + rd_stats_uv, mbmi); + } + } else { + assert(curr_sse >= 0); + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, rd_stats, rd_stats_y, + rd_stats_uv, mbmi); + } + mbmi->skip_txfm = 0; + } else { + // Perform full transform search + int64_t skip_rd = INT64_MAX; + int64_t skip_rdy = INT64_MAX; + if (txfm_rd_gate_level) { + // Check if the mode is good enough based on skip RD + int64_t sse_y = INT64_MAX; + int64_t curr_sse = get_sse(cpi, x, &sse_y); + skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse); + skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4)); + int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd, + txfm_rd_gate_level, 0); + if (!eval_txfm) continue; + } + + // Do transform search + const int mode_rate = rd_stats->rate; + if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, + rd_stats->rate, ref_best_rd)) { + if (rd_stats_y->rate == INT_MAX && mode_index == 0) { + return INT64_MAX; + } + continue; + } + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int y_rate = + rd_stats->skip_txfm + ? x->mode_costs.skip_txfm_cost[skip_ctx][1] + : (rd_stats_y->rate + x->mode_costs.skip_txfm_cost[skip_ctx][0]); + this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y->dist); + + const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (curr_rd < ref_best_rd) { + ref_best_rd = curr_rd; + ref_skip_rd[0] = skip_rd; + ref_skip_rd[1] = skip_rdy; + } + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + inter_mode_data_push( + tile_data, mbmi->bsize, rd_stats->sse, rd_stats->dist, + rd_stats_y->rate + rd_stats_uv->rate + + mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]); + } + } + + if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) { + if (is_nontrans_global_motion(xd, xd->mi[0])) { + mbmi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + } + } + + const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (mode_index == 0) { + args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd; + } + if (mode_index == 0 || tmp_rd < best_rd) { + // Update best_rd data if this is the best motion mode so far + best_mbmi = *mbmi; + best_rd = tmp_rd; + best_rd_stats = *rd_stats; + best_rd_stats_y = *rd_stats_y; + best_rate_mv = tmp_rate_mv; + *yrd = this_yrd; + if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv; + memcpy(best_blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width); + best_xskip_txfm = mbmi->skip_txfm; + } + } + // Update RD and mbmi stats for selected motion mode + mbmi->ref_frame[1] = ref_frame_1; + *rate_mv = best_rate_mv; + if (best_rd == INT64_MAX || !av1_check_newmv_joint_nonzero(cm, x)) { + av1_invalid_rd_stats(rd_stats); + restore_dst_buf(xd, *orig_dst, num_planes); + return INT64_MAX; + } + *mbmi = best_mbmi; + *rd_stats = best_rd_stats; + *rd_stats_y = best_rd_stats_y; + if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv; + memcpy(txfm_info->blk_skip, best_blk_skip, + sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width); + txfm_info->skip_txfm = best_xskip_txfm; + + restore_dst_buf(xd, *orig_dst, num_planes); + return 0; +} + +static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi, + MACROBLOCK *const x, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t best_rd) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int64_t total_sse = 0; + int64_t this_rd = INT64_MAX; + const int skip_mode_ctx = av1_get_skip_mode_context(xd); + rd_stats->rate = x->mode_costs.skip_mode_cost[skip_mode_ctx][1]; + + for (int plane = 0; plane < num_planes; ++plane) { + // Call av1_enc_build_inter_predictor() for one plane at a time. + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + plane, plane); + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + + av1_subtract_plane(x, plane_bsize, plane); + + int64_t sse = + av1_pixel_diff_dist(x, plane, 0, 0, plane_bsize, plane_bsize, NULL); + if (is_cur_buf_hbd(xd)) sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + sse <<= 4; + total_sse += sse; + // When current rd cost is more than the best rd, skip evaluation of + // remaining planes. + this_rd = RDCOST(x->rdmult, rd_stats->rate, total_sse); + if (this_rd > best_rd) break; + } + + rd_stats->dist = rd_stats->sse = total_sse; + rd_stats->rdcost = this_rd; + + restore_dst_buf(xd, *orig_dst, num_planes); + return 0; +} + +// Check NEARESTMV, NEARMV, GLOBALMV ref mvs for duplicate and skip the relevant +// mode +// Note(rachelbarker): This speed feature currently does not interact correctly +// with global motion. The issue is that, when global motion is used, GLOBALMV +// produces a different prediction to NEARESTMV/NEARMV even if the motion +// vectors are the same. Thus GLOBALMV should not be pruned in this case. +static INLINE int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext, + int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + PREDICTION_MODE single_mode) { + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + assert(single_mode != NEWMV); + if (single_mode == NEARESTMV) { + return 0; + } else if (single_mode == NEARMV) { + // when ref_mv_count = 0, NEARESTMV and NEARMV are same as GLOBALMV + // when ref_mv_count = 1, NEARMV is same as GLOBALMV + if (ref_mv_count < 2) return 1; + } else if (single_mode == GLOBALMV) { + // when ref_mv_count == 0, GLOBALMV is same as NEARESTMV + if (ref_mv_count == 0) return 1; + // when ref_mv_count == 1, NEARMV is same as GLOBALMV + else if (ref_mv_count == 1) + return 0; + + int stack_size = AOMMIN(USABLE_REF_MV_STACK_SIZE, ref_mv_count); + // Check GLOBALMV is matching with any mv in ref_mv_stack + for (int ref_mv_idx = 0; ref_mv_idx < stack_size; ref_mv_idx++) { + int_mv this_mv; + + if (ref_idx == 0) + this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; + else + this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; + + if (this_mv.as_int == mbmi_ext->global_mvs[ref_frame[ref_idx]].as_int) + return 1; + } + } + return 0; +} + +static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode, + int ref_idx, int ref_mv_idx, + int skip_repeated_ref_mv, + const MV_REFERENCE_FRAME *ref_frame, + const MB_MODE_INFO_EXT *mbmi_ext) { + const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx); + assert(is_inter_singleref_mode(single_mode)); + if (single_mode == NEWMV) { + this_mv->as_int = INVALID_MV; + } else if (single_mode == GLOBALMV) { + if (skip_repeated_ref_mv && + check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode)) + return 0; + *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]]; + } else { + assert(single_mode == NEARMV || single_mode == NEARESTMV); + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int ref_mv_offset = single_mode == NEARESTMV ? 0 : ref_mv_idx + 1; + if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) { + assert(ref_mv_offset >= 0); + if (ref_idx == 0) { + *this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv; + } else { + *this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv; + } + } else { + if (skip_repeated_ref_mv && + check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode)) + return 0; + *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]]; + } + } + return 1; +} + +// Skip NEARESTMV and NEARMV modes based on refmv weight computed in ref mv list +// population +static INLINE int skip_nearest_near_mv_using_refmv_weight( + const MACROBLOCK *const x, const PREDICTION_MODE this_mode, + const int8_t ref_frame_type, PREDICTION_MODE best_mode) { + if (this_mode != NEARESTMV && this_mode != NEARMV) return 0; + // Do not skip the mode if the current block has not yet obtained a valid + // inter mode. + if (!is_inter_mode(best_mode)) return 0; + + const MACROBLOCKD *xd = &x->e_mbd; + // Do not skip the mode if both the top and left neighboring blocks are not + // available. + if (!xd->left_available || !xd->up_available) return 0; + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const uint16_t *const ref_mv_weight = mbmi_ext->weight[ref_frame_type]; + const int ref_mv_count = + AOMMIN(MAX_REF_MV_SEARCH, mbmi_ext->ref_mv_count[ref_frame_type]); + + if (ref_mv_count == 0) return 0; + // If ref mv list has at least one nearest candidate do not prune NEARESTMV + if (this_mode == NEARESTMV && ref_mv_weight[0] >= REF_CAT_LEVEL) return 0; + + // Count number of ref mvs populated from nearest candidates + int nearest_refmv_count = 0; + for (int ref_mv_idx = 0; ref_mv_idx < ref_mv_count; ref_mv_idx++) { + if (ref_mv_weight[ref_mv_idx] >= REF_CAT_LEVEL) nearest_refmv_count++; + } + + // nearest_refmv_count indicates the closeness of block motion characteristics + // with respect to its spatial neighbor. Smaller value of nearest_refmv_count + // w.r.t to ref_mv_count means less correlation with its spatial neighbors. + // Hence less possibility for NEARESTMV and NEARMV modes becoming the best + // mode since these modes work well for blocks that shares similar motion + // characteristics with its neighbor. Thus, NEARMV mode is pruned when + // nearest_refmv_count is relatively smaller than ref_mv_count and NEARESTMV + // mode is pruned if none of the ref mvs are populated from nearest candidate. + const int prune_thresh = 1 + (ref_mv_count >= 2); + if (nearest_refmv_count < prune_thresh) return 1; + return 0; +} + +// This function update the non-new mv for the current prediction mode +static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode, + const AV1_COMMON *cm, const MACROBLOCK *x, + int skip_repeated_ref_mv) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_comp_pred = has_second_ref(mbmi); + + int ret = 1; + for (int i = 0; i < is_comp_pred + 1; ++i) { + int_mv this_mv; + this_mv.as_int = INVALID_MV; + ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, + skip_repeated_ref_mv, mbmi->ref_frame, &x->mbmi_ext); + if (!ret) return 0; + const PREDICTION_MODE single_mode = get_single_mode(this_mode, i); + if (single_mode == NEWMV) { + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + cur_mv[i] = + (i == 0) ? x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] + .this_mv + : x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] + .comp_mv; + } else { + ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x); + } + } + return ret; +} + +static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi, + const MB_MODE_INFO_EXT *mbmi_ext, + const int (*const drl_mode_cost0)[2], + int8_t ref_frame_type) { + int cost = 0; + if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { + for (int idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx]; + if (mbmi->ref_mv_idx == idx) return cost; + } + } + return cost; + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + for (int idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)]; + if (mbmi->ref_mv_idx == (idx - 1)) return cost; + } + } + return cost; + } + return cost; +} + +static INLINE int is_single_newmv_valid(const HandleInterModeArgs *const args, + const MB_MODE_INFO *const mbmi, + PREDICTION_MODE this_mode) { + for (int ref_idx = 0; ref_idx < 2; ++ref_idx) { + const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx); + const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx]; + if (single_mode == NEWMV && + args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) { + return 0; + } + } + return 1; +} + +static int get_drl_refmv_count(const MACROBLOCK *const x, + const MV_REFERENCE_FRAME *ref_frame, + PREDICTION_MODE mode) { + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV); + const int has_drl = + (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1); + const int ref_set = + has_drl ? AOMMIN(MAX_REF_MV_SEARCH, ref_mv_count - has_nearmv) : 1; + + return ref_set; +} + +// Checks if particular ref_mv_idx should be pruned. +static int prune_ref_mv_idx_using_qindex(const int reduce_inter_modes, + const int qindex, + const int ref_mv_idx) { + if (reduce_inter_modes >= 3) return 1; + // Q-index logic based pruning is enabled only for + // reduce_inter_modes = 2. + assert(reduce_inter_modes == 2); + // When reduce_inter_modes=2, pruning happens as below based on q index. + // For q index range between 0 and 85: prune if ref_mv_idx >= 1. + // For q index range between 86 and 170: prune if ref_mv_idx == 2. + // For q index range between 171 and 255: no pruning. + const int min_prune_ref_mv_idx = (qindex * 3 / QINDEX_RANGE) + 1; + return (ref_mv_idx >= min_prune_ref_mv_idx); +} + +// Whether this reference motion vector can be skipped, based on initial +// heuristics. +static bool ref_mv_idx_early_breakout( + const SPEED_FEATURES *const sf, + const RefFrameDistanceInfo *const ref_frame_dist_info, MACROBLOCK *x, + const HandleInterModeArgs *const args, int64_t ref_best_rd, + int ref_mv_idx) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + const int is_comp_pred = has_second_ref(mbmi); + if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) { + if (mbmi->ref_frame[0] == LAST2_FRAME || + mbmi->ref_frame[0] == LAST3_FRAME || + mbmi->ref_frame[1] == LAST2_FRAME || + mbmi->ref_frame[1] == LAST3_FRAME) { + const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; + if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] < + REF_CAT_LEVEL) { + return true; + } + } + // TODO(any): Experiment with reduce_inter_modes for compound prediction + if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred && + have_newmv_in_inter_mode(mbmi->mode)) { + if (mbmi->ref_frame[0] != ref_frame_dist_info->nearest_past_ref && + mbmi->ref_frame[0] != ref_frame_dist_info->nearest_future_ref) { + const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; + const int do_prune = prune_ref_mv_idx_using_qindex( + sf->inter_sf.reduce_inter_modes, x->qindex, ref_mv_idx); + if (do_prune && + (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] < + REF_CAT_LEVEL)) { + return true; + } + } + } + } + + mbmi->ref_mv_idx = ref_mv_idx; + if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) { + return true; + } + size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost; + const int drl_cost = get_drl_cost( + mbmi, mbmi_ext, x->mode_costs.drl_mode_cost0, ref_frame_type); + est_rd_rate += drl_cost; + if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { + return true; + } + return false; +} + +// Compute the estimated RD cost for the motion vector with simple translation. +static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, + HandleInterModeArgs *args, + int ref_mv_idx, int64_t ref_best_rd, + BLOCK_SIZE bsize) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + const AV1_COMMON *cm = &cpi->common; + const int is_comp_pred = has_second_ref(mbmi); + const ModeCosts *mode_costs = &x->mode_costs; + + struct macroblockd_plane *p = xd->plane; + const BUFFER_SET orig_dst = { + { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, + { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + }; + av1_init_rd_stats(rd_stats); + + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + if (mbmi->ref_frame[1] == INTRA_FRAME) { + mbmi->ref_frame[1] = NONE_FRAME; + } + int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); + + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->ref_mv_idx = ref_mv_idx; + + rd_stats->rate += args->ref_frame_cost + args->single_comp_cost; + const int drl_cost = + get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type); + rd_stats->rate += drl_cost; + + int_mv cur_mv[2]; + if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) { + return INT64_MAX; + } + assert(have_nearmv_in_inter_mode(mbmi->mode)); + for (int i = 0; i < is_comp_pred + 1; ++i) { + mbmi->mv[i].as_int = cur_mv[i].as_int; + } + const int ref_mv_cost = cost_mv_ref(mode_costs, mbmi->mode, mode_ctx); + rd_stats->rate += ref_mv_cost; + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) { + return INT64_MAX; + } + + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->num_proj_ref = 0; + if (is_comp_pred) { + // Only compound_average + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + } + set_default_interp_filters(mbmi, cm->features.interp_filter); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + int est_rate; + int64_t est_dist; + model_rd_sb_fn[MODELRD_CURVFIT](cpi, bsize, x, xd, 0, 0, &est_rate, &est_dist, + NULL, NULL, NULL, NULL, NULL); + return RDCOST(x->rdmult, rd_stats->rate + est_rate, est_dist); +} + +// Represents a set of integers, from 0 to sizeof(int) * 8, as bits in +// an integer. 0 for the i-th bit means that integer is excluded, 1 means +// it is included. +static INLINE void mask_set_bit(int *mask, int index) { *mask |= (1 << index); } + +static INLINE bool mask_check_bit(int mask, int index) { + return (mask >> index) & 0x1; +} + +// Before performing the full MV search in handle_inter_mode, do a simple +// translation search and see if we can eliminate any motion vectors. +// Returns an integer where, if the i-th bit is set, it means that the i-th +// motion vector should be searched. This is only set for NEAR_MV. +static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, + HandleInterModeArgs *const args, + int64_t ref_best_rd, BLOCK_SIZE bsize, + const int ref_set) { + // If the number of ref mv count is equal to 1, do not prune the same. It + // is better to evaluate the same than to prune it. + if (ref_set == 1) return 1; + AV1_COMMON *const cm = &cpi->common; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + + // Only search indices if they have some chance of being good. + int good_indices = 0; + for (int i = 0; i < ref_set; ++i) { + if (ref_mv_idx_early_breakout(&cpi->sf, &cpi->ref_frame_dist_info, x, args, + ref_best_rd, i)) { + continue; + } + mask_set_bit(&good_indices, i); + } + + // Only prune in NEARMV mode, if the speed feature is set, and the block size + // is large enough. If these conditions are not met, return all good indices + // found so far. + if (!cpi->sf.inter_sf.prune_mode_search_simple_translation) + return good_indices; + if (!have_nearmv_in_inter_mode(this_mode)) return good_indices; + if (num_pels_log2_lookup[bsize] <= 6) return good_indices; + // Do not prune when there is internal resizing. TODO(elliottk) fix this + // so b/2384 can be resolved. + if (av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[0])) || + (mbmi->ref_frame[1] > 0 && + av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[1])))) { + return good_indices; + } + + // Calculate the RD cost for the motion vectors using simple translation. + int64_t idx_rdcost[] = { INT64_MAX, INT64_MAX, INT64_MAX }; + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + // If this index is bad, ignore it. + if (!mask_check_bit(good_indices, ref_mv_idx)) { + continue; + } + idx_rdcost[ref_mv_idx] = simple_translation_pred_rd( + cpi, x, rd_stats, args, ref_mv_idx, ref_best_rd, bsize); + } + // Find the index with the best RD cost. + int best_idx = 0; + for (int i = 1; i < MAX_REF_MV_SEARCH; ++i) { + if (idx_rdcost[i] < idx_rdcost[best_idx]) { + best_idx = i; + } + } + // Only include indices that are good and within a % of the best. + const double dth = has_second_ref(mbmi) ? 1.05 : 1.001; + // If the simple translation cost is not within this multiple of the + // best RD, skip it. Note that the cutoff is derived experimentally. + const double ref_dth = 5; + int result = 0; + for (int i = 0; i < ref_set; ++i) { + if (mask_check_bit(good_indices, i) && + (1.0 * idx_rdcost[i]) / idx_rdcost[best_idx] < dth && + (1.0 * idx_rdcost[i]) / ref_best_rd < ref_dth) { + mask_set_bit(&result, i); + } + } + return result; +} + +/*!\brief Motion mode information for inter mode search speedup. + * + * Used in a speed feature to search motion modes other than + * SIMPLE_TRANSLATION only on winning candidates. + */ +typedef struct motion_mode_candidate { + /*! + * Mode info for the motion mode candidate. + */ + MB_MODE_INFO mbmi; + /*! + * Rate describing the cost of the motion vectors for this candidate. + */ + int rate_mv; + /*! + * Rate before motion mode search and transform coding is applied. + */ + int rate2_nocoeff; + /*! + * An integer value 0 or 1 which indicates whether or not to skip the motion + * mode search and default to SIMPLE_TRANSLATION as a speed feature for this + * candidate. + */ + int skip_motion_mode; + /*! + * Total RD cost for this candidate. + */ + int64_t rd_cost; +} motion_mode_candidate; + +/*!\cond */ +typedef struct motion_mode_best_st_candidate { + motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES]; + int num_motion_mode_cand; +} motion_mode_best_st_candidate; + +// Checks if the current reference frame matches with neighbouring block's +// (top/left) reference frames +static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi, + MB_MODE_INFO *nb_mbmi) { + MV_REFERENCE_FRAME nb_ref_frames[2] = { nb_mbmi->ref_frame[0], + nb_mbmi->ref_frame[1] }; + MV_REFERENCE_FRAME cur_ref_frames[2] = { cur_mbmi->ref_frame[0], + cur_mbmi->ref_frame[1] }; + const int is_cur_comp_pred = has_second_ref(cur_mbmi); + int match_found = 0; + + for (int i = 0; i < (is_cur_comp_pred + 1); i++) { + if ((cur_ref_frames[i] == nb_ref_frames[0]) || + (cur_ref_frames[i] == nb_ref_frames[1])) + match_found = 1; + } + return match_found; +} + +static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols, + MACROBLOCKD *xd) { + if (!xd->up_available) return 1; + const int mi_col = xd->mi_col; + MB_MODE_INFO **cur_mbmi = xd->mi; + // prev_row_mi points into the mi array, starting at the beginning of the + // previous row. + MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; + const int end_col = AOMMIN(mi_col + xd->width, total_mi_cols); + uint8_t mi_step; + for (int above_mi_col = mi_col; above_mi_col < end_col; + above_mi_col += mi_step) { + MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col; + mi_step = mi_size_wide[above_mi[0]->bsize]; + int match_found = 0; + if (is_inter_block(*above_mi)) + match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi); + if (match_found) return 1; + } + return 0; +} + +static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows, + MACROBLOCKD *xd) { + if (!xd->left_available) return 1; + const int mi_row = xd->mi_row; + MB_MODE_INFO **cur_mbmi = xd->mi; + // prev_col_mi points into the mi array, starting at the top of the + // previous column + MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; + const int end_row = AOMMIN(mi_row + xd->height, total_mi_rows); + uint8_t mi_step; + for (int left_mi_row = mi_row; left_mi_row < end_row; + left_mi_row += mi_step) { + MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride; + mi_step = mi_size_high[left_mi[0]->bsize]; + int match_found = 0; + if (is_inter_block(*left_mi)) + match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi); + if (match_found) return 1; + } + return 0; +} +/*!\endcond */ + +/*! \brief Struct used to hold TPL data to + * narrow down parts of the inter mode search. + */ +typedef struct { + /*! + * The best inter cost out of all of the reference frames. + */ + int64_t best_inter_cost; + /*! + * The inter cost for each reference frame. + */ + int64_t ref_inter_cost[INTER_REFS_PER_FRAME]; +} PruneInfoFromTpl; + +#if !CONFIG_REALTIME_ONLY +// TODO(Remya): Check if get_tpl_stats_b() can be reused +static AOM_INLINE void get_block_level_tpl_stats( + AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs, + PruneInfoFromTpl *inter_cost_info_from_tpl) { + AV1_COMMON *const cm = &cpi->common; + + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + if (!av1_tpl_stats_ready(tpl_data, tpl_idx)) return; + const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + const int tpl_stride = tpl_frame->stride; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + const int row_step = step; + const int col_step_sr = + coded_to_superres_mi(step, cm->superres_scale_denominator); + for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows); + row += row_step) { + for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr); + col += col_step_sr) { + const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + + // Sums up the inter cost of corresponding ref frames + for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) { + inter_cost_info_from_tpl->ref_inter_cost[ref_idx] += + this_stats->pred_error[ref_idx]; + } + } + } + + // Computes the best inter cost (minimum inter_cost) + int64_t best_inter_cost = INT64_MAX; + for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) { + const int64_t cur_inter_cost = + inter_cost_info_from_tpl->ref_inter_cost[ref_idx]; + // For invalid ref frames, cur_inter_cost = 0 and has to be handled while + // calculating the minimum inter_cost + if (cur_inter_cost != 0 && (cur_inter_cost < best_inter_cost) && + valid_refs[ref_idx]) + best_inter_cost = cur_inter_cost; + } + inter_cost_info_from_tpl->best_inter_cost = best_inter_cost; +} +#endif + +static AOM_INLINE int prune_modes_based_on_tpl_stats( + PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx, + const PREDICTION_MODE this_mode, int prune_mode_level) { + const int have_newmv = have_newmv_in_inter_mode(this_mode); + if ((prune_mode_level < 2) && have_newmv) return 0; + + const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost; + if (best_inter_cost == INT64_MAX) return 0; + + const int prune_level = prune_mode_level - 1; + int64_t cur_inter_cost; + + const int is_globalmv = + (this_mode == GLOBALMV) || (this_mode == GLOBAL_GLOBALMV); + const int prune_index = is_globalmv ? MAX_REF_MV_SEARCH : ref_mv_idx; + + // Thresholds used for pruning: + // Lower value indicates aggressive pruning and higher value indicates + // conservative pruning which is set based on ref_mv_idx and speed feature. + // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index + // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV + static const int tpl_inter_mode_prune_mul_factor[3][MAX_REF_MV_SEARCH + 1] = { + { 6, 6, 6, 4 }, { 6, 4, 4, 4 }, { 5, 4, 4, 4 } + }; + + const int is_comp_pred = (refs[1] > INTRA_FRAME); + if (!is_comp_pred) { + cur_inter_cost = inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1]; + } else { + const int64_t inter_cost_ref0 = + inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1]; + const int64_t inter_cost_ref1 = + inter_cost_info_from_tpl->ref_inter_cost[refs[1] - 1]; + // Choose maximum inter_cost among inter_cost_ref0 and inter_cost_ref1 for + // more aggressive pruning + cur_inter_cost = AOMMAX(inter_cost_ref0, inter_cost_ref1); + } + + // Prune the mode if cur_inter_cost is greater than threshold times + // best_inter_cost + if (cur_inter_cost > + ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] * + best_inter_cost) >> + 2)) + return 1; + return 0; +} + +/*!\brief High level function to select parameters for compound mode. + * + * \ingroup inter_mode_search + * The main search functionality is done in the call to av1_compound_type_rd(). + * + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to struct holding all the data for + * the current macroblock. + * \param[in] args HandleInterModeArgs struct holding + * miscellaneous arguments for inter mode + * search. See the documentation for this + * struct for a description of each member. + * \param[in] ref_best_rd Best RD found so far for this block. + * It is used for early termination of this + * search if the RD exceeds this value. + * \param[in,out] cur_mv Current motion vector. + * \param[in] bsize Current block size. + * \param[in,out] compmode_interinter_cost RD of the selected interinter + compound mode. + * \param[in,out] rd_buffers CompoundTypeRdBuffers struct to hold all + * allocated buffers for the compound + * predictors and masks in the compound type + * search. + * \param[in,out] orig_dst A prediction buffer to hold a computed + * prediction. This will eventually hold the + * final prediction, and the tmp_dst info will + * be copied here. + * \param[in] tmp_dst A temporary prediction buffer to hold a + * computed prediction. + * \param[in,out] rate_mv The rate associated with the motion vectors. + * This will be modified if a motion search is + * done in the motion mode search. + * \param[in,out] rd_stats Struct to keep track of the overall RD + * information. + * \param[in,out] skip_rd An array of length 2 where skip_rd[0] is the + * best total RD for a skip mode so far, and + * skip_rd[1] is the best RD for a skip mode so + * far in luma. This is used as a speed feature + * to skip the transform search if the computed + * skip RD for the current mode is not better + * than the best skip_rd so far. + * \param[in,out] skip_build_pred Indicates whether or not to build the inter + * predictor. If this is 0, the inter predictor + * has already been built and thus we can avoid + * repeating computation. + * \return Returns 1 if this mode is worse than one already seen and 0 if it is + * a viable candidate. + */ +static int process_compound_inter_mode( + AV1_COMP *const cpi, MACROBLOCK *x, HandleInterModeArgs *args, + int64_t ref_best_rd, int_mv *cur_mv, BLOCK_SIZE bsize, + int *compmode_interinter_cost, const CompoundTypeRdBuffers *rd_buffers, + const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, int *rate_mv, + RD_STATS *rd_stats, int64_t *skip_rd, int *skip_build_pred) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const AV1_COMMON *cm = &cpi->common; + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params->enable_masked_compound; + int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) | + (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD); + + const int num_planes = av1_num_planes(cm); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int is_luma_interp_done = 0; + set_default_interp_filters(mbmi, cm->features.interp_filter); + + int64_t best_rd_compound; + int64_t rd_thresh; + const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT; + const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE; + rd_thresh = get_rd_thresh_from_best_rd(ref_best_rd, (1 << comp_type_rd_shift), + comp_type_rd_scale); + // Select compound type and any parameters related to that type + // (for example, the mask parameters if it is a masked mode) and compute + // the RD + *compmode_interinter_cost = av1_compound_type_rd( + cpi, x, args, bsize, cur_mv, mode_search_mask, masked_compound_used, + orig_dst, tmp_dst, rd_buffers, rate_mv, &best_rd_compound, rd_stats, + ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh); + if (ref_best_rd < INT64_MAX && + (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale > + ref_best_rd) { + restore_dst_buf(xd, *orig_dst, num_planes); + return 1; + } + + // Build only uv predictor for COMPOUND_AVERAGE. + // Note there is no need to call av1_enc_build_inter_predictor + // for luma if COMPOUND_AVERAGE is selected because it is the first + // candidate in av1_compound_type_rd, which means it used the dst_buf + // rather than the tmp_buf. + if (mbmi->interinter_comp.type == COMPOUND_AVERAGE && is_luma_interp_done) { + if (num_planes > 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_U, num_planes - 1); + } + *skip_build_pred = 1; + } + return 0; +} + +// Speed feature to prune out MVs that are similar to previous MVs if they +// don't achieve the best RD advantage. +static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx, + int_mv save_mv[MAX_REF_MV_SEARCH - 1][2], + MB_MODE_INFO *mbmi, int pruning_factor) { + int i; + const int is_comp_pred = has_second_ref(mbmi); + const int thr = (1 + is_comp_pred) << (pruning_factor + 1); + + // Skip the evaluation if an MV match is found. + if (ref_mv_idx > 0) { + for (int idx = 0; idx < ref_mv_idx; ++idx) { + if (save_mv[idx][0].as_int == INVALID_MV) continue; + + int mv_diff = 0; + for (i = 0; i < 1 + is_comp_pred; ++i) { + mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) + + abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col); + } + + // If this mode is not the best one, and current MV is similar to + // previous stored MV, terminate this ref_mv_idx evaluation. + if (best_ref_mv_idx == -1 && mv_diff <= thr) return 1; + } + } + + if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) { + for (i = 0; i < is_comp_pred + 1; ++i) + save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int; + } + + return 0; +} + +/*!\brief Prunes ZeroMV Search Using Best NEWMV's SSE + * + * \ingroup inter_mode_search + * + * Compares the sse of zero mv and the best sse found in single new_mv. If the + * sse of the zero_mv is higher, returns 1 to signal zero_mv can be skipped. + * Else returns 0. + * + * Note that the sse of here comes from single_motion_search. So it is + * interpolated with the filter in motion search, not the actual interpolation + * filter used in encoding. + * + * \param[in] fn_ptr A table of function pointers to compute SSE. + * \param[in] x Pointer to struct holding all the data for + * the current macroblock. + * \param[in] bsize The current block_size. + * \param[in] args The args to handle_inter_mode, used to track + * the best SSE. + * \param[in] prune_zero_mv_with_sse The argument holds speed feature + * prune_zero_mv_with_sse value + * \return Returns 1 if zero_mv is pruned, 0 otherwise. + */ +static AOM_INLINE int prune_zero_mv_with_sse( + const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize, + const HandleInterModeArgs *args, int prune_zero_mv_with_sse) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + + const int is_comp_pred = has_second_ref(mbmi); + const MV_REFERENCE_FRAME *refs = mbmi->ref_frame; + + for (int idx = 0; idx < 1 + is_comp_pred; idx++) { + if (xd->global_motion[refs[idx]].wmtype != IDENTITY) { + // Pruning logic only works for IDENTITY type models + // Note: In theory we could apply similar logic for TRANSLATION + // type models, but we do not code these due to a spec bug + // (see comments in gm_get_motion_vector() in av1/common/mv.h) + assert(xd->global_motion[refs[idx]].wmtype != TRANSLATION); + return 0; + } + + // Don't prune if we have invalid data + assert(mbmi->mv[idx].as_int == 0); + if (args->best_single_sse_in_refs[refs[idx]] == INT32_MAX) { + return 0; + } + } + + // Sum up the sse of ZEROMV and best NEWMV + unsigned int this_sse_sum = 0; + unsigned int best_sse_sum = 0; + for (int idx = 0; idx < 1 + is_comp_pred; idx++) { + const struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + const struct macroblockd_plane *pd = xd->plane; + const struct buf_2d *src_buf = &p->src; + const struct buf_2d *ref_buf = &pd->pre[idx]; + const uint8_t *src = src_buf->buf; + const uint8_t *ref = ref_buf->buf; + const int src_stride = src_buf->stride; + const int ref_stride = ref_buf->stride; + + unsigned int this_sse; + fn_ptr[bsize].vf(ref, ref_stride, src, src_stride, &this_sse); + this_sse_sum += this_sse; + + const unsigned int best_sse = args->best_single_sse_in_refs[refs[idx]]; + best_sse_sum += best_sse; + } + + const double mul = prune_zero_mv_with_sse > 1 ? 1.00 : 1.25; + if ((double)this_sse_sum > (mul * (double)best_sse_sum)) { + return 1; + } + + return 0; +} + +/*!\brief Searches for interpolation filter in realtime mode during winner eval + * + * \ingroup inter_mode_search + * + * Does a simple interpolation filter search during winner mode evaluation. This + * is currently only used by realtime mode as \ref + * av1_interpolation_filter_search is not called during realtime encoding. + * + * This function only searches over two possible filters. EIGHTTAP_REGULAR is + * always search. For lowres clips (<= 240p), MULTITAP_SHARP is also search. For + * higher res slips (>240p), EIGHTTAP_SMOOTH is also searched. + * * + * \param[in] cpi Pointer to the compressor. Used for feature + * flags. + * \param[in,out] x Pointer to macroblock. This is primarily + * used to access the buffers. + * \param[in] mi_row The current row in mi unit (4X4 pixels). + * \param[in] mi_col The current col in mi unit (4X4 pixels). + * \param[in] bsize The current block_size. + * \return Returns true if a predictor is built in xd->dst, false otherwise. + */ +static AOM_INLINE bool fast_interp_search(const AV1_COMP *cpi, MACROBLOCK *x, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + static const InterpFilters filters_ref_set[3] = { + { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }, + { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH }, + { MULTITAP_SHARP, MULTITAP_SHARP } + }; + + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + int64_t best_cost = INT64_MAX; + int best_filter_index = -1; + // dst_bufs[0] sores the new predictor, and dist_bifs[1] stores the best + const int num_planes = av1_num_planes(cm); + const int is_240p_or_lesser = AOMMIN(cm->width, cm->height) <= 240; + assert(is_inter_mode(mi->mode)); + assert(mi->motion_mode == SIMPLE_TRANSLATION); + assert(!is_inter_compound_mode(mi->mode)); + + if (!av1_is_interp_needed(xd)) { + return false; + } + + struct macroblockd_plane *pd = xd->plane; + const BUFFER_SET orig_dst = { + { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, + { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, + }; + uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]); + const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, + tmp_buf + 2 * MAX_SB_SQUARE }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } }; + const BUFFER_SET *dst_bufs[2] = { &orig_dst, &tmp_dst }; + + for (int i = 0; i < 3; ++i) { + if (is_240p_or_lesser) { + if (filters_ref_set[i].x_filter == EIGHTTAP_SMOOTH) { + continue; + } + } else { + if (filters_ref_set[i].x_filter == MULTITAP_SHARP) { + continue; + } + } + int64_t cost; + RD_STATS tmp_rd = { 0 }; + + mi->interp_filters.as_filters = filters_ref_set[i]; + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + + model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model + ? MODELRD_LEGACY + : MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, AOM_PLANE_Y, AOM_PLANE_Y, &tmp_rd.rate, &tmp_rd.dist, + &tmp_rd.skip_txfm, &tmp_rd.sse, NULL, NULL, NULL); + + tmp_rd.rate += av1_get_switchable_rate(x, xd, cm->features.interp_filter, + cm->seq_params->enable_dual_filter); + cost = RDCOST(x->rdmult, tmp_rd.rate, tmp_rd.dist); + if (cost < best_cost) { + best_filter_index = i; + best_cost = cost; + swap_dst_buf(xd, dst_bufs, num_planes); + } + } + assert(best_filter_index >= 0); + + mi->interp_filters.as_filters = filters_ref_set[best_filter_index]; + + const bool is_best_pred_in_orig = &orig_dst == dst_bufs[1]; + + if (is_best_pred_in_orig) { + swap_dst_buf(xd, dst_bufs, num_planes); + } else { + // Note that xd->pd's bufers are kept in sync with dst_bufs[0]. So if + // is_best_pred_in_orig is false, that means the current buffer is the + // original one. + assert(&orig_dst == dst_bufs[0]); + assert(xd->plane[AOM_PLANE_Y].dst.buf == orig_dst.plane[AOM_PLANE_Y]); + const int width = block_size_wide[bsize]; + const int height = block_size_high[bsize]; +#if CONFIG_AV1_HIGHBITDEPTH + const bool is_hbd = is_cur_buf_hbd(xd); + if (is_hbd) { + aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(tmp_dst.plane[AOM_PLANE_Y]), + tmp_dst.stride[AOM_PLANE_Y], + CONVERT_TO_SHORTPTR(orig_dst.plane[AOM_PLANE_Y]), + orig_dst.stride[AOM_PLANE_Y], width, height); + } else { + aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y], + orig_dst.plane[AOM_PLANE_Y], + orig_dst.stride[AOM_PLANE_Y], width, height); + } +#else + aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y], + orig_dst.plane[AOM_PLANE_Y], orig_dst.stride[AOM_PLANE_Y], + width, height); +#endif + } + + // Build the YUV predictor. + if (num_planes > 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_U, AOM_PLANE_V); + } + + return true; +} + +/*!\brief AV1 inter mode RD computation + * + * \ingroup inter_mode_search + * Do the RD search for a given inter mode and compute all information relevant + * to the input mode. It will compute the best MV, + * compound parameters (if the mode is a compound mode) and interpolation filter + * parameters. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during + * encoding. + * \param[in] x Pointer to structure holding all the data + * for the current macroblock. + * \param[in] bsize Current block size. + * \param[in,out] rd_stats Struct to keep track of the overall RD + * information. + * \param[in,out] rd_stats_y Struct to keep track of the RD information + * for only the Y plane. + * \param[in,out] rd_stats_uv Struct to keep track of the RD information + * for only the UV planes. + * \param[in] args HandleInterModeArgs struct holding + * miscellaneous arguments for inter mode + * search. See the documentation for this + * struct for a description of each member. + * \param[in] ref_best_rd Best RD found so far for this block. + * It is used for early termination of this + * search if the RD exceeds this value. + * \param[in] tmp_buf Temporary buffer used to hold predictors + * built in this search. + * \param[in,out] rd_buffers CompoundTypeRdBuffers struct to hold all + * allocated buffers for the compound + * predictors and masks in the compound type + * search. + * \param[in,out] best_est_rd Estimated RD for motion mode search if + * do_tx_search (see below) is 0. + * \param[in] do_tx_search Parameter to indicate whether or not to do + * a full transform search. This will compute + * an estimated RD for the modes without the + * transform search and later perform the full + * transform search on the best candidates. + * \param[in,out] inter_modes_info InterModesInfo struct to hold inter mode + * information to perform a full transform + * search only on winning candidates searched + * with an estimate for transform coding RD. + * \param[in,out] motion_mode_cand A motion_mode_candidate struct to store + * motion mode information used in a speed + * feature to search motion modes other than + * SIMPLE_TRANSLATION only on winning + * candidates. + * \param[in,out] skip_rd A length 2 array, where skip_rd[0] is the + * best total RD for a skip mode so far, and + * skip_rd[1] is the best RD for a skip mode so + * far in luma. This is used as a speed feature + * to skip the transform search if the computed + * skip RD for the current mode is not better + * than the best skip_rd so far. + * \param[in] inter_cost_info_from_tpl A PruneInfoFromTpl struct used to + * narrow down the search based on data + * collected in the TPL model. + * \param[out] yrd Stores the rdcost corresponding to encoding + * the luma plane. + * + * \return The RD cost for the mode being searched. + */ +static int64_t handle_inter_mode( + AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x, + BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, HandleInterModeArgs *args, int64_t ref_best_rd, + uint8_t *const tmp_buf, const CompoundTypeRdBuffers *rd_buffers, + int64_t *best_est_rd, const int do_tx_search, + InterModesInfo *inter_modes_info, motion_mode_candidate *motion_mode_cand, + int64_t *skip_rd, PruneInfoFromTpl *inter_cost_info_from_tpl, + int64_t *yrd) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; + +#if CONFIG_REALTIME_ONLY + const int prune_modes_based_on_tpl = 0; +#else // CONFIG_REALTIME_ONLY + const TplParams *const tpl_data = &cpi->ppi->tpl_data; + const int prune_modes_based_on_tpl = + cpi->sf.inter_sf.prune_inter_modes_based_on_tpl && + av1_tpl_stats_ready(tpl_data, cpi->gf_frame_index); +#endif // CONFIG_REALTIME_ONLY + int i; + // Reference frames for this mode + const int refs[2] = { mbmi->ref_frame[0], + (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; + int rate_mv = 0; + int64_t rd = INT64_MAX; + // Do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + struct macroblockd_plane *pd = xd->plane; + const BUFFER_SET orig_dst = { + { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, + { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, + }; + const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, + tmp_buf + 2 * MAX_SB_SQUARE }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } }; + + int64_t ret_val = INT64_MAX; + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; + int64_t best_rd = INT64_MAX; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + int64_t best_yrd = INT64_MAX; + MB_MODE_INFO best_mbmi = *mbmi; + int best_xskip_txfm = 0; + int64_t newmv_ret_val = INT64_MAX; + inter_mode_info mode_info[MAX_REF_MV_SEARCH]; + + // Do not prune the mode based on inter cost from tpl if the current ref frame + // is the winner ref in neighbouring blocks. + int ref_match_found_in_above_nb = 0; + int ref_match_found_in_left_nb = 0; + if (prune_modes_based_on_tpl) { + ref_match_found_in_above_nb = + find_ref_match_in_above_nbs(cm->mi_params.mi_cols, xd); + ref_match_found_in_left_nb = + find_ref_match_in_left_nbs(cm->mi_params.mi_rows, xd); + } + + // First, perform a simple translation search for each of the indices. If + // an index performs well, it will be fully searched in the main loop + // of this function. + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); + // Save MV results from first 2 ref_mv_idx. + int_mv save_mv[MAX_REF_MV_SEARCH - 1][2]; + int best_ref_mv_idx = -1; + const int idx_mask = + ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, bsize, ref_set); + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); + const ModeCosts *mode_costs = &x->mode_costs; + const int ref_mv_cost = cost_mv_ref(mode_costs, this_mode, mode_ctx); + const int base_rate = + args->ref_frame_cost + args->single_comp_cost + ref_mv_cost; + + for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) { + save_mv[i][0].as_int = INVALID_MV; + save_mv[i][1].as_int = INVALID_MV; + } + args->start_mv_cnt = 0; + + // Main loop of this function. This will iterate over all of the ref mvs + // in the dynamic reference list and do the following: + // 1.) Get the current MV. Create newmv MV if necessary + // 2.) Search compound type and parameters if applicable + // 3.) Do interpolation filter search + // 4.) Build the inter predictor + // 5.) Pick the motion mode (SIMPLE_TRANSLATION, OBMC_CAUSAL, + // WARPED_CAUSAL) + // 6.) Update stats if best so far + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + mbmi->ref_mv_idx = ref_mv_idx; + + mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV; + mode_info[ref_mv_idx].full_mv_bestsme = INT_MAX; + const int drl_cost = get_drl_cost( + mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type); + mode_info[ref_mv_idx].drl_cost = drl_cost; + mode_info[ref_mv_idx].skip = 0; + + if (!mask_check_bit(idx_mask, ref_mv_idx)) { + // MV did not perform well in simple translation search. Skip it. + continue; + } + if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb && + !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) { + // Skip mode if TPL model indicates it will not be beneficial. + if (prune_modes_based_on_tpl_stats( + inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode, + cpi->sf.inter_sf.prune_inter_modes_based_on_tpl)) + continue; + } + av1_init_rd_stats(rd_stats); + + // Initialize compound mode data + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME; + + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + + // Compute cost for signalling this DRL index + rd_stats->rate = base_rate; + rd_stats->rate += drl_cost; + + int rs = 0; + int compmode_interinter_cost = 0; + + int_mv cur_mv[2]; + + // TODO(Cherma): Extend this speed feature to support compound mode + int skip_repeated_ref_mv = + is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv; + // Generate the current mv according to the prediction mode + if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) { + continue; + } + + // The above call to build_cur_mv does not handle NEWMV modes. Build + // the mv here if we have NEWMV for any predictors. + if (have_newmv_in_inter_mode(this_mode)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_newmv_time); +#endif + newmv_ret_val = + handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_newmv_time); +#endif + + if (newmv_ret_val != 0) continue; + + if (is_inter_singleref_mode(this_mode) && + cur_mv[0].as_int != INVALID_MV) { + const MV_REFERENCE_FRAME ref = refs[0]; + const unsigned int this_sse = x->pred_sse[ref]; + if (this_sse < args->best_single_sse_in_refs[ref]) { + args->best_single_sse_in_refs[ref] = this_sse; + } + + if (cpi->sf.rt_sf.skip_newmv_mode_based_on_sse) { + const int th_idx = cpi->sf.rt_sf.skip_newmv_mode_based_on_sse - 1; + const int pix_idx = num_pels_log2_lookup[bsize] - 4; + const double scale_factor[3][11] = { + { 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9 }, + { 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 1, 1, 1, 1, 1 }, + { 0.7, 0.7, 0.7, 0.7, 1, 1, 1, 1, 1, 1, 1 } + }; + assert(pix_idx >= 0); + assert(th_idx <= 2); + if (args->best_pred_sse < scale_factor[th_idx][pix_idx] * this_sse) + continue; + } + } + + rd_stats->rate += rate_mv; + } + // Copy the motion vector for this mode into mbmi struct + for (i = 0; i < is_comp_pred + 1; ++i) { + mbmi->mv[i].as_int = cur_mv[i].as_int; + } + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { + continue; + } + + // Skip the rest of the search if prune_ref_mv_idx_search speed feature + // is enabled, and the current MV is similar to a previous one. + if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred && + prune_ref_mv_idx_search(ref_mv_idx, best_ref_mv_idx, save_mv, mbmi, + cpi->sf.inter_sf.prune_ref_mv_idx_search)) + continue; + + if (cpi->sf.gm_sf.prune_zero_mv_with_sse && + (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) { + if (prune_zero_mv_with_sse(cpi->ppi->fn_ptr, x, bsize, args, + cpi->sf.gm_sf.prune_zero_mv_with_sse)) { + continue; + } + } + + int skip_build_pred = 0; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Handle a compound predictor, continue if it is determined this + // cannot be the best compound mode + if (is_comp_pred) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, compound_type_rd_time); +#endif + const int not_best_mode = process_compound_inter_mode( + cpi, x, args, ref_best_rd, cur_mv, bsize, &compmode_interinter_cost, + rd_buffers, &orig_dst, &tmp_dst, &rate_mv, rd_stats, skip_rd, + &skip_build_pred); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, compound_type_rd_time); +#endif + if (not_best_mode) continue; + } + + if (!args->skip_ifs) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, interpolation_filter_search_time); +#endif + // Determine the interpolation filter for this mode + ret_val = av1_interpolation_filter_search( + x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs, + &skip_build_pred, args, ref_best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, interpolation_filter_search_time); +#endif + if (args->modelled_rd != NULL && !is_comp_pred) { + args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd; + } + if (ret_val != 0) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout && + ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + + // Compute modelled RD if enabled + if (args->modelled_rd != NULL) { + if (is_comp_pred) { + const int mode0 = compound_ref0_mode(this_mode); + const int mode1 = compound_ref1_mode(this_mode); + const int64_t mrd = + AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + } + } + } + + rd_stats->rate += compmode_interinter_cost; + if (skip_build_pred != 1) { + // Build this inter predictor if it has not been previously built + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0, + av1_num_planes(cm) - 1); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, motion_mode_rd_time); +#endif + int rate2_nocoeff = rd_stats->rate; + // Determine the motion mode. This will be one of SIMPLE_TRANSLATION, + // OBMC_CAUSAL or WARPED_CAUSAL + int64_t this_yrd; + ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y, + rd_stats_uv, args, ref_best_rd, skip_rd, &rate_mv, + &orig_dst, best_est_rd, do_tx_search, + inter_modes_info, 0, &this_yrd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, motion_mode_rd_time); +#endif + assert( + IMPLIES(!av1_check_newmv_joint_nonzero(cm, x), ret_val == INT64_MAX)); + + if (ret_val != INT64_MAX) { + int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + const THR_MODES mode_enum = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + // Collect mode stats for multiwinner mode processing + store_winner_mode_stats(&cpi->common, x, mbmi, rd_stats, rd_stats_y, + rd_stats_uv, mode_enum, NULL, bsize, tmp_rd, + cpi->sf.winner_mode_sf.multi_winner_mode_type, + do_tx_search); + if (tmp_rd < best_rd) { + best_yrd = this_yrd; + // Update the best rd stats if we found the best mode so far + best_rd_stats = *rd_stats; + best_rd_stats_y = *rd_stats_y; + best_rd_stats_uv = *rd_stats_uv; + best_rd = tmp_rd; + best_mbmi = *mbmi; + best_xskip_txfm = txfm_info->skip_txfm; + memcpy(best_blk_skip, txfm_info->blk_skip, + sizeof(best_blk_skip[0]) * xd->height * xd->width); + av1_copy_array(best_tx_type_map, xd->tx_type_map, + xd->height * xd->width); + motion_mode_cand->rate_mv = rate_mv; + motion_mode_cand->rate2_nocoeff = rate2_nocoeff; + } + + if (tmp_rd < ref_best_rd) { + ref_best_rd = tmp_rd; + best_ref_mv_idx = ref_mv_idx; + } + } + restore_dst_buf(xd, orig_dst, num_planes); + } + + if (best_rd == INT64_MAX) return INT64_MAX; + + // re-instate status of the best choice + *rd_stats = best_rd_stats; + *rd_stats_y = best_rd_stats_y; + *rd_stats_uv = best_rd_stats_uv; + *yrd = best_yrd; + *mbmi = best_mbmi; + txfm_info->skip_txfm = best_xskip_txfm; + assert(IMPLIES(mbmi->comp_group_idx == 1, + mbmi->interinter_comp.type != COMPOUND_AVERAGE)); + memcpy(txfm_info->blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * xd->height * xd->width); + av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width); + + rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + + return rd_stats->rdcost; +} + +/*!\brief Search for the best intrabc predictor + * + * \ingroup intra_mode_search + * \callergraph + * This function performs a motion search to find the best intrabc predictor. + * + * \returns Returns the best overall rdcost (including the non-intrabc modes + * search before this function). + */ +static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, + PICK_MODE_CONTEXT *ctx, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc || + !cpi->sf.mv_sf.use_intrabc || cpi->sf.rt_sf.use_nonrd_pick_mode) + return INT64_MAX; + const int num_planes = av1_num_planes(cm); + + MACROBLOCKD *const xd = &x->e_mbd; + const TileInfo *tile = &xd->tile; + MB_MODE_INFO *mbmi = xd->mi[0]; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int sb_row = mi_row >> cm->seq_params->mib_size_log2; + const int sb_col = mi_col >> cm->seq_params->mib_size_log2; + + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const MV_REFERENCE_FRAME ref_frame = INTRA_FRAME; + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + int_mv nearestmv, nearmv; + av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv, + 0); + + if (nearestmv.as_int == INVALID_MV) { + nearestmv.as_int = 0; + } + if (nearmv.as_int == INVALID_MV) { + nearmv.as_int = 0; + } + + int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; + if (dv_ref.as_int == 0) { + av1_find_ref_dv(&dv_ref, tile, cm->seq_params->mib_size, mi_row); + } + // Ref DV should not have sub-pel. + assert((dv_ref.as_mv.col & 7) == 0); + assert((dv_ref.as_mv.row & 7) == 0); + mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref; + + struct buf_2d yv12_mb[MAX_MB_PLANE]; + av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, NULL, NULL, num_planes); + for (int i = 0; i < num_planes; ++i) { + xd->plane[i].pre[0] = yv12_mb[i]; + } + + enum IntrabcMotionDirection { + IBC_MOTION_ABOVE, + IBC_MOTION_LEFT, + IBC_MOTION_DIRECTIONS + }; + + MB_MODE_INFO best_mbmi = *mbmi; + RD_STATS best_rdstats = *rd_stats; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + + FULLPEL_MOTION_SEARCH_PARAMS fullms_params; + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize); + const search_site_config *lookahead_search_sites = + cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD]; + const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv); + av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize, + &dv_ref.as_mv, start_mv, + lookahead_search_sites, search_method, + /*fine_search_interval=*/0); + const IntraBCMVCosts *const dv_costs = x->dv_costs; + av1_set_ms_to_intra_mode(&fullms_params, dv_costs); + + for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE; + dir < IBC_MOTION_DIRECTIONS; ++dir) { + switch (dir) { + case IBC_MOTION_ABOVE: + fullms_params.mv_limits.col_min = + (tile->mi_col_start - mi_col) * MI_SIZE; + fullms_params.mv_limits.col_max = + (tile->mi_col_end - mi_col) * MI_SIZE - w; + fullms_params.mv_limits.row_min = + (tile->mi_row_start - mi_row) * MI_SIZE; + fullms_params.mv_limits.row_max = + (sb_row * cm->seq_params->mib_size - mi_row) * MI_SIZE - h; + break; + case IBC_MOTION_LEFT: + fullms_params.mv_limits.col_min = + (tile->mi_col_start - mi_col) * MI_SIZE; + fullms_params.mv_limits.col_max = + (sb_col * cm->seq_params->mib_size - mi_col) * MI_SIZE - w; + // TODO(aconverse@google.com): Minimize the overlap between above and + // left areas. + fullms_params.mv_limits.row_min = + (tile->mi_row_start - mi_row) * MI_SIZE; + int bottom_coded_mi_edge = + AOMMIN((sb_row + 1) * cm->seq_params->mib_size, tile->mi_row_end); + fullms_params.mv_limits.row_max = + (bottom_coded_mi_edge - mi_row) * MI_SIZE - h; + break; + default: assert(0); + } + assert(fullms_params.mv_limits.col_min >= fullms_params.mv_limits.col_min); + assert(fullms_params.mv_limits.col_max <= fullms_params.mv_limits.col_max); + assert(fullms_params.mv_limits.row_min >= fullms_params.mv_limits.row_min); + assert(fullms_params.mv_limits.row_max <= fullms_params.mv_limits.row_max); + + av1_set_mv_search_range(&fullms_params.mv_limits, &dv_ref.as_mv); + + if (fullms_params.mv_limits.col_max < fullms_params.mv_limits.col_min || + fullms_params.mv_limits.row_max < fullms_params.mv_limits.row_min) { + continue; + } + + const int step_param = cpi->mv_search_params.mv_step_param; + IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info; + int_mv best_mv, best_hash_mv; + FULLPEL_MV_STATS best_mv_stats; + + int bestsme = + av1_full_pixel_search(start_mv, &fullms_params, step_param, NULL, + &best_mv.as_fullmv, &best_mv_stats, NULL); + const int hashsme = av1_intrabc_hash_search( + cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv); + if (hashsme < bestsme) { + best_mv = best_hash_mv; + bestsme = hashsme; + } + + if (bestsme == INT_MAX) continue; + const MV dv = get_mv_from_fullmv(&best_mv.as_fullmv); + if (!av1_is_fullmv_in_range(&fullms_params.mv_limits, + get_fullmv_from_mv(&dv))) + continue; + if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize, + cm->seq_params->mib_size_log2)) + continue; + + // DV should not have sub-pel. + assert((dv.col & 7) == 0); + assert((dv.row & 7) == 0); + memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info)); + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->use_intrabc = 1; + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->mv[0].as_mv = dv; + mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR); + mbmi->skip_txfm = 0; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + + // TODO(aconverse@google.com): The full motion field defining discount + // in MV_COST_WEIGHT is too large. Explore other values. + const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv, + dv_costs->dv_costs, MV_COST_WEIGHT_SUB); + const int rate_mode = x->mode_costs.intrabc_cost[1]; + RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv; + if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y, + &rd_stats_uv, rate_mode + rate_mv, INT64_MAX)) + continue; + rd_stats_yuv.rdcost = + RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist); + if (rd_stats_yuv.rdcost < best_rd) { + best_rd = rd_stats_yuv.rdcost; + best_mbmi = *mbmi; + best_rdstats = rd_stats_yuv; + memcpy(best_blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width); + } + } + *mbmi = best_mbmi; + *rd_stats = best_rdstats; + memcpy(txfm_info->blk_skip, best_blk_skip, + sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); +#if CONFIG_RD_DEBUG + mbmi->rd_stats = *rd_stats; +#endif + return best_rd; +} + +// TODO(chiyotsai@google.com): We are using struct $struct_name instead of their +// typedef here because Doxygen doesn't know about the typedefs yet. So using +// the typedef will prevent doxygen from finding this function and generating +// the callgraph. Once documents for AV1_COMP and MACROBLOCK are added to +// doxygen, we can revert back to using the typedefs. +void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x, + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int num_planes = av1_num_planes(cm); + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; + uint8_t y_skip_txfm = 0, uv_skip_txfm = 0; + int64_t dist_y = 0, dist_uv = 0; + + ctx->rd_stats.skip_txfm = 0; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->use_intrabc = 0; + mbmi->mv[0].as_int = 0; + mbmi->skip_mode = 0; + + const int64_t intra_yrd = + av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, + &y_skip_txfm, bsize, best_rd, ctx); + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + if (intra_yrd < best_rd) { + // Search intra modes for uv planes if needed + if (num_planes > 1) { + // Set up the tx variables for reproducing the y predictions in case we + // need it for chroma-from-luma. + if (xd->is_chroma_ref && store_cfl_required_rdo(cm, x)) { + memcpy(txfm_info->blk_skip, ctx->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk); + } + const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, + &dist_uv, &uv_skip_txfm, bsize, + max_uv_tx_size); + } + + // Intra block is always coded as non-skip + rd_cost->rate = + rate_y + rate_uv + + x->mode_costs.skip_txfm_cost[av1_get_skip_txfm_context(xd)][0]; + rd_cost->dist = dist_y + dist_uv; + rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); + rd_cost->skip_txfm = 0; + } else { + rd_cost->rate = INT_MAX; + } + + if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd) + best_rd = rd_cost->rdcost; + if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) { + ctx->rd_stats.skip_txfm = mbmi->skip_txfm; + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + assert(rd_cost->rate != INT_MAX); + } + if (rd_cost->rate == INT_MAX) return; + + ctx->mic = *xd->mi[0]; + av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); +} + +static AOM_INLINE void calc_target_weighted_pred( + const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd, + const uint8_t *above, int above_stride, const uint8_t *left, + int left_stride); + +static AOM_INLINE void rd_pick_skip_mode( + RD_STATS *rd_cost, InterModeSearchState *search_state, + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { + const AV1_COMMON *const cm = &cpi->common; + const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + x->compound_idx = 1; // COMPOUND_AVERAGE + RD_STATS skip_mode_rd_stats; + av1_invalid_rd_stats(&skip_mode_rd_stats); + + if (skip_mode_info->ref_frame_idx_0 == INVALID_IDX || + skip_mode_info->ref_frame_idx_1 == INVALID_IDX) { + return; + } + + const MV_REFERENCE_FRAME ref_frame = + LAST_FRAME + skip_mode_info->ref_frame_idx_0; + const MV_REFERENCE_FRAME second_ref_frame = + LAST_FRAME + skip_mode_info->ref_frame_idx_1; + const PREDICTION_MODE this_mode = NEAREST_NEARESTMV; + const THR_MODES mode_index = + get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame); + + if (mode_index == THR_INVALID) { + return; + } + + if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp || + cpi->sf.inter_sf.disable_onesided_comp) && + cpi->all_one_sided_refs) { + return; + } + + mbmi->mode = this_mode; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + if (x->mbmi_ext.ref_mv_count[ref_frame_type] == UINT8_MAX) { + MB_MODE_INFO_EXT *mbmi_ext = &x->mbmi_ext; + if (mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX || + mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) { + return; + } + av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_type); + } + + assert(this_mode == NEAREST_NEARESTMV); + if (!build_cur_mv(mbmi->mv, this_mode, cm, x, 0)) { + return; + } + + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); + mbmi->comp_group_idx = 0; + mbmi->compound_idx = x->compound_idx; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->ref_mv_idx = 0; + mbmi->skip_mode = mbmi->skip_txfm = 1; + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + + set_default_interp_filters(mbmi, cm->features.interp_filter); + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + BUFFER_SET orig_dst; + for (int i = 0; i < num_planes; i++) { + orig_dst.plane[i] = xd->plane[i].dst.buf; + orig_dst.stride[i] = xd->plane[i].dst.stride; + } + + // Compare the use of skip_mode with the best intra/inter mode obtained. + const int skip_mode_ctx = av1_get_skip_mode_context(xd); + int64_t best_intra_inter_mode_cost = INT64_MAX; + if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) { + const ModeCosts *mode_costs = &x->mode_costs; + best_intra_inter_mode_cost = RDCOST( + x->rdmult, rd_cost->rate + mode_costs->skip_mode_cost[skip_mode_ctx][0], + rd_cost->dist); + // Account for non-skip mode rate in total rd stats + rd_cost->rate += mode_costs->skip_mode_cost[skip_mode_ctx][0]; + av1_rd_cost_update(x->rdmult, rd_cost); + } + + // Obtain the rdcost for skip_mode. + skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, &orig_dst, + best_intra_inter_mode_cost); + + if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost && + (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) { + assert(mode_index != THR_INVALID); + search_state->best_mbmode.skip_mode = 1; + search_state->best_mbmode = *mbmi; + memset(search_state->best_mbmode.inter_tx_size, + search_state->best_mbmode.tx_size, + sizeof(search_state->best_mbmode.inter_tx_size)); + set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height, + search_state->best_mbmode.skip_txfm && is_inter_block(mbmi), + xd); + search_state->best_mode_index = mode_index; + + // Update rd_cost + rd_cost->rate = skip_mode_rd_stats.rate; + rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist; + rd_cost->rdcost = skip_mode_rd_stats.rdcost; + + search_state->best_rd = rd_cost->rdcost; + search_state->best_skip2 = 1; + search_state->best_mode_skippable = 1; + + x->txfm_search_info.skip_txfm = 1; + } +} + +// Get winner mode stats of given mode index +static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats( + MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost, + int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index, + RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv, + THR_MODES *winner_mode_index, MULTI_WINNER_MODE_TYPE multi_winner_mode_type, + int mode_idx) { + MB_MODE_INFO *winner_mbmi; + if (multi_winner_mode_type) { + assert(mode_idx >= 0 && mode_idx < x->winner_mode_count); + WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx]; + winner_mbmi = &winner_mode_stat->mbmi; + + *winner_rd_cost = &winner_mode_stat->rd_cost; + *winner_rate_y = winner_mode_stat->rate_y; + *winner_rate_uv = winner_mode_stat->rate_uv; + *winner_mode_index = winner_mode_stat->mode_index; + } else { + winner_mbmi = best_mbmode; + *winner_rd_cost = best_rd_cost; + *winner_rate_y = best_rate_y; + *winner_rate_uv = best_rate_uv; + *winner_mode_index = *best_mode_index; + } + return winner_mbmi; +} + +// speed feature: fast intra/inter transform type search +// Used for speed >= 2 +// When this speed feature is on, in rd mode search, only DCT is used. +// After the mode is determined, this function is called, to select +// transform types and get accurate rdcost. +static AOM_INLINE void refine_winner_mode_tx( + const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, THR_MODES *best_mode_index, + MB_MODE_INFO *best_mbmode, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], + int best_rate_y, int best_rate_uv, int *best_skip2, int winner_mode_count) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + TxfmSearchParams *txfm_params = &x->txfm_search_params; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + int64_t best_rd; + const int num_planes = av1_num_planes(cm); + + if (!is_winner_mode_processing_enabled(cpi, x, best_mbmode, + rd_cost->skip_txfm)) + return; + + // Set params for winner mode evaluation + set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); + + // No best mode identified so far + if (*best_mode_index == THR_INVALID) return; + + best_rd = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); + for (int mode_idx = 0; mode_idx < winner_mode_count; mode_idx++) { + RD_STATS *winner_rd_stats = NULL; + int winner_rate_y = 0, winner_rate_uv = 0; + THR_MODES winner_mode_index = 0; + + // TODO(any): Combine best mode and multi-winner mode processing paths + // Get winner mode stats for current mode index + MB_MODE_INFO *winner_mbmi = get_winner_mode_stats( + x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index, + &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index, + cpi->sf.winner_mode_sf.multi_winner_mode_type, mode_idx); + + if (xd->lossless[winner_mbmi->segment_id] == 0 && + winner_mode_index != THR_INVALID && + is_winner_mode_processing_enabled(cpi, x, winner_mbmi, + rd_cost->skip_txfm)) { + RD_STATS rd_stats = *winner_rd_stats; + int skip_blk = 0; + RD_STATS rd_stats_y, rd_stats_uv; + const int skip_ctx = av1_get_skip_txfm_context(xd); + + *mbmi = *winner_mbmi; + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (has_second_ref(mbmi)) + xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + if (is_inter_mode(mbmi->mode)) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + bool is_predictor_built = false; + const PREDICTION_MODE prediction_mode = mbmi->mode; + // Do interpolation filter search for realtime mode if applicable. + if (cpi->sf.winner_mode_sf.winner_mode_ifs && + cpi->oxcf.mode == REALTIME && + cm->current_frame.reference_mode == SINGLE_REFERENCE && + is_inter_mode(prediction_mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION && + !is_inter_compound_mode(prediction_mode)) { + is_predictor_built = + fast_interp_search(cpi, x, mi_row, mi_col, bsize); + } + if (!is_predictor_built) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + } + if (mbmi->motion_mode == OBMC_CAUSAL) + av1_build_obmc_inter_predictors_sb(cm, xd); + + av1_subtract_plane(x, bsize, 0); + if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && + !xd->lossless[mbmi->segment_id]) { + av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + assert(rd_stats_y.rate != INT_MAX); + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + memset(mbmi->inter_tx_size, mbmi->tx_size, + sizeof(mbmi->inter_tx_size)); + for (int i = 0; i < xd->height * xd->width; ++i) + set_blk_skip(txfm_info->blk_skip, 0, i, rd_stats_y.skip_txfm); + } + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + } + + if (num_planes > 1) { + av1_txfm_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); + } else { + av1_init_rd_stats(&rd_stats_uv); + } + + const ModeCosts *mode_costs = &x->mode_costs; + if (is_inter_mode(mbmi->mode) && + RDCOST(x->rdmult, + mode_costs->skip_txfm_cost[skip_ctx][0] + rd_stats_y.rate + + rd_stats_uv.rate, + (rd_stats_y.dist + rd_stats_uv.dist)) > + RDCOST(x->rdmult, mode_costs->skip_txfm_cost[skip_ctx][1], + (rd_stats_y.sse + rd_stats_uv.sse))) { + skip_blk = 1; + rd_stats_y.rate = mode_costs->skip_txfm_cost[skip_ctx][1]; + rd_stats_uv.rate = 0; + rd_stats_y.dist = rd_stats_y.sse; + rd_stats_uv.dist = rd_stats_uv.sse; + } else { + skip_blk = 0; + rd_stats_y.rate += mode_costs->skip_txfm_cost[skip_ctx][0]; + } + int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate - + winner_rate_y - winner_rate_uv; + int64_t this_rd = + RDCOST(x->rdmult, this_rate, (rd_stats_y.dist + rd_stats_uv.dist)); + if (best_rd > this_rd) { + *best_mbmode = *mbmi; + *best_mode_index = winner_mode_index; + av1_copy_array(ctx->blk_skip, txfm_info->blk_skip, ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + rd_cost->rate = this_rate; + rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist; + rd_cost->sse = rd_stats_y.sse + rd_stats_uv.sse; + rd_cost->rdcost = this_rd; + best_rd = this_rd; + *best_skip2 = skip_blk; + } + } + } +} + +/*!\cond */ +typedef struct { + // Mask for each reference frame, specifying which prediction modes to NOT try + // during search. + uint32_t pred_modes[REF_FRAMES]; + // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of + // reference frames (i, j). + // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1 + // (NONE_FRAME). + bool ref_combo[REF_FRAMES][REF_FRAMES + 1]; +} mode_skip_mask_t; +/*!\endcond */ + +// Update 'ref_combo' mask to disable given 'ref' in single and compound modes. +static AOM_INLINE void disable_reference( + MV_REFERENCE_FRAME ref, bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) { + for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) { + ref_combo[ref][ref2 + 1] = true; + } +} + +// Update 'ref_combo' mask to disable all inter references except ALTREF. +static AOM_INLINE void disable_inter_references_except_altref( + bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) { + disable_reference(LAST_FRAME, ref_combo); + disable_reference(LAST2_FRAME, ref_combo); + disable_reference(LAST3_FRAME, ref_combo); + disable_reference(GOLDEN_FRAME, ref_combo); + disable_reference(BWDREF_FRAME, ref_combo); + disable_reference(ALTREF2_FRAME, ref_combo); +} + +static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = { + { LAST_FRAME, NONE_FRAME }, { ALTREF_FRAME, NONE_FRAME }, + { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, NONE_FRAME }, + { INTRA_FRAME, NONE_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }, + { LAST_FRAME, GOLDEN_FRAME }, { LAST_FRAME, INTRA_FRAME }, + { LAST_FRAME, BWDREF_FRAME }, { LAST_FRAME, LAST3_FRAME }, + { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME }, + { BWDREF_FRAME, NONE_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }, + { ALTREF_FRAME, INTRA_FRAME }, { BWDREF_FRAME, INTRA_FRAME }, +}; + +typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET; + +static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask, + REF_SET ref_set) { + if (ref_set == REF_SET_FULL) { + // Everything available by default. + memset(mask, 0, sizeof(*mask)); + } else { + // All modes available by default. + memset(mask->pred_modes, 0, sizeof(mask->pred_modes)); + // All references disabled first. + for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) { + for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) { + mask->ref_combo[ref1][ref2 + 1] = true; + } + } + const MV_REFERENCE_FRAME(*ref_set_combos)[2]; + int num_ref_combos; + + // Then enable reduced set of references explicitly. + switch (ref_set) { + case REF_SET_REDUCED: + ref_set_combos = reduced_ref_combos; + num_ref_combos = + (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]); + break; + case REF_SET_REALTIME: + ref_set_combos = real_time_ref_combos; + num_ref_combos = + (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]); + break; + default: assert(0); num_ref_combos = 0; + } + + for (int i = 0; i < num_ref_combos; ++i) { + const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i]; + mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false; + } + } +} + +static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + unsigned char segment_id = mbmi->segment_id; + const SPEED_FEATURES *const sf = &cpi->sf; + const INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf; + REF_SET ref_set = REF_SET_FULL; + + if (sf->rt_sf.use_real_time_ref_set) + ref_set = REF_SET_REALTIME; + else if (cpi->oxcf.ref_frm_cfg.enable_reduced_reference_set) + ref_set = REF_SET_REDUCED; + + default_skip_mask(mask, ref_set); + + int min_pred_mv_sad = INT_MAX; + MV_REFERENCE_FRAME ref_frame; + if (ref_set == REF_SET_REALTIME) { + // For real-time encoding, we only look at a subset of ref frames. So the + // threshold for pruning should be computed from this subset as well. + const int num_rt_refs = + sizeof(real_time_ref_combos) / sizeof(*real_time_ref_combos); + for (int r_idx = 0; r_idx < num_rt_refs; r_idx++) { + const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0]; + if (ref != INTRA_FRAME) { + min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref]); + } + } + } else { + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) + min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]); + } + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) { + // Skip checking missing reference in both single and compound reference + // modes. + disable_reference(ref_frame, mask->ref_combo); + } else { + // Skip fixed mv modes for poor references + if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) { + mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO; + } + } + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + // Reference not used for the segment. + disable_reference(ref_frame, mask->ref_combo); + } + } + // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature + // is disabled for this segment. This is to prevent the possibility that we + // end up unable to pick any mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->rc.is_src_frame_alt_ref && + (cpi->oxcf.algo_cfg.arnr_max_frames == 0)) { + disable_inter_references_except_altref(mask->ref_combo); + + mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO; + const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME }; + int_mv near_mv, nearest_mv, global_mv; + get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames, + &x->mbmi_ext); + get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext); + get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext); + + if (near_mv.as_int != global_mv.as_int) + mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV); + if (nearest_mv.as_int != global_mv.as_int) + mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV); + } + } + + if (cpi->rc.is_src_frame_alt_ref) { + if (inter_sf->alt_ref_search_fp && + (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME])) { + mask->pred_modes[ALTREF_FRAME] = 0; + disable_inter_references_except_altref(mask->ref_combo); + disable_reference(INTRA_FRAME, mask->ref_combo); + } + } + + if (inter_sf->alt_ref_search_fp) { + if (!cm->show_frame && x->best_pred_mv_sad[0] < INT_MAX) { + int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 3); + // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if + // those are past frames + MV_REFERENCE_FRAME start_frame = + inter_sf->alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME; + for (ref_frame = start_frame; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] < + 0) { + // Prune inter modes when relative dist of ALTREF2 and ALTREF is close + // to the relative dist of LAST_FRAME. + if (inter_sf->alt_ref_search_fp == 1 && + (abs(cpi->ref_frame_dist_info + .ref_relative_dist[ref_frame - LAST_FRAME]) > + 1.5 * abs(cpi->ref_frame_dist_info + .ref_relative_dist[LAST_FRAME - LAST_FRAME]))) { + continue; + } + if (x->pred_mv_sad[ref_frame] > sad_thresh) + mask->pred_modes[ref_frame] |= INTER_ALL; + } + } + } + } + + if (sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) { + if (x->best_pred_mv_sad[0] < INT_MAX) { + int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 1); + const int prune_ref_list[2] = { GOLDEN_FRAME, ALTREF_FRAME }; + + // Conservatively skip the modes w.r.t. GOLDEN and ALTREF references + for (int ref_idx = 0; ref_idx < 2; ref_idx++) { + ref_frame = prune_ref_list[ref_idx]; + if (x->pred_mv_sad[ref_frame] > sad_thresh) + mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO; + } + } + } + + if (bsize > sf->part_sf.max_intra_bsize) { + disable_reference(INTRA_FRAME, mask->ref_combo); + } + + if (!cpi->oxcf.tool_cfg.enable_global_motion) { + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + mask->pred_modes[ref_frame] |= (1 << GLOBALMV); + mask->pred_modes[ref_frame] |= (1 << GLOBAL_GLOBALMV); + } + } + + mask->pred_modes[INTRA_FRAME] |= + ~(uint32_t)sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]; + + // Prune reference frames which are not the closest to the current + // frame and with large pred_mv_sad. + if (inter_sf->prune_single_ref) { + assert(inter_sf->prune_single_ref > 0 && inter_sf->prune_single_ref < 3); + const double prune_threshes[2] = { 1.20, 1.05 }; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefFrameDistanceInfo *const ref_frame_dist_info = + &cpi->ref_frame_dist_info; + const int is_closest_ref = + (ref_frame == ref_frame_dist_info->nearest_past_ref) || + (ref_frame == ref_frame_dist_info->nearest_future_ref); + + if (!is_closest_ref) { + const int dir = + (ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] < 0) + ? 0 + : 1; + if (x->best_pred_mv_sad[dir] < INT_MAX && + x->pred_mv_sad[ref_frame] > + prune_threshes[inter_sf->prune_single_ref - 1] * + x->best_pred_mv_sad[dir]) + mask->pred_modes[ref_frame] |= INTER_SINGLE_ALL; + } + } + } +} + +static AOM_INLINE void init_neighbor_pred_buf( + const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args, + int is_hbd) { + if (is_hbd) { + const int len = sizeof(uint16_t); + args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred); + args->above_pred_buf[1] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + + (MAX_SB_SQUARE >> 1) * len); + args->above_pred_buf[2] = + CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + MAX_SB_SQUARE * len); + args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred); + args->left_pred_buf[1] = + CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1) * len); + args->left_pred_buf[2] = + CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + MAX_SB_SQUARE * len); + } else { + args->above_pred_buf[0] = obmc_buffer->above_pred; + args->above_pred_buf[1] = obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1); + args->above_pred_buf[2] = obmc_buffer->above_pred + MAX_SB_SQUARE; + args->left_pred_buf[0] = obmc_buffer->left_pred; + args->left_pred_buf[1] = obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1); + args->left_pred_buf[2] = obmc_buffer->left_pred + MAX_SB_SQUARE; + } +} + +static AOM_INLINE int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x, + MV_REFERENCE_FRAME ref_frame) { + const AV1_COMMON *const cm = &cpi->common; + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_frame); + + if ((cpi->prune_ref_frame_mask >> ref_frame) & 1) return 1; + + if (prune_ref_by_selective_ref_frame(cpi, x, rf, + cm->cur_frame->ref_display_order_hint)) { + return 1; + } + + return 0; +} + +static AOM_INLINE int is_ref_frame_used_by_compound_ref( + int ref_frame, int skip_ref_frame_mask) { + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (!(skip_ref_frame_mask & (1 << r))) { + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_frame || rf[1] == ref_frame) { + return 1; + } + } + } + return 0; +} + +static AOM_INLINE int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame, + const MB_MODE_INFO *mi_cache) { + if (!mi_cache) { + return 0; + } + + if (ref_frame < REF_FRAMES) { + return (ref_frame == mi_cache->ref_frame[0] || + ref_frame == mi_cache->ref_frame[1]); + } + + // if we are here, then the current mode is compound. + MV_REFERENCE_FRAME cached_ref_type = av1_ref_frame_type(mi_cache->ref_frame); + return ref_frame == cached_ref_type; +} + +// Please add/modify parameter setting in this function, making it consistent +// and easy to read and maintain. +static AOM_INLINE void set_params_rd_pick_inter_mode( + const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args, + BLOCK_SIZE bsize, mode_skip_mask_t *mode_skip_mask, int skip_ref_frame_mask, + unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES], + struct buf_2d (*yv12_mb)[MAX_MB_PLANE]) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + unsigned char segment_id = mbmi->segment_id; + + init_neighbor_pred_buf(&x->obmc_buffer, args, is_cur_buf_hbd(&x->e_mbd)); + av1_collect_neighbors_ref_counts(xd); + estimate_ref_frame_costs(cm, xd, &x->mode_costs, segment_id, ref_costs_single, + ref_costs_comp); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + x->best_pred_mv_sad[0] = INT_MAX; + x->best_pred_mv_sad[1] = INT_MAX; + + for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; + ++ref_frame) { + x->pred_mv_sad[ref_frame] = INT_MAX; + mbmi_ext->mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + // Skip the ref frame if the mask says skip and the ref is not used by + // compound ref. + if (skip_ref_frame_mask & (1 << ref_frame) && + !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask) && + !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) { + continue; + } + assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL); + setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb); + } + if (cpi->sf.inter_sf.alt_ref_search_fp || + cpi->sf.inter_sf.prune_single_ref || + cpi->sf.rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) { + // Store the best pred_mv_sad across all past frames + if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] < + 0) + x->best_pred_mv_sad[0] = + AOMMIN(x->best_pred_mv_sad[0], x->pred_mv_sad[ref_frame]); + else + // Store the best pred_mv_sad across all future frames + x->best_pred_mv_sad[1] = + AOMMIN(x->best_pred_mv_sad[1], x->pred_mv_sad[ref_frame]); + } + } + + if (!cpi->sf.rt_sf.use_real_time_ref_set && is_comp_ref_allowed(bsize)) { + // No second reference on RT ref set, so no need to initialize + for (MV_REFERENCE_FRAME ref_frame = EXTREF_FRAME; + ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) { + mbmi_ext->mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; + const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES]; + if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) && + (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) { + continue; + } + + if (skip_ref_frame_mask & (1 << ref_frame) && + !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) { + continue; + } + // Ref mv list population is not required, when compound references are + // pruned. + if (prune_ref_frame(cpi, x, ref_frame)) continue; + + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + } + } + + av1_count_overlappable_neighbors(cm, xd); + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int use_actual_frame_probs = 1; + int prune_obmc; +#if CONFIG_FPMT_TEST + use_actual_frame_probs = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; + if (!use_actual_frame_probs) { + prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] < + cpi->sf.inter_sf.prune_obmc_prob_thresh; + } +#endif + if (use_actual_frame_probs) { + prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] < + cpi->sf.inter_sf.prune_obmc_prob_thresh; + } + if (cpi->oxcf.motion_mode_cfg.enable_obmc && !prune_obmc) { + if (check_num_overlappable_neighbors(mbmi) && + is_motion_variation_allowed_bsize(bsize)) { + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + av1_build_prediction_by_above_preds(cm, xd, args->above_pred_buf, + dst_width1, dst_height1, + args->above_pred_stride); + av1_build_prediction_by_left_preds(cm, xd, args->left_pred_buf, + dst_width2, dst_height2, + args->left_pred_stride); + const int num_planes = av1_num_planes(cm); + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, + mi_col, 0, num_planes); + calc_target_weighted_pred( + cm, x, xd, args->above_pred_buf[0], args->above_pred_stride[0], + args->left_pred_buf[0], args->left_pred_stride[0]); + } + } + + init_mode_skip_mask(mode_skip_mask, cpi, x, bsize); + + // Set params for mode evaluation + set_mode_eval_params(cpi, x, MODE_EVAL); + + x->comp_rd_stats_idx = 0; + + for (int idx = 0; idx < REF_FRAMES; idx++) { + args->best_single_sse_in_refs[idx] = INT32_MAX; + } +} + +static AOM_INLINE void init_single_inter_mode_search_state( + InterModeSearchState *search_state) { + for (int dir = 0; dir < 2; ++dir) { + for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { + SingleInterModeState *state; + + state = &search_state->single_state[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + + state = &search_state->single_state_modelled[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + + search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME; + } + } + } + + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + search_state->best_single_rd[ref_frame] = INT64_MAX; + search_state->best_single_mode[ref_frame] = PRED_MODE_INVALID; + } + av1_zero(search_state->single_state_cnt); + av1_zero(search_state->single_state_modelled_cnt); +} + +static AOM_INLINE void init_inter_mode_search_state( + InterModeSearchState *search_state, const AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) { + init_intra_mode_search_state(&search_state->intra_search_state); + av1_invalid_rd_stats(&search_state->best_y_rdcost); + + search_state->best_rd = best_rd_so_far; + search_state->best_skip_rd[0] = INT64_MAX; + search_state->best_skip_rd[1] = INT64_MAX; + + av1_zero(search_state->best_mbmode); + + search_state->best_rate_y = INT_MAX; + + search_state->best_rate_uv = INT_MAX; + + search_state->best_mode_skippable = 0; + + search_state->best_skip2 = 0; + + search_state->best_mode_index = THR_INVALID; + + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const unsigned char segment_id = mbmi->segment_id; + + search_state->num_available_refs = 0; + memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs)); + memset(search_state->dist_order_refs, -1, + sizeof(search_state->dist_order_refs)); + + for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i) + search_state->mode_threshold[i] = 0; + const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; + for (int i = LAST_NEW_MV_INDEX + 1; i < SINGLE_REF_MODE_END; ++i) + search_state->mode_threshold[i] = + ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >> + RD_THRESH_FAC_FRAC_BITS; + + search_state->best_intra_rd = INT64_MAX; + + search_state->best_pred_sse = UINT_MAX; + + av1_zero(search_state->single_newmv); + av1_zero(search_state->single_newmv_rate); + av1_zero(search_state->single_newmv_valid); + for (int i = SINGLE_INTER_MODE_START; i < SINGLE_INTER_MODE_END; ++i) { + for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + search_state->modelled_rd[i][j][ref_frame] = INT64_MAX; + search_state->simple_rd[i][j][ref_frame] = INT64_MAX; + } + } + } + + for (int i = 0; i < REFERENCE_MODES; ++i) { + search_state->best_pred_rd[i] = INT64_MAX; + } + + if (cpi->common.current_frame.reference_mode != SINGLE_REFERENCE) { + for (int i = SINGLE_REF_MODE_END; i < THR_INTER_MODE_END; ++i) + search_state->mode_threshold[i] = + ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >> + RD_THRESH_FAC_FRAC_BITS; + + for (int i = COMP_INTER_MODE_START; i < COMP_INTER_MODE_END; ++i) { + for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + search_state->modelled_rd[i][j][ref_frame] = INT64_MAX; + search_state->simple_rd[i][j][ref_frame] = INT64_MAX; + } + } + } + + init_single_inter_mode_search_state(search_state); + } +} + +static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask, + const MV_REFERENCE_FRAME *ref_frame, + const PREDICTION_MODE this_mode) { + if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) { + return true; + } + + return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1]; +} + +static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x, + BLOCK_SIZE bsize, + PREDICTION_MODE curr_mode, + const MV_REFERENCE_FRAME *ref_frames) { + const int comp_pred = ref_frames[1] > INTRA_FRAME; + if (comp_pred) { + if (!is_comp_ref_allowed(bsize)) return 1; + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frames[1]])) { + return 1; + } + + const AV1_COMMON *const cm = &cpi->common; + if (frame_is_intra_only(cm)) return 1; + + const CurrentFrame *const current_frame = &cm->current_frame; + if (current_frame->reference_mode == SINGLE_REFERENCE) return 1; + + const struct segmentation *const seg = &cm->seg; + const unsigned char segment_id = x->e_mbd.mi[0]->segment_id; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1; + } + + if (ref_frames[0] > INTRA_FRAME && ref_frames[1] == INTRA_FRAME) { + // Mode must be compatible + if (!is_interintra_allowed_bsize(bsize)) return 1; + if (!is_interintra_allowed_mode(curr_mode)) return 1; + } + + return 0; +} + +static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x, + BLOCK_SIZE bsize, int mib_size) { + const int sb_size_mask = mib_size - 1; + const MACROBLOCKD *const xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int mi_row_in_sb = mi_row & sb_size_mask; + const int mi_col_in_sb = mi_col & sb_size_mask; + const int mi_w = mi_size_wide[bsize]; + const int mi_h = mi_size_high[bsize]; + int picked_ref_frames_mask = 0; + for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) { + for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) { + picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j]; + } + } + return picked_ref_frames_mask; +} + +// Check if reference frame pair of the current block matches with the given +// block. +static INLINE int match_ref_frame_pair(const MB_MODE_INFO *mbmi, + const MV_REFERENCE_FRAME *ref_frames) { + return ((ref_frames[0] == mbmi->ref_frame[0]) && + (ref_frames[1] == mbmi->ref_frame[1])); +} + +// Case 1: return 0, means don't skip this mode +// Case 2: return 1, means skip this mode completely +// Case 3: return 2, means skip compound only, but still try single motion modes +static int inter_mode_search_order_independent_skip( + const AV1_COMP *cpi, const MACROBLOCK *x, mode_skip_mask_t *mode_skip_mask, + InterModeSearchState *search_state, int skip_ref_frame_mask, + PREDICTION_MODE mode, const MV_REFERENCE_FRAME *ref_frame) { + if (mask_says_skip(mode_skip_mask, ref_frame, mode)) { + return 1; + } + + const int ref_type = av1_ref_frame_type(ref_frame); + if (!cpi->sf.rt_sf.use_real_time_ref_set) + if (prune_ref_frame(cpi, x, ref_type)) return 1; + + // This is only used in motion vector unit test. + if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test && + ref_frame[0] == INTRA_FRAME) + return 1; + + const AV1_COMMON *const cm = &cpi->common; + if (skip_repeated_mv(cm, x, mode, ref_frame, search_state)) { + return 1; + } + + // Reuse the prediction mode in cache + if (x->use_mb_mode_cache) { + const MB_MODE_INFO *cached_mi = x->mb_mode_cache; + const PREDICTION_MODE cached_mode = cached_mi->mode; + const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame; + const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME; + + // If the cached mode is intra, then we just need to match the mode. + if (is_mode_intra(cached_mode) && mode != cached_mode) { + return 1; + } + + // If the cached mode is single inter mode, then we match the mode and + // reference frame. + if (cached_mode_is_single) { + if (mode != cached_mode || ref_frame[0] != cached_frame[0]) { + return 1; + } + } else { + // If the cached mode is compound, then we need to consider several cases. + const int mode_is_single = ref_frame[1] <= INTRA_FRAME; + if (mode_is_single) { + // If the mode is single, we know the modes can't match. But we might + // still want to search it if compound mode depends on the current mode. + int skip_motion_mode_only = 0; + if (cached_mode == NEW_NEARMV || cached_mode == NEW_NEARESTMV) { + skip_motion_mode_only = (ref_frame[0] == cached_frame[0]); + } else if (cached_mode == NEAR_NEWMV || cached_mode == NEAREST_NEWMV) { + skip_motion_mode_only = (ref_frame[0] == cached_frame[1]); + } else if (cached_mode == NEW_NEWMV) { + skip_motion_mode_only = (ref_frame[0] == cached_frame[0] || + ref_frame[0] == cached_frame[1]); + } + + return 1 + skip_motion_mode_only; + } else { + // If both modes are compound, then everything must match. + if (mode != cached_mode || ref_frame[0] != cached_frame[0] || + ref_frame[1] != cached_frame[1]) { + return 1; + } + } + } + } + + const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0]; + // If no valid mode has been found so far in PARTITION_NONE when finding a + // valid partition is required, do not skip mode. + if (search_state->best_rd == INT64_MAX && mbmi->partition == PARTITION_NONE && + x->must_find_valid_partition) + return 0; + + const SPEED_FEATURES *const sf = &cpi->sf; + // Prune NEARMV and NEAR_NEARMV based on q index and neighbor's reference + // frames + if (sf->inter_sf.prune_nearmv_using_neighbors && + (mode == NEAR_NEARMV || mode == NEARMV)) { + const MACROBLOCKD *const xd = &x->e_mbd; + if (search_state->best_rd != INT64_MAX && xd->left_available && + xd->up_available) { + const int thresholds[PRUNE_NEARMV_MAX][3] = { { 1, 0, 0 }, + { 1, 1, 0 }, + { 2, 1, 0 } }; + const int qindex_sub_range = x->qindex * 3 / QINDEX_RANGE; + + assert(sf->inter_sf.prune_nearmv_using_neighbors <= PRUNE_NEARMV_MAX && + qindex_sub_range < 3); + const int num_ref_frame_pair_match_thresh = + thresholds[sf->inter_sf.prune_nearmv_using_neighbors - 1] + [qindex_sub_range]; + + assert(num_ref_frame_pair_match_thresh <= 2 && + num_ref_frame_pair_match_thresh >= 0); + int num_ref_frame_pair_match = 0; + + num_ref_frame_pair_match = match_ref_frame_pair(xd->left_mbmi, ref_frame); + num_ref_frame_pair_match += + match_ref_frame_pair(xd->above_mbmi, ref_frame); + + // Pruning based on ref frame pair match with neighbors. + if (num_ref_frame_pair_match < num_ref_frame_pair_match_thresh) return 1; + } + } + + int skip_motion_mode = 0; + if (mbmi->partition != PARTITION_NONE) { + int skip_ref = skip_ref_frame_mask & (1 << ref_type); + if (ref_type <= ALTREF_FRAME && skip_ref) { + // Since the compound ref modes depends on the motion estimation result of + // two single ref modes (best mv of single ref modes as the start point), + // if current single ref mode is marked skip, we need to check if it will + // be used in compound ref modes. + if (is_ref_frame_used_by_compound_ref(ref_type, skip_ref_frame_mask)) { + // Found a not skipped compound ref mode which contains current + // single ref. So this single ref can't be skipped completely + // Just skip its motion mode search, still try its simple + // transition mode. + skip_motion_mode = 1; + skip_ref = 0; + } + } + // If we are reusing the prediction from cache, and the current frame is + // required by the cache, then we cannot prune it. + if (is_ref_frame_used_in_cache(ref_type, x->mb_mode_cache)) { + skip_ref = 0; + // If the cache only needs the current reference type for compound + // prediction, then we can skip motion mode search. + skip_motion_mode = (ref_type <= ALTREF_FRAME && + x->mb_mode_cache->ref_frame[1] > INTRA_FRAME); + } + if (skip_ref) return 1; + } + + if (ref_frame[0] == INTRA_FRAME) { + if (mode != DC_PRED) { + // Disable intra modes other than DC_PRED for blocks with low variance + // Threshold for intra skipping based on source variance + // TODO(debargha): Specialize the threshold for super block sizes + const unsigned int skip_intra_var_thresh = 64; + if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && + x->source_variance < skip_intra_var_thresh) + return 1; + } + } + + if (skip_motion_mode) return 2; + + return 0; +} + +static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode, + const MV_REFERENCE_FRAME *ref_frames, + const AV1_COMMON *cm) { + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + mbmi->ref_mv_idx = 0; + mbmi->mode = curr_mode; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = ref_frames[0]; + mbmi->ref_frame[1] = ref_frames[1]; + pmi->palette_size[0] = 0; + pmi->palette_size[1] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); + set_default_interp_filters(mbmi, cm->features.interp_filter); +} + +static AOM_INLINE void collect_single_states(MACROBLOCK *x, + InterModeSearchState *search_state, + const MB_MODE_INFO *const mbmi) { + int i, j; + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1; + const int mode_offset = INTER_OFFSET(this_mode); + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); + + // Simple rd + int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + const int64_t rd = + search_state->simple_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < simple_rd) simple_rd = rd; + } + + // Insertion sort of single_state + const SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 }; + SingleInterModeState *state_s = search_state->single_state[dir][mode_offset]; + i = search_state->single_state_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j) + state_s[j] = state_s[j - 1]; + state_s[j] = this_state_s; + search_state->single_state_cnt[dir][mode_offset]++; + + // Modelled rd + int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + const int64_t rd = + search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < modelled_rd) modelled_rd = rd; + } + + // Insertion sort of single_state_modelled + const SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 }; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode_offset]; + i = search_state->single_state_modelled_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j) + state_m[j] = state_m[j - 1]; + state_m[j] = this_state_m; + search_state->single_state_modelled_cnt[dir][mode_offset]++; +} + +static AOM_INLINE void analyze_single_states( + const AV1_COMP *cpi, InterModeSearchState *search_state) { + const int prune_level = cpi->sf.inter_sf.prune_comp_search_by_single_result; + assert(prune_level >= 1); + int i, j, dir, mode; + + for (dir = 0; dir < 2; ++dir) { + int64_t best_rd; + SingleInterModeState(*state)[FWD_REFS]; + const int prune_factor = prune_level >= 2 ? 6 : 5; + + // Use the best rd of GLOBALMV or NEWMV to prune the unlikely + // reference frames for all the modes (NEARESTMV and NEARMV may not + // have same motion vectors). Always keep the best of each mode + // because it might form the best possible combination with other mode. + state = search_state->single_state[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 3) * prune_factor > best_rd) { + state[mode][i].valid = 0; + } + } + } + + state = search_state->single_state_modelled[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 3) * prune_factor > best_rd) { + state[mode][i].valid = 0; + } + } + } + } + + // Ordering by simple rd first, then by modelled rd + for (dir = 0; dir < 2; ++dir) { + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + const int state_cnt_s = search_state->single_state_cnt[dir][mode]; + const int state_cnt_m = + search_state->single_state_modelled_cnt[dir][mode]; + SingleInterModeState *state_s = search_state->single_state[dir][mode]; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode]; + int count = 0; + const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m); + for (i = 0; i < state_cnt_s; ++i) { + if (state_s[i].rd == INT64_MAX) break; + if (state_s[i].valid) { + search_state->single_rd_order[dir][mode][count++] = + state_s[i].ref_frame; + } + } + if (count >= max_candidates) continue; + + for (i = 0; i < state_cnt_m && count < max_candidates; ++i) { + if (state_m[i].rd == INT64_MAX) break; + if (!state_m[i].valid) continue; + const int ref_frame = state_m[i].ref_frame; + int match = 0; + // Check if existing already + for (j = 0; j < count; ++j) { + if (search_state->single_rd_order[dir][mode][j] == ref_frame) { + match = 1; + break; + } + } + if (match) continue; + // Check if this ref_frame is removed in simple rd + int valid = 1; + for (j = 0; j < state_cnt_s; ++j) { + if (ref_frame == state_s[j].ref_frame) { + valid = state_s[j].valid; + break; + } + } + if (valid) { + search_state->single_rd_order[dir][mode][count++] = ref_frame; + } + } + } + } +} + +static int compound_skip_get_candidates( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const int dir, const PREDICTION_MODE mode) { + const int mode_offset = INTER_OFFSET(mode); + const SingleInterModeState *state = + search_state->single_state[dir][mode_offset]; + const SingleInterModeState *state_modelled = + search_state->single_state_modelled[dir][mode_offset]; + + int max_candidates = 0; + for (int i = 0; i < FWD_REFS; ++i) { + if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break; + max_candidates++; + } + + int candidates = max_candidates; + if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 2) { + candidates = AOMMIN(2, max_candidates); + } + if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 3) { + if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX && + state[0].ref_frame == state_modelled[0].ref_frame) + candidates = 1; + if (mode == NEARMV || mode == GLOBALMV) candidates = 1; + } + + if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 4) { + // Limit the number of candidates to 1 in each direction for compound + // prediction + candidates = AOMMIN(1, candidates); + } + return candidates; +} + +static int compound_skip_by_single_states( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame, + const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) { + const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame }; + const int mode[2] = { compound_ref0_mode(this_mode), + compound_ref1_mode(this_mode) }; + const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) }; + const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1, + refs[1] <= GOLDEN_FRAME ? 0 : 1 }; + int ref_searched[2] = { 0, 0 }; + int ref_mv_match[2] = { 1, 1 }; + int i, j; + + for (i = 0; i < 2; ++i) { + const SingleInterModeState *state = + search_state->single_state[mode_dir[i]][mode_offset[i]]; + const int state_cnt = + search_state->single_state_cnt[mode_dir[i]][mode_offset[i]]; + for (j = 0; j < state_cnt; ++j) { + if (state[j].ref_frame == refs[i]) { + ref_searched[i] = 1; + break; + } + } + } + + const int ref_set = get_drl_refmv_count(x, refs, this_mode); + for (i = 0; i < 2; ++i) { + if (!ref_searched[i] || (mode[i] != NEARESTMV && mode[i] != NEARMV)) { + continue; + } + const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME }; + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) { + int_mv single_mv; + int_mv comp_mv; + get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs, + &x->mbmi_ext); + get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, &x->mbmi_ext); + if (single_mv.as_int != comp_mv.as_int) { + ref_mv_match[i] = 0; + break; + } + } + } + + for (i = 0; i < 2; ++i) { + if (!ref_searched[i] || !ref_mv_match[i]) continue; + const int candidates = + compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]); + const MV_REFERENCE_FRAME *ref_order = + search_state->single_rd_order[mode_dir[i]][mode_offset[i]]; + int match = 0; + for (j = 0; j < candidates; ++j) { + if (refs[i] == ref_order[j]) { + match = 1; + break; + } + } + if (!match) return 1; + } + + return 0; +} + +// Check if ref frames of current block matches with given block. +static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi, + const MV_REFERENCE_FRAME *ref_frames, + int *const is_ref_match) { + if (is_inter_block(mbmi)) { + is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[0]; + is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[0]; + if (has_second_ref(mbmi)) { + is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[1]; + is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[1]; + } + } +} + +// Prune compound mode using ref frames of neighbor blocks. +static INLINE int compound_skip_using_neighbor_refs( + MACROBLOCKD *const xd, const PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) { + // Exclude non-extended compound modes from pruning + if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV || + this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV) + return 0; + + if (prune_ext_comp_using_neighbors >= 3) return 1; + + int is_ref_match[2] = { 0 }; // 0 - match for forward refs + // 1 - match for backward refs + // Check if ref frames of this block matches with left neighbor. + if (xd->left_available) + match_ref_frame(xd->left_mbmi, ref_frames, is_ref_match); + + // Check if ref frames of this block matches with above neighbor. + if (xd->up_available) + match_ref_frame(xd->above_mbmi, ref_frames, is_ref_match); + + // Combine ref frame match with neighbors in forward and backward refs. + const int track_ref_match = is_ref_match[0] + is_ref_match[1]; + + // Pruning based on ref frame match with neighbors. + if (track_ref_match >= prune_ext_comp_using_neighbors) return 0; + return 1; +} + +// Update best single mode for the given reference frame based on simple rd. +static INLINE void update_best_single_mode(InterModeSearchState *search_state, + const PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME ref_frame, + int64_t this_rd) { + if (this_rd < search_state->best_single_rd[ref_frame]) { + search_state->best_single_rd[ref_frame] = this_rd; + search_state->best_single_mode[ref_frame] = this_mode; + } +} + +// Prune compound mode using best single mode for the same reference. +static INLINE int skip_compound_using_best_single_mode_ref( + const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME *ref_frames, + const PREDICTION_MODE *best_single_mode, + int prune_comp_using_best_single_mode_ref) { + // Exclude non-extended compound modes from pruning + if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV || + this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV) + return 0; + + assert(this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV); + const PREDICTION_MODE comp_mode_ref0 = compound_ref0_mode(this_mode); + // Get ref frame direction corresponding to NEWMV + // 0 - NEWMV corresponding to forward direction + // 1 - NEWMV corresponding to backward direction + const int newmv_dir = comp_mode_ref0 != NEWMV; + + // Avoid pruning the compound mode when ref frame corresponding to NEWMV + // have NEWMV as single mode winner. + // Example: For an extended-compound mode, + // {mode, {fwd_frame, bwd_frame}} = {NEAR_NEWMV, {LAST_FRAME, ALTREF_FRAME}} + // - Ref frame corresponding to NEWMV is ALTREF_FRAME + // - Avoid pruning this mode, if best single mode corresponding to ref frame + // ALTREF_FRAME is NEWMV + const PREDICTION_MODE single_mode = best_single_mode[ref_frames[newmv_dir]]; + if (single_mode == NEWMV) return 0; + + // Avoid pruning the compound mode when best single mode is not available + if (prune_comp_using_best_single_mode_ref == 1) + if (single_mode == MB_MODE_COUNT) return 0; + return 1; +} + +static int compare_int64(const void *a, const void *b) { + int64_t a64 = *((int64_t *)a); + int64_t b64 = *((int64_t *)b); + if (a64 < b64) { + return -1; + } else if (a64 == b64) { + return 0; + } else { + return 1; + } +} + +static INLINE void update_search_state( + InterModeSearchState *search_state, RD_STATS *best_rd_stats_dst, + PICK_MODE_CONTEXT *ctx, const RD_STATS *new_best_rd_stats, + const RD_STATS *new_best_rd_stats_y, const RD_STATS *new_best_rd_stats_uv, + THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int skip_txfm = + mbmi->skip_txfm && !is_mode_intra(av1_mode_defs[new_best_mode].mode); + const TxfmSearchInfo *txfm_info = &x->txfm_search_info; + + search_state->best_rd = new_best_rd_stats->rdcost; + search_state->best_mode_index = new_best_mode; + *best_rd_stats_dst = *new_best_rd_stats; + search_state->best_mbmode = *mbmi; + search_state->best_skip2 = skip_txfm; + search_state->best_mode_skippable = new_best_rd_stats->skip_txfm; + // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and + // rate_uv because av1_txfm_search process is replaced by rd estimation. + // Therefore, we should avoid updating best_rate_y and best_rate_uv here. + // These two values will be updated when av1_txfm_search is called. + if (txfm_search_done) { + search_state->best_rate_y = + new_best_rd_stats_y->rate + + x->mode_costs.skip_txfm_cost[skip_ctx] + [new_best_rd_stats->skip_txfm || skip_txfm]; + search_state->best_rate_uv = new_best_rd_stats_uv->rate; + } + search_state->best_y_rdcost = *new_best_rd_stats_y; + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); +} + +// Find the best RD for a reference frame (among single reference modes) +// and store +10% of it in the 0-th element in ref_frame_rd. +static AOM_INLINE void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) { + assert(ref_frame_rd[0] == INT64_MAX); + int64_t ref_copy[REF_FRAMES - 1]; + memcpy(ref_copy, ref_frame_rd + 1, + sizeof(ref_frame_rd[0]) * (REF_FRAMES - 1)); + qsort(ref_copy, REF_FRAMES - 1, sizeof(int64_t), compare_int64); + + int64_t cutoff = ref_copy[0]; + // The cut-off is within 10% of the best. + if (cutoff != INT64_MAX) { + assert(cutoff < INT64_MAX / 200); + cutoff = (110 * cutoff) / 100; + } + ref_frame_rd[0] = cutoff; +} + +// Check if either frame is within the cutoff. +static INLINE bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES], + MV_REFERENCE_FRAME frame1, + MV_REFERENCE_FRAME frame2) { + assert(frame2 > 0); + return ref_frame_rd[frame1] <= ref_frame_rd[0] || + ref_frame_rd[frame2] <= ref_frame_rd[0]; +} + +static AOM_INLINE void evaluate_motion_mode_for_winner_candidates( + const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *const rd_cost, + HandleInterModeArgs *const args, TileDataEnc *const tile_data, + PICK_MODE_CONTEXT *const ctx, + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], + const motion_mode_best_st_candidate *const best_motion_mode_cands, + int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd, + InterModeSearchState *const search_state, int64_t *yrd) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + InterModesInfo *const inter_modes_info = x->inter_modes_info; + const int num_best_cand = best_motion_mode_cands->num_motion_mode_cand; + + for (int cand = 0; cand < num_best_cand; cand++) { + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + av1_init_rd_stats(&rd_stats); + av1_init_rd_stats(&rd_stats_y); + av1_init_rd_stats(&rd_stats_uv); + int rate_mv; + + rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv; + args->skip_motion_mode = + best_motion_mode_cands->motion_mode_cand[cand].skip_motion_mode; + *mbmi = best_motion_mode_cands->motion_mode_cand[cand].mbmi; + rd_stats.rate = + best_motion_mode_cands->motion_mode_cand[cand].rate2_nocoeff; + + // Continue if the best candidate is compound. + if (!is_inter_singleref_mode(mbmi->mode)) continue; + + x->txfm_search_info.skip_txfm = 0; + struct macroblockd_plane *pd = xd->plane; + const BUFFER_SET orig_dst = { + { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, + { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, + }; + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + // Initialize motion mode to simple translation + // Calculation of switchable rate depends on it. + mbmi->motion_mode = 0; + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + int64_t skip_rd[2] = { search_state->best_skip_rd[0], + search_state->best_skip_rd[1] }; + int64_t this_yrd = INT64_MAX; + int64_t ret_value = motion_mode_rd( + cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, args, + search_state->best_rd, skip_rd, &rate_mv, &orig_dst, best_est_rd, + do_tx_search, inter_modes_info, 1, &this_yrd); + + if (ret_value != INT64_MAX) { + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); + const THR_MODES mode_enum = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + // Collect mode stats for multiwinner mode processing + store_winner_mode_stats( + &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, + mode_enum, NULL, bsize, rd_stats.rdcost, + cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search); + if (rd_stats.rdcost < search_state->best_rd) { + *yrd = this_yrd; + update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_enum, x, do_tx_search); + if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0]; + } + } + } +} + +/*!\cond */ +// Arguments for speed feature pruning of inter mode search +typedef struct { + int *skip_motion_mode; + mode_skip_mask_t *mode_skip_mask; + InterModeSearchState *search_state; + int skip_ref_frame_mask; + int reach_first_comp_mode; + int mode_thresh_mul_fact; + int num_single_modes_processed; + int prune_cpd_using_sr_stats_ready; +} InterModeSFArgs; +/*!\endcond */ + +static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, + int64_t *ref_frame_rd, int midx, + InterModeSFArgs *args, int is_low_temp_var) { + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + // Get the actual prediction mode we are trying in this iteration + const THR_MODES mode_enum = av1_default_mode_order[midx]; + const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; + const PREDICTION_MODE this_mode = mode_def->mode; + const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame; + const MV_REFERENCE_FRAME ref_frame = ref_frames[0]; + const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1]; + const int comp_pred = second_ref_frame > INTRA_FRAME; + + if (ref_frame == INTRA_FRAME) return 1; + + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + if (sf->inter_sf.skip_arf_compound && update_type == ARF_UPDATE && + comp_pred) { + return 1; + } + + // This is for real time encoding. + if (is_low_temp_var && !comp_pred && ref_frame != LAST_FRAME && + this_mode != NEARESTMV) + return 1; + + // Check if this mode should be skipped because it is incompatible with the + // current frame + if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames)) + return 1; + const int ret = inter_mode_search_order_independent_skip( + cpi, x, args->mode_skip_mask, args->search_state, + args->skip_ref_frame_mask, this_mode, mode_def->ref_frame); + if (ret == 1) return 1; + *(args->skip_motion_mode) = (ret == 2); + + // We've reached the first compound prediction mode, get stats from the + // single reference predictors to help with pruning. + // Disable this pruning logic if interpolation filter search was skipped for + // single prediction modes as it can result in aggressive pruning of compound + // prediction modes due to the absence of modelled_rd populated by + // av1_interpolation_filter_search(). + // TODO(Remya): Check the impact of the sf + // 'prune_comp_search_by_single_result' if compound prediction modes are + // enabled in future for REALTIME encode. + if (!sf->interp_sf.skip_interp_filter_search && + sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred && + args->reach_first_comp_mode == 0) { + analyze_single_states(cpi, args->search_state); + args->reach_first_comp_mode = 1; + } + + // Prune aggressively when best mode is skippable. + int mul_fact = args->search_state->best_mode_skippable + ? args->mode_thresh_mul_fact + : (1 << MODE_THRESH_QBITS); + int64_t mode_threshold = + (args->search_state->mode_threshold[mode_enum] * mul_fact) >> + MODE_THRESH_QBITS; + + if (args->search_state->best_rd < mode_threshold) return 1; + + // Skip this compound mode based on the RD results from the single prediction + // modes + if (!sf->interp_sf.skip_interp_filter_search && + sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) { + if (compound_skip_by_single_states(cpi, args->search_state, this_mode, + ref_frame, second_ref_frame, x)) + return 1; + } + + if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) { + // After we done with single reference modes, find the 2nd best RD + // for a reference frame. Only search compound modes that have a reference + // frame at least as good as the 2nd best. + if (!args->prune_cpd_using_sr_stats_ready && + args->num_single_modes_processed == NUM_SINGLE_REF_MODES) { + find_top_ref(ref_frame_rd); + args->prune_cpd_using_sr_stats_ready = 1; + } + if (args->prune_cpd_using_sr_stats_ready && + !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame)) + return 1; + } + + // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes + if (sf->inter_sf.skip_ext_comp_nearmv_mode && + (this_mode == NEW_NEARMV || this_mode == NEAR_NEWMV)) { + return 1; + } + + if (sf->inter_sf.prune_ext_comp_using_neighbors && comp_pred) { + if (compound_skip_using_neighbor_refs( + xd, this_mode, ref_frames, + sf->inter_sf.prune_ext_comp_using_neighbors)) + return 1; + } + + if (sf->inter_sf.prune_comp_using_best_single_mode_ref && comp_pred) { + if (skip_compound_using_best_single_mode_ref( + this_mode, ref_frames, args->search_state->best_single_mode, + sf->inter_sf.prune_comp_using_best_single_mode_ref)) + return 1; + } + + if (sf->inter_sf.prune_nearest_near_mv_using_refmv_weight && !comp_pred) { + const int8_t ref_frame_type = av1_ref_frame_type(ref_frames); + if (skip_nearest_near_mv_using_refmv_weight( + x, this_mode, ref_frame_type, + args->search_state->best_mbmode.mode)) { + // Ensure the mode is pruned only when the current block has obtained a + // valid inter mode. + assert(is_inter_mode(args->search_state->best_mbmode.mode)); + return 1; + } + } + + if (sf->rt_sf.prune_inter_modes_with_golden_ref && + ref_frame == GOLDEN_FRAME && !comp_pred) { + const int subgop_size = AOMMIN(cpi->ppi->gf_group.size, FIXED_GF_INTERVAL); + if (cpi->rc.frames_since_golden > (subgop_size >> 2) && + args->search_state->best_mbmode.ref_frame[0] != GOLDEN_FRAME) { + if ((bsize > BLOCK_16X16 && this_mode == NEWMV) || this_mode == NEARMV) + return 1; + } + } + + return 0; +} + +static void record_best_compound(REFERENCE_MODE reference_mode, + RD_STATS *rd_stats, int comp_pred, int rdmult, + InterModeSearchState *search_state, + int compmode_cost) { + int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; + + if (reference_mode == REFERENCE_MODE_SELECT) { + single_rate = rd_stats->rate - compmode_cost; + hybrid_rate = rd_stats->rate; + } else { + single_rate = rd_stats->rate; + hybrid_rate = rd_stats->rate + compmode_cost; + } + + single_rd = RDCOST(rdmult, single_rate, rd_stats->dist); + hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist); + + if (!comp_pred) { + if (single_rd < search_state->best_pred_rd[SINGLE_REFERENCE]) + search_state->best_pred_rd[SINGLE_REFERENCE] = single_rd; + } else { + if (single_rd < search_state->best_pred_rd[COMPOUND_REFERENCE]) + search_state->best_pred_rd[COMPOUND_REFERENCE] = single_rd; + } + if (hybrid_rd < search_state->best_pred_rd[REFERENCE_MODE_SELECT]) + search_state->best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; +} + +// Does a transform search over a list of the best inter mode candidates. +// This is called if the original mode search computed an RD estimate +// for the transform search rather than doing a full search. +static void tx_search_best_inter_candidates( + AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, + int64_t best_rd_so_far, BLOCK_SIZE bsize, + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int mi_row, int mi_col, + InterModeSearchState *search_state, RD_STATS *rd_cost, + PICK_MODE_CONTEXT *ctx, int64_t *yrd) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + const ModeCosts *mode_costs = &x->mode_costs; + const int num_planes = av1_num_planes(cm); + const int skip_ctx = av1_get_skip_txfm_context(xd); + MB_MODE_INFO *const mbmi = xd->mi[0]; + InterModesInfo *inter_modes_info = x->inter_modes_info; + inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr); + search_state->best_rd = best_rd_so_far; + search_state->best_mode_index = THR_INVALID; + // Initialize best mode stats for winner mode processing + x->winner_mode_count = 0; + store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, + NULL, bsize, best_rd_so_far, + cpi->sf.winner_mode_sf.multi_winner_mode_type, 0); + inter_modes_info->num = + inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search + ? inter_modes_info->num + : cpi->sf.rt_sf.num_inter_modes_for_tx_search; + const int64_t top_est_rd = + inter_modes_info->num > 0 + ? inter_modes_info + ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx] + : INT64_MAX; + *yrd = INT64_MAX; + int64_t best_rd_in_this_partition = INT64_MAX; + int num_inter_mode_cands = inter_modes_info->num; + int newmv_mode_evaled = 0; + int max_allowed_cands = INT_MAX; + if (cpi->sf.inter_sf.limit_inter_mode_cands) { + // The bound on the no. of inter mode candidates, beyond which the + // candidates are limited if a newmv mode got evaluated, is set as + // max_allowed_cands + 1. + const int num_allowed_cands[5] = { INT_MAX, 10, 9, 6, 2 }; + assert(cpi->sf.inter_sf.limit_inter_mode_cands <= 4); + max_allowed_cands = + num_allowed_cands[cpi->sf.inter_sf.limit_inter_mode_cands]; + } + + int num_mode_thresh = INT_MAX; + if (cpi->sf.inter_sf.limit_txfm_eval_per_mode) { + // Bound the no. of transform searches per prediction mode beyond a + // threshold. + const int num_mode_thresh_ary[4] = { INT_MAX, 4, 3, 0 }; + assert(cpi->sf.inter_sf.limit_txfm_eval_per_mode <= 3); + num_mode_thresh = + num_mode_thresh_ary[cpi->sf.inter_sf.limit_txfm_eval_per_mode]; + } + + int num_tx_cands = 0; + int num_tx_search_modes[INTER_MODE_END - INTER_MODE_START] = { 0 }; + // Iterate over best inter mode candidates and perform tx search + for (int j = 0; j < num_inter_mode_cands; ++j) { + const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx; + *mbmi = inter_modes_info->mbmi_arr[data_idx]; + const PREDICTION_MODE prediction_mode = mbmi->mode; + int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx]; + if (curr_est_rd * 0.80 > top_est_rd) break; + + if (num_tx_cands > num_mode_thresh) { + if ((prediction_mode != NEARESTMV && + num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 1) || + (prediction_mode == NEARESTMV && + num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 2)) + continue; + } + + txfm_info->skip_txfm = 0; + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + bool is_predictor_built = false; + + // Initialize RD stats + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + const int mode_rate = inter_modes_info->mode_rate_arr[data_idx]; + int64_t skip_rd = INT64_MAX; + const int txfm_rd_gate_level = get_txfm_rd_gate_level( + cm->seq_params->enable_masked_compound, + cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_DEFAULT, + /*eval_motion_mode=*/0); + if (txfm_rd_gate_level) { + // Check if the mode is good enough based on skip RD + int64_t curr_sse = inter_modes_info->sse_arr[data_idx]; + skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse); + int eval_txfm = check_txfm_eval(x, bsize, search_state->best_skip_rd[0], + skip_rd, txfm_rd_gate_level, 0); + if (!eval_txfm) continue; + } + + // Build the prediction for this mode + if (!is_predictor_built) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + } + if (mbmi->motion_mode == OBMC_CAUSAL) { + av1_build_obmc_inter_predictors_sb(cm, xd); + } + + num_tx_cands++; + if (have_newmv_in_inter_mode(prediction_mode)) newmv_mode_evaled = 1; + num_tx_search_modes[prediction_mode - INTER_MODE_START]++; + int64_t this_yrd = INT64_MAX; + // Do the transform search + if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, + mode_rate, search_state->best_rd)) { + continue; + } else { + const int y_rate = + rd_stats.skip_txfm + ? mode_costs->skip_txfm_cost[skip_ctx][1] + : (rd_stats_y.rate + mode_costs->skip_txfm_cost[skip_ctx][0]); + this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y.dist); + + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + inter_mode_data_push( + tile_data, mbmi->bsize, rd_stats.sse, rd_stats.dist, + rd_stats_y.rate + rd_stats_uv.rate + + mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]); + } + } + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); + if (rd_stats.rdcost < best_rd_in_this_partition) { + best_rd_in_this_partition = rd_stats.rdcost; + *yrd = this_yrd; + } + + const THR_MODES mode_enum = get_prediction_mode_idx( + prediction_mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum, + NULL, bsize, rd_stats.rdcost, + cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); + + if (rd_stats.rdcost < search_state->best_rd) { + update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_enum, x, txfm_search_done); + search_state->best_skip_rd[0] = skip_rd; + // Limit the total number of modes to be evaluated if the first is valid + // and transform skip or compound + if (cpi->sf.inter_sf.inter_mode_txfm_breakout) { + if (!j && (search_state->best_mbmode.skip_txfm || rd_stats.skip_txfm)) { + // Evaluate more candidates at high quantizers where occurrence of + // transform skip is high. + const int max_cands_cap[5] = { 2, 3, 5, 7, 9 }; + const int qindex_band = (5 * x->qindex) >> QINDEX_BITS; + num_inter_mode_cands = + AOMMIN(max_cands_cap[qindex_band], inter_modes_info->num); + } else if (!j && has_second_ref(&search_state->best_mbmode)) { + const int aggr = cpi->sf.inter_sf.inter_mode_txfm_breakout - 1; + // Evaluate more candidates at low quantizers where occurrence of + // single reference mode is high. + const int max_cands_cap_cmp[2][4] = { { 10, 7, 5, 4 }, + { 10, 7, 5, 3 } }; + const int qindex_band_cmp = (4 * x->qindex) >> QINDEX_BITS; + num_inter_mode_cands = AOMMIN( + max_cands_cap_cmp[aggr][qindex_band_cmp], inter_modes_info->num); + } + } + } + // If the number of candidates evaluated exceeds max_allowed_cands, break if + // a newmv mode was evaluated already. + if ((num_tx_cands > max_allowed_cands) && newmv_mode_evaled) break; + } +} + +// Indicates number of winner simple translation modes to be used +static const unsigned int num_winner_motion_modes[3] = { 0, 10, 3 }; + +// Adds a motion mode to the candidate list for motion_mode_for_winner_cand +// speed feature. This list consists of modes that have only searched +// SIMPLE_TRANSLATION. The final list will be used to search other motion +// modes after the initial RD search. +static void handle_winner_cand( + MB_MODE_INFO *const mbmi, + motion_mode_best_st_candidate *best_motion_mode_cands, + int max_winner_motion_mode_cand, int64_t this_rd, + motion_mode_candidate *motion_mode_cand, int skip_motion_mode) { + // Number of current motion mode candidates in list + const int num_motion_mode_cand = best_motion_mode_cands->num_motion_mode_cand; + int valid_motion_mode_cand_loc = num_motion_mode_cand; + + // find the best location to insert new motion mode candidate + for (int j = 0; j < num_motion_mode_cand; j++) { + if (this_rd < best_motion_mode_cands->motion_mode_cand[j].rd_cost) { + valid_motion_mode_cand_loc = j; + break; + } + } + + // Insert motion mode if location is found + if (valid_motion_mode_cand_loc < max_winner_motion_mode_cand) { + if (num_motion_mode_cand > 0 && + valid_motion_mode_cand_loc < max_winner_motion_mode_cand - 1) + memmove( + &best_motion_mode_cands + ->motion_mode_cand[valid_motion_mode_cand_loc + 1], + &best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc], + (AOMMIN(num_motion_mode_cand, max_winner_motion_mode_cand - 1) - + valid_motion_mode_cand_loc) * + sizeof(best_motion_mode_cands->motion_mode_cand[0])); + motion_mode_cand->mbmi = *mbmi; + motion_mode_cand->rd_cost = this_rd; + motion_mode_cand->skip_motion_mode = skip_motion_mode; + best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc] = + *motion_mode_cand; + best_motion_mode_cands->num_motion_mode_cand = + AOMMIN(max_winner_motion_mode_cand, + best_motion_mode_cands->num_motion_mode_cand + 1); + } +} + +/*!\brief Search intra modes in interframes + * + * \ingroup intra_mode_search + * + * This function searches for the best intra mode when the current frame is an + * interframe. This function however does *not* handle luma palette mode. + * Palette mode is currently handled by \ref av1_search_palette_mode. + * + * This function will first iterate through the luma mode candidates to find the + * best luma intra mode. Once the best luma mode it's found, it will then search + * for the best chroma mode. Because palette mode is currently not handled by + * here, a cache of uv mode is stored in + * InterModeSearchState::intra_search_state so it can be reused later by \ref + * av1_search_palette_mode. + * + * \param[in,out] search_state Struct keep track of the prediction mode + * search state in interframe. + * + * \param[in] cpi Top-level encoder structure. + * \param[in,out] x Pointer to struct holding all the data for + * the current prediction block. + * \param[out] rd_cost Stores the best rd_cost among all the + * prediction modes searched. + * \param[in] bsize Current block size. + * \param[in,out] ctx Structure to hold the number of 4x4 blks to + * copy the tx_type and txfm_skip arrays. + * for only the Y plane. + * \param[in] sf_args Stores the list of intra mode candidates + * to be searched. + * \param[in] intra_ref_frame_cost The entropy cost for signaling that the + * current ref frame is an intra frame. + * \param[in] yrd_threshold The rdcost threshold for luma intra mode to + * terminate chroma intra mode search. + * + * \remark If a new best mode is found, search_state and rd_costs are updated + * correspondingly. While x is also modified, it is only used as a temporary + * buffer, and the final decisions are stored in search_state. + */ +static AOM_INLINE void search_intra_modes_in_interframe( + InterModeSearchState *search_state, const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + const InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost, + int64_t yrd_threshold) { + const AV1_COMMON *const cm = &cpi->common; + const SPEED_FEATURES *const sf = &cpi->sf; + const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + IntraModeSearchState *intra_search_state = &search_state->intra_search_state; + + int is_best_y_mode_intra = 0; + RD_STATS best_intra_rd_stats_y; + int64_t best_rd_y = INT64_MAX; + int best_mode_cost_y = -1; + MB_MODE_INFO best_mbmi = *xd->mi[0]; + THR_MODES best_mode_enum = THR_INVALID; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int num_4x4 = bsize_to_num_blk(bsize); + + // Performs luma search + int64_t best_model_rd = INT64_MAX; + int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT]; + for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) { + top_intra_model_rd[i] = INT64_MAX; + } + for (int mode_idx = 0; mode_idx < LUMA_MODE_COUNT; ++mode_idx) { + if (sf->intra_sf.skip_intra_in_interframe && + search_state->intra_search_state.skip_intra_modes) + break; + set_y_mode_and_delta_angle( + mode_idx, mbmi, sf->intra_sf.prune_luma_odd_delta_angles_in_intra); + assert(mbmi->mode < INTRA_MODE_END); + + // Use intra_y_mode_mask speed feature to skip intra mode evaluation. + if (sf_args->mode_skip_mask->pred_modes[INTRA_FRAME] & (1 << mbmi->mode)) + continue; + + const THR_MODES mode_enum = + get_prediction_mode_idx(mbmi->mode, INTRA_FRAME, NONE_FRAME); + if ((!intra_mode_cfg->enable_smooth_intra || + cpi->sf.intra_sf.disable_smooth_intra) && + (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED || + mbmi->mode == SMOOTH_V_PRED)) + continue; + if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED) + continue; + if (av1_is_directional_mode(mbmi->mode) && + !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) && + mbmi->angle_delta[PLANE_TYPE_Y] != 0) + continue; + const PREDICTION_MODE this_mode = mbmi->mode; + + assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME); + assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME); + init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm); + x->txfm_search_info.skip_txfm = 0; + + if (this_mode != DC_PRED) { + // Only search the oblique modes if the best so far is + // one of the neighboring directional modes + if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) { + if (search_state->best_mode_index != THR_INVALID && + search_state->best_mbmode.ref_frame[0] > INTRA_FRAME) + continue; + } + if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra( + this_mode, search_state->intra_search_state.best_intra_mode)) + continue; + } + } + + RD_STATS intra_rd_stats_y; + int mode_cost_y; + int64_t intra_rd_y = INT64_MAX; + const int is_luma_result_valid = av1_handle_intra_y_mode( + intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx, + &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y, + &best_model_rd, top_intra_model_rd); + if (is_luma_result_valid && intra_rd_y < yrd_threshold) { + is_best_y_mode_intra = 1; + if (intra_rd_y < best_rd_y) { + best_intra_rd_stats_y = intra_rd_stats_y; + best_mode_cost_y = mode_cost_y; + best_rd_y = intra_rd_y; + best_mbmi = *mbmi; + best_mode_enum = mode_enum; + memcpy(best_blk_skip, x->txfm_search_info.blk_skip, + sizeof(best_blk_skip[0]) * num_4x4); + av1_copy_array(best_tx_type_map, xd->tx_type_map, num_4x4); + } + } + } + + if (!is_best_y_mode_intra) { + return; + } + + assert(best_rd_y < INT64_MAX); + + // Restores the best luma mode + *mbmi = best_mbmi; + memcpy(x->txfm_search_info.blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * num_4x4); + av1_copy_array(xd->tx_type_map, best_tx_type_map, num_4x4); + + // Performs chroma search + RD_STATS intra_rd_stats, intra_rd_stats_uv; + av1_init_rd_stats(&intra_rd_stats); + av1_init_rd_stats(&intra_rd_stats_uv); + const int num_planes = av1_num_planes(cm); + if (num_planes > 1) { + const int intra_uv_mode_valid = av1_search_intra_uv_modes_in_interframe( + intra_search_state, cpi, x, bsize, &intra_rd_stats, + &best_intra_rd_stats_y, &intra_rd_stats_uv, search_state->best_rd); + + if (!intra_uv_mode_valid) { + return; + } + } + + // Merge the luma and chroma rd stats + assert(best_mode_cost_y >= 0); + intra_rd_stats.rate = best_intra_rd_stats_y.rate + best_mode_cost_y; + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) { + // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size + // in the tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + best_intra_rd_stats_y.rate -= tx_size_cost(x, bsize, mbmi->tx_size); + } + + const ModeCosts *mode_costs = &x->mode_costs; + const PREDICTION_MODE mode = mbmi->mode; + if (num_planes > 1 && xd->is_chroma_ref) { + const int uv_mode_cost = + mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode]; + intra_rd_stats.rate += + intra_rd_stats_uv.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost); + } + + // Intra block is always coded as non-skip + intra_rd_stats.skip_txfm = 0; + intra_rd_stats.dist = best_intra_rd_stats_y.dist + intra_rd_stats_uv.dist; + // Add in the cost of the no skip flag. + const int skip_ctx = av1_get_skip_txfm_context(xd); + intra_rd_stats.rate += mode_costs->skip_txfm_cost[skip_ctx][0]; + // Calculate the final RD estimate for this mode. + const int64_t this_rd = + RDCOST(x->rdmult, intra_rd_stats.rate, intra_rd_stats.dist); + // Keep record of best intra rd + if (this_rd < search_state->best_intra_rd) { + search_state->best_intra_rd = this_rd; + intra_search_state->best_intra_mode = mode; + } + + for (int i = 0; i < REFERENCE_MODES; ++i) { + search_state->best_pred_rd[i] = + AOMMIN(search_state->best_pred_rd[i], this_rd); + } + + intra_rd_stats.rdcost = this_rd; + + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, &intra_rd_stats, &best_intra_rd_stats_y, + &intra_rd_stats_uv, best_mode_enum, NULL, bsize, intra_rd_stats.rdcost, + cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); + if (intra_rd_stats.rdcost < search_state->best_rd) { + update_search_state(search_state, rd_cost, ctx, &intra_rd_stats, + &best_intra_rd_stats_y, &intra_rd_stats_uv, + best_mode_enum, x, txfm_search_done); + } +} + +#if !CONFIG_REALTIME_ONLY +// Prepare inter_cost and intra_cost from TPL stats, which are used as ML +// features in intra mode pruning. +static AOM_INLINE void calculate_cost_from_tpl_data( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, int64_t *inter_cost, int64_t *intra_cost) { + const AV1_COMMON *const cm = &cpi->common; + // Only consider full SB. + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int tpl_bsize_1d = cpi->ppi->tpl_data.tpl_bsize_1d; + const int len = (block_size_wide[sb_size] / tpl_bsize_1d) * + (block_size_high[sb_size] / tpl_bsize_1d); + SuperBlockEnc *sb_enc = &x->sb_enc; + if (sb_enc->tpl_data_count == len) { + const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d); + const int tpl_stride = sb_enc->tpl_stride; + const int tplw = mi_size_wide[tpl_bsize]; + const int tplh = mi_size_high[tpl_bsize]; + const int nw = mi_size_wide[bsize] / tplw; + const int nh = mi_size_high[bsize] / tplh; + if (nw >= 1 && nh >= 1) { + const int of_h = mi_row % mi_size_high[sb_size]; + const int of_w = mi_col % mi_size_wide[sb_size]; + const int start = of_h / tplh * tpl_stride + of_w / tplw; + + for (int k = 0; k < nh; k++) { + for (int l = 0; l < nw; l++) { + *inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l]; + *intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l]; + } + } + *inter_cost /= nw * nh; + *intra_cost /= nw * nh; + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +// When the speed feature skip_intra_in_interframe > 0, enable ML model to prune +// intra mode search. +static AOM_INLINE void skip_intra_modes_in_interframe( + AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize, + InterModeSearchState *search_state, const SPEED_FEATURES *const sf, + int64_t inter_cost, int64_t intra_cost) { + MACROBLOCKD *const xd = &x->e_mbd; + const int comp_pred = search_state->best_mbmode.ref_frame[1] > INTRA_FRAME; + if (sf->rt_sf.prune_intra_mode_based_on_mv_range && + bsize > sf->part_sf.max_intra_bsize && !comp_pred) { + const MV best_mv = search_state->best_mbmode.mv[0].as_mv; + const int mv_thresh = 16 << sf->rt_sf.prune_intra_mode_based_on_mv_range; + if (abs(best_mv.row) < mv_thresh && abs(best_mv.col) < mv_thresh && + x->source_variance > 128) { + search_state->intra_search_state.skip_intra_modes = 1; + return; + } + } + + const unsigned int src_var_thresh_intra_skip = 1; + const int skip_intra_in_interframe = sf->intra_sf.skip_intra_in_interframe; + if (!(skip_intra_in_interframe && + (x->source_variance > src_var_thresh_intra_skip))) + return; + + // Prune intra search based on best inter mode being transfrom skip. + if ((skip_intra_in_interframe >= 2) && search_state->best_mbmode.skip_txfm) { + const int qindex_thresh[2] = { 200, MAXQ }; + const int ind = (skip_intra_in_interframe >= 3) ? 1 : 0; + if (!have_newmv_in_inter_mode(search_state->best_mbmode.mode) && + (x->qindex <= qindex_thresh[ind])) { + search_state->intra_search_state.skip_intra_modes = 1; + return; + } else if ((skip_intra_in_interframe >= 4) && + (inter_cost < 0 || intra_cost < 0)) { + search_state->intra_search_state.skip_intra_modes = 1; + return; + } + } + // Use ML model to prune intra search. + if (inter_cost >= 0 && intra_cost >= 0) { + const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480) + ? &av1_intrap_nn_config + : &av1_intrap_hd_nn_config; + float nn_features[6]; + float scores[2] = { 0.0f }; + + nn_features[0] = (float)search_state->best_mbmode.skip_txfm; + nn_features[1] = (float)mi_size_wide_log2[bsize]; + nn_features[2] = (float)mi_size_high_log2[bsize]; + nn_features[3] = (float)intra_cost; + nn_features[4] = (float)inter_cost; + const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); + const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd); + nn_features[5] = (float)(ac_q_max / ac_q); + + av1_nn_predict(nn_features, nn_config, 1, scores); + + // For two parameters, the max prob returned from av1_nn_softmax equals + // 1.0 / (1.0 + e^(-|diff_score|)). Here use scores directly to avoid the + // calling of av1_nn_softmax. + const float thresh[5] = { 1.4f, 1.4f, 1.4f, 1.4f, 1.4f }; + assert(skip_intra_in_interframe <= 5); + if (scores[1] > scores[0] + thresh[skip_intra_in_interframe - 1]) { + search_state->intra_search_state.skip_intra_modes = 1; + } + } +} + +static AOM_INLINE bool skip_interp_filter_search(const AV1_COMP *cpi, + int is_single_pred) { + const MODE encoding_mode = cpi->oxcf.mode; + if (encoding_mode == REALTIME) { + return (cpi->common.current_frame.reference_mode == SINGLE_REFERENCE && + (cpi->sf.interp_sf.skip_interp_filter_search || + cpi->sf.winner_mode_sf.winner_mode_ifs)); + } else if (encoding_mode == GOOD) { + // Skip interpolation filter search for single prediction modes. + return (cpi->sf.interp_sf.skip_interp_filter_search && is_single_pred); + } + return false; +} + +static AOM_INLINE int get_block_temp_var(const AV1_COMP *cpi, + const MACROBLOCK *x, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const SPEED_FEATURES *const sf = &cpi->sf; + + if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION || + !sf->rt_sf.short_circuit_low_temp_var || + !sf->rt_sf.prune_inter_modes_using_temp_var) { + return 0; + } + + const int mi_row = x->e_mbd.mi_row; + const int mi_col = x->e_mbd.mi_col; + int is_low_temp_var = 0; + + if (cm->seq_params->sb_size == BLOCK_64X64) + is_low_temp_var = av1_get_force_skip_low_temp_var_small_sb( + &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); + else + is_low_temp_var = av1_get_force_skip_low_temp_var( + &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); + + return is_low_temp_var; +} + +// TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb. +void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, + struct macroblock *x, struct RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + AV1_COMMON *const cm = &cpi->common; + const FeatureFlags *const features = &cm->features; + const int num_planes = av1_num_planes(cm); + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + int i; + const ModeCosts *mode_costs = &x->mode_costs; + const int *comp_inter_cost = + mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)]; + + InterModeSearchState search_state; + init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far); + INTERINTRA_MODE interintra_modes[REF_FRAMES] = { + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES + }; + HandleInterModeArgs args = { { NULL }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, + { NULL }, + { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }, + NULL, + NULL, + NULL, + search_state.modelled_rd, + INT_MAX, + INT_MAX, + search_state.simple_rd, + 0, + false, + interintra_modes, + { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } }, + { { 0, 0 } }, + { 0 }, + 0, + 0, + -1, + -1, + -1, + { 0 }, + { 0 }, + UINT_MAX }; + // Currently, is_low_temp_var is used in real time encoding. + const int is_low_temp_var = get_block_temp_var(cpi, x, bsize); + + for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1; + // Indicates the appropriate number of simple translation winner modes for + // exhaustive motion mode evaluation + const int max_winner_motion_mode_cand = + num_winner_motion_modes[sf->winner_mode_sf.motion_mode_for_winner_cand]; + assert(max_winner_motion_mode_cand <= MAX_WINNER_MOTION_MODES); + motion_mode_candidate motion_mode_cand; + motion_mode_best_st_candidate best_motion_mode_cands; + // Initializing the number of motion mode candidates to zero. + best_motion_mode_cands.num_motion_mode_cand = 0; + for (i = 0; i < MAX_WINNER_MOTION_MODES; ++i) + best_motion_mode_cands.motion_mode_cand[i].rd_cost = INT64_MAX; + + for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + + av1_invalid_rd_stats(rd_cost); + + for (i = 0; i < REF_FRAMES; ++i) { + x->warp_sample_info[i].num = -1; + } + + // Ref frames that are selected by square partition blocks. + int picked_ref_frames_mask = 0; + if (sf->inter_sf.prune_ref_frame_for_rect_partitions && + mbmi->partition != PARTITION_NONE) { + // prune_ref_frame_for_rect_partitions = 1 implies prune only extended + // partition blocks. prune_ref_frame_for_rect_partitions >=2 + // implies prune for vert, horiz and extended partition blocks. + if ((mbmi->partition != PARTITION_VERT && + mbmi->partition != PARTITION_HORZ) || + sf->inter_sf.prune_ref_frame_for_rect_partitions >= 2) { + picked_ref_frames_mask = + fetch_picked_ref_frames_mask(x, bsize, cm->seq_params->mib_size); + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, set_params_rd_pick_inter_mode_time); +#endif + // Skip ref frames that never selected by square blocks. + const int skip_ref_frame_mask = + picked_ref_frames_mask ? ~picked_ref_frames_mask : 0; + mode_skip_mask_t mode_skip_mask; + unsigned int ref_costs_single[REF_FRAMES]; + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; + // init params, set frame modes, speed features + set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask, + skip_ref_frame_mask, ref_costs_single, + ref_costs_comp, yv12_mb); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, set_params_rd_pick_inter_mode_time); +#endif + + int64_t best_est_rd = INT64_MAX; + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + // If do_tx_search is 0, only estimated RD should be computed. + // If do_tx_search is 1, all modes have TX search performed. + const int do_tx_search = + !((sf->inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) || + (sf->inter_sf.inter_mode_rd_model_estimation == 2 && + num_pels_log2_lookup[bsize] > 8)); + InterModesInfo *inter_modes_info = x->inter_modes_info; + inter_modes_info->num = 0; + + // Temporary buffers used by handle_inter_mode(). + uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]); + + // The best RD found for the reference frame, among single reference modes. + // Note that the 0-th element will contain a cut-off that is later used + // to determine if we should skip a compound mode. + int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX, INT64_MAX }; + + // Prepared stats used later to check if we could skip intra mode eval. + int64_t inter_cost = -1; + int64_t intra_cost = -1; + // Need to tweak the threshold for hdres speed 0 & 1. + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Obtain the relevant tpl stats for pruning inter modes + PruneInfoFromTpl inter_cost_info_from_tpl; +#if !CONFIG_REALTIME_ONLY + if (sf->inter_sf.prune_inter_modes_based_on_tpl) { + // x->tpl_keep_ref_frame[id] = 1 => no pruning in + // prune_ref_by_selective_ref_frame() + // x->tpl_keep_ref_frame[id] = 0 => ref frame can be pruned in + // prune_ref_by_selective_ref_frame() + // Populating valid_refs[idx] = 1 ensures that + // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a + // pruned ref frame. + int valid_refs[INTER_REFS_PER_FRAME]; + for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) { + const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME }; + valid_refs[frame - 1] = + x->tpl_keep_ref_frame[frame] || + !prune_ref_by_selective_ref_frame( + cpi, x, refs, cm->cur_frame->ref_display_order_hint); + } + av1_zero(inter_cost_info_from_tpl); + get_block_level_tpl_stats(cpi, bsize, mi_row, mi_col, valid_refs, + &inter_cost_info_from_tpl); + } + + const int do_pruning = + (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1; + if (do_pruning && sf->intra_sf.skip_intra_in_interframe && + cpi->oxcf.algo_cfg.enable_tpl_model) + calculate_cost_from_tpl_data(cpi, x, bsize, mi_row, mi_col, &inter_cost, + &intra_cost); +#endif // !CONFIG_REALTIME_ONLY + + // Initialize best mode stats for winner mode processing. + const int max_winner_mode_count = + winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type]; + zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats); + x->winner_mode_count = 0; + store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, + NULL, bsize, best_rd_so_far, + sf->winner_mode_sf.multi_winner_mode_type, 0); + + int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS); + if (sf->inter_sf.prune_inter_modes_if_skippable) { + // Higher multiplication factor values for lower quantizers. + mode_thresh_mul_fact = mode_threshold_mul_factor[x->qindex]; + } + + // Initialize arguments for mode loop speed features + InterModeSFArgs sf_args = { &args.skip_motion_mode, + &mode_skip_mask, + &search_state, + skip_ref_frame_mask, + 0, + mode_thresh_mul_fact, + 0, + 0 }; + int64_t best_inter_yrd = INT64_MAX; + + // This is the main loop of this function. It loops over all possible inter + // modes and calls handle_inter_mode() to compute the RD for each. + // Here midx is just an iterator index that should not be used by itself + // except to keep track of the number of modes searched. It should be used + // with av1_default_mode_order to get the enum that defines the mode, which + // can be used with av1_mode_defs to get the prediction mode and the ref + // frames. + // TODO(yunqing, any): Setting mode_start and mode_end outside for-loop brings + // good speedup for real time case. If we decide to use compound mode in real + // time, maybe we can modify av1_default_mode_order table. + THR_MODES mode_start = THR_INTER_MODE_START; + THR_MODES mode_end = THR_INTER_MODE_END; + const CurrentFrame *const current_frame = &cm->current_frame; + if (current_frame->reference_mode == SINGLE_REFERENCE) { + mode_start = SINGLE_REF_MODE_START; + mode_end = SINGLE_REF_MODE_END; + } + + for (THR_MODES midx = mode_start; midx < mode_end; ++midx) { + // Get the actual prediction mode we are trying in this iteration + const THR_MODES mode_enum = av1_default_mode_order[midx]; + const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; + const PREDICTION_MODE this_mode = mode_def->mode; + const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame; + + const MV_REFERENCE_FRAME ref_frame = ref_frames[0]; + const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1]; + const int is_single_pred = + ref_frame > INTRA_FRAME && second_ref_frame == NONE_FRAME; + const int comp_pred = second_ref_frame > INTRA_FRAME; + + init_mbmi(mbmi, this_mode, ref_frames, cm); + + txfm_info->skip_txfm = 0; + sf_args.num_single_modes_processed += is_single_pred; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, skip_inter_mode_time); +#endif + // Apply speed features to decide if this inter mode can be skipped + const int is_skip_inter_mode = skip_inter_mode( + cpi, x, bsize, ref_frame_rd, midx, &sf_args, is_low_temp_var); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, skip_inter_mode_time); +#endif + if (is_skip_inter_mode) continue; + + // Select prediction reference frames. + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } + + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->ref_mv_idx = 0; + + const int64_t ref_best_rd = search_state.best_rd; + RD_STATS rd_stats, rd_stats_y, rd_stats_uv; + av1_init_rd_stats(&rd_stats); + + const int ref_frame_cost = comp_pred + ? ref_costs_comp[ref_frame][second_ref_frame] + : ref_costs_single[ref_frame]; + const int compmode_cost = + is_comp_ref_allowed(mbmi->bsize) ? comp_inter_cost[comp_pred] : 0; + const int real_compmode_cost = + cm->current_frame.reference_mode == REFERENCE_MODE_SELECT + ? compmode_cost + : 0; + // Point to variables that are maintained between loop iterations + args.single_newmv = search_state.single_newmv; + args.single_newmv_rate = search_state.single_newmv_rate; + args.single_newmv_valid = search_state.single_newmv_valid; + args.single_comp_cost = real_compmode_cost; + args.ref_frame_cost = ref_frame_cost; + args.best_pred_sse = search_state.best_pred_sse; + args.skip_ifs = skip_interp_filter_search(cpi, is_single_pred); + + int64_t skip_rd[2] = { search_state.best_skip_rd[0], + search_state.best_skip_rd[1] }; + int64_t this_yrd = INT64_MAX; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_inter_mode_time); +#endif + int64_t this_rd = handle_inter_mode( + cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &args, + ref_best_rd, tmp_buf, &x->comp_rd_buffer, &best_est_rd, do_tx_search, + inter_modes_info, &motion_mode_cand, skip_rd, &inter_cost_info_from_tpl, + &this_yrd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_inter_mode_time); +#endif + if (current_frame->reference_mode != SINGLE_REFERENCE) { + if (!args.skip_ifs && + sf->inter_sf.prune_comp_search_by_single_result > 0 && + is_inter_singleref_mode(this_mode)) { + collect_single_states(x, &search_state, mbmi); + } + + if (sf->inter_sf.prune_comp_using_best_single_mode_ref > 0 && + is_inter_singleref_mode(this_mode)) + update_best_single_mode(&search_state, this_mode, ref_frame, this_rd); + } + + if (this_rd == INT64_MAX) continue; + + if (mbmi->skip_txfm) { + rd_stats_y.rate = 0; + rd_stats_uv.rate = 0; + } + + if (sf->inter_sf.prune_compound_using_single_ref && is_single_pred && + this_rd < ref_frame_rd[ref_frame]) { + ref_frame_rd[ref_frame] = this_rd; + } + + // Did this mode help, i.e., is it the new best mode + if (this_rd < search_state.best_rd) { + assert(IMPLIES(comp_pred, + cm->current_frame.reference_mode != SINGLE_REFERENCE)); + search_state.best_pred_sse = x->pred_sse[ref_frame]; + best_inter_yrd = this_yrd; + update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_enum, x, do_tx_search); + if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0]; + // skip_rd[0] is the best total rd for a skip mode so far. + // skip_rd[1] is the best total rd for a skip mode so far in luma. + // When do_tx_search = 1, both skip_rd[0] and skip_rd[1] are updated. + // When do_tx_search = 0, skip_rd[1] is updated. + search_state.best_skip_rd[1] = skip_rd[1]; + } + if (sf->winner_mode_sf.motion_mode_for_winner_cand) { + // Add this mode to motion mode candidate list for motion mode search + // if using motion_mode_for_winner_cand speed feature + handle_winner_cand(mbmi, &best_motion_mode_cands, + max_winner_motion_mode_cand, this_rd, + &motion_mode_cand, args.skip_motion_mode); + } + + /* keep record of best compound/single-only prediction */ + record_best_compound(cm->current_frame.reference_mode, &rd_stats, comp_pred, + x->rdmult, &search_state, compmode_cost); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, evaluate_motion_mode_for_winner_candidates_time); +#endif + if (sf->winner_mode_sf.motion_mode_for_winner_cand) { + // For the single ref winner candidates, evaluate other motion modes (non + // simple translation). + evaluate_motion_mode_for_winner_candidates( + cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb, + &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd, + &search_state, &best_inter_yrd); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, evaluate_motion_mode_for_winner_candidates_time); +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, do_tx_search_time); +#endif + if (do_tx_search != 1) { + // A full tx search has not yet been done, do tx search for + // top mode candidates + tx_search_best_inter_candidates(cpi, tile_data, x, best_rd_so_far, bsize, + yv12_mb, mi_row, mi_col, &search_state, + rd_cost, ctx, &best_inter_yrd); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, do_tx_search_time); +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_intra_mode_time); +#endif + // Gate intra mode evaluation if best of inter is skip except when source + // variance is extremely low and also based on max intra bsize. + skip_intra_modes_in_interframe(cm, x, bsize, &search_state, sf, inter_cost, + intra_cost); + + const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME]; + search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx, + &sf_args, intra_ref_frame_cost, + best_inter_yrd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_intra_mode_time); +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, refine_winner_mode_tx_time); +#endif + int winner_mode_count = + sf->winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1; + // In effect only when fast tx search speed features are enabled. + refine_winner_mode_tx( + cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index, + &search_state.best_mbmode, yv12_mb, search_state.best_rate_y, + search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, refine_winner_mode_tx_time); +#endif + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + // Only try palette mode when the best mode so far is an intra mode. + const int try_palette = + cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(features->allow_screen_content_tools, mbmi->bsize) && + !is_inter_mode(search_state.best_mbmode.mode) && rd_cost->rate != INT_MAX; + RD_STATS this_rd_cost; + int this_skippable = 0; + if (try_palette) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_search_palette_mode_time); +#endif + this_skippable = av1_search_palette_mode( + &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost, + ctx, &this_rd_cost, search_state.best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_search_palette_mode_time); +#endif + if (this_rd_cost.rdcost < search_state.best_rd) { + search_state.best_mode_index = THR_DC; + mbmi->mv[0].as_int = 0; + rd_cost->rate = this_rd_cost.rate; + rd_cost->dist = this_rd_cost.dist; + rd_cost->rdcost = this_rd_cost.rdcost; + search_state.best_rd = rd_cost->rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = 0; + search_state.best_mode_skippable = this_skippable; + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + } + } + + search_state.best_mbmode.skip_mode = 0; + if (cm->current_frame.skip_mode_info.skip_mode_flag && + is_comp_ref_allowed(bsize)) { + const struct segmentation *const seg = &cm->seg; + unsigned char segment_id = mbmi->segment_id; + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, yv12_mb); + } + } + + // Make sure that the ref_mv_idx is only nonzero when we're + // using a mode which can support ref_mv_idx + if (search_state.best_mbmode.ref_mv_idx != 0 && + !(search_state.best_mbmode.mode == NEWMV || + search_state.best_mbmode.mode == NEW_NEWMV || + have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) { + search_state.best_mbmode.ref_mv_idx = 0; + } + + if (search_state.best_mode_index == THR_INVALID || + search_state.best_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + const InterpFilter interp_filter = features->interp_filter; + assert((interp_filter == SWITCHABLE) || + (interp_filter == + search_state.best_mbmode.interp_filters.as_filters.y_filter) || + !is_inter_block(&search_state.best_mbmode)); + assert((interp_filter == SWITCHABLE) || + (interp_filter == + search_state.best_mbmode.interp_filters.as_filters.x_filter) || + !is_inter_block(&search_state.best_mbmode)); + + if (!cpi->rc.is_src_frame_alt_ref && sf->inter_sf.adaptive_rd_thresh) { + av1_update_rd_thresh_fact( + cm, x->thresh_freq_fact, sf->inter_sf.adaptive_rd_thresh, bsize, + search_state.best_mode_index, mode_start, mode_end, THR_DC, MAX_MODES); + } + + // macroblock modes + *mbmi = search_state.best_mbmode; + txfm_info->skip_txfm |= search_state.best_skip2; + + // Note: this section is needed since the mode may have been forced to + // GLOBALMV by the all-zero mode handling of ref-mv. + if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) { + // Correct the interp filters for GLOBALMV + if (is_nontrans_global_motion(xd, xd->mi[0])) { + int_interpfilters filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + assert(mbmi->interp_filters.as_int == filters.as_int); + (void)filters; + } + } + + txfm_info->skip_txfm |= search_state.best_mode_skippable; + + assert(search_state.best_mode_index != THR_INVALID); + +#if CONFIG_INTERNAL_STATS + store_coding_context(x, ctx, search_state.best_mode_index, + search_state.best_mode_skippable); +#else + store_coding_context(x, ctx, search_state.best_mode_skippable); +#endif // CONFIG_INTERNAL_STATS + + if (mbmi->palette_mode_info.palette_size[1] > 0) { + assert(try_palette); + av1_restore_uv_color_map(cpi, x); + } +} + +void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, + TileDataEnc *tile_data, MACROBLOCK *x, + int mi_row, int mi_col, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + const AV1_COMMON *const cm = &cpi->common; + const FeatureFlags *const features = &cm->features; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + unsigned char segment_id = mbmi->segment_id; + const int comp_pred = 0; + int i; + unsigned int ref_costs_single[REF_FRAMES]; + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; + const ModeCosts *mode_costs = &x->mode_costs; + const int *comp_inter_cost = + mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)]; + InterpFilter best_filter = SWITCHABLE; + int64_t this_rd = INT64_MAX; + int rate2 = 0; + const int64_t distortion2 = 0; + (void)mi_row; + (void)mi_col; + (void)tile_data; + + av1_collect_neighbors_ref_counts(xd); + + estimate_ref_frame_costs(cm, xd, mode_costs, segment_id, ref_costs_single, + ref_costs_comp); + + for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX; + + rd_cost->rate = INT_MAX; + + assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); + + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mode = GLOBALMV; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->uv_mode = UV_DC_PRED; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) + mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + else + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->mv[0].as_int = + gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]], + features->allow_high_precision_mv, bsize, mi_col, + mi_row, features->cur_frame_force_integer_mv) + .as_int; + mbmi->tx_size = max_txsize_lookup[bsize]; + x->txfm_search_info.skip_txfm = 1; + + mbmi->ref_mv_idx = 0; + + mbmi->motion_mode = SIMPLE_TRANSLATION; + av1_count_overlappable_neighbors(cm, xd); + if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) { + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref); + // Select the samples according to motion vector difference + if (mbmi->num_proj_ref > 1) { + mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); + } + } + + const InterpFilter interp_filter = features->interp_filter; + set_default_interp_filters(mbmi, interp_filter); + + if (interp_filter != SWITCHABLE) { + best_filter = interp_filter; + } else { + best_filter = EIGHTTAP_REGULAR; + if (av1_is_interp_needed(xd)) { + int rs; + int best_rs = INT_MAX; + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + mbmi->interp_filters = av1_broadcast_interp_filter(i); + rs = av1_get_switchable_rate(x, xd, interp_filter, + cm->seq_params->enable_dual_filter); + if (rs < best_rs) { + best_rs = rs; + best_filter = mbmi->interp_filters.as_filters.y_filter; + } + } + } + } + // Set the appropriate filter + mbmi->interp_filters = av1_broadcast_interp_filter(best_filter); + rate2 += av1_get_switchable_rate(x, xd, interp_filter, + cm->seq_params->enable_dual_filter); + + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) + rate2 += comp_inter_cost[comp_pred]; + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + rate2 += ref_costs_single[LAST_FRAME]; + this_rd = RDCOST(x->rdmult, rate2, distortion2); + + rd_cost->rate = rate2; + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + + if (this_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + assert((interp_filter == SWITCHABLE) || + (interp_filter == mbmi->interp_filters.as_filters.y_filter)); + + if (cpi->sf.inter_sf.adaptive_rd_thresh) { + av1_update_rd_thresh_fact(cm, x->thresh_freq_fact, + cpi->sf.inter_sf.adaptive_rd_thresh, bsize, + THR_GLOBALMV, THR_INTER_MODE_START, + THR_INTER_MODE_END, THR_DC, MAX_MODES); + } + +#if CONFIG_INTERNAL_STATS + store_coding_context(x, ctx, THR_GLOBALMV, 0); +#else + store_coding_context(x, ctx, 0); +#endif // CONFIG_INTERNAL_STATS +} + +/*!\cond */ +struct calc_target_weighted_pred_ctxt { + const OBMCBuffer *obmc_buffer; + const uint8_t *tmp; + int tmp_stride; + int overlap; +}; +/*!\endcond */ + +static INLINE void calc_target_weighted_pred_above( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) { + (void)nb_mi; + (void)num_planes; + (void)rel_mi_row; + (void)dir; + + struct calc_target_weighted_pred_ctxt *ctxt = + (struct calc_target_weighted_pred_ctxt *)fun_ctxt; + + const int bw = xd->width << MI_SIZE_LOG2; + const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); + + int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_col * MI_SIZE); + int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_col * MI_SIZE); + const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE; + const int is_hbd = is_cur_buf_hbd(xd); + + if (!is_hbd) { + for (int row = 0; row < ctxt->overlap; ++row) { + const uint8_t m0 = mask1d[row]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + for (int col = 0; col < op_mi_size * MI_SIZE; ++col) { + wsrc[col] = m1 * tmp[col]; + mask[col] = m0; + } + wsrc += bw; + mask += bw; + tmp += ctxt->tmp_stride; + } + } else { + const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp); + + for (int row = 0; row < ctxt->overlap; ++row) { + const uint8_t m0 = mask1d[row]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + for (int col = 0; col < op_mi_size * MI_SIZE; ++col) { + wsrc[col] = m1 * tmp16[col]; + mask[col] = m0; + } + wsrc += bw; + mask += bw; + tmp16 += ctxt->tmp_stride; + } + } +} + +static INLINE void calc_target_weighted_pred_left( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) { + (void)nb_mi; + (void)num_planes; + (void)rel_mi_col; + (void)dir; + + struct calc_target_weighted_pred_ctxt *ctxt = + (struct calc_target_weighted_pred_ctxt *)fun_ctxt; + + const int bw = xd->width << MI_SIZE_LOG2; + const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); + + int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_row * MI_SIZE * bw); + int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_row * MI_SIZE * bw); + const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride); + const int is_hbd = is_cur_buf_hbd(xd); + + if (!is_hbd) { + for (int row = 0; row < op_mi_size * MI_SIZE; ++row) { + for (int col = 0; col < ctxt->overlap; ++col) { + const uint8_t m0 = mask1d[col]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 + + (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1; + mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0; + } + wsrc += bw; + mask += bw; + tmp += ctxt->tmp_stride; + } + } else { + const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp); + + for (int row = 0; row < op_mi_size * MI_SIZE; ++row) { + for (int col = 0; col < ctxt->overlap; ++col) { + const uint8_t m0 = mask1d[col]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 + + (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1; + mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0; + } + wsrc += bw; + mask += bw; + tmp16 += ctxt->tmp_stride; + } + } +} + +// This function has a structure similar to av1_build_obmc_inter_prediction +// +// The OBMC predictor is computed as: +// +// PObmc(x,y) = +// AOM_BLEND_A64(Mh(x), +// AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)), +// PLeft(x, y)) +// +// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate +// rounding, this can be written as: +// +// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) = +// Mh(x) * Mv(y) * P(x,y) + +// Mh(x) * Cv(y) * Pabove(x,y) + +// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y) +// +// Where : +// +// Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y) +// Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y) +// +// This function computes 'wsrc' and 'mask' as: +// +// wsrc(x, y) = +// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) - +// Mh(x) * Cv(y) * Pabove(x,y) + +// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y) +// +// mask(x, y) = Mh(x) * Mv(y) +// +// These can then be used to efficiently approximate the error for any +// predictor P in the context of the provided neighbouring predictors by +// computing: +// +// error(x, y) = +// wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2) +// +static AOM_INLINE void calc_target_weighted_pred( + const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd, + const uint8_t *above, int above_stride, const uint8_t *left, + int left_stride) { + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + const int bw = xd->width << MI_SIZE_LOG2; + const int bh = xd->height << MI_SIZE_LOG2; + const OBMCBuffer *obmc_buffer = &x->obmc_buffer; + int32_t *mask_buf = obmc_buffer->mask; + int32_t *wsrc_buf = obmc_buffer->wsrc; + + const int is_hbd = is_cur_buf_hbd(xd); + const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA; + + // plane 0 should not be sub-sampled + assert(xd->plane[0].subsampling_x == 0); + assert(xd->plane[0].subsampling_y == 0); + + av1_zero_array(wsrc_buf, bw * bh); + for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA; + + // handle above row + if (xd->up_available) { + const int overlap = + AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1; + struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, above, + above_stride, overlap }; + foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + calc_target_weighted_pred_above, &ctxt); + } + + for (int i = 0; i < bw * bh; ++i) { + wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA; + mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA; + } + + // handle left column + if (xd->left_available) { + const int overlap = + AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1; + struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, left, + left_stride, overlap }; + foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + calc_target_weighted_pred_left, &ctxt); + } + + if (!is_hbd) { + const uint8_t *src = x->plane[0].src.buf; + + for (int row = 0; row < bh; ++row) { + for (int col = 0; col < bw; ++col) { + wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col]; + } + wsrc_buf += bw; + src += x->plane[0].src.stride; + } + } else { + const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf); + + for (int row = 0; row < bh; ++row) { + for (int col = 0; col < bw; ++col) { + wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col]; + } + wsrc_buf += bw; + src += x->plane[0].src.stride; + } + } +} diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h new file mode 100644 index 0000000000..efb797e5b5 --- /dev/null +++ b/third_party/aom/av1/encoder/rdopt.h @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RDOPT_H_ +#define AOM_AV1_ENCODER_RDOPT_H_ + +#include + +#include "av1/common/blockd.h" +#include "av1/common/txb_common.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/rdopt_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define COMP_TYPE_RD_THRESH_SCALE 11 +#define COMP_TYPE_RD_THRESH_SHIFT 4 +#define MAX_WINNER_MOTION_MODES 10 + +struct TileInfo; +struct macroblock; +struct RD_STATS; + +/*!\brief AV1 intra mode selection for intra frames. + * + * \ingroup intra_mode_search + * \callgraph + * Top level function for rd-based intra mode selection during intra frame + * encoding. This function will first search for the best luma prediction by + * calling av1_rd_pick_intra_sby_mode, then it searches for chroma prediction + * with av1_rd_pick_intra_sbuv_mode. If applicable, this function ends the + * search with an evaluation for intrabc. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to structure holding all the data for + the current macroblock. + * \param[in] rd_cost Struct to keep track of the RD information. + * \param[in] bsize Current block size. + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process. + * \param[in] best_rd Best RD seen for this block so far. + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ +void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x, + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd); + +/*!\brief AV1 inter mode selection. + * + * \ingroup inter_mode_search + * \callgraph + * Top level function for inter mode selection. This function will loop over + * all possible inter modes and select the best one for the current block by + * computing the RD cost. The mode search and RD are computed in + * handle_inter_mode(), which is called from this function within the main + * loop. + * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + data/contexts/models for the tile during + encoding + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] rd_cost Struct to keep track of the RD information + * \param[in] bsize Current block size + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process + * \param[in] best_rd_so_far Best RD seen for this block so far + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ +void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, + struct macroblock *x, struct RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); + +/*!\brief AV1 intra mode selection based on Non-RD optimized model. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Top level function for Non-RD optimized intra mode selection. + * This finction will loop over subset of intra modes and select the best one + * based on calculated modelled RD cost. Only 4 intra modes are checked as + * specified in \c intra_mode_list. When calculating RD cost Hadamard transform + * of residual is used to calculate rate. Estmation of RD cost is performed + * in \c av1_estimate_block_intra which is called from this function + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] rd_cost Struct to keep track of the RD information + * \param[in] bsize Current block size + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ +void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); + +/*!\brief AV1 inter mode selection based on Non-RD optimized model. + * + * \ingroup nonrd_mode_search + * \callgraph + * Top level function for Non-RD optimized inter mode selection. + * This finction will loop over subset of inter modes and select the best one + * based on calculated modelled RD cost. While making decisions which modes to + * check, this function applies heuristics based on previously checked modes, + * block residual variance, block size, and other factors to prune certain + * modes and reference frames. Currently only single reference frame modes + * are checked. Additional heuristics are applied to decide if intra modes + * need to be checked. + * * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + data/contexts/models for the tile during + encoding + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] rd_cost Struct to keep track of the RD information + * \param[in] bsize Current block size + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ +void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi, + struct TileDataEnc *tile_data, + struct macroblock *x, + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx); + +void av1_rd_pick_inter_mode_sb_seg_skip( + const struct AV1_COMP *cpi, struct TileDataEnc *tile_data, + struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); + +void av1_inter_mode_data_init(struct TileDataEnc *tile_data); +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult); + +static INLINE int coded_to_superres_mi(int mi_col, int denom) { + return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR; +} + +static INLINE int av1_encoder_get_relative_dist(int a, int b) { + assert(a >= 0 && b >= 0); + return (a - b); +} + +// This function will return number of mi's in a superblock. +static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) { + const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize]; + int sb_mi_rows = + (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) / + mi_alloc_size_1d; + assert(mi_size_wide[cm->seq_params->sb_size] == + mi_size_high[cm->seq_params->sb_size]); + int sb_mi_size = sb_mi_rows * sb_mi_rows; + + return sb_mi_size; +} + +// This function prunes the mode if either of the reference frame falls in the +// pruning list +static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame, + const unsigned int *const ref_display_order_hint, + const unsigned int frame_display_order_hint, + const int *ref_frame_list) { + for (int i = 0; i < 2; i++) { + if (ref_frame_list[i] == NONE_FRAME) continue; + + if (ref_frame[0] == ref_frame_list[i] || + ref_frame[1] == ref_frame_list[i]) { + if (av1_encoder_get_relative_dist( + ref_display_order_hint[ref_frame_list[i] - LAST_FRAME], + frame_display_order_hint) < 0) + return 1; + } + } + return 0; +} + +static INLINE int has_closest_ref_frames(const MV_REFERENCE_FRAME *ref_frame, + int8_t closest_past_ref, + int8_t closest_future_ref) { + int has_closest_past_ref = + (ref_frame[0] == closest_past_ref) || (ref_frame[1] == closest_past_ref); + int has_closest_future_ref = (ref_frame[0] == closest_future_ref) || + (ref_frame[1] == closest_future_ref); + return (has_closest_past_ref && has_closest_future_ref); +} + +static INLINE int has_best_pred_mv_sad(const MV_REFERENCE_FRAME *ref_frame, + const MACROBLOCK *const x) { + int has_best_past_pred_mv_sad = 0; + int has_best_future_pred_mv_sad = 0; + if (x->best_pred_mv_sad[0] < INT_MAX && x->best_pred_mv_sad[1] < INT_MAX) { + has_best_past_pred_mv_sad = + (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[0]) || + (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[0]); + has_best_future_pred_mv_sad = + (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[1]) || + (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[1]); + } + return (has_best_past_pred_mv_sad && has_best_future_pred_mv_sad); +} + +static INLINE int prune_ref_by_selective_ref_frame( + const AV1_COMP *const cpi, const MACROBLOCK *const x, + const MV_REFERENCE_FRAME *const ref_frame, + const unsigned int *const ref_display_order_hint) { + const SPEED_FEATURES *const sf = &cpi->sf; + if (!sf->inter_sf.selective_ref_frame) return 0; + + const int comp_pred = ref_frame[1] > INTRA_FRAME; + + if (sf->inter_sf.selective_ref_frame >= 2 || + (sf->inter_sf.selective_ref_frame == 1 && comp_pred)) { + int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME }; + + if (x != NULL) { + // Disable pruning if either tpl suggests that we keep the frame or + // the pred_mv gives us the best sad + if (x->tpl_keep_ref_frame[LAST3_FRAME] || + x->pred_mv_sad[LAST3_FRAME] == x->best_pred_mv_sad[0]) { + ref_frame_list[0] = NONE_FRAME; + } + if (x->tpl_keep_ref_frame[LAST2_FRAME] || + x->pred_mv_sad[LAST2_FRAME] == x->best_pred_mv_sad[0]) { + ref_frame_list[1] = NONE_FRAME; + } + } + + if (prune_ref(ref_frame, ref_display_order_hint, + ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME], + ref_frame_list)) + return 1; + } + + if (sf->inter_sf.selective_ref_frame >= 3) { + int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME }; + + if (x != NULL) { + // Disable pruning if either tpl suggests that we keep the frame or + // the pred_mv gives us the best sad + if (x->tpl_keep_ref_frame[ALTREF2_FRAME] || + x->pred_mv_sad[ALTREF2_FRAME] == x->best_pred_mv_sad[0]) { + ref_frame_list[0] = NONE_FRAME; + } + if (x->tpl_keep_ref_frame[BWDREF_FRAME] || + x->pred_mv_sad[BWDREF_FRAME] == x->best_pred_mv_sad[0]) { + ref_frame_list[1] = NONE_FRAME; + } + } + + if (prune_ref(ref_frame, ref_display_order_hint, + ref_display_order_hint[LAST_FRAME - LAST_FRAME], + ref_frame_list)) + return 1; + } + + if (x != NULL && sf->inter_sf.prune_comp_ref_frames && comp_pred) { + int closest_ref_frames = has_closest_ref_frames( + ref_frame, cpi->ref_frame_dist_info.nearest_past_ref, + cpi->ref_frame_dist_info.nearest_future_ref); + if (closest_ref_frames == 0) { + // Prune reference frames which are not the closest to the current frame. + if (sf->inter_sf.prune_comp_ref_frames >= 2) { + return 1; + } else if (sf->inter_sf.prune_comp_ref_frames == 1) { + // Prune reference frames with non minimum pred_mv_sad. + if (has_best_pred_mv_sad(ref_frame, x) == 0) return 1; + } + } + } + + return 0; +} + +// This function will copy the best reference mode information from +// MB_MODE_INFO_EXT to MB_MODE_INFO_EXT_FRAME. +static INLINE void av1_copy_mbmi_ext_to_mbmi_ext_frame( + MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, + const MB_MODE_INFO_EXT *const mbmi_ext, uint8_t ref_frame_type) { + memcpy(mbmi_ext_best->ref_mv_stack, mbmi_ext->ref_mv_stack[ref_frame_type], + sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); + memcpy(mbmi_ext_best->weight, mbmi_ext->weight[ref_frame_type], + sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); + mbmi_ext_best->mode_context = mbmi_ext->mode_context[ref_frame_type]; + mbmi_ext_best->ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + memcpy(mbmi_ext_best->global_mvs, mbmi_ext->global_mvs, + sizeof(mbmi_ext->global_mvs)); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RDOPT_H_ diff --git a/third_party/aom/av1/encoder/rdopt_data_defs.h b/third_party/aom/av1/encoder/rdopt_data_defs.h new file mode 100644 index 0000000000..ca7ef810f3 --- /dev/null +++ b/third_party/aom/av1/encoder/rdopt_data_defs.h @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ +#define AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +static const THR_MODES intra_to_mode_idx[INTRA_MODE_NUM] = { + THR_DC, // DC_PRED, + THR_V_PRED, // V_PRED, + THR_H_PRED, // H_PRED, + THR_D45_PRED, // D45_PRED, + THR_D135_PRED, // D135_PRED, + THR_D113_PRED, // D113_PRED, + THR_D157_PRED, // D157_PRED, + THR_D203_PRED, // D203_PRED, + THR_D67_PRED, // D67_PRED, + THR_SMOOTH, // SMOOTH_PRED, + THR_SMOOTH_V, // SMOOTH_V_PRED, + THR_SMOOTH_H, // SMOOTH_H_PRED, + THR_PAETH, // PAETH_PRED, +}; + +/* clang-format off */ +static const THR_MODES single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM] + [REF_FRAMES] = { + // NEARESTMV, + { THR_INVALID, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3, + THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, }, + // NEARMV, + { THR_INVALID, THR_NEARMV, THR_NEARL2, THR_NEARL3, + THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, }, + // GLOBALMV, + { THR_INVALID, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3, + THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, }, + // NEWMV, + { THR_INVALID, THR_NEWMV, THR_NEWL2, THR_NEWL3, + THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, }, +}; +/* clang-format on */ + +/* clang-format off */ +static const THR_MODES comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES] + [REF_FRAMES] = { + // NEAREST_NEARESTMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3, + THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB, + THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTL2B, + THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTL3B, + THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTGB, + THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEAR_NEARMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3, + THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB, + THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARL2B, + THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARL3B, + THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARGB, + THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEAREST_NEWMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3, + THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB, + THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWL2B, + THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWL3B, + THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWGB, + THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEW_NEARESTMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3, + THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB, + THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTL2B, + THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTL3B, + THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTGB, + THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEAR_NEWMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3, + THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB, + THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWL2B, + THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWL3B, + THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWGB, + THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEW_NEARMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3, + THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB, + THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARL2B, + THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARL3B, + THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARGB, + THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // GLOBAL_GLOBALMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3, + THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB, + THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALL2B, + THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALL3B, + THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALGB, + THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEW_NEWMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3, + THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB, + THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWL2B, + THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWL3B, + THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWGB, + THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ diff --git a/third_party/aom/av1/encoder/rdopt_utils.h b/third_party/aom/av1/encoder/rdopt_utils.h new file mode 100644 index 0000000000..b6bc4927e3 --- /dev/null +++ b/third_party/aom/av1/encoder/rdopt_utils.h @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RDOPT_UTILS_H_ +#define AOM_AV1_ENCODER_RDOPT_UTILS_H_ + +#include "aom/aom_integer.h" +#include "av1/encoder/block.h" +#include "av1/common/cfl.h" +#include "av1/common/pred_common.h" +#include "av1/encoder/rdopt_data_defs.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_REF_MV_SEARCH 3 +#define MAX_TX_RD_GATE_LEVEL 5 +#define INTER_INTRA_RD_THRESH_SCALE 9 +#define INTER_INTRA_RD_THRESH_SHIFT 4 + +typedef struct { + PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame[2]; +} MODE_DEFINITION; + +// This array defines the mapping from the enums in THR_MODES to the actual +// prediction modes and refrence frames +static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = { + { NEARESTMV, { LAST_FRAME, NONE_FRAME } }, + { NEARESTMV, { LAST2_FRAME, NONE_FRAME } }, + { NEARESTMV, { LAST3_FRAME, NONE_FRAME } }, + { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } }, + { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } }, + { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { NEWMV, { LAST_FRAME, NONE_FRAME } }, + { NEWMV, { LAST2_FRAME, NONE_FRAME } }, + { NEWMV, { LAST3_FRAME, NONE_FRAME } }, + { NEWMV, { BWDREF_FRAME, NONE_FRAME } }, + { NEWMV, { ALTREF2_FRAME, NONE_FRAME } }, + { NEWMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { NEARMV, { LAST_FRAME, NONE_FRAME } }, + { NEARMV, { LAST2_FRAME, NONE_FRAME } }, + { NEARMV, { LAST3_FRAME, NONE_FRAME } }, + { NEARMV, { BWDREF_FRAME, NONE_FRAME } }, + { NEARMV, { ALTREF2_FRAME, NONE_FRAME } }, + { NEARMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEARMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { GLOBALMV, { LAST_FRAME, NONE_FRAME } }, + { GLOBALMV, { LAST2_FRAME, NONE_FRAME } }, + { GLOBALMV, { LAST3_FRAME, NONE_FRAME } }, + { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } }, + { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } }, + { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } }, + { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } }, + + // TODO(zoeliu): May need to reconsider the order on the modes to check + + { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + + { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } }, + + { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } }, + + // intra modes + { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, + { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, +}; + +// Number of winner modes allowed for different values of the speed feature +// multi_winner_mode_type. +static const int winner_mode_count_allowed[MULTI_WINNER_MODE_LEVELS] = { + 1, // MULTI_WINNER_MODE_OFF + 2, // MULTI_WINNER_MODE_FAST + 3 // MULTI_WINNER_MODE_DEFAULT +}; + +static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst, + const int num_planes) { + for (int i = 0; i < num_planes; i++) { + xd->plane[i].dst.buf = dst.plane[i]; + xd->plane[i].dst.stride = dst.stride[i]; + } +} + +static AOM_INLINE void swap_dst_buf(MACROBLOCKD *xd, + const BUFFER_SET *dst_bufs[2], + int num_planes) { + const BUFFER_SET *buf0 = dst_bufs[0]; + dst_bufs[0] = dst_bufs[1]; + dst_bufs[1] = buf0; + restore_dst_buf(xd, *dst_bufs[0], num_planes); +} + +/* clang-format on */ +// Calculate rd threshold based on ref best rd and relevant scaling factors +static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd, + int mul_factor, + int div_factor) { + int64_t rd_thresh = ref_best_rd; + if (div_factor != 0) { + rd_thresh = ref_best_rd < (div_factor * (INT64_MAX / mul_factor)) + ? ((ref_best_rd / div_factor) * mul_factor) + : INT64_MAX; + } + return rd_thresh; +} + +static AOM_INLINE THR_MODES +get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME second_ref_frame) { + if (this_mode < INTRA_MODE_END) { + assert(ref_frame == INTRA_FRAME); + assert(second_ref_frame == NONE_FRAME); + return intra_to_mode_idx[this_mode - INTRA_MODE_START]; + } + if (this_mode >= SINGLE_INTER_MODE_START && + this_mode < SINGLE_INTER_MODE_END) { + assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); + return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START] + [ref_frame]; + } + if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END && + second_ref_frame != NONE_FRAME) { + assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); + assert((second_ref_frame > INTRA_FRAME) && + (second_ref_frame <= ALTREF_FRAME)); + return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame] + [second_ref_frame]; + } + assert(0); + return THR_INVALID; +} + +static AOM_INLINE int inter_mode_data_block_idx(BLOCK_SIZE bsize) { + if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || + bsize == BLOCK_4X16 || bsize == BLOCK_16X4) { + return -1; + } + return 1; +} + +// Get transform block visible dimensions cropped to the MI units. +static AOM_INLINE void get_txb_dimensions(const MACROBLOCKD *xd, int plane, + BLOCK_SIZE plane_bsize, int blk_row, + int blk_col, BLOCK_SIZE tx_bsize, + int *width, int *height, + int *visible_width, + int *visible_height) { + assert(tx_bsize <= plane_bsize); + const int txb_height = block_size_high[tx_bsize]; + const int txb_width = block_size_wide[tx_bsize]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + + // TODO(aconverse@google.com): Investigate using crop_width/height here rather + // than the MI size + if (xd->mb_to_bottom_edge >= 0) { + *visible_height = txb_height; + } else { + const int block_height = block_size_high[plane_bsize]; + const int block_rows = + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height; + *visible_height = + clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, txb_height); + } + if (height) *height = txb_height; + + if (xd->mb_to_right_edge >= 0) { + *visible_width = txb_width; + } else { + const int block_width = block_size_wide[plane_bsize]; + const int block_cols = + (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width; + *visible_width = + clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, txb_width); + } + if (width) *width = txb_width; +} + +static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) { + int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2); + return num_blk; +} + +static INLINE int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize, + int64_t best_skip_rd, int64_t skip_rd, + int level, int is_luma_only) { + int eval_txfm = 1; + // Derive aggressiveness factor for gating the transform search + // Lower value indicates more aggressiveness. Be more conservative (high + // value) for (i) low quantizers (ii) regions where prediction is poor + const int scale[MAX_TX_RD_GATE_LEVEL + 1] = { INT_MAX, 4, 3, 2, 2, 1 }; + const int qslope = 2 * (!is_luma_only); + const int level_to_qindex_map[MAX_TX_RD_GATE_LEVEL + 1] = { 0, 0, 0, + 80, 100, 140 }; + int aggr_factor = 4; + assert(level <= MAX_TX_RD_GATE_LEVEL); + const int pred_qindex_thresh = level_to_qindex_map[level]; + if (!is_luma_only && level <= 2) { + aggr_factor = 4 * AOMMAX(1, ROUND_POWER_OF_TWO((MAXQ - x->qindex) * qslope, + QINDEX_BITS)); + } + if ((best_skip_rd > + (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS))) && + (x->qindex >= pred_qindex_thresh)) + aggr_factor *= scale[level]; + // For level setting 1, be more conservative for non-luma-only case even when + // prediction is good. + else if ((level <= 1) && !is_luma_only) + aggr_factor = (aggr_factor >> 2) * 6; + + // Be more conservative for luma only cases (called from compound type rd) + // since best_skip_rd is computed after and skip_rd is computed (with 8-bit + // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before + // interpolation filter search + const int luma_mul[MAX_TX_RD_GATE_LEVEL + 1] = { + INT_MAX, 32, 29, 17, 17, 17 + }; + int mul_factor = is_luma_only ? luma_mul[level] : 16; + int64_t rd_thresh = + (best_skip_rd == INT64_MAX) + ? best_skip_rd + : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 6); + if (skip_rd > rd_thresh) eval_txfm = 0; + return eval_txfm; +} + +static TX_MODE select_tx_mode( + const AV1_COMMON *cm, const TX_SIZE_SEARCH_METHOD tx_size_search_method) { + if (cm->features.coded_lossless) return ONLY_4X4; + if (tx_size_search_method == USE_LARGESTALL) { + return TX_MODE_LARGEST; + } else { + assert(tx_size_search_method == USE_FULL_RD || + tx_size_search_method == USE_FAST_RD); + return TX_MODE_SELECT; + } +} + +// Checks the conditions to disable winner mode processing +static INLINE int bypass_winner_mode_processing(const MACROBLOCK *const x, + const SPEED_FEATURES *sf, + int use_txfm_skip, + int actual_txfm_skip, + PREDICTION_MODE best_mode) { + const int prune_winner_mode_eval_level = + sf->winner_mode_sf.prune_winner_mode_eval_level; + + // Disable winner mode processing for blocks with low source variance. + // The aggressiveness of this pruning logic reduces as qindex increases. + // The threshold decreases linearly from 64 as qindex varies from 0 to 255. + if (prune_winner_mode_eval_level == 1) { + const unsigned int src_var_thresh = 64 - 48 * x->qindex / (MAXQ + 1); + if (x->source_variance < src_var_thresh) return 1; + } else if (prune_winner_mode_eval_level == 2) { + // Skip winner mode processing of blocks for which transform turns out to be + // skip due to nature of eob alone except NEWMV mode. + if (!have_newmv_in_inter_mode(best_mode) && actual_txfm_skip) return 1; + } else if (prune_winner_mode_eval_level == 3) { + // Skip winner mode processing of blocks for which transform turns out to be + // skip except NEWMV mode and considered based on the quantizer. + // At high quantizers: Take conservative approach by considering transform + // skip based on eob alone. + // At low quantizers: Consider transform skip based on eob nature or RD cost + // evaluation. + const int is_txfm_skip = + x->qindex > 127 ? actual_txfm_skip : actual_txfm_skip || use_txfm_skip; + + if (!have_newmv_in_inter_mode(best_mode) && is_txfm_skip) return 1; + } else if (prune_winner_mode_eval_level >= 4) { + // Do not skip winner mode evaluation at low quantizers if normal mode's + // transform search was too aggressive. + if (sf->rd_sf.perform_coeff_opt >= 5 && x->qindex <= 70) return 0; + + if (use_txfm_skip || actual_txfm_skip) return 1; + } + + return 0; +} + +// Checks the conditions to enable winner mode processing +static INLINE int is_winner_mode_processing_enabled(const struct AV1_COMP *cpi, + const MACROBLOCK *const x, + MB_MODE_INFO *const mbmi, + int actual_txfm_skip) { + const SPEED_FEATURES *sf = &cpi->sf; + const PREDICTION_MODE best_mode = mbmi->mode; + + if (bypass_winner_mode_processing(x, sf, mbmi->skip_txfm, actual_txfm_skip, + best_mode)) + return 0; + + // TODO(any): Move block independent condition checks to frame level + if (is_inter_block(mbmi)) { + if (is_inter_mode(best_mode) && + (sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != INT_MAX) && + !cpi->oxcf.txfm_cfg.use_inter_dct_only) + return 1; + } else { + if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search && + !cpi->oxcf.txfm_cfg.use_intra_default_tx_only && + !cpi->oxcf.txfm_cfg.use_intra_dct_only) + return 1; + } + + // Check speed feature related to winner mode processing + if (sf->winner_mode_sf.enable_winner_mode_for_coeff_opt && + cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT && + cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT) + return 1; + if (sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch) return 1; + + return 0; +} + +static INLINE void set_tx_size_search_method( + const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params, + TxfmSearchParams *txfm_params, int enable_winner_mode_for_tx_size_srch, + int is_winner_mode) { + // Populate transform size search method/transform mode appropriately + txfm_params->tx_size_search_method = + winner_mode_params->tx_size_search_methods[DEFAULT_EVAL]; + if (enable_winner_mode_for_tx_size_srch) { + if (is_winner_mode) + txfm_params->tx_size_search_method = + winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL]; + else + txfm_params->tx_size_search_method = + winner_mode_params->tx_size_search_methods[MODE_EVAL]; + } + txfm_params->tx_mode_search_type = + select_tx_mode(cm, txfm_params->tx_size_search_method); +} + +static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf, + TxfmSearchParams *txfm_params, + int winner_mode_tx_type_pruning, + int is_winner_mode) { + // Populate prune transform mode appropriately + txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode; + if (!winner_mode_tx_type_pruning) return; + + const int prune_mode[4][2] = { { TX_TYPE_PRUNE_3, TX_TYPE_PRUNE_0 }, + { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 }, + { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 }, + { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_3 } }; + txfm_params->prune_2d_txfm_mode = + prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode]; +} + +static INLINE void set_tx_domain_dist_params( + const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params, + int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) { + if (txfm_params->use_qm_dist_metric) { + // QM-weighted PSNR is computed in transform space, so we need to forcibly + // enable the use of tx domain distortion. + txfm_params->use_transform_domain_distortion = 1; + txfm_params->tx_domain_dist_threshold = 0; + return; + } + + if (!enable_winner_mode_for_tx_domain_dist) { + txfm_params->use_transform_domain_distortion = + winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL]; + txfm_params->tx_domain_dist_threshold = + winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL]; + return; + } + + if (is_winner_mode) { + txfm_params->use_transform_domain_distortion = + winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL]; + txfm_params->tx_domain_dist_threshold = + winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL]; + } else { + txfm_params->use_transform_domain_distortion = + winner_mode_params->use_transform_domain_distortion[MODE_EVAL]; + txfm_params->tx_domain_dist_threshold = + winner_mode_params->tx_domain_dist_threshold[MODE_EVAL]; + } +} + +// This function sets mode parameters for different mode evaluation stages +static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi, + MACROBLOCK *x, + MODE_EVAL_TYPE mode_eval_type) { + const AV1_COMMON *cm = &cpi->common; + const SPEED_FEATURES *sf = &cpi->sf; + const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params; + TxfmSearchParams *txfm_params = &x->txfm_search_params; + + txfm_params->use_qm_dist_metric = + cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR; + + switch (mode_eval_type) { + case DEFAULT_EVAL: + txfm_params->default_inter_tx_type_prob_thresh = INT_MAX; + txfm_params->use_default_intra_tx_type = 0; + txfm_params->skip_txfm_level = + winner_mode_params->skip_txfm_level[DEFAULT_EVAL]; + txfm_params->predict_dc_level = + winner_mode_params->predict_dc_level[DEFAULT_EVAL]; + // Set default transform domain distortion type + set_tx_domain_dist_params(winner_mode_params, txfm_params, 0, 0); + + // Get default threshold for R-D optimization of coefficients + get_rd_opt_coeff_thresh(winner_mode_params->coeff_opt_thresholds, + txfm_params, 0, 0); + + // Set default transform size search method + set_tx_size_search_method(cm, winner_mode_params, txfm_params, 0, 0); + // Set default transform type prune + set_tx_type_prune(sf, txfm_params, 0, 0); + break; + case MODE_EVAL: + txfm_params->use_default_intra_tx_type = + (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search || + cpi->oxcf.txfm_cfg.use_intra_default_tx_only); + txfm_params->default_inter_tx_type_prob_thresh = + cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh; + txfm_params->skip_txfm_level = + winner_mode_params->skip_txfm_level[MODE_EVAL]; + txfm_params->predict_dc_level = + winner_mode_params->predict_dc_level[MODE_EVAL]; + // Set transform domain distortion type for mode evaluation + set_tx_domain_dist_params( + winner_mode_params, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0); + + // Get threshold for R-D optimization of coefficients during mode + // evaluation + get_rd_opt_coeff_thresh( + winner_mode_params->coeff_opt_thresholds, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0); + + // Set the transform size search method for mode evaluation + set_tx_size_search_method( + cm, winner_mode_params, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0); + // Set transform type prune for mode evaluation + set_tx_type_prune(sf, txfm_params, + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning, + 0); + break; + case WINNER_MODE_EVAL: + txfm_params->default_inter_tx_type_prob_thresh = INT_MAX; + txfm_params->use_default_intra_tx_type = 0; + txfm_params->skip_txfm_level = + winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL]; + txfm_params->predict_dc_level = + winner_mode_params->predict_dc_level[WINNER_MODE_EVAL]; + + // Set transform domain distortion type for winner mode evaluation + set_tx_domain_dist_params( + winner_mode_params, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1); + + // Get threshold for R-D optimization of coefficients for winner mode + // evaluation + get_rd_opt_coeff_thresh( + winner_mode_params->coeff_opt_thresholds, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1); + + // Set the transform size search method for winner mode evaluation + set_tx_size_search_method( + cm, winner_mode_params, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1); + // Set default transform type prune mode for winner mode evaluation + set_tx_type_prune(sf, txfm_params, + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning, + 1); + break; + default: assert(0); + } + + // Rd record collected at a specific mode evaluation stage can not be used + // across other evaluation stages as the transform parameters are different. + // Hence, reset mb rd record whenever mode evaluation stage type changes. + if (txfm_params->mode_eval_type != mode_eval_type) + reset_mb_rd_record(x->txfm_search_info.mb_rd_record); + + txfm_params->mode_eval_type = mode_eval_type; +} + +// Similar to store_cfl_required(), but for use during the RDO process, +// where we haven't yet determined whether this block uses CfL. +static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm, + const MACROBLOCK *x) { + const MACROBLOCKD *xd = &x->e_mbd; + + if (cm->seq_params->monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED; + + if (!xd->is_chroma_ref) { + // For non-chroma-reference blocks, we should always store the luma pixels, + // in case the corresponding chroma-reference block uses CfL. + // Note that this can only happen for block sizes which are <8 on + // their shortest side, as otherwise they would be chroma reference + // blocks. + return CFL_ALLOWED; + } + + // For chroma reference blocks, we should store data in the encoder iff we're + // allowed to try out CfL. + return is_cfl_allowed(xd); +} + +static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) { + mbmi->uv_mode = UV_DC_PRED; + mbmi->palette_mode_info.palette_size[1] = 0; +} + +// Store best mode stats for winner mode processing +static INLINE void store_winner_mode_stats( + const AV1_COMMON *const cm, MACROBLOCK *x, const MB_MODE_INFO *mbmi, + RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv, + THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd, + int multi_winner_mode_type, int txfm_search_done) { + WinnerModeStats *winner_mode_stats = x->winner_mode_stats; + int mode_idx = 0; + int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0; + // Mode stat is not required when multiwinner mode processing is disabled + if (multi_winner_mode_type == MULTI_WINNER_MODE_OFF) return; + // Ignore mode with maximum rd + if (this_rd == INT64_MAX) return; + // TODO(any): Winner mode processing is currently not applicable for palette + // mode in Inter frames. Clean-up the following code, once support is added + if (!frame_is_intra_only(cm) && is_palette_mode) return; + + int max_winner_mode_count = winner_mode_count_allowed[multi_winner_mode_type]; + assert(x->winner_mode_count >= 0 && + x->winner_mode_count <= max_winner_mode_count); + + if (x->winner_mode_count) { + // Find the mode which has higher rd cost than this_rd + for (mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) + if (winner_mode_stats[mode_idx].rd > this_rd) break; + + if (mode_idx == max_winner_mode_count) { + // No mode has higher rd cost than this_rd + return; + } else if (mode_idx < max_winner_mode_count - 1) { + // Create a slot for current mode and move others to the next slot + memmove( + &winner_mode_stats[mode_idx + 1], &winner_mode_stats[mode_idx], + (max_winner_mode_count - mode_idx - 1) * sizeof(*winner_mode_stats)); + } + } + // Add a mode stat for winner mode processing + winner_mode_stats[mode_idx].mbmi = *mbmi; + winner_mode_stats[mode_idx].rd = this_rd; + winner_mode_stats[mode_idx].mode_index = mode_index; + + // Update rd stats required for inter frame + if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) { + const MACROBLOCKD *xd = &x->e_mbd; + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END; + const int skip_txfm = mbmi->skip_txfm && !is_intra_mode; + + winner_mode_stats[mode_idx].rd_cost = *rd_cost; + if (txfm_search_done) { + winner_mode_stats[mode_idx].rate_y = + rd_cost_y->rate + + x->mode_costs + .skip_txfm_cost[skip_ctx][rd_cost->skip_txfm || skip_txfm]; + winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate; + } + } + + if (color_map) { + // Store color_index_map for palette mode + const MACROBLOCKD *const xd = &x->e_mbd; + int block_width, block_height; + av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width, + &block_height, NULL, NULL); + memcpy(winner_mode_stats[mode_idx].color_index_map, color_map, + block_width * block_height * sizeof(color_map[0])); + } + + x->winner_mode_count = + AOMMIN(x->winner_mode_count + 1, max_winner_mode_count); +} + +unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi, + const MACROBLOCKD *xd, + const struct buf_2d *ref, + BLOCK_SIZE bsize, int plane, + int use_hbd); + +unsigned int av1_get_perpixel_variance_facade(const struct AV1_COMP *cpi, + const MACROBLOCKD *xd, + const struct buf_2d *ref, + BLOCK_SIZE bsize, int plane); + +static INLINE int is_mode_intra(PREDICTION_MODE mode) { + return mode < INTRA_MODE_END; +} + +// This function will copy usable ref_mv_stack[ref_frame][4] and +// weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and +// weight[ref_frame][8]. +static INLINE void av1_copy_usable_ref_mv_stack_and_weight( + const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext, + MV_REFERENCE_FRAME ref_frame) { + memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame], + USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0])); + memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame], + USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0])); +} + +// Get transform rd gate level for the given transform search case. +static INLINE int get_txfm_rd_gate_level( + const int is_masked_compound_enabled, + const int txfm_rd_gate_level[TX_SEARCH_CASES], BLOCK_SIZE bsize, + TX_SEARCH_CASE tx_search_case, int eval_motion_mode) { + assert(tx_search_case < TX_SEARCH_CASES); + if (tx_search_case == TX_SEARCH_MOTION_MODE && !eval_motion_mode && + num_pels_log2_lookup[bsize] > 8) + return txfm_rd_gate_level[TX_SEARCH_MOTION_MODE]; + // Enable aggressive gating of transform search only when masked compound type + // is enabled. + else if (tx_search_case == TX_SEARCH_COMP_TYPE_MODE && + is_masked_compound_enabled) + return txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE]; + + return txfm_rd_gate_level[TX_SEARCH_DEFAULT]; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RDOPT_UTILS_H_ diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c new file mode 100644 index 0000000000..9b964113a5 --- /dev/null +++ b/third_party/aom/av1/encoder/reconinter_enc.c @@ -0,0 +1,701 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/encoder/reconinter_enc.h" + +static AOM_INLINE void enc_calc_subpel_params( + const MV *const src_mv, InterPredParams *const inter_pred_params, + uint8_t **pre, SubpelParams *subpel_params, int *src_stride) { + struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf; + init_subpel_params(src_mv, inter_pred_params, subpel_params, pre_buf->width, + pre_buf->height); + *pre = pre_buf->buf0 + + (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (subpel_params->pos_x >> SCALE_SUBPEL_BITS); + *src_stride = pre_buf->stride; +} + +#define IS_DEC 0 +#include "av1/common/reconinter_template.inc" +#undef IS_DEC + +void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride, + const MV *src_mv, + InterPredParams *inter_pred_params) { + build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params); +} + +static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, const MB_MODE_INFO *mi, + int bw, int bh, int mi_x, int mi_y) { + build_inter_predictors(cm, xd, plane, mi, /*build_for_obmc=*/0, bw, bh, mi_x, + mi_y); +} + +void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) { + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + InterPredParams inter_pred_params; + + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf; + const MV mv = xd->mi[0]->mv[0].as_mv; + const struct scale_factors *const sf = xd->block_ref_scale_factors[0]; + + av1_init_inter_params(&inter_pred_params, pd->width, pd->height, mi_y, mi_x, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), false, sf, pd->pre, + xd->mi[0]->interp_filters); + + inter_pred_params.conv_params = get_conv_params_no_round( + 0, AOM_PLANE_Y, xd->tmp_conv_dst, MAX_SB_SIZE, false, xd->bd); + + inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0; + av1_enc_build_one_inter_predictor(dst, dst_buf->stride, &mv, + &inter_pred_params); +} + +void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params) { + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + + const MB_MODE_INFO *mbmi = xd->mi[0]; + struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *pre_buf = &pd->pre[0]; + const uint8_t *src = + pre_buf->buf0 + + (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (subpel_params->pos_x >> SCALE_SUBPEL_BITS); + uint8_t *const dst = dst_buf->buf; + int src_stride = pre_buf->stride; + int dst_stride = dst_buf->stride; + inter_pred_params->ref_frame_buf = *pre_buf; + + // Initialize interp filter for single reference mode. + init_interp_filter_params(inter_pred_params->interp_filter_params, + &mbmi->interp_filters.as_filters, pd->width, + pd->height, /*is_intrabc=*/0); + + av1_make_inter_predictor(src, src_stride, dst, dst_stride, inter_pred_params, + subpel_params); +} + +void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + const BUFFER_SET *ctx, BLOCK_SIZE bsize, + int plane_from, int plane_to) { + for (int plane = plane_from; plane <= plane_to; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + enc_build_inter_predictors(cm, xd, plane, xd->mi[0], xd->plane[plane].width, + xd->plane[plane].height, mi_x, mi_y); + + if (is_interintra_pred(xd->mi[0])) { + BUFFER_SET default_ctx = { + { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf }, + { xd->plane[0].dst.stride, xd->plane[1].dst.stride, + xd->plane[2].dst.stride } + }; + if (!ctx) { + ctx = &default_ctx; + } + av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf, + xd->plane[plane].dst.stride, ctx, plane, + bsize); + } + } +} + +static void setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset, + int mi_col_offset, MB_MODE_INFO *ref_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes) { + const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->bsize); + const int ref_mi_row = xd->mi_row + mi_row_offset; + const int ref_mi_col = xd->mi_col + mi_col_offset; + + for (int plane = 0; plane < num_planes; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane], + ctxt->tmp_width[plane], ctxt->tmp_height[plane], + ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset, + NULL, pd->subsampling_x, pd->subsampling_y); + } + + const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0]; + + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const sf = + get_ref_scale_factors_const(ctxt->cm, frame); + + xd->block_ref_scale_factors[0] = sf; + if (!av1_is_valid_scale(sf)) + aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + + av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf, + num_planes); +} + +static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *above_mbmi, + void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt, + num_planes); + + const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2; + const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2; + + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + + InterPredParams inter_pred_params; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = 0, bh = 0; + + if (dir) { + // prepare left reference block size + bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, + block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); + bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y; + } else { + // prepare above reference block size + bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; + bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, + block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); + } + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, dir)) continue; + + const struct buf_2d *const pre_buf = &pd->pre[0]; + const MV mv = above_mbmi->mv[0].as_mv; + + av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, + pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0, + xd->block_ref_scale_factors[0], pre_buf, + above_mbmi->interp_filters); + inter_pred_params.conv_params = get_conv_params(0, j, xd->bd); + + av1_enc_build_one_inter_predictor(pd->dst.buf, pd->dst.stride, &mv, + &inter_pred_params); + } +} + +void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->up_available) return; + struct build_prediction_ctxt ctxt = { + cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, NULL + }; + BLOCK_SIZE bsize = xd->mi[0]->bsize; + foreach_overlappable_nb_above(cm, xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + build_obmc_prediction, &ctxt); +} + +void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->left_available) return; + struct build_prediction_ctxt ctxt = { + cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, NULL + }; + BLOCK_SIZE bsize = xd->mi[0]->bsize; + foreach_overlappable_nb_left(cm, xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + build_obmc_prediction, &ctxt); +} + +void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) { + const int num_planes = av1_num_planes(cm); + uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; + int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + + av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1, + dst_stride1); + av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2, + dst_stride2); + av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row, + mi_col, 0, num_planes); + av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2, + dst_stride2); +} + +void av1_build_inter_predictors_for_planes_single_buf( + MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref, + uint8_t *ext_dst[], int ext_dst_stride[]) { + assert(bsize < BLOCK_SIZES_ALL); + const MB_MODE_INFO *mi = xd->mi[0]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + WarpTypesAllowed warp_types; + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype); + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + InterPredParams inter_pred_params; + + av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, + pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0, + xd->block_ref_scale_factors[ref], &pd->pre[ref], + mi->interp_filters); + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi); + + uint8_t *const dst = get_buf_by_bd(xd, ext_dst[plane]); + const MV mv = mi->mv[ref].as_mv; + + av1_enc_build_one_inter_predictor(dst, ext_dst_stride[plane], &mv, + &inter_pred_params); + } +} + +static void build_masked_compound( + uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w) { + // Derive subsampling from h and w passed in. May be refactored to + // pass in subsampling factors directly. + const int subh = (2 << mi_size_high_log2[sb_type]) == h; + const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, block_size_wide[sb_type], w, h, subw, subh); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void build_masked_compound_highbd( + uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride, + const uint8_t *src1_8, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w, int bd) { + // Derive subsampling from h and w passed in. May be refactored to + // pass in subsampling factors directly. + const int subh = (2 << mi_size_high_log2[sb_type]) == h; + const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + // const uint8_t *mask = + // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type); + aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, block_size_wide[sb_type], w, h, + subw, subh, bd); +} +#endif + +static void build_wedge_inter_predictor_from_buf( + MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0, + int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_compound = has_second_ref(mbmi); + MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + mbmi->interinter_comp.seg_mask = xd->seg_mask; + const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp; + const int is_hbd = is_cur_buf_hbd(xd); + + if (is_compound && is_masked_compound_type(comp_data->type)) { + if (!plane && comp_data->type == COMPOUND_DIFFWTD) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + av1_build_compound_diffwtd_mask_highbd( + comp_data->seg_mask, comp_data->mask_type, + CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd); + } else { + av1_build_compound_diffwtd_mask( + comp_data->seg_mask, comp_data->mask_type, ext_dst0, + ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w); + } +#else + (void)is_hbd; + av1_build_compound_diffwtd_mask(comp_data->seg_mask, comp_data->mask_type, + ext_dst0, ext_dst_stride0, ext_dst1, + ext_dst_stride1, h, w); +#endif // CONFIG_AV1_HIGHBITDEPTH + } +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + build_masked_compound_highbd( + dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, mbmi->bsize, + h, w, xd->bd); + } else { + build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, + ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, + h, w); + } +#else + build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, + ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, h, + w); +#endif + } else { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_SHORTPTR(dst), dst_buf->stride, w, h); + } else { + aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h); + } +#else + aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h); +#endif + } +} + +void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane_from, int plane_to, + uint8_t *ext_dst0[], + int ext_dst_stride0[], + uint8_t *ext_dst1[], + int ext_dst_stride1[]) { + int plane; + assert(bsize < BLOCK_SIZES_ALL); + for (plane = plane_from; plane <= plane_to; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size( + bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + build_wedge_inter_predictor_from_buf( + xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane], + ext_dst1[plane], ext_dst_stride1[plane]); + } +} + +// Get pred block from up-sampled reference. +void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, + int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + + if (!subpel_x_q3 && !subpel_y_q3) { + for (int i = 0; i < height; i++) { + memcpy(comp_pred, ref, width * sizeof(*comp_pred)); + comp_pred += width; + ref += ref_stride; + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL, + -1, width, height); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel, + 16, width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), + MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, + width, height); + } +} + +void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + int i, j; + + aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1); + } + comp_pred += width; + pred += width; + } +} + +void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask, + int subpel_search) { + if (subpel_x_q3 | subpel_y_q3) { + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + ref = comp_pred; + ref_stride = width; + } + aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); +} + +void aom_dist_wtd_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + + aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint8_t)tmp; + } + comp_pred += width; + pred += width; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, + const struct AV1Common *const cm, int mi_row, + int mi_col, const MV *const mv, + uint8_t *comp_pred8, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred8, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + + if (!subpel_x_q3 && !subpel_y_q3) { + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + for (int i = 0; i < height; i++) { + memcpy(comp_pred, ref, width * sizeof(*comp_pred)); + comp_pred += width; + ref += ref_stride; + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel, + 16, NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert_c( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); + } +} + +void aom_highbd_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, int subpel_search) { + int i, j; + + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1); + } + comp_pred += width; + pred += width; + } +} + +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, + int subpel_search) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, + ref_stride, bd, subpel_search); + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint16_t)tmp; + } + comp_pred += width; + pred += width; + } +} + +void aom_highbd_comp_mask_upsampled_pred( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width, + mask, mask_stride, invert_mask); +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h new file mode 100644 index 0000000000..16932f37a0 --- /dev/null +++ b/third_party/aom/av1/encoder/reconinter_enc.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_ +#define AOM_AV1_ENCODER_RECONINTER_ENC_H_ + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/reconinter.h" +#include "av1/common/warped_motion.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask, + int subpel_search); + +void aom_highbd_comp_mask_upsampled_pred( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int bd, int subpel_search); + +// Build single or compound reference inter predictors for all planes. +// Can build inter-intra predictors, masked predictors etc as well. +void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + const BUFFER_SET *ctx, BLOCK_SIZE bsize, + int plane_from, int plane_to); + +void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col); + +void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params); + +// Build one inter predictor. It is called for building predictor for single +// reference case, or just the 1st or 2nd reference in compound reference case. +// Can build both regular and masked predictors. +void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride, + const MV *src_mv, + InterPredParams *inter_pred_params); + +void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]); + +void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]); + +void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd); + +// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive. +void av1_build_inter_predictors_for_planes_single_buf( + MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref, + uint8_t *ext_dst[], int ext_dst_stride[]); + +// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive. +void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane_from, int plane_to, + uint8_t *ext_dst0[], + int ext_dst_stride0[], + uint8_t *ext_dst1[], + int ext_dst_stride1[]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_ diff --git a/third_party/aom/av1/encoder/saliency_map.c b/third_party/aom/av1/encoder/saliency_map.c new file mode 100644 index 0000000000..30019bbec0 --- /dev/null +++ b/third_party/aom/av1/encoder/saliency_map.c @@ -0,0 +1,1414 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/saliency_map.h" + +// The Gabor filter is generated by setting the parameters as: +// ksize = 9 +// sigma = 1 +// theta = y*np.pi/4, where y /in {0, 1, 2, 3}, i.e., 0, 45, 90, 135 degree +// lambda1 = 1 +// gamma=0.8 +// phi =0 +static const double kGaborFilter[4][9][9] = { // [angle: 0, 45, 90, 135 + // degree][ksize][ksize] + { { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03, + 3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 }, + { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02, + 3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 }, + { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01, + 1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 }, + { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01, + 4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 }, + { 3.3546262e-04, 1.1108996e-02, 1.3533528e-01, 6.0653067e-01, 1.0000000e+00, + 6.0653067e-01, 1.3533528e-01, 1.1108996e-02, 3.3546262e-04 }, + { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01, + 4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 }, + { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01, + 1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 }, + { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02, + 3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 }, + { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03, + 3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 } }, + + { { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04, + 6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05, + 3.5712848e-05 }, + { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03, + 1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03, + -8.1631159e-05 }, + { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02, + -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03, + -9.9486928e-04 }, + { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01, + -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02, + 1.3962291e-03 }, + { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01, + 1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02, + 6.6981313e-04 }, + { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01, + -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03, + -4.4602581e-04 }, + { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02, + -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03, + 3.0079011e-06 }, + { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02, + 1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06, + 3.8760313e-06 }, + { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03, + 6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06, + -6.2165498e-08 } }, + + { { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04, + 2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 }, + { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02, + 8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 }, + { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01, + 9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 }, + { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01, + 4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 }, + { 5.9760227e-03, 5.6134764e-02, 2.7803731e-01, 7.2614902e-01, 1.0000000e+00, + 7.2614902e-01, 2.7803731e-01, 5.6134764e-02, 5.9760227e-03 }, + { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01, + 4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 }, + { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01, + 9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 }, + { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02, + 8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 }, + { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04, + 2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 } }, + + { { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03, + 6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06, + -6.2165498e-08 }, + { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02, + 1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06, + 3.8760313e-06 }, + { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02, + -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03, + 3.0079011e-06 }, + { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01, + -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03, + -4.4602581e-04 }, + { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01, + 1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02, + 6.6981313e-04 }, + { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01, + -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02, + 1.3962291e-03 }, + { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02, + -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03, + -9.9486928e-04 }, + { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03, + 1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03, + -8.1631159e-05 }, + { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04, + 6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05, + 3.5712848e-05 } } +}; + +// This function is to extract red/green/blue channels, and calculate intensity +// = (r+g+b)/3. Note that it only handles 8bits case now. +// TODO(linzhen): add high bitdepth support. +static void get_color_intensity(const YV12_BUFFER_CONFIG *src, + int subsampling_x, int subsampling_y, + double *cr, double *cg, double *cb, + double *intensity) { + const uint8_t *y = src->buffers[0]; + const uint8_t *u = src->buffers[1]; + const uint8_t *v = src->buffers[2]; + + const int y_height = src->crop_heights[0]; + const int y_width = src->crop_widths[0]; + const int y_stride = src->strides[0]; + const int c_stride = src->strides[1]; + + for (int i = 0; i < y_height; ++i) { + for (int j = 0; j < y_width; ++j) { + cr[i * y_width + j] = + fclamp((double)y[i * y_stride + j] + + 1.370 * (double)(v[(i >> subsampling_y) * c_stride + + (j >> subsampling_x)] - + 128), + 0, 255); + cg[i * y_width + j] = + fclamp((double)y[i * y_stride + j] - + 0.698 * (double)(u[(i >> subsampling_y) * c_stride + + (j >> subsampling_x)] - + 128) - + 0.337 * (double)(v[(i >> subsampling_y) * c_stride + + (j >> subsampling_x)] - + 128), + 0, 255); + cb[i * y_width + j] = + fclamp((double)y[i * y_stride + j] + + 1.732 * (double)(u[(i >> subsampling_y) * c_stride + + (j >> subsampling_x)] - + 128), + 0, 255); + + intensity[i * y_width + j] = + (cr[i * y_width + j] + cg[i * y_width + j] + cb[i * y_width + j]) / + 3.0; + assert(intensity[i * y_width + j] >= 0 && + intensity[i * y_width + j] <= 255); + + intensity[i * y_width + j] /= 256; + cr[i * y_width + j] /= 256; + cg[i * y_width + j] /= 256; + cb[i * y_width + j] /= 256; + } + } +} + +static INLINE double convolve_map(const double *filter, const double *map, + const int size) { + double result = 0; + for (int i = 0; i < size; ++i) { + result += filter[i] * map[i]; // symmetric filter is used + } + return result; +} + +// This function is to decimate the map by half, and apply Gaussian filter on +// top of the downsampled map. +static INLINE void decimate_map(const double *map, int height, int width, + int stride, double *downsampled_map) { + const int new_width = width / 2; + const int window_size = 5; + const double gaussian_filter[25] = { + 1. / 256, 1.0 / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16, + 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32, + 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256, + 1. / 64, 3. / 128, 1. / 64, 1. / 256 + }; + + double map_region[25]; + for (int y = 0; y < height - 1; y += 2) { + for (int x = 0; x < width - 1; x += 2) { + int i = 0; + for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) { + for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) { + int yvalue = clamp(yy, 0, height - 1); + int xvalue = clamp(xx, 0, width - 1); + map_region[i++] = map[yvalue * stride + xvalue]; + } + } + downsampled_map[(y / 2) * new_width + (x / 2)] = + convolve_map(gaussian_filter, map_region, window_size * window_size); + } + } +} + +// This function is to upscale the map from in_level size to out_level size. +// Note that the map at "level-1" will upscale the map at "level" by x2. +static INLINE int upscale_map(const double *input, int in_level, int out_level, + int height[9], int width[9], double *output) { + for (int level = in_level; level > out_level; level--) { + const int cur_width = width[level]; + const int cur_height = height[level]; + const int cur_stride = width[level]; + + double *original = (level == in_level) ? (double *)input : output; + + assert(level > 0); + + const int h_upscale = height[level - 1]; + const int w_upscale = width[level - 1]; + const int s_upscale = width[level - 1]; + + double *upscale = aom_malloc(h_upscale * w_upscale * sizeof(*upscale)); + + if (!upscale) { + return 0; + } + + for (int i = 0; i < h_upscale; ++i) { + for (int j = 0; j < w_upscale; ++j) { + const int ii = clamp((i >> 1), 0, cur_height - 1); + const int jj = clamp((j >> 1), 0, cur_width - 1); + upscale[j + i * s_upscale] = (double)original[jj + ii * cur_stride]; + } + } + memcpy(output, upscale, h_upscale * w_upscale * sizeof(double)); + aom_free(upscale); + } + + return 1; +} + +// This function calculates the differences between a fine scale c and a +// coarser scale s yielding the feature maps. c \in {2, 3, 4}, and s = c + +// delta, where delta \in {3, 4}. +static int center_surround_diff(const double *input[9], int height[9], + int width[9], saliency_feature_map *output[6]) { + int j = 0; + for (int k = 2; k < 5; ++k) { + int cur_height = height[k]; + int cur_width = width[k]; + + if (upscale_map(input[k + 3], k + 3, k, height, width, output[j]->buf) == + 0) { + return 0; + } + + for (int r = 0; r < cur_height; ++r) { + for (int c = 0; c < cur_width; ++c) { + output[j]->buf[r * cur_width + c] = + fabs((double)(input[k][r * cur_width + c] - + output[j]->buf[r * cur_width + c])); + } + } + + if (upscale_map(input[k + 4], k + 4, k, height, width, + output[j + 1]->buf) == 0) { + return 0; + } + + for (int r = 0; r < cur_height; ++r) { + for (int c = 0; c < cur_width; ++c) { + output[j + 1]->buf[r * cur_width + c] = + fabs(input[k][r * cur_width + c] - + output[j + 1]->buf[r * cur_width + c]); + } + } + + j += 2; + } + return 1; +} + +// For color channels, the differences is calculated based on "color +// double-opponency". For example, the RG feature map is constructed between a +// fine scale c of R-G component and a coarser scale s of G-R component. +static int center_surround_diff_rgb(const double *input_1[9], + const double *input_2[9], int height[9], + int width[9], + saliency_feature_map *output[6]) { + int j = 0; + for (int k = 2; k < 5; ++k) { + int cur_height = height[k]; + int cur_width = width[k]; + + if (upscale_map(input_2[k + 3], k + 3, k, height, width, output[j]->buf) == + 0) { + return 0; + } + + for (int r = 0; r < cur_height; ++r) { + for (int c = 0; c < cur_width; ++c) { + output[j]->buf[r * cur_width + c] = + fabs((double)(input_1[k][r * cur_width + c] - + output[j]->buf[r * cur_width + c])); + } + } + + if (upscale_map(input_2[k + 4], k + 4, k, height, width, + output[j + 1]->buf) == 0) { + return 0; + } + + for (int r = 0; r < cur_height; ++r) { + for (int c = 0; c < cur_width; ++c) { + output[j + 1]->buf[r * cur_width + c] = + fabs(input_1[k][r * cur_width + c] - + output[j + 1]->buf[r * cur_width + c]); + } + } + + j += 2; + } + return 1; +} + +// This function is to generate Gaussian pyramid images with indexes from 0 to +// 8, and construct the feature maps from calculating the center-surround +// differences. +static int gaussian_pyramid(const double *src, int width[9], int height[9], + saliency_feature_map *dst[6]) { + double *gaussian_map[9]; // scale = 9 + gaussian_map[0] = + (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0])); + if (!gaussian_map[0]) { + return 0; + } + + memcpy(gaussian_map[0], src, width[0] * height[0] * sizeof(double)); + + for (int i = 1; i < 9; ++i) { + int stride = width[i - 1]; + int new_width = width[i]; + int new_height = height[i]; + + gaussian_map[i] = + (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i])); + + if (!gaussian_map[i]) { + for (int l = 0; l < i; ++l) { + aom_free(gaussian_map[l]); + } + return 0; + } + + memset(gaussian_map[i], 0, new_width * new_height * sizeof(double)); + + decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride, + gaussian_map[i]); + } + + if (center_surround_diff((const double **)gaussian_map, height, width, dst) == + 0) { + for (int l = 0; l < 9; ++l) { + aom_free(gaussian_map[l]); + } + return 0; + } + + for (int i = 0; i < 9; ++i) { + aom_free(gaussian_map[i]); + } + return 1; +} + +static int gaussian_pyramid_rgb(double *src_1, double *src_2, int width[9], + int height[9], saliency_feature_map *dst[6]) { + double *gaussian_map[2][9]; // scale = 9 + double *src[2]; + + src[0] = src_1; + src[1] = src_2; + + for (int k = 0; k < 2; ++k) { + gaussian_map[k][0] = (double *)aom_malloc(width[0] * height[0] * + sizeof(*gaussian_map[k][0])); + if (!gaussian_map[k][0]) { + for (int l = 0; l < k; ++l) { + aom_free(gaussian_map[l][0]); + } + return 0; + } + memcpy(gaussian_map[k][0], src[k], width[0] * height[0] * sizeof(double)); + + for (int i = 1; i < 9; ++i) { + int stride = width[i - 1]; + int new_width = width[i]; + int new_height = height[i]; + + gaussian_map[k][i] = (double *)aom_malloc(new_width * new_height * + sizeof(*gaussian_map[k][i])); + if (!gaussian_map[k][i]) { + for (int l = 0; l < k; ++l) { + aom_free(gaussian_map[l][i]); + } + return 0; + } + memset(gaussian_map[k][i], 0, new_width * new_height * sizeof(double)); + decimate_map(gaussian_map[k][i - 1], height[i - 1], width[i - 1], stride, + gaussian_map[k][i]); + } + } + + if (center_surround_diff_rgb((const double **)gaussian_map[0], + (const double **)gaussian_map[1], height, width, + dst) == 0) { + for (int l = 0; l < 2; ++l) { + for (int i = 0; i < 9; ++i) { + aom_free(gaussian_map[l][i]); + } + } + return 0; + } + + for (int l = 0; l < 2; ++l) { + for (int i = 0; i < 9; ++i) { + aom_free(gaussian_map[l][i]); + } + } + return 1; +} + +static int get_feature_map_intensity(double *intensity, int width[9], + int height[9], + saliency_feature_map *i_map[6]) { + if (gaussian_pyramid(intensity, width, height, i_map) == 0) { + return 0; + } + return 1; +} + +static int get_feature_map_rgb(double *cr, double *cg, double *cb, int width[9], + int height[9], saliency_feature_map *rg_map[6], + saliency_feature_map *by_map[6]) { + double *rg_mat = aom_malloc(height[0] * width[0] * sizeof(*rg_mat)); + double *by_mat = aom_malloc(height[0] * width[0] * sizeof(*by_mat)); + double *gr_mat = aom_malloc(height[0] * width[0] * sizeof(*gr_mat)); + double *yb_mat = aom_malloc(height[0] * width[0] * sizeof(*yb_mat)); + + if (!rg_mat || !by_mat || !gr_mat || !yb_mat) { + aom_free(rg_mat); + aom_free(by_mat); + aom_free(gr_mat); + aom_free(yb_mat); + return 0; + } + + double r, g, b, y; + for (int i = 0; i < height[0]; ++i) { + for (int j = 0; j < width[0]; ++j) { + r = AOMMAX(0, cr[i * width[0] + j] - + (cg[i * width[0] + j] + cb[i * width[0] + j]) / 2); + g = AOMMAX(0, cg[i * width[0] + j] - + (cr[i * width[0] + j] + cb[i * width[0] + j]) / 2); + b = AOMMAX(0, cb[i * width[0] + j] - + (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2); + y = AOMMAX(0, (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2 - + fabs(cr[i * width[0] + j] - cg[i * width[0] + j]) / 2 - + cb[i * width[0] + j]); + + rg_mat[i * width[0] + j] = r - g; + by_mat[i * width[0] + j] = b - y; + gr_mat[i * width[0] + j] = g - r; + yb_mat[i * width[0] + j] = y - b; + } + } + + if (gaussian_pyramid_rgb(rg_mat, gr_mat, width, height, rg_map) == 0 || + gaussian_pyramid_rgb(by_mat, yb_mat, width, height, by_map) == 0) { + aom_free(rg_mat); + aom_free(by_mat); + aom_free(gr_mat); + aom_free(yb_mat); + return 0; + } + + aom_free(rg_mat); + aom_free(by_mat); + aom_free(gr_mat); + aom_free(yb_mat); + return 1; +} + +static INLINE void filter2d(const double *input, const double kernel[9][9], + int width, int height, double *output) { + const int window_size = 9; + double map_section[81]; + for (int y = 0; y <= height - 1; ++y) { + for (int x = 0; x <= width - 1; ++x) { + int i = 0; + for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) { + for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) { + int yvalue = clamp(yy, 0, height - 1); + int xvalue = clamp(xx, 0, width - 1); + map_section[i++] = input[yvalue * width + xvalue]; + } + } + + output[y * width + x] = 0; + for (int k = 0; k < window_size; ++k) { + for (int l = 0; l < window_size; ++l) { + output[y * width + x] += + kernel[k][l] * map_section[k * window_size + l]; + } + } + } + } +} + +static int get_feature_map_orientation(const double *intensity, int width[9], + int height[9], + saliency_feature_map *dst[24]) { + double *gaussian_map[9]; + + gaussian_map[0] = + (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0])); + if (!gaussian_map[0]) { + return 0; + } + memcpy(gaussian_map[0], intensity, width[0] * height[0] * sizeof(double)); + + for (int i = 1; i < 9; ++i) { + int stride = width[i - 1]; + int new_width = width[i]; + int new_height = height[i]; + + gaussian_map[i] = + (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i])); + if (!gaussian_map[i]) { + for (int l = 0; l < i; ++l) { + aom_free(gaussian_map[l]); + } + return 0; + } + memset(gaussian_map[i], 0, new_width * new_height * sizeof(double)); + decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride, + gaussian_map[i]); + } + + double *tempGaborOutput[4][9]; //[angle: 0, 45, 90, 135 degree][filter_size] + + for (int i = 2; i < 9; ++i) { + const int cur_height = height[i]; + const int cur_width = width[i]; + for (int j = 0; j < 4; ++j) { + tempGaborOutput[j][i] = (double *)aom_malloc( + cur_height * cur_width * sizeof(*tempGaborOutput[j][i])); + if (!tempGaborOutput[j][i]) { + for (int l = 0; l < 9; ++l) { + aom_free(gaussian_map[l]); + } + for (int h = 0; h < 4; ++h) { + for (int g = 2; g < 9; ++g) { + aom_free(tempGaborOutput[h][g]); + } + } + return 0; + } + filter2d(gaussian_map[i], kGaborFilter[j], cur_width, cur_height, + tempGaborOutput[j][i]); + } + } + + for (int i = 0; i < 9; ++i) { + aom_free(gaussian_map[i]); + } + + saliency_feature_map + *tmp[4][6]; //[angle: 0, 45, 90, 135 degree][filter_size] + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 4; ++j) { + tmp[j][i] = dst[j * 6 + i]; + } + } + + for (int j = 0; j < 4; ++j) { + if (center_surround_diff((const double **)tempGaborOutput[j], height, width, + tmp[j]) == 0) { + for (int h = 0; h < 4; ++h) { + for (int g = 2; g < 9; ++g) { + aom_free(tempGaborOutput[h][g]); + } + } + return 0; + } + } + + for (int i = 2; i < 9; ++i) { + for (int j = 0; j < 4; ++j) { + aom_free(tempGaborOutput[j][i]); + } + } + + return 1; +} + +static INLINE void find_min_max(const saliency_feature_map *input, + double *max_value, double *min_value) { + assert(input && input->buf); + *min_value = DBL_MAX; + *max_value = 0.0; + + for (int i = 0; i < input->height; ++i) { + for (int j = 0; j < input->width; ++j) { + assert(input->buf[i * input->width + j] >= 0.0); + *min_value = fmin(input->buf[i * input->width + j], *min_value); + *max_value = fmax(input->buf[i * input->width + j], *max_value); + } + } +} + +static INLINE double average_local_max(const saliency_feature_map *input, + int stepsize) { + int numlocal = 0; + double lmaxmean = 0, lmax = 0, dummy = 0; + saliency_feature_map local_map; + local_map.height = stepsize; + local_map.width = stepsize; + local_map.buf = + (double *)aom_malloc(stepsize * stepsize * sizeof(*local_map.buf)); + + if (!local_map.buf) { + return -1; + } + + for (int y = 0; y < input->height - stepsize; y += stepsize) { + for (int x = 0; x < input->width - stepsize; x += stepsize) { + for (int i = 0; i < stepsize; ++i) { + for (int j = 0; j < stepsize; ++j) { + local_map.buf[i * stepsize + j] = + input->buf[(y + i) * input->width + x + j]; + } + } + + find_min_max(&local_map, &lmax, &dummy); + lmaxmean += lmax; + numlocal++; + } + } + + aom_free(local_map.buf); + + return lmaxmean / numlocal; +} + +// Linear normalization the values in the map to [0,1]. +static void minmax_normalize(saliency_feature_map *input) { + double max_value, min_value; + find_min_max(input, &max_value, &min_value); + + for (int i = 0; i < input->height; ++i) { + for (int j = 0; j < input->width; ++j) { + if (max_value != min_value) { + input->buf[i * input->width + j] = + input->buf[i * input->width + j] / (max_value - min_value) + + min_value / (min_value - max_value); + } else { + input->buf[i * input->width + j] -= min_value; + } + } + } +} + +// This function is to promote meaningful “activation spots” in the map and +// ignores homogeneous areas. +static int nomalization_operator(saliency_feature_map *input, int stepsize) { + minmax_normalize(input); + double lmaxmean = average_local_max(input, stepsize); + if (lmaxmean < 0) { + return 0; + } + double normCoeff = (1 - lmaxmean) * (1 - lmaxmean); + + for (int i = 0; i < input->height; ++i) { + for (int j = 0; j < input->width; ++j) { + input->buf[i * input->width + j] *= normCoeff; + } + } + + return 1; +} + +// Normalize the values in feature maps to [0,1], and then upscale all maps to +// the original frame size. +static int normalize_fm(saliency_feature_map *input[6], int width[9], + int height[9], int num_fm, + saliency_feature_map *output[6]) { + // Feature maps (FM) are generated by function "center_surround_diff()". The + // difference is between a fine scale c and a coarser scale s, where c \in {2, + // 3, 4}, and s = c + delta, where delta \in {3, 4}, and the FM size is scale + // c. Specifically, i=0: c=2 and s=5, i=1: c=2 and s=6, i=2: c=3 and s=6, i=3: + // c=3 and s=7, i=4: c=4 and s=7, i=5: c=4 and s=8. + for (int i = 0; i < num_fm; ++i) { + if (nomalization_operator(input[i], 8) == 0) { + return 0; + } + + // Upscale FM to original frame size + if (upscale_map(input[i]->buf, (i / 2) + 2, 0, height, width, + output[i]->buf) == 0) { + return 0; + } + } + return 1; +} + +// Combine feature maps with the same category (intensity, color, or +// orientation) into one conspicuity map. +static int normalized_map(saliency_feature_map *input[6], int width[9], + int height[9], saliency_feature_map *output) { + int num_fm = 6; + + saliency_feature_map *n_input[6]; + for (int i = 0; i < 6; ++i) { + n_input[i] = (saliency_feature_map *)aom_malloc(sizeof(*n_input[i])); + if (!n_input[i]) { + return 0; + } + n_input[i]->buf = + (double *)aom_malloc(width[0] * height[0] * sizeof(*n_input[i]->buf)); + if (!n_input[i]->buf) { + aom_free(n_input[i]); + return 0; + } + n_input[i]->height = height[0]; + n_input[i]->width = width[0]; + } + + if (normalize_fm(input, width, height, num_fm, n_input) == 0) { + for (int i = 0; i < num_fm; ++i) { + aom_free(n_input[i]->buf); + aom_free(n_input[i]); + } + return 0; + } + + // Add up all normalized feature maps with the same category into one map. + for (int i = 0; i < num_fm; ++i) { + for (int r = 0; r < height[0]; ++r) { + for (int c = 0; c < width[0]; ++c) { + output->buf[r * width[0] + c] += n_input[i]->buf[r * width[0] + c]; + } + } + } + + for (int i = 0; i < num_fm; ++i) { + aom_free(n_input[i]->buf); + aom_free(n_input[i]); + } + + nomalization_operator(output, 8); + return 1; +} + +static int normalized_map_rgb(saliency_feature_map *rg_map[6], + saliency_feature_map *by_map[6], int width[9], + int height[9], saliency_feature_map *output) { + saliency_feature_map *color_cm[2]; // 0: color_cm_rg, 1: color_cm_by + for (int i = 0; i < 2; ++i) { + color_cm[i] = aom_malloc(sizeof(*color_cm[i])); + if (!color_cm[i]) { + return 0; + } + color_cm[i]->buf = + (double *)aom_malloc(width[0] * height[0] * sizeof(*color_cm[i]->buf)); + if (!color_cm[i]->buf) { + for (int l = 0; l < i; ++l) { + aom_free(color_cm[l]->buf); + } + aom_free(color_cm[i]); + return 0; + } + + color_cm[i]->width = width[0]; + color_cm[i]->height = height[0]; + memset(color_cm[i]->buf, 0, + width[0] * height[0] * sizeof(*color_cm[i]->buf)); + } + + if (normalized_map(rg_map, width, height, color_cm[0]) == 0 || + normalized_map(by_map, width, height, color_cm[1]) == 0) { + for (int i = 0; i < 2; ++i) { + aom_free(color_cm[i]->buf); + aom_free(color_cm[i]); + } + return 0; + } + + for (int r = 0; r < height[0]; ++r) { + for (int c = 0; c < width[0]; ++c) { + output->buf[r * width[0] + c] = color_cm[0]->buf[r * width[0] + c] + + color_cm[1]->buf[r * width[0] + c]; + } + } + + for (int i = 0; i < 2; ++i) { + aom_free(color_cm[i]->buf); + aom_free(color_cm[i]); + } + + nomalization_operator(output, 8); + return 1; +} + +static int normalized_map_orientation(saliency_feature_map *orientation_map[24], + int width[9], int height[9], + saliency_feature_map *output) { + int num_fms_per_angle = 6; + + saliency_feature_map *ofm[4][6]; + for (int i = 0; i < num_fms_per_angle; ++i) { + for (int j = 0; j < 4; ++j) { + ofm[j][i] = orientation_map[j * num_fms_per_angle + i]; + } + } + + // extract conspicuity map for each angle + saliency_feature_map *nofm = aom_malloc(sizeof(*nofm)); + if (!nofm) { + return 0; + } + nofm->buf = (double *)aom_malloc(width[0] * height[0] * sizeof(*nofm->buf)); + if (!nofm->buf) { + aom_free(nofm); + return 0; + } + nofm->height = height[0]; + nofm->width = width[0]; + + for (int i = 0; i < 4; ++i) { + memset(nofm->buf, 0, width[0] * height[0] * sizeof(*nofm->buf)); + if (normalized_map(ofm[i], width, height, nofm) == 0) { + aom_free(nofm->buf); + aom_free(nofm); + return 0; + } + + for (int r = 0; r < height[0]; ++r) { + for (int c = 0; c < width[0]; ++c) { + output->buf[r * width[0] + c] += nofm->buf[r * width[0] + c]; + } + } + } + + aom_free(nofm->buf); + aom_free(nofm); + + nomalization_operator(output, 8); + return 1; +} + +// Set pixel level saliency mask based on Itti-Koch algorithm +int av1_set_saliency_map(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + + int frm_width = cm->width; + int frm_height = cm->height; + + int pyr_height[9]; + int pyr_width[9]; + + pyr_height[0] = frm_height; + pyr_width[0] = frm_width; + + for (int i = 1; i < 9; ++i) { + pyr_width[i] = pyr_width[i - 1] / 2; + pyr_height[i] = pyr_height[i - 1] / 2; + } + + double *cr = aom_malloc(frm_width * frm_height * sizeof(*cr)); + double *cg = aom_malloc(frm_width * frm_height * sizeof(*cg)); + double *cb = aom_malloc(frm_width * frm_height * sizeof(*cb)); + double *intensity = aom_malloc(frm_width * frm_height * sizeof(*intensity)); + + if (!cr || !cg || !cb || !intensity) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + return 0; + } + + // Extract red / green / blue channels and intensity component + get_color_intensity(cpi->source, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, cr, cg, cb, intensity); + + // Feature Map Extraction + // intensity map + saliency_feature_map *i_map[6]; + for (int i = 0; i < 6; ++i) { + int cur_height = pyr_height[(i / 2) + 2]; + int cur_width = pyr_width[(i / 2) + 2]; + + i_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*i_map[i])); + if (!i_map[i]) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < i; ++l) { + aom_free(i_map[l]); + } + return 0; + } + i_map[i]->buf = + (double *)aom_malloc(cur_height * cur_width * sizeof(*i_map[i]->buf)); + if (!i_map[i]->buf) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < i; ++l) { + aom_free(i_map[l]->buf); + aom_free(i_map[l]); + } + return 0; + } + i_map[i]->height = cur_height; + i_map[i]->width = cur_width; + } + + if (get_feature_map_intensity(intensity, pyr_width, pyr_height, i_map) == 0) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(i_map[l]); + } + return 0; + } + + // RGB map + saliency_feature_map *rg_map[6], *by_map[6]; + for (int i = 0; i < 6; ++i) { + int cur_height = pyr_height[(i / 2) + 2]; + int cur_width = pyr_width[(i / 2) + 2]; + rg_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*rg_map[i])); + by_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*by_map[i])); + if (!rg_map[i] || !by_map[i]) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + return 0; + } + rg_map[i]->buf = + (double *)aom_malloc(cur_height * cur_width * sizeof(*rg_map[i]->buf)); + by_map[i]->buf = + (double *)aom_malloc(cur_height * cur_width * sizeof(*by_map[i]->buf)); + if (!by_map[i]->buf || !rg_map[i]->buf) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(i_map[l]); + } + for (int l = 0; l < i; ++l) { + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + return 0; + } + rg_map[i]->height = cur_height; + rg_map[i]->width = cur_width; + by_map[i]->height = cur_height; + by_map[i]->width = cur_width; + } + + if (get_feature_map_rgb(cr, cg, cb, pyr_width, pyr_height, rg_map, by_map) == + 0) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + return 0; + } + + // Orientation map + saliency_feature_map *orientation_map[24]; + for (int i = 0; i < 24; ++i) { + int cur_height = pyr_height[((i % 6) / 2) + 2]; + int cur_width = pyr_width[((i % 6) / 2) + 2]; + + orientation_map[i] = + (saliency_feature_map *)aom_malloc(sizeof(*orientation_map[i])); + if (!orientation_map[i]) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + for (int h = 0; h < i; ++h) { + aom_free(orientation_map[h]); + } + return 0; + } + + orientation_map[i]->buf = (double *)aom_malloc( + cur_height * cur_width * sizeof(*orientation_map[i]->buf)); + if (!orientation_map[i]->buf) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + + for (int h = 0; h < i; ++h) { + aom_free(orientation_map[h]->buf); + aom_free(orientation_map[h]->buf); + aom_free(orientation_map[h]); + aom_free(orientation_map[h]); + } + return 0; + } + + orientation_map[i]->height = cur_height; + orientation_map[i]->width = cur_width; + } + + if (get_feature_map_orientation(intensity, pyr_width, pyr_height, + orientation_map) == 0) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + for (int h = 0; h < 24; ++h) { + aom_free(orientation_map[h]->buf); + aom_free(orientation_map[h]); + } + return 0; + } + + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + + saliency_feature_map + *normalized_maps[3]; // 0: intensity, 1: color, 2: orientation + + for (int i = 0; i < 3; ++i) { + normalized_maps[i] = aom_malloc(sizeof(*normalized_maps[i])); + if (!normalized_maps[i]) { + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + + for (int h = 0; h < 24; ++h) { + aom_free(orientation_map[h]->buf); + aom_free(orientation_map[h]); + } + + for (int l = 0; l < i; ++l) { + aom_free(normalized_maps[l]); + } + return 0; + } + normalized_maps[i]->buf = (double *)aom_malloc( + frm_width * frm_height * sizeof(*normalized_maps[i]->buf)); + if (!normalized_maps[i]->buf) { + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + for (int h = 0; h < 24; ++h) { + aom_free(orientation_map[h]->buf); + aom_free(orientation_map[h]); + } + for (int l = 0; l < i; ++l) { + aom_free(normalized_maps[l]->buf); + aom_free(normalized_maps[l]); + } + return 0; + } + normalized_maps[i]->width = frm_width; + normalized_maps[i]->height = frm_height; + memset(normalized_maps[i]->buf, 0, + frm_width * frm_height * sizeof(*normalized_maps[i]->buf)); + } + + // Conspicuity map generation + if (normalized_map(i_map, pyr_width, pyr_height, normalized_maps[0]) == 0 || + normalized_map_rgb(rg_map, by_map, pyr_width, pyr_height, + normalized_maps[1]) == 0 || + normalized_map_orientation(orientation_map, pyr_width, pyr_height, + normalized_maps[2]) == 0) { + for (int i = 0; i < 6; ++i) { + aom_free(i_map[i]->buf); + aom_free(rg_map[i]->buf); + aom_free(by_map[i]->buf); + aom_free(i_map[i]); + aom_free(rg_map[i]); + aom_free(by_map[i]); + } + + for (int i = 0; i < 24; ++i) { + aom_free(orientation_map[i]->buf); + aom_free(orientation_map[i]); + } + + for (int i = 0; i < 3; ++i) { + aom_free(normalized_maps[i]->buf); + aom_free(normalized_maps[i]); + } + return 0; + } + + for (int i = 0; i < 6; ++i) { + aom_free(i_map[i]->buf); + aom_free(rg_map[i]->buf); + aom_free(by_map[i]->buf); + aom_free(i_map[i]); + aom_free(rg_map[i]); + aom_free(by_map[i]); + } + + for (int i = 0; i < 24; ++i) { + aom_free(orientation_map[i]->buf); + aom_free(orientation_map[i]); + } + + // Pixel level saliency map + saliency_feature_map *combined_saliency_map = + aom_malloc(sizeof(*combined_saliency_map)); + if (!combined_saliency_map) { + for (int i = 0; i < 3; ++i) { + aom_free(normalized_maps[i]->buf); + aom_free(normalized_maps[i]); + } + return 0; + } + + combined_saliency_map->buf = (double *)aom_malloc( + frm_width * frm_height * sizeof(*combined_saliency_map->buf)); + if (!combined_saliency_map->buf) { + for (int i = 0; i < 3; ++i) { + aom_free(normalized_maps[i]->buf); + aom_free(normalized_maps[i]); + } + + aom_free(combined_saliency_map); + return 0; + } + combined_saliency_map->height = frm_height; + combined_saliency_map->width = frm_width; + + double w_intensity, w_color, w_orient; + + w_intensity = w_color = w_orient = (double)1 / 3; + + for (int r = 0; r < frm_height; ++r) { + for (int c = 0; c < frm_width; ++c) { + combined_saliency_map->buf[r * frm_width + c] = + (w_intensity * normalized_maps[0]->buf[r * frm_width + c] + + w_color * normalized_maps[1]->buf[r * frm_width + c] + + w_orient * normalized_maps[2]->buf[r * frm_width + c]); + } + } + + for (int r = 0; r < frm_height; ++r) { + for (int c = 0; c < frm_width; ++c) { + int index = r * frm_width + c; + cpi->saliency_map[index] = + (uint8_t)(combined_saliency_map->buf[index] * 255); + } + } + + for (int i = 0; i < 3; ++i) { + aom_free(normalized_maps[i]->buf); + aom_free(normalized_maps[i]); + } + + aom_free(combined_saliency_map->buf); + aom_free(combined_saliency_map); + + return 1; +} + +// Set superblock level saliency mask for rdmult scaling +int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio) { + AV1_COMMON *cm = &cpi->common; + + saliency_feature_map *sb_saliency_map = + aom_malloc(sizeof(saliency_feature_map)); + + if (sb_saliency_map == NULL) { + return 0; + } + + const BLOCK_SIZE bsize = cm->seq_params->sb_size; + const int num_mi_w = mi_size_wide[bsize]; + const int num_mi_h = mi_size_high[bsize]; + const int block_width = block_size_wide[bsize]; + const int block_height = block_size_high[bsize]; + const int num_sb_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_sb_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + + sb_saliency_map->height = num_sb_rows; + sb_saliency_map->width = num_sb_cols; + sb_saliency_map->buf = (double *)aom_malloc(num_sb_rows * num_sb_cols * + sizeof(*sb_saliency_map->buf)); + + if (sb_saliency_map->buf == NULL) { + aom_free(sb_saliency_map); + return 0; + } + + for (int row = 0; row < num_sb_rows; ++row) { + for (int col = 0; col < num_sb_cols; ++col) { + const int index = row * num_sb_cols + col; + double total_pixel = 0; + double total_weight = 0; + + for (int i = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j++) { + if ((row * block_height + i) >= cpi->common.height || + (col * block_width + j) >= cpi->common.width) + continue; + total_pixel++; + total_weight += + cpi->saliency_map[(row * block_height + i) * cpi->common.width + + col * block_width + j]; + } + } + + assert(total_pixel > 0); + + // Calculate the superblock level saliency map from pixel level saliency + // map + sb_saliency_map->buf[index] = total_weight / total_pixel; + + // Further lower the superblock saliency score for boundary superblocks. + if (row < 1 || row > num_sb_rows - 2 || col < 1 || + col > num_sb_cols - 2) { + sb_saliency_map->buf[index] /= 5; + } + } + } + + // superblock level saliency map finalization + minmax_normalize(sb_saliency_map); + + double log_sum = 0.0; + double sum = 0.0; + int block_count = 0; + + // Calculate the average superblock sm_scaling_factor for a frame, to be used + // for clamping later. + for (int row = 0; row < num_sb_rows; ++row) { + for (int col = 0; col < num_sb_cols; ++col) { + const int index = row * num_sb_cols + col; + const double saliency = sb_saliency_map->buf[index]; + + cpi->sm_scaling_factor[index] = 1 - saliency; + sum += cpi->sm_scaling_factor[index]; + block_count++; + } + } + assert(block_count > 0); + sum /= block_count; + + // Calculate the geometric mean of superblock sm_scaling_factor for a frame, + // to be used for normalization. + for (int row = 0; row < num_sb_rows; ++row) { + for (int col = 0; col < num_sb_cols; ++col) { + const int index = row * num_sb_cols + col; + log_sum += log(fmax(cpi->sm_scaling_factor[index], 0.001)); + cpi->sm_scaling_factor[index] = + fmax(cpi->sm_scaling_factor[index], 0.8 * sum); + } + } + + log_sum = exp(log_sum / block_count); + + // Normalize the sm_scaling_factor by geometric mean. + for (int row = 0; row < num_sb_rows; ++row) { + for (int col = 0; col < num_sb_cols; ++col) { + const int index = row * num_sb_cols + col; + assert(log_sum > 0); + cpi->sm_scaling_factor[index] /= log_sum; + + // Modulate the sm_scaling_factor by frame basis motion factor + cpi->sm_scaling_factor[index] = + cpi->sm_scaling_factor[index] * motion_ratio; + } + } + + aom_free(sb_saliency_map->buf); + aom_free(sb_saliency_map); + return 1; +} + +// av1_setup_motion_ratio() is only enabled when CONFIG_REALTIME_ONLY is 0, +// because the computations need to access the first pass stats which are +// only available when CONFIG_REALTIME_ONLY is equal to 0. +#if !CONFIG_REALTIME_ONLY +// Set motion_ratio that reflects the motion quantities between two consecutive +// frames. Motion_ratio will be used to set up saliency_map based rdmult scaling +// factor, i.e., the less the motion quantities are, the more bits will be spent +// on this frame, and vice versa. +double av1_setup_motion_ratio(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + int frames_since_key = + cm->current_frame.display_order_hint - cpi->rc.frames_since_key; + const FIRSTPASS_STATS *cur_stats = av1_firstpass_info_peek( + &cpi->ppi->twopass.firstpass_info, frames_since_key); + assert(cur_stats != NULL); + assert(cpi->ppi->twopass.firstpass_info.total_stats.count > 0); + + const double avg_intra_error = + exp(cpi->ppi->twopass.firstpass_info.total_stats.log_intra_error / + cpi->ppi->twopass.firstpass_info.total_stats.count); + const double avg_inter_error = + exp(cpi->ppi->twopass.firstpass_info.total_stats.log_coded_error / + cpi->ppi->twopass.firstpass_info.total_stats.count); + + double inter_error = cur_stats->coded_error; + double error_stdev = 0; + const double avg_error = + cpi->ppi->twopass.firstpass_info.total_stats.intra_error / + cpi->ppi->twopass.firstpass_info.total_stats.count; + for (int i = 0; i < cpi->ppi->twopass.firstpass_info.total_stats.count; i++) { + const FIRSTPASS_STATS *stats = + &cpi->ppi->twopass.firstpass_info.stats_buf[i]; + error_stdev += + (stats->intra_error - avg_error) * (stats->intra_error - avg_error); + } + error_stdev = + sqrt(error_stdev / cpi->ppi->twopass.firstpass_info.total_stats.count); + + double motion_ratio = 1; + if (error_stdev / fmax(avg_intra_error, 1) > 0.1) { + motion_ratio = inter_error / fmax(1, avg_inter_error); + motion_ratio = AOMMIN(motion_ratio, 1.5); + motion_ratio = AOMMAX(motion_ratio, 0.8); + } + + return motion_ratio; +} +#endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/aom/av1/encoder/saliency_map.h b/third_party/aom/av1/encoder/saliency_map.h new file mode 100644 index 0000000000..0d27f83633 --- /dev/null +++ b/third_party/aom/av1/encoder/saliency_map.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SALIENCY_MAP_H_ +#define AOM_AV1_ENCODER_SALIENCY_MAP_H_ +#include "av1/encoder/encoder.h" + +typedef struct saliency_feature_map { + double *buf; // stores values of the map in 1D array + int height; + int width; +} saliency_feature_map; + +int av1_set_saliency_map(AV1_COMP *cpi); +#if !CONFIG_REALTIME_ONLY +double av1_setup_motion_ratio(AV1_COMP *cpi); +#endif +int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio); + +#endif // AOM_AV1_ENCODER_SALIENCY_MAP_H_ diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c new file mode 100644 index 0000000000..4b4e78779c --- /dev/null +++ b/third_party/aom/av1/encoder/segmentation.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_mem/aom_mem.h" + +#include "av1/common/pred_common.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/segmentation.h" + +void av1_enable_segmentation(struct segmentation *seg) { + seg->enabled = 1; + seg->update_map = 1; + seg->update_data = 1; + seg->temporal_update = 0; +} + +void av1_disable_segmentation(struct segmentation *seg) { + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; + seg->temporal_update = 0; +} + +void av1_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] &= ~(1u << feature_id); +} + +void av1_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_data[segment_id][feature_id] = 0; +} + +void av1_reset_segment_features(AV1_COMMON *cm) { + struct segmentation *seg = &cm->seg; + + // Set up default state for MB feature flags + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; + av1_clearall_segfeatures(seg); +} diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h new file mode 100644 index 0000000000..1ad13d66a9 --- /dev/null +++ b/third_party/aom/av1/encoder/segmentation.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_ +#define AOM_AV1_ENCODER_SEGMENTATION_H_ + +#include "av1/common/blockd.h" +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_enable_segmentation(struct segmentation *seg); +void av1_disable_segmentation(struct segmentation *seg); + +void av1_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); +void av1_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); + +void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd); + +void av1_reset_segment_features(AV1_COMMON *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SEGMENTATION_H_ diff --git a/third_party/aom/av1/encoder/sorting_network.h b/third_party/aom/av1/encoder/sorting_network.h new file mode 100644 index 0000000000..54f4c19dcd --- /dev/null +++ b/third_party/aom/av1/encoder/sorting_network.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*! \file + * This file contains several utility functions used to sort small arrays with + * sorting networks. + * + * Sorting network is a (potentially branch-less) way to quickly sort small + * arrays with known size. For more details, consult + * (https://en.wikipedia.org/wiki/Sorting_network). + */ +#ifndef AOM_AV1_ENCODER_SORTING_NETWORK_H_ +#define AOM_AV1_ENCODER_SORTING_NETWORK_H_ + +#include "aom/aom_integer.h" + +#define SWAP(i, j) \ + do { \ + const float maxf = (k[i] >= k[j]) ? k[i] : k[j]; \ + const float minf = (k[i] >= k[j]) ? k[j] : k[i]; \ + const int maxi = (k[i] >= k[j]) ? v[i] : v[j]; \ + const int mini = (k[i] >= k[j]) ? v[j] : v[i]; \ + k[i] = maxf; \ + k[j] = minf; \ + v[i] = maxi; \ + v[j] = mini; \ + } while (0) + +/*!\brief Sorts two size-16 arrays of keys and values in descending order of + * keys. + * + * \param[in,out] k An length-16 array of float serves as the keys. + * \param[in,out] v An length-16 array of int32 serves as the + * value. + */ +static AOM_INLINE void av1_sort_fi32_16(float k[], int32_t v[]) { + SWAP(0, 1); + SWAP(2, 3); + SWAP(4, 5); + SWAP(6, 7); + SWAP(8, 9); + SWAP(10, 11); + SWAP(12, 13); + SWAP(14, 15); + SWAP(0, 2); + SWAP(1, 3); + SWAP(4, 6); + SWAP(5, 7); + SWAP(8, 10); + SWAP(9, 11); + SWAP(12, 14); + SWAP(13, 15); + SWAP(1, 2); + SWAP(5, 6); + SWAP(0, 4); + SWAP(3, 7); + SWAP(9, 10); + SWAP(13, 14); + SWAP(8, 12); + SWAP(11, 15); + SWAP(1, 5); + SWAP(2, 6); + SWAP(9, 13); + SWAP(10, 14); + SWAP(0, 8); + SWAP(7, 15); + SWAP(1, 4); + SWAP(3, 6); + SWAP(9, 12); + SWAP(11, 14); + SWAP(2, 4); + SWAP(3, 5); + SWAP(10, 12); + SWAP(11, 13); + SWAP(1, 9); + SWAP(6, 14); + SWAP(3, 4); + SWAP(11, 12); + SWAP(1, 8); + SWAP(2, 10); + SWAP(5, 13); + SWAP(7, 14); + SWAP(3, 11); + SWAP(2, 8); + SWAP(4, 12); + SWAP(7, 13); + SWAP(3, 10); + SWAP(5, 12); + SWAP(3, 9); + SWAP(6, 12); + SWAP(3, 8); + SWAP(7, 12); + SWAP(5, 9); + SWAP(6, 10); + SWAP(4, 8); + SWAP(7, 11); + SWAP(5, 8); + SWAP(7, 10); + SWAP(6, 8); + SWAP(7, 9); + SWAP(7, 8); +} + +/*!\brief Sorts two size-8 arrays of keys and values in descending order of + * keys. + * + * \param[in,out] k An length-8 array of float serves as the keys. + * \param[in,out] v An length-8 array of int32 serves as the values. + */ +static AOM_INLINE void av1_sort_fi32_8(float k[], int32_t v[]) { + SWAP(0, 1); + SWAP(2, 3); + SWAP(4, 5); + SWAP(6, 7); + SWAP(0, 2); + SWAP(1, 3); + SWAP(4, 6); + SWAP(5, 7); + SWAP(1, 2); + SWAP(5, 6); + SWAP(0, 4); + SWAP(3, 7); + SWAP(1, 5); + SWAP(2, 6); + SWAP(1, 4); + SWAP(3, 6); + SWAP(2, 4); + SWAP(3, 5); + SWAP(3, 4); +} +#undef SWAP +#endif // AOM_AV1_ENCODER_SORTING_NETWORK_H_ diff --git a/third_party/aom/av1/encoder/sparse_linear_solver.c b/third_party/aom/av1/encoder/sparse_linear_solver.c new file mode 100644 index 0000000000..e47c78e148 --- /dev/null +++ b/third_party/aom/av1/encoder/sparse_linear_solver.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "av1/common/av1_common_int.h" +#include "av1/encoder/sparse_linear_solver.h" +#include "config/aom_config.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/alloccommon.h" + +#if CONFIG_OPTICAL_FLOW_API +/* + * Input: + * rows: array of row positions + * cols: array of column positions + * values: array of element values + * num_elem: total number of elements in the matrix + * num_rows: number of rows in the matrix + * num_cols: number of columns in the matrix + * + * Output: + * sm: pointer to the sparse matrix to be initialized + * + * Return: 0 - success + * -1 - failed + */ +int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values, + int num_elem, int num_rows, int num_cols, + SPARSE_MTX *sm) { + sm->n_elem = num_elem; + sm->n_rows = num_rows; + sm->n_cols = num_cols; + if (num_elem == 0) { + sm->row_pos = NULL; + sm->col_pos = NULL; + sm->value = NULL; + return 0; + } + sm->row_pos = aom_calloc(num_elem, sizeof(*sm->row_pos)); + sm->col_pos = aom_calloc(num_elem, sizeof(*sm->col_pos)); + sm->value = aom_calloc(num_elem, sizeof(*sm->value)); + + if (!sm->row_pos || !sm->col_pos || !sm->value) { + av1_free_sparse_mtx_elems(sm); + return -1; + } + + memcpy(sm->row_pos, rows, num_elem * sizeof(*sm->row_pos)); + memcpy(sm->col_pos, cols, num_elem * sizeof(*sm->col_pos)); + memcpy(sm->value, values, num_elem * sizeof(*sm->value)); + + return 0; +} + +/* + * Combines two sparse matrices (allocating new space). + * + * Input: + * sm1, sm2: matrices to be combined + * row_offset1, row_offset2: row offset of each matrix in the new matrix + * col_offset1, col_offset2: column offset of each matrix in the new matrix + * new_n_rows, new_n_cols: number of rows and columns in the new matrix + * + * Output: + * sm: the combined matrix + * + * Return: 0 - success + * -1 - failed + */ +int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2, + SPARSE_MTX *sm, int row_offset1, + int col_offset1, int row_offset2, + int col_offset2, int new_n_rows, + int new_n_cols) { + sm->n_elem = sm1->n_elem + sm2->n_elem; + sm->n_cols = new_n_cols; + sm->n_rows = new_n_rows; + + if (sm->n_elem == 0) { + sm->row_pos = NULL; + sm->col_pos = NULL; + sm->value = NULL; + return 0; + } + + sm->row_pos = aom_calloc(sm->n_elem, sizeof(*sm->row_pos)); + sm->col_pos = aom_calloc(sm->n_elem, sizeof(*sm->col_pos)); + sm->value = aom_calloc(sm->n_elem, sizeof(*sm->value)); + + if (!sm->row_pos || !sm->col_pos || !sm->value) { + av1_free_sparse_mtx_elems(sm); + return -1; + } + + for (int i = 0; i < sm1->n_elem; i++) { + sm->row_pos[i] = sm1->row_pos[i] + row_offset1; + sm->col_pos[i] = sm1->col_pos[i] + col_offset1; + } + memcpy(sm->value, sm1->value, sm1->n_elem * sizeof(*sm1->value)); + int n_elem1 = sm1->n_elem; + for (int i = 0; i < sm2->n_elem; i++) { + sm->row_pos[n_elem1 + i] = sm2->row_pos[i] + row_offset2; + sm->col_pos[n_elem1 + i] = sm2->col_pos[i] + col_offset2; + } + memcpy(sm->value + n_elem1, sm2->value, sm2->n_elem * sizeof(*sm2->value)); + return 0; +} + +void av1_free_sparse_mtx_elems(SPARSE_MTX *sm) { + sm->n_cols = 0; + sm->n_rows = 0; + if (sm->n_elem != 0) { + aom_free(sm->row_pos); + aom_free(sm->col_pos); + aom_free(sm->value); + } + sm->n_elem = 0; +} + +/* + * Calculate matrix and vector multiplication: A*b + * + * Input: + * sm: matrix A + * srcv: the vector b to be multiplied to + * dstl: the length of vectors + * + * Output: + * dstv: pointer to the resulting vector + */ +void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv, + double *dstv, int dstl) { + memset(dstv, 0, sizeof(*dstv) * dstl); + for (int i = 0; i < sm->n_elem; i++) { + dstv[sm->row_pos[i]] += srcv[sm->col_pos[i]] * sm->value[i]; + } +} +/* + * Calculate matrix and vector multiplication: b*A + * + * Input: + * sm: matrix A + * srcv: the vector b to be multiplied to + * dstl: the length of vectors + * + * Output: + * dstv: pointer to the resulting vector + */ +void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv, + double *dstv, int dstl) { + memset(dstv, 0, sizeof(*dstv) * dstl); + for (int i = 0; i < sm->n_elem; i++) { + dstv[sm->col_pos[i]] += srcv[sm->row_pos[i]] * sm->value[i]; + } +} + +/* + * Calculate inner product of two vectors + * + * Input: + * src1, scr2: the vectors to be multiplied + * src1l: length of the vectors + * + * Output: + * the inner product + */ +double av1_vect_vect_multi(const double *src1, int src1l, const double *src2) { + double result = 0; + for (int i = 0; i < src1l; i++) { + result += src1[i] * src2[i]; + } + return result; +} + +/* + * Multiply each element in the matrix sm with a constant c + */ +void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c) { + for (int i = 0; i < sm->n_elem; i++) { + sm->value[i] *= c; + } +} + +static INLINE void free_solver_local_buf(double *buf1, double *buf2, + double *buf3, double *buf4, + double *buf5, double *buf6, + double *buf7) { + aom_free(buf1); + aom_free(buf2); + aom_free(buf3); + aom_free(buf4); + aom_free(buf5); + aom_free(buf6); + aom_free(buf7); +} + +/* + * Solve for Ax = b + * no requirement on A + * + * Input: + * A: the sparse matrix + * b: the vector b + * bl: length of b + * x: the vector x + * + * Output: + * x: pointer to the solution vector + * + * Return: 0 - success + * -1 - failed + */ +int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, + int bl, double *x) { + double *r = NULL, *r_hat = NULL, *p = NULL, *p_hat = NULL, *Ap = NULL, + *p_hatA = NULL, *x_hat = NULL; + double alpha, beta, rtr, r_norm_2; + double denormtemp; + + // initialize + r = aom_calloc(bl, sizeof(*r)); + r_hat = aom_calloc(bl, sizeof(*r_hat)); + p = aom_calloc(bl, sizeof(*p)); + p_hat = aom_calloc(bl, sizeof(*p_hat)); + Ap = aom_calloc(bl, sizeof(*Ap)); + p_hatA = aom_calloc(bl, sizeof(*p_hatA)); + x_hat = aom_calloc(bl, sizeof(*x_hat)); + if (!r || !r_hat || !p || !p_hat || !Ap || !p_hatA || !x_hat) { + free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat); + return -1; + } + + int i; + for (i = 0; i < bl; i++) { + r[i] = b[i]; + r_hat[i] = b[i]; + p[i] = r[i]; + p_hat[i] = r_hat[i]; + x[i] = 0; + x_hat[i] = 0; + } + r_norm_2 = av1_vect_vect_multi(r_hat, bl, r); + for (int k = 0; k < MAX_CG_SP_ITER; k++) { + rtr = r_norm_2; + av1_mtx_vect_multi_right(A, p, Ap, bl); + av1_mtx_vect_multi_left(A, p_hat, p_hatA, bl); + + denormtemp = av1_vect_vect_multi(p_hat, bl, Ap); + if (denormtemp < 1e-10) break; + alpha = rtr / denormtemp; + r_norm_2 = 0; + for (i = 0; i < bl; i++) { + x[i] += alpha * p[i]; + x_hat[i] += alpha * p_hat[i]; + r[i] -= alpha * Ap[i]; + r_hat[i] -= alpha * p_hatA[i]; + r_norm_2 += r_hat[i] * r[i]; + } + if (sqrt(r_norm_2) < 1e-2) { + break; + } + if (rtr < 1e-10) break; + beta = r_norm_2 / rtr; + for (i = 0; i < bl; i++) { + p[i] = r[i] + beta * p[i]; + p_hat[i] = r_hat[i] + beta * p_hat[i]; + } + } + // free + free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat); + return 0; +} + +/* + * Solve for Ax = b when A is symmetric and positive definite + * + * Input: + * A: the sparse matrix + * b: the vector b + * bl: length of b + * x: the vector x + * + * Output: + * x: pointer to the solution vector + * + * Return: 0 - success + * -1 - failed + */ +int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl, + double *x) { + double *r = NULL, *p = NULL, *Ap = NULL; + double alpha, beta, rtr, r_norm_2; + double denormtemp; + + // initialize + r = aom_calloc(bl, sizeof(*r)); + p = aom_calloc(bl, sizeof(*p)); + Ap = aom_calloc(bl, sizeof(*Ap)); + if (!r || !p || !Ap) { + free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL); + return -1; + } + + int i; + for (i = 0; i < bl; i++) { + r[i] = b[i]; + p[i] = r[i]; + x[i] = 0; + } + r_norm_2 = av1_vect_vect_multi(r, bl, r); + int k; + for (k = 0; k < MAX_CG_SP_ITER; k++) { + rtr = r_norm_2; + av1_mtx_vect_multi_right(A, p, Ap, bl); + denormtemp = av1_vect_vect_multi(p, bl, Ap); + if (denormtemp < 1e-10) break; + alpha = rtr / denormtemp; + r_norm_2 = 0; + for (i = 0; i < bl; i++) { + x[i] += alpha * p[i]; + r[i] -= alpha * Ap[i]; + r_norm_2 += r[i] * r[i]; + } + if (r_norm_2 < 1e-8 * bl) break; + if (rtr < 1e-10) break; + beta = r_norm_2 / rtr; + for (i = 0; i < bl; i++) { + p[i] = r[i] + beta * p[i]; + } + } + // free + free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL); + + return 0; +} + +/* + * Solve for Ax = b using Jacobi method + * + * Input: + * A: the sparse matrix + * b: the vector b + * bl: length of b + * x: the vector x + * + * Output: + * x: pointer to the solution vector + * + * Return: 0 - success + * -1 - failed + */ +int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) { + double *diags = NULL, *Rx = NULL, *x_last = NULL, *x_cur = NULL, + *tempx = NULL; + double resi2; + + diags = aom_calloc(bl, sizeof(*diags)); + Rx = aom_calloc(bl, sizeof(*Rx)); + x_last = aom_calloc(bl, sizeof(*x_last)); + x_cur = aom_calloc(bl, sizeof(*x_cur)); + + if (!diags || !Rx || !x_last || !x_cur) { + free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL); + return -1; + } + + int i; + memset(x_last, 0, sizeof(*x_last) * bl); + // get the diagonals of A + memset(diags, 0, sizeof(*diags) * bl); + for (int c = 0; c < A->n_elem; c++) { + if (A->row_pos[c] != A->col_pos[c]) continue; + diags[A->row_pos[c]] = A->value[c]; + } + int k; + for (k = 0; k < MAX_CG_SP_ITER; k++) { + // R = A - diag(diags) + // get R*x_last + memset(Rx, 0, sizeof(*Rx) * bl); + for (int c = 0; c < A->n_elem; c++) { + if (A->row_pos[c] == A->col_pos[c]) continue; + Rx[A->row_pos[c]] += x_last[A->col_pos[c]] * A->value[c]; + } + resi2 = 0; + for (i = 0; i < bl; i++) { + x_cur[i] = (b[i] - Rx[i]) / diags[i]; + resi2 += (x_last[i] - x_cur[i]) * (x_last[i] - x_cur[i]); + } + if (resi2 <= 1e-10 * bl) break; + // swap last & cur buffer ptrs + tempx = x_last; + x_last = x_cur; + x_cur = tempx; + } + printf("\n numiter: %d\n", k); + for (i = 0; i < bl; i++) { + x[i] = x_cur[i]; + } + free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL); + return 0; +} + +/* + * Solve for Ax = b using Steepest descent method + * + * Input: + * A: the sparse matrix + * b: the vector b + * bl: length of b + * x: the vector x + * + * Output: + * x: pointer to the solution vector + * + * Return: 0 - success + * -1 - failed + */ +int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl, + double *x) { + double *d = NULL, *Ad = NULL, *Ax = NULL; + double resi2, resi2_last, dAd, temp; + + d = aom_calloc(bl, sizeof(*d)); + Ax = aom_calloc(bl, sizeof(*Ax)); + Ad = aom_calloc(bl, sizeof(*Ad)); + + if (!d || !Ax || !Ad) { + free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL); + return -1; + } + + int i; + // initialize with 0s + resi2 = 0; + for (i = 0; i < bl; i++) { + x[i] = 0; + d[i] = b[i]; + resi2 += d[i] * d[i] / bl; + } + int k; + for (k = 0; k < MAX_CG_SP_ITER; k++) { + // get A*x_last + av1_mtx_vect_multi_right(A, d, Ad, bl); + dAd = resi2 * bl / av1_vect_vect_multi(d, bl, Ad); + for (i = 0; i < bl; i++) { + temp = dAd * d[i]; + x[i] = x[i] + temp; + } + av1_mtx_vect_multi_right(A, x, Ax, bl); + resi2_last = resi2; + resi2 = 0; + for (i = 0; i < bl; i++) { + d[i] = b[i] - Ax[i]; + resi2 += d[i] * d[i] / bl; + } + if (resi2 <= 1e-8) break; + if (resi2_last - resi2 < 1e-8) { + break; + } + } + free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL); + + return 0; +} + +#endif // CONFIG_OPTICAL_FLOW_API diff --git a/third_party/aom/av1/encoder/sparse_linear_solver.h b/third_party/aom/av1/encoder/sparse_linear_solver.h new file mode 100644 index 0000000000..f30fc0f5b1 --- /dev/null +++ b/third_party/aom/av1/encoder/sparse_linear_solver.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ +#define AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/aom_config.h" + +#if CONFIG_OPTICAL_FLOW_API + +// Number of iterations for solving linear equations. +#define MAX_CG_SP_ITER 100 + +typedef struct { + int n_elem; // number of non-zero elements + int n_rows; + int n_cols; + // using arrays to represent non-zero elements. + int *col_pos; + int *row_pos; // starts with 0 + double *value; +} SPARSE_MTX; + +int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values, + int num_elem, int num_rows, int num_cols, + SPARSE_MTX *sm); +int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2, + SPARSE_MTX *sm, int row_offset1, + int col_offset1, int row_offset2, + int col_offset2, int new_n_rows, + int new_n_cols); +void av1_free_sparse_mtx_elems(SPARSE_MTX *sm); + +void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv, + double *dstv, int dstl); +void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv, + double *dstv, int dstl); +double av1_vect_vect_multi(const double *src1, int src1l, const double *src2); +void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c); + +int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl, + double *x); +int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, + int bl, double *x); +int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x); +int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl, + double *x); + +#endif // CONFIG_OPTICAL_FLOW_API + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ */ diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c new file mode 100644 index 0000000000..a6c0971096 --- /dev/null +++ b/third_party/aom/av1/encoder/speed_features.c @@ -0,0 +1,2715 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/rdopt.h" + +#include "aom_dsp/aom_dsp_common.h" + +#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method +// Max speed setting for tx domain evaluation +#define MAX_TX_DOMAIN_EVAL_SPEED 5 +static MESH_PATTERN + good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + }; + +// TODO(huisu@google.com): These settings are pretty relaxed, tune them for +// each speed setting +static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { + { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } }, + { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, +}; + +// Threshold values to be used for pruning the txfm_domain_distortion +// based on block MSE +// Index 0: Default mode evaluation, Winner mode processing is not +// applicable (Eg : IntraBc). Index 1: Mode evaluation. +// Index 2: Winner mode evaluation. Index 1 and 2 are applicable when +// enable_winner_mode_for_use_tx_domain_dist speed feature is ON +// TODO(any): Experiment the threshold logic based on variance metric +static unsigned int tx_domain_dist_thresholds[4][MODE_EVAL_TYPES] = { + { UINT_MAX, UINT_MAX, UINT_MAX }, + { 22026, 22026, 22026 }, + { 1377, 1377, 1377 }, + { 0, 0, 0 } +}; + +// Number of different levels of aggressiveness in using transform domain +// distortion during the R-D evaluation based on the speed feature +// tx_domain_dist_level. +#define TX_DOMAIN_DIST_LEVELS 4 + +// Transform domain distortion type to be used for default, mode and winner mode +// evaluation Index 0: Default mode evaluation, Winner mode processing is not +// applicable (Eg : IntraBc). Index 1: Mode evaluation. Index 2: Winner mode +// evaluation. Index 1 and 2 are applicable when +// enable_winner_mode_for_use_tx_domain_dist speed feature is ON +static unsigned int + tx_domain_dist_types[TX_DOMAIN_DIST_LEVELS][MODE_EVAL_TYPES] = { + { 0, 2, 0 }, { 1, 2, 0 }, { 2, 2, 0 }, { 2, 2, 2 } + }; + +// Threshold values to be used for disabling coeff RD-optimization +// based on block MSE / qstep^2. +// TODO(any): Experiment the threshold logic based on variance metric. +// Table has satd and dist threshold value index 0 : dist,index 1: satd +// For each row, the indices are as follows. +// Index 0: Default mode evaluation, Winner mode processing is not applicable +// (Eg : IntraBc) +// Index 1: Mode evaluation. +// Index 2: Winner mode evaluation. +// Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed +// feature is ON +// There are 7 levels with increasing speed, mapping to vertical indices. +static unsigned int coeff_opt_thresholds[9][MODE_EVAL_TYPES][2] = { + { { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX } }, + { { 3200, UINT_MAX }, { 250, UINT_MAX }, { UINT_MAX, UINT_MAX } }, + { { 1728, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } }, + { { 864, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } }, + { { 432, UINT_MAX }, { 86, UINT_MAX }, { UINT_MAX, UINT_MAX } }, + { { 864, 97 }, { 142, 16 }, { UINT_MAX, UINT_MAX } }, + { { 432, 97 }, { 86, 16 }, { UINT_MAX, UINT_MAX } }, + { { 216, 25 }, { 86, 10 }, { UINT_MAX, UINT_MAX } }, + { { 216, 25 }, { 0, 10 }, { UINT_MAX, UINT_MAX } } +}; + +// Transform size to be used for default, mode and winner mode evaluation +// Index 0: Default mode evaluation, Winner mode processing is not applicable +// (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation. +// Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed +// feature is ON +static TX_SIZE_SEARCH_METHOD tx_size_search_methods[4][MODE_EVAL_TYPES] = { + { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD }, + { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD }, + { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD }, + { USE_LARGESTALL, USE_LARGESTALL, USE_LARGESTALL } +}; + +// Predict transform skip levels to be used for default, mode and winner mode +// evaluation. Index 0: Default mode evaluation, Winner mode processing is not +// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation +// Values indicate the aggressiveness of skip flag prediction. +// 0 : no early skip prediction +// 1 : conservative early skip prediction using DCT_DCT +// 2 : early skip prediction based on SSE +static unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 }, + { 1, 1, 1 }, + { 1, 2, 1 } }; + +// Predict skip or DC block level used during transform type search. It is +// indexed using the following: +// First index : Speed feature 'dc_blk_pred_level' (0 to 3) +// Second index : Mode evaluation type (DEFAULT_EVAL, MODE_EVAL and +// WINNER_MODE_EVAL). +// +// The values of predict_dc_levels[][] indicate the aggressiveness of predicting +// a block as transform skip or DC only. +// Type 0 : No skip block or DC only block prediction +// Type 1 : Prediction of skip block based on residual mean and variance +// Type 2 : Prediction of skip block or DC only block based on residual mean and +// variance +static unsigned int predict_dc_levels[4][MODE_EVAL_TYPES] = { + { 0, 0, 0 }, { 1, 1, 0 }, { 2, 2, 0 }, { 2, 2, 2 } +}; + +#if !CONFIG_FPMT_TEST +// This table holds the maximum number of reference frames for global motion. +// The table is indexed as per the speed feature 'gm_search_type'. +// 0 : All reference frames are allowed. +// 1 : All reference frames except L2 and L3 are allowed. +// 2 : All reference frames except L2, L3 and ARF2 are allowed. +// 3 : No reference frame is allowed. +static int gm_available_reference_frames[GM_DISABLE_SEARCH + 1] = { + INTER_REFS_PER_FRAME, INTER_REFS_PER_FRAME - 2, INTER_REFS_PER_FRAME - 3, 0 +}; +#endif + +// Qindex threshold levels used for selecting full-pel motion search. +// ms_qthresh[i][j][k] indicates the qindex boundary value for 'k'th qindex band +// for resolution index 'j' for aggressiveness level 'i'. +// Aggressiveness increases from i = 0 to 2. +// j = 0: lower than 720p resolution, j = 1: 720p or larger resolution. +// Currently invoked only for speed 0, 1 and 2. +static int ms_qindex_thresh[3][2][2] = { { { 200, 70 }, { MAXQ, 200 } }, + { { 170, 50 }, { MAXQ, 200 } }, + { { 170, 40 }, { 200, 40 } } }; + +// Full-pel search methods for aggressive search based on qindex. +// Index 0 is for resolutions lower than 720p, index 1 for 720p or larger +// resolutions. Currently invoked only for speed 1 and 2. +static SEARCH_METHODS motion_search_method[2] = { CLAMPED_DIAMOND, DIAMOND }; + +// Intra only frames, golden frames (except alt ref overlays) and +// alt ref frames tend to be coded at a higher than ambient quality +static int frame_is_boosted(const AV1_COMP *cpi) { + return frame_is_kf_gf_arf(cpi); +} + +// Set transform rd gate level for all transform search cases. +static AOM_INLINE void set_txfm_rd_gate_level( + int txfm_rd_gate_level[TX_SEARCH_CASES], int level) { + assert(level <= MAX_TX_RD_GATE_LEVEL); + for (int idx = 0; idx < TX_SEARCH_CASES; idx++) + txfm_rd_gate_level[idx] = level; +} + +static void set_allintra_speed_feature_framesize_dependent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; + const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160; + const bool use_hbd = cpi->oxcf.use_highbitdepth; + + if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; + if (is_720p_or_larger) + sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED; + else + sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; + if (use_hbd) sf->tx_sf.prune_tx_size_level = 1; + } + + if (is_4k_or_larger) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + } + + // TODO(huisu@google.com): train models for 720P and above. + if (!is_720p_or_larger) { + sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64 + sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + sf->part_sf.ml_early_term_after_part_split_level = 1; + } + + if (is_720p_or_larger) { + // TODO(chiyotsai@google.com): make this speed feature adaptive based on + // current block's vertical texture instead of hardcoded with resolution + sf->mv_sf.use_downsampled_sad = 2; + } + + if (speed >= 1) { + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; + } else if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } + + if (!is_720p_or_larger) { + sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64 + sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + } + sf->part_sf.ml_early_term_after_part_split_level = 2; + } + + if (speed >= 2) { + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + } else if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } + + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); + sf->part_sf.partition_search_breakout_rate_thr = 120; + } else { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 22); + sf->part_sf.partition_search_breakout_rate_thr = 100; + } + + if (is_480p_or_larger) { + sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1; + if (use_hbd) sf->tx_sf.prune_tx_size_level = 2; + } else { + if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; + } + } + + if (speed >= 3) { + sf->part_sf.ml_early_term_after_part_split_level = 0; + + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 25); + sf->part_sf.partition_search_breakout_rate_thr = 200; + } else { + sf->part_sf.max_intra_bsize = BLOCK_32X32; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 23); + sf->part_sf.partition_search_breakout_rate_thr = 120; + } + if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; + } + + if (speed >= 4) { + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); + } else { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); + } + + if (is_480p_or_larger) { + sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2; + } + } + + if (speed >= 6) { + if (is_720p_or_larger) { + sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE; + } else if (is_480p_or_larger) { + sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; + } + + if (is_1080p_or_larger) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + } + + sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16; + } + + if (speed >= 7) { + // TODO(kyslov): add more speed features to control speed/quality + } + + if (speed >= 8) { + if (!is_480p_or_larger) { + sf->rt_sf.nonrd_check_partition_merge_mode = 2; + } + if (is_720p_or_larger) { + sf->rt_sf.force_large_partition_blocks_intra = 1; + } + } + + if (speed >= 9) { + // TODO(kyslov): add more speed features to control speed/quality + if (!is_4k_or_larger) { + // In av1_select_sb_size(), superblock size is set to 64x64 only for + // resolutions less than 4k in speed>=9, to improve the multithread + // performance. If cost update levels are set to INTERNAL_COST_UPD_OFF + // for resolutions >= 4k, the SB size setting can be modified for these + // resolutions as well. + sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_OFF; + sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_OFF; + } + } +} + +static void set_allintra_speed_features_framesize_independent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; + const int allow_screen_content_tools = + cm->features.allow_screen_content_tools; + const int use_hbd = cpi->oxcf.use_highbitdepth; + + sf->part_sf.less_rectangular_check_level = 1; + sf->part_sf.ml_prune_partition = 1; + sf->part_sf.prune_ext_partition_types_search_level = 1; + sf->part_sf.prune_part4_search = 2; + sf->part_sf.simple_motion_search_prune_rect = 1; + sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3; + sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; + sf->part_sf.use_best_rd_for_pruning = 1; + + sf->intra_sf.intra_pruning_with_hog = 1; + sf->intra_sf.prune_luma_palette_size_search_level = 1; + sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF; + sf->intra_sf.early_term_chroma_palette_size_search = 1; + + sf->tx_sf.adaptive_txb_search_level = 1; + sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.model_based_prune_tx_search_level = 1; + sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; + + sf->rt_sf.use_nonrd_pick_mode = 0; + sf->rt_sf.use_real_time_ref_set = 0; + + if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION || + cpi->use_screen_content_tools) { + sf->mv_sf.exhaustive_searches_thresh = (1 << 20); + } else { + sf->mv_sf.exhaustive_searches_thresh = (1 << 25); + } + + sf->rd_sf.perform_coeff_opt = 1; + sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL; + + if (speed >= 1) { + sf->part_sf.intra_cnn_based_part_prune_level = + allow_screen_content_tools ? 0 : 2; + sf->part_sf.simple_motion_search_early_term_none = 1; + // TODO(Venkat): Clean-up frame type dependency for + // simple_motion_search_split in partition search function and set the + // speed feature accordingly + sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2; + sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3; + sf->part_sf.reuse_best_prediction_for_part_ab = 1; + + sf->mv_sf.exhaustive_searches_thresh <<= 1; + + sf->intra_sf.prune_palette_search_level = 1; + sf->intra_sf.prune_luma_palette_size_search_level = 2; + sf->intra_sf.top_intra_model_count_allowed = 3; + + sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + sf->tx_sf.model_based_prune_tx_search_level = 0; + sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2; + sf->tx_sf.tx_type_search.skip_tx_search = 1; + + sf->rd_sf.perform_coeff_opt = 2; + sf->rd_sf.tx_domain_dist_level = 1; + sf->rd_sf.tx_domain_dist_thres_level = 1; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1; + sf->lpf_sf.dual_sgr_penalty_level = 1; + sf->lpf_sf.enable_sgr_ep_pruning = 1; + } + + if (speed >= 2) { + sf->mv_sf.auto_mv_step_size = 1; + + sf->intra_sf.disable_smooth_intra = 1; + sf->intra_sf.intra_pruning_with_hog = 2; + sf->intra_sf.prune_filter_intra_level = 1; + + sf->rd_sf.perform_coeff_opt = 3; + + sf->lpf_sf.prune_wiener_based_on_src_var = 1; + sf->lpf_sf.prune_sgr_based_on_wiener = 1; + } + + if (speed >= 3) { + sf->hl_sf.high_precision_mv_usage = CURRENT_Q; + sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; + + sf->part_sf.less_rectangular_check_level = 2; + sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL1; + sf->part_sf.prune_ext_part_using_split_info = 1; + + sf->mv_sf.full_pixel_search_level = 1; + sf->mv_sf.search_method = DIAMOND; + + // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are + // inherited directly from luma hog with some minor tweaking. Eventually we + // should run this with a bayesian optimizer to find the Pareto frontier. + sf->intra_sf.chroma_intra_pruning_with_hog = 2; + sf->intra_sf.intra_pruning_with_hog = 3; + sf->intra_sf.prune_palette_search_level = 2; + + sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; + sf->tx_sf.use_rd_based_breakout_for_intra_tx_search = true; + + // TODO(any): evaluate if these lpf features can be moved to speed 2. + // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality + // loss. + sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2; + sf->lpf_sf.disable_loop_restoration_chroma = 0; + sf->lpf_sf.reduce_wiener_window_size = 1; + sf->lpf_sf.prune_wiener_based_on_src_var = 2; + } + + if (speed >= 4) { + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + + sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL2; + sf->part_sf.simple_motion_search_reduce_search_steps = 4; + sf->part_sf.prune_ext_part_using_split_info = 2; + sf->part_sf.early_term_after_none_split = 1; + sf->part_sf.ml_predict_breakout_level = 3; + + sf->intra_sf.prune_chroma_modes_using_luma_winner = 1; + + sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL; + + sf->tpl_sf.prune_starting_mv = 2; + sf->tpl_sf.subpel_force_stop = HALF_PEL; + sf->tpl_sf.search_method = FAST_BIGDIA; + + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2; + sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; + sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1; + + sf->rd_sf.perform_coeff_opt = 5; + sf->rd_sf.tx_domain_dist_thres_level = 3; + + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL; + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3; + + sf->mv_sf.reduce_search_range = 1; + + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1; + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1; + sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_DEFAULT; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1; + } + + if (speed >= 5) { + sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL3; + sf->part_sf.ext_partition_eval_thresh = + allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16; + sf->part_sf.intra_cnn_based_part_prune_level = + allow_screen_content_tools ? 1 : 2; + + sf->intra_sf.chroma_intra_pruning_with_hog = 3; + + sf->lpf_sf.use_coarse_filter_level_search = 0; + // Disable Wiener and Self-guided Loop restoration filters. + sf->lpf_sf.disable_wiener_filter = true; + sf->lpf_sf.disable_sgr_filter = true; + + sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2; + + sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_FAST; + } + + if (speed >= 6) { + sf->intra_sf.prune_smooth_intra_mode_for_chroma = 1; + sf->intra_sf.prune_filter_intra_level = 2; + sf->intra_sf.chroma_intra_pruning_with_hog = 4; + sf->intra_sf.intra_pruning_with_hog = 4; + sf->intra_sf.cfl_search_range = 1; + sf->intra_sf.top_intra_model_count_allowed = 2; + sf->intra_sf.adapt_top_model_rd_count_using_neighbors = 1; + sf->intra_sf.prune_luma_odd_delta_angles_in_intra = 1; + + sf->part_sf.prune_rectangular_split_based_on_qidx = + allow_screen_content_tools ? 0 : 2; + sf->part_sf.prune_rect_part_using_4x4_var_deviation = true; + sf->part_sf.prune_rect_part_using_none_pred_mode = true; + sf->part_sf.prune_sub_8x8_partition_level = + allow_screen_content_tools ? 0 : 1; + sf->part_sf.prune_part4_search = 3; + // TODO(jingning): This might not be a good trade off if the + // target image quality is very low. + sf->part_sf.default_max_partition_size = BLOCK_32X32; + + sf->mv_sf.use_bsize_dependent_search_method = 1; + + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3; + sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0; + sf->tx_sf.prune_intra_tx_depths_using_nn = true; + + sf->rd_sf.perform_coeff_opt = 6; + sf->rd_sf.tx_domain_dist_level = 3; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4; + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; + + sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF; + sf->winner_mode_sf.prune_winner_mode_eval_level = 1; + sf->winner_mode_sf.dc_blk_pred_level = 1; + } + // The following should make all-intra mode speed 7 approximately equal + // to real-time speed 6, + // all-intra speed 8 close to real-time speed 7, and all-intra speed 9 + // close to real-time speed 8 + if (speed >= 7) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + sf->part_sf.partition_search_type = VAR_BASED_PARTITION; + sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + sf->rt_sf.var_part_split_threshold_shift = 7; + } + + if (speed >= 8) { + sf->rt_sf.hybrid_intra_pickmode = 1; + sf->rt_sf.use_nonrd_pick_mode = 1; + sf->rt_sf.nonrd_check_partition_merge_mode = 1; + sf->rt_sf.var_part_split_threshold_shift = 8; + // Set mask for intra modes. + for (int i = 0; i < BLOCK_SIZES; ++i) + if (i >= BLOCK_32X32) + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + else + // Use DC, H, V intra mode for block sizes < 32X32. + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V; + } + + if (speed >= 9) { + sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW; + sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW; + + sf->rt_sf.nonrd_check_partition_merge_mode = 0; + sf->rt_sf.hybrid_intra_pickmode = 0; + sf->rt_sf.var_part_split_threshold_shift = 9; + sf->rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var = true; + sf->rt_sf.prune_h_pred_using_best_mode_so_far = true; + sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true; + sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true; + } + + // As the speed feature prune_chroma_modes_using_luma_winner already + // constrains the number of chroma directional mode evaluations to a maximum + // of 1, the HOG computation and the associated pruning logic does not seem to + // help speed-up the chroma mode evaluations. Hence disable the speed feature + // chroma_intra_pruning_with_hog when prune_chroma_modes_using_luma_winner is + // enabled. + if (sf->intra_sf.prune_chroma_modes_using_luma_winner) + sf->intra_sf.chroma_intra_pruning_with_hog = 0; +} + +static void set_good_speed_feature_framesize_dependent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; + const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; + const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160; + const bool use_hbd = cpi->oxcf.use_highbitdepth; + // Speed features applicable for temporal filtering and tpl modules may be + // changed based on frame type at places where the sf is applied (Example : + // use_downsampled_sad). This is because temporal filtering and tpl modules + // are called before this function (except for the first key frame). + // TODO(deepa.kg@ittiam.com): For the speed features applicable to temporal + // filtering and tpl modules, modify the sf initialization appropriately + // before calling the modules. + const int boosted = frame_is_boosted(cpi); + const int is_boosted_arf2_bwd_type = + boosted || + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; + const int is_lf_frame = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == LF_UPDATE; + const int allow_screen_content_tools = + cm->features.allow_screen_content_tools; + + if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; + if (is_720p_or_larger) + sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED; + else + sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; + if (use_hbd) sf->tx_sf.prune_tx_size_level = 1; + } + + if (is_4k_or_larger) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + } + + // TODO(huisu@google.com): train models for 720P and above. + if (!is_720p_or_larger) { + sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64 + sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + sf->part_sf.ml_early_term_after_part_split_level = 1; + } + + if (is_720p_or_larger) { + // TODO(chiyotsai@google.com): make this speed feature adaptive based on + // current block's vertical texture instead of hardcoded with resolution + sf->mv_sf.use_downsampled_sad = 2; + } + + if (!is_720p_or_larger) { + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + const int rate_tolerance = + AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct); + sf->hl_sf.recode_tolerance = 25 + (rate_tolerance >> 2); + } + + if (speed >= 1) { + if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 1; + + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; + } else if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } + + if (!is_720p_or_larger) { + sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64 + sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + } + sf->part_sf.ml_early_term_after_part_split_level = 2; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1; + } + + if (speed >= 2) { + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + } else if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } + + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); + sf->part_sf.partition_search_breakout_rate_thr = 120; + } else { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 22); + sf->part_sf.partition_search_breakout_rate_thr = 100; + } + + if (is_720p_or_larger) { + sf->inter_sf.prune_obmc_prob_thresh = 16; + } else { + sf->inter_sf.prune_obmc_prob_thresh = 8; + } + + if (is_480p_or_larger) { + sf->inter_sf.disable_interintra_wedge_var_thresh = 100; + } else { + sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; + } + + if (is_480p_or_lesser) sf->inter_sf.skip_ext_comp_nearmv_mode = 1; + + if (is_720p_or_larger) { + sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 1 : 0; + } else { + sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 2 : 0; + } + + if (is_480p_or_larger) { + sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1; + if (use_hbd) sf->tx_sf.prune_tx_size_level = 2; + } else { + if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = boosted ? 0 : 1; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = boosted ? 0 : 1; + } + + if (!is_720p_or_larger) { + sf->mv_sf.disable_second_mv = 1; + sf->mv_sf.auto_mv_step_size = 2; + } else { + sf->mv_sf.disable_second_mv = boosted ? 0 : 2; + sf->mv_sf.auto_mv_step_size = 1; + } + + if (!is_720p_or_larger) { + sf->hl_sf.recode_tolerance = 50; + sf->inter_sf.disable_interinter_wedge_newmv_search = + is_boosted_arf2_bwd_type ? 0 : 1; + sf->inter_sf.enable_fast_wedge_mask_search = 1; + } + } + + if (speed >= 3) { + sf->inter_sf.enable_fast_wedge_mask_search = 1; + sf->inter_sf.skip_newmv_in_drl = 2; + sf->inter_sf.skip_ext_comp_nearmv_mode = 1; + sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 3 : 0; + sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = + frame_is_intra_only(&cpi->common) ? 0 : 1; + + sf->part_sf.ml_early_term_after_part_split_level = 0; + + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 25); + sf->part_sf.partition_search_breakout_rate_thr = 200; + sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 2 : 0; + } else { + sf->part_sf.max_intra_bsize = BLOCK_32X32; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 23); + sf->part_sf.partition_search_breakout_rate_thr = 120; + sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 1 : 0; + } + if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; + + if (is_480p_or_larger) { + sf->part_sf.early_term_after_none_split = 1; + } else { + sf->part_sf.early_term_after_none_split = 0; + } + if (is_720p_or_larger) { + sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 2; + } else { + sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 3; + } + + if (is_720p_or_larger) { + sf->inter_sf.disable_interinter_wedge_var_thresh = 100; + sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 1; + } else { + sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX; + sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2; + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL2; + } + + sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; + } + + if (speed >= 4) { + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1; + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); + } else { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); + } + sf->part_sf.early_term_after_none_split = 1; + + if (is_480p_or_larger) { + sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2; + } else { + sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1; + } + + sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX; + sf->inter_sf.prune_obmc_prob_thresh = INT_MAX; + sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2; + if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 3; + + if (is_720p_or_larger) { + sf->inter_sf.prune_comp_ref_frames = 1; + } else if (is_480p_or_larger) { + sf->inter_sf.prune_comp_ref_frames = is_boosted_arf2_bwd_type ? 0 : 1; + } + + if (is_720p_or_larger) + sf->hl_sf.recode_tolerance = 32; + else + sf->hl_sf.recode_tolerance = 55; + + sf->intra_sf.skip_intra_in_interframe = 4; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3; + } + + if (speed >= 5) { + if (is_720p_or_larger) { + sf->inter_sf.prune_warped_prob_thresh = 16; + } else if (is_480p_or_larger) { + sf->inter_sf.prune_warped_prob_thresh = 8; + } + if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 40; + + sf->inter_sf.skip_newmv_in_drl = 4; + sf->inter_sf.prune_comp_ref_frames = 1; + sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1; + + if (!is_720p_or_larger) { + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET; + sf->inter_sf.prune_nearest_near_mv_using_refmv_weight = + (boosted || allow_screen_content_tools) ? 0 : 1; + sf->mv_sf.use_downsampled_sad = 1; + } + + if (!is_480p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); + } + + if (is_480p_or_lesser) { + sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL1; + } else { + sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL2; + } + + if (is_720p_or_larger) + sf->part_sf.ext_part_eval_based_on_cur_best = + (allow_screen_content_tools || frame_is_intra_only(cm)) ? 0 : 1; + + if (is_480p_or_larger) { + sf->tpl_sf.reduce_num_frames = 1; + } + } + + if (speed >= 6) { + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4; + sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3; + sf->inter_sf.prune_comp_ref_frames = 2; + sf->inter_sf.prune_nearest_near_mv_using_refmv_weight = + (boosted || allow_screen_content_tools) ? 0 : 1; + sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 2; + + if (is_720p_or_larger) { + sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE; + } else if (is_480p_or_larger) { + sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; + } + + if (is_480p_or_larger) { + sf->hl_sf.allow_sub_blk_me_in_tf = 1; + } + + if (is_1080p_or_larger) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + } + + if (is_720p_or_larger) { + sf->inter_sf.disable_masked_comp = 1; + } + + if (!is_720p_or_larger) { + sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW; + sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW; + } + + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 28); + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); + } + + if (is_720p_or_larger) { + sf->inter_sf.prune_ref_mv_idx_search = 2; + } else { + sf->inter_sf.prune_ref_mv_idx_search = 1; + } + + if (!is_720p_or_larger) { + sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = + is_boosted_arf2_bwd_type ? 450 : 150; + } + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4; + + sf->hl_sf.recode_tolerance = 55; + } +} + +static void set_good_speed_features_framesize_independent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boosted = frame_is_boosted(cpi); + const int is_boosted_arf2_bwd_type = + boosted || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; + const int is_inter_frame = + gf_group->frame_type[cpi->gf_frame_index] == INTER_FRAME; + const int allow_screen_content_tools = + cm->features.allow_screen_content_tools; + const int use_hbd = cpi->oxcf.use_highbitdepth; + if (!cpi->oxcf.tile_cfg.enable_large_scale_tile) { + sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA; + } + + // Speed 0 for all speed features that give neutral coding performance change. + sf->gm_sf.gm_search_type = boosted ? GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2 + : GM_SEARCH_CLOSEST_REFS_ONLY; + sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1; + sf->gm_sf.disable_gm_search_based_on_stats = 1; + + sf->part_sf.less_rectangular_check_level = 1; + sf->part_sf.ml_prune_partition = 1; + sf->part_sf.prune_ext_partition_types_search_level = 1; + sf->part_sf.prune_part4_search = 2; + sf->part_sf.simple_motion_search_prune_rect = 1; + sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3; + sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; + sf->part_sf.use_best_rd_for_pruning = 1; + sf->part_sf.simple_motion_search_prune_agg = + allow_screen_content_tools ? NO_PRUNING : SIMPLE_AGG_LVL0; + + // TODO(debargha): Test, tweak and turn on either 1 or 2 + sf->inter_sf.inter_mode_rd_model_estimation = 1; + sf->inter_sf.model_based_post_interp_filter_breakout = 1; + sf->inter_sf.prune_compound_using_single_ref = 1; + sf->inter_sf.prune_mode_search_simple_translation = 1; + sf->inter_sf.prune_ref_frame_for_rect_partitions = + (boosted || (allow_screen_content_tools)) + ? 0 + : (is_boosted_arf2_bwd_type ? 1 : 2); + sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2; + sf->inter_sf.selective_ref_frame = 1; + sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH; + + sf->interp_sf.use_fast_interpolation_filter_search = 1; + + sf->intra_sf.intra_pruning_with_hog = 1; + + sf->tx_sf.adaptive_txb_search_level = 1; + sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.model_based_prune_tx_search_level = 1; + sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; + + sf->tpl_sf.search_method = NSTEP_8PT; + + sf->rt_sf.use_nonrd_pick_mode = 0; + sf->rt_sf.use_real_time_ref_set = 0; + + if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION || + cpi->use_screen_content_tools) { + sf->mv_sf.exhaustive_searches_thresh = (1 << 20); + } else { + sf->mv_sf.exhaustive_searches_thresh = (1 << 25); + } + + sf->rd_sf.perform_coeff_opt = 1; + sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL; + + if (speed >= 1) { + sf->hl_sf.adjust_num_frames_for_arf_filtering = + allow_screen_content_tools ? 0 : 1; + + sf->part_sf.intra_cnn_based_part_prune_level = + allow_screen_content_tools ? 0 : 2; + sf->part_sf.simple_motion_search_early_term_none = 1; + // TODO(Venkat): Clean-up frame type dependency for + // simple_motion_search_split in partition search function and set the + // speed feature accordingly + sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2; + sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3; + + sf->mv_sf.exhaustive_searches_thresh <<= 1; + sf->mv_sf.obmc_full_pixel_search_level = 1; + sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS; + sf->mv_sf.disable_extensive_joint_motion_search = 1; + + sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1; + sf->inter_sf.prune_comp_type_by_comp_avg = 1; + sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1; + sf->inter_sf.prune_ref_frame_for_rect_partitions = + (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools)) + ? 0 + : (boosted ? 1 : 2); + sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3; + sf->inter_sf.reuse_inter_intra_mode = 1; + sf->inter_sf.selective_ref_frame = 2; + sf->inter_sf.skip_arf_compound = 1; + + sf->interp_sf.use_interp_filter = 1; + + sf->intra_sf.prune_palette_search_level = 1; + + sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + sf->tx_sf.model_based_prune_tx_search_level = 0; + sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2; + sf->tx_sf.tx_type_search.skip_tx_search = 1; + + sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3; + sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2; + sf->rd_sf.tx_domain_dist_thres_level = 1; + + sf->lpf_sf.dual_sgr_penalty_level = 1; + sf->lpf_sf.enable_sgr_ep_pruning = 1; + + // TODO(any, yunqing): move this feature to speed 0. + sf->tpl_sf.skip_alike_starting_mv = 1; + } + + if (speed >= 2) { + sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; + + sf->fp_sf.skip_motion_search_threshold = 25; + + sf->gm_sf.num_refinement_steps = 2; + + sf->part_sf.reuse_best_prediction_for_part_ab = + !frame_is_intra_only(&cpi->common); + + sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL; + sf->mv_sf.subpel_iters_per_step = 1; + sf->mv_sf.reduce_search_range = 1; + + // TODO(chiyotsai@google.com): We can get 10% speed up if we move + // adaptive_rd_thresh to speed 1. But currently it performs poorly on some + // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a + // bit more closely to figure out why. + sf->inter_sf.adaptive_rd_thresh = 1; + sf->inter_sf.disable_interinter_wedge_var_thresh = 100; + sf->inter_sf.fast_interintra_wedge_search = 1; + sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1; + sf->inter_sf.prune_ext_comp_using_neighbors = 1; + sf->inter_sf.prune_comp_using_best_single_mode_ref = 2; + sf->inter_sf.prune_comp_type_by_comp_avg = 2; + sf->inter_sf.selective_ref_frame = 3; + sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; + sf->inter_sf.enable_fast_compound_mode_search = 1; + sf->inter_sf.reuse_mask_search_results = 1; + set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 1); + sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 1; + sf->inter_sf.alt_ref_search_fp = 1; + + sf->interp_sf.adaptive_interp_filter_search = 1; + sf->interp_sf.disable_dual_filter = 1; + + sf->intra_sf.disable_smooth_intra = + !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key > 1); + sf->intra_sf.intra_pruning_with_hog = 2; + sf->intra_sf.skip_intra_in_interframe = is_inter_frame ? 2 : 1; + sf->intra_sf.skip_filter_intra_in_inter_frames = 1; + + sf->tpl_sf.prune_starting_mv = 1; + sf->tpl_sf.search_method = DIAMOND; + + sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4; + sf->rd_sf.use_mb_rd_hash = 1; + + sf->lpf_sf.prune_wiener_based_on_src_var = 1; + sf->lpf_sf.prune_sgr_based_on_wiener = 1; + sf->lpf_sf.disable_loop_restoration_chroma = boosted ? 0 : 1; + sf->lpf_sf.reduce_wiener_window_size = boosted ? 0 : 1; + + // TODO(any): Re-evaluate this feature set to 1 in speed 2. + sf->tpl_sf.allow_compound_pred = 0; + sf->tpl_sf.prune_ref_frames_in_tpl = 1; + } + + if (speed >= 3) { + sf->hl_sf.high_precision_mv_usage = CURRENT_Q; + + sf->gm_sf.prune_ref_frame_for_gm_search = 1; + sf->gm_sf.prune_zero_mv_with_sse = 1; + sf->gm_sf.num_refinement_steps = 0; + + sf->part_sf.less_rectangular_check_level = 2; + sf->part_sf.simple_motion_search_prune_agg = + allow_screen_content_tools + ? SIMPLE_AGG_LVL0 + : (boosted ? SIMPLE_AGG_LVL1 : QIDX_BASED_AGG_LVL1); + sf->part_sf.prune_ext_part_using_split_info = 1; + sf->part_sf.simple_motion_search_rect_split = 1; + + sf->mv_sf.full_pixel_search_level = 1; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; + sf->mv_sf.search_method = DIAMOND; + sf->mv_sf.disable_second_mv = 2; + sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_1; + sf->mv_sf.use_intrabc = 0; + + sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1; + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; + sf->inter_sf.disable_onesided_comp = 1; + sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; + // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2 + // and clean-up the speed feature + sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1; + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1; + sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2; + sf->inter_sf.selective_ref_frame = 5; + sf->inter_sf.reuse_compound_type_decision = 1; + set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, + boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2)); + sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 2; + + sf->interp_sf.adaptive_interp_filter_search = 2; + + // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are + // inherited directly from luma hog with some minor tweaking. Eventually we + // should run this with a bayesian optimizer to find the Pareto frontier. + sf->intra_sf.chroma_intra_pruning_with_hog = 2; + sf->intra_sf.intra_pruning_with_hog = 3; + sf->intra_sf.prune_palette_search_level = 2; + sf->intra_sf.top_intra_model_count_allowed = 2; + + sf->tpl_sf.prune_starting_mv = 2; + sf->tpl_sf.skip_alike_starting_mv = 2; + sf->tpl_sf.prune_intra_modes = 1; + sf->tpl_sf.reduce_first_step_size = 6; + sf->tpl_sf.subpel_force_stop = QUARTER_PEL; + sf->tpl_sf.gop_length_decision_method = 1; + + sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3; + sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; + + // TODO(any): Refactor the code related to following winner mode speed + // features + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1; + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1; + sf->winner_mode_sf.motion_mode_for_winner_cand = + boosted ? 0 + : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE ? 1 + : 2; + sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 4; + + // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality + // loss. + sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2; + sf->lpf_sf.prune_wiener_based_on_src_var = 2; + sf->lpf_sf.use_coarse_filter_level_search = + frame_is_intra_only(&cpi->common) ? 0 : 1; + sf->lpf_sf.use_downsampled_wiener_stats = 1; + } + + if (speed >= 4) { + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + + sf->gm_sf.prune_zero_mv_with_sse = 2; + + sf->part_sf.simple_motion_search_prune_agg = + allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL2; + sf->part_sf.simple_motion_search_reduce_search_steps = 4; + sf->part_sf.prune_ext_part_using_split_info = 2; + sf->part_sf.ml_predict_breakout_level = 3; + sf->part_sf.prune_rectangular_split_based_on_qidx = + (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0 + : 1; + + sf->inter_sf.alt_ref_search_fp = 2; + sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 3; + sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_MOTION_MODE] = boosted ? 0 : 5; + sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 3; + + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2; + sf->inter_sf.prune_ext_comp_using_neighbors = 2; + sf->inter_sf.prune_obmc_prob_thresh = INT_MAX; + sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX; + + sf->interp_sf.cb_pred_filter_search = 1; + sf->interp_sf.skip_sharp_interp_filter_search = 1; + sf->interp_sf.use_interp_filter = 2; + + sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; + sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; + sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL; + // TODO(any): "intra_y_mode_mask" doesn't help much at speed 4. + // sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + // sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + // sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; + sf->intra_sf.skip_intra_in_interframe = 4; + + sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL; + sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2; + + sf->tpl_sf.subpel_force_stop = HALF_PEL; + sf->tpl_sf.search_method = FAST_BIGDIA; + sf->tpl_sf.use_sad_for_mode_decision = 1; + + sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; + + sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 5 : 7; + + // TODO(any): Extend multi-winner mode processing support for inter frames + sf->winner_mode_sf.multi_winner_mode_type = + frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_DEFAULT + : MULTI_WINNER_MODE_OFF; + sf->winner_mode_sf.dc_blk_pred_level = boosted ? 0 : 2; + + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL; + } + + if (speed >= 5) { + sf->hl_sf.weight_calc_level_in_tf = 1; + sf->hl_sf.adjust_num_frames_for_arf_filtering = + allow_screen_content_tools ? 0 : 2; + + sf->fp_sf.reduce_mv_step_param = 4; + + sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH; + + sf->part_sf.simple_motion_search_prune_agg = + allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL3; + sf->part_sf.ext_partition_eval_thresh = + allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16; + sf->part_sf.prune_sub_8x8_partition_level = + allow_screen_content_tools ? 1 : 2; + + sf->mv_sf.warp_search_method = WARP_SEARCH_DIAMOND; + + sf->inter_sf.prune_inter_modes_if_skippable = 1; + sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 1; + sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 4; + sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 5; + sf->inter_sf.enable_fast_compound_mode_search = 2; + + sf->interp_sf.skip_interp_filter_search = boosted ? 0 : 1; + + sf->intra_sf.chroma_intra_pruning_with_hog = 3; + + // TODO(any): Extend multi-winner mode processing support for inter frames + sf->winner_mode_sf.multi_winner_mode_type = + frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_FAST + : MULTI_WINNER_MODE_OFF; + + // Disable Self-guided Loop restoration filter. + sf->lpf_sf.disable_sgr_filter = true; + sf->lpf_sf.disable_wiener_coeff_refine_search = true; + + sf->tpl_sf.prune_starting_mv = 3; + sf->tpl_sf.use_y_only_rate_distortion = 1; + sf->tpl_sf.subpel_force_stop = FULL_PEL; + sf->tpl_sf.gop_length_decision_method = 2; + sf->tpl_sf.use_sad_for_mode_decision = 2; + + sf->winner_mode_sf.dc_blk_pred_level = 2; + + sf->fp_sf.disable_recon = 1; + } + + if (speed >= 6) { + sf->hl_sf.disable_extra_sc_testing = 1; + sf->hl_sf.second_alt_ref_filtering = 0; + + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3; + sf->inter_sf.selective_ref_frame = 6; + sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 2; + sf->inter_sf.prune_ext_comp_using_neighbors = 3; + + sf->intra_sf.chroma_intra_pruning_with_hog = 4; + sf->intra_sf.intra_pruning_with_hog = 4; + sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC; + sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC; + sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC; + sf->intra_sf.early_term_chroma_palette_size_search = 1; + + sf->part_sf.prune_rectangular_split_based_on_qidx = + boosted || allow_screen_content_tools ? 0 : 2; + + sf->part_sf.prune_part4_search = 3; + + sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL; + sf->mv_sf.use_bsize_dependent_search_method = 1; + + sf->tpl_sf.gop_length_decision_method = 3; + + sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 6 : 8; + + sf->winner_mode_sf.dc_blk_pred_level = 3; + sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF; + + sf->fp_sf.skip_zeromv_motion_search = 1; + } +} + +static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi, + SPEED_FEATURES *const sf, + int speed) { + const AV1_COMMON *const cm = &cpi->common; + const int boosted = frame_is_boosted(cpi); + const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360; + + if (!is_360p_or_larger) { + sf->rt_sf.prune_intra_mode_based_on_mv_range = 1; + sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1; + if (speed >= 6) + sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2; + if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 2; + if (speed >= 7) { + sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true; + sf->rt_sf.use_rtc_tf = 2; + } + if (speed == 8) sf->rt_sf.prefer_large_partition_blocks = 1; + if (speed >= 8) { + sf->rt_sf.use_nonrd_filter_search = 1; + sf->rt_sf.tx_size_level_based_on_qstep = 1; + } + if (speed >= 9) { + sf->rt_sf.use_comp_ref_nonrd = 0; + sf->rt_sf.nonrd_aggressive_skip = 1; + sf->rt_sf.skip_intra_pred = 1; + // Only turn on enable_ref_short_signaling for low resolution when only + // LAST and GOLDEN ref frames are used. + sf->rt_sf.enable_ref_short_signaling = + (!sf->rt_sf.use_nonrd_altref_frame && + (!sf->rt_sf.use_comp_ref_nonrd || + (!sf->rt_sf.ref_frame_comp_nonrd[1] && + !sf->rt_sf.ref_frame_comp_nonrd[2]))); + +// TODO(kyslov) Re-enable when AV1 models are trained +#if 0 +#if CONFIG_RT_ML_PARTITIONING + if (!frame_is_intra_only(cm)) { + sf->part_sf.partition_search_type = ML_BASED_PARTITION; + sf->rt_sf.reuse_inter_pred_nonrd = 0; + } +#endif +#endif + sf->rt_sf.use_adaptive_subpel_search = false; + } + if (speed >= 10) { + // TODO(yunqingwang@google.com): To be conservative, disable + // sf->rt_sf.estimate_motion_for_var_based_partition = 3 for speed 10/qvga + // for now. May enable it in the future. + sf->rt_sf.estimate_motion_for_var_based_partition = 0; + sf->rt_sf.skip_intra_pred = 2; + sf->rt_sf.hybrid_intra_pickmode = 3; + sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 2; + sf->rt_sf.use_nonrd_filter_search = 0; + } + } else { + sf->rt_sf.prune_intra_mode_based_on_mv_range = 2; + sf->intra_sf.skip_filter_intra_in_inter_frames = 1; + if (speed <= 5) { + sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = + boosted ? INT_MAX : 350; + sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2; + } + if (speed == 6) sf->part_sf.disable_8x8_part_based_on_qidx = 1; + if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 2; + if (speed == 7) { + sf->rt_sf.prefer_large_partition_blocks = 1; + // Enable this feature for [360p, 720p] resolution range initially. + // Only enable for low bitdepth to mitigate issue: b/303023614. + if (!cpi->rc.rtc_external_ratectrl && + AOMMIN(cm->width, cm->height) <= 720 && !cpi->oxcf.use_highbitdepth) + sf->hl_sf.accurate_bit_estimate = cpi->oxcf.q_cfg.aq_mode == NO_AQ; + } + if (speed >= 7) { + sf->rt_sf.use_rtc_tf = 1; + } + if (speed == 8 && !cpi->ppi->use_svc) { + sf->rt_sf.short_circuit_low_temp_var = 0; + sf->rt_sf.use_nonrd_altref_frame = 1; + } + if (speed >= 8) sf->rt_sf.tx_size_level_based_on_qstep = 2; + if (speed >= 9) { + sf->rt_sf.gf_length_lvl = 1; + sf->rt_sf.skip_cdef_sb = 1; + sf->rt_sf.sad_based_adp_altref_lag = 2; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 2; + sf->rt_sf.use_adaptive_subpel_search = true; + sf->interp_sf.cb_pred_filter_search = 1; + } + if (speed >= 10) { + sf->rt_sf.hybrid_intra_pickmode = 2; + sf->rt_sf.sad_based_adp_altref_lag = 4; + sf->rt_sf.tx_size_level_based_on_qstep = 0; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; + sf->rt_sf.use_adaptive_subpel_search = false; + sf->interp_sf.cb_pred_filter_search = 2; + } + } + if (!is_480p_or_larger) { + if (speed == 7) { + sf->rt_sf.nonrd_check_partition_merge_mode = 2; + } + } + if (!is_720p_or_larger) { + if (speed >= 9) { + sf->rt_sf.force_large_partition_blocks_intra = 1; + } + } else { + if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 3; + if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 0; + if (speed >= 7) { + sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 2; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 1; + } + if (speed >= 9) { + sf->rt_sf.sad_based_adp_altref_lag = 1; + sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 0; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 2; + } + if (speed >= 10) { + sf->rt_sf.sad_based_adp_altref_lag = 3; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; + } + } + // TODO(Any): Check/Tune settings of other sfs for 1080p. + if (is_1080p_or_larger) { + if (speed >= 7) { + sf->rt_sf.reduce_mv_pel_precision_highmotion = 0; + sf->rt_sf.use_adaptive_subpel_search = 0; + } + if (speed >= 9) sf->interp_sf.cb_pred_filter_search = 0; + } else { + if (speed >= 9) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + if (speed >= 10) sf->rt_sf.nonrd_aggressive_skip = 1; + } + // TODO(marpan): Tune settings for speed 11 video mode, + // for resolutions below 720p. + if (speed >= 11 && !is_720p_or_larger && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { + sf->rt_sf.skip_cdef_sb = 2; + sf->rt_sf.force_only_last_ref = 1; + sf->rt_sf.selective_cdf_update = 1; + sf->rt_sf.use_nonrd_filter_search = 0; + if (is_360p_or_larger) { + sf->part_sf.fixed_partition_size = BLOCK_32X32; + sf->rt_sf.use_fast_fixed_part = 1; + } + sf->rt_sf.increase_source_sad_thresh = 1; + sf->rt_sf.part_early_exit_zeromv = 2; + sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2; + for (int i = 0; i < BLOCK_SIZES; ++i) { + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + } + } + // Setting for SVC, or when the ref_frame_config control is + // used to set the reference structure. + if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) { + const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + // For SVC: for greater than 2 temporal layers, use better mv search on + // base temporal layers, and only on base spatial layer if highest + // resolution is above 640x360. + if (cpi->svc.number_temporal_layers >= 2 && + cpi->svc.temporal_layer_id == 0 && + (cpi->svc.spatial_layer_id == 0 || + cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <= + 640 * 360)) { + sf->mv_sf.search_method = NSTEP; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; + sf->rt_sf.fullpel_search_step_param = 10; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 0; + if (cm->width * cm->height <= 352 * 288) + sf->rt_sf.nonrd_prune_ref_frame_search = 2; + sf->rt_sf.force_large_partition_blocks_intra = 0; + } + if (speed >= 8) { + if (cpi->svc.number_temporal_layers > 2) + sf->rt_sf.disable_cdf_update_non_reference_frame = true; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; + if (rtc_ref->non_reference_frame) { + sf->rt_sf.nonrd_aggressive_skip = 1; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + } + } + if (speed <= 9 && cpi->svc.number_temporal_layers > 2 && + cpi->svc.temporal_layer_id == 0) + sf->rt_sf.check_only_zero_zeromv_on_large_blocks = false; + else + sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true; + sf->rt_sf.frame_level_mode_cost_update = false; + + // Compound mode enabling. + if (rtc_ref->ref_frame_comp[0] || rtc_ref->ref_frame_comp[1] || + rtc_ref->ref_frame_comp[2]) { + sf->rt_sf.use_comp_ref_nonrd = 1; + sf->rt_sf.ref_frame_comp_nonrd[0] = + rtc_ref->ref_frame_comp[0] && rtc_ref->reference[GOLDEN_FRAME - 1]; + sf->rt_sf.ref_frame_comp_nonrd[1] = + rtc_ref->ref_frame_comp[1] && rtc_ref->reference[LAST2_FRAME - 1]; + sf->rt_sf.ref_frame_comp_nonrd[2] = + rtc_ref->ref_frame_comp[2] && rtc_ref->reference[ALTREF_FRAME - 1]; + } else { + sf->rt_sf.use_comp_ref_nonrd = 0; + } + + if (cpi->svc.number_spatial_layers > 1 || + cpi->svc.number_temporal_layers > 1) + sf->hl_sf.accurate_bit_estimate = 0; + + sf->rt_sf.estimate_motion_for_var_based_partition = 1; + + // For single layers RPS: bias/adjustment for recovery frame. + if (cpi->ppi->rtc_ref.bias_recovery_frame) { + sf->mv_sf.search_method = NSTEP; + sf->mv_sf.subpel_search_method = SUBPEL_TREE; + sf->rt_sf.fullpel_search_step_param = 8; + sf->rt_sf.nonrd_aggressive_skip = 0; + } + } + // Screen settings. + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + // TODO(marpan): Check settings for speed 7 and 8. + if (speed >= 7) { + sf->rt_sf.reduce_mv_pel_precision_highmotion = 1; + sf->mv_sf.use_bsize_dependent_search_method = 0; + sf->rt_sf.skip_cdef_sb = 1; + sf->rt_sf.increase_color_thresh_palette = 1; + if (!frame_is_intra_only(cm)) sf->rt_sf.dct_only_palette_nonrd = 1; + } + if (speed >= 8) { + sf->rt_sf.nonrd_check_partition_merge_mode = 3; + sf->rt_sf.nonrd_prune_ref_frame_search = 1; + sf->rt_sf.use_nonrd_filter_search = 0; + sf->rt_sf.prune_hv_pred_modes_using_src_sad = false; + } + if (speed >= 9) { + sf->rt_sf.prune_idtx_nonrd = 1; + sf->rt_sf.part_early_exit_zeromv = 2; + sf->rt_sf.skip_lf_screen = 1; + sf->rt_sf.nonrd_prune_ref_frame_search = 3; + sf->rt_sf.var_part_split_threshold_shift = 10; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; + sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1; + sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + sf->rt_sf.nonrd_check_partition_merge_mode = 0; + sf->interp_sf.cb_pred_filter_search = 0; + } + if (speed >= 10) { + if (cm->width * cm->height > 1920 * 1080) + sf->part_sf.disable_8x8_part_based_on_qidx = 1; + sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80; + sf->rt_sf.part_early_exit_zeromv = 1; + sf->rt_sf.nonrd_aggressive_skip = 1; + } + if (speed >= 11) { + sf->rt_sf.skip_lf_screen = 2; + sf->rt_sf.skip_cdef_sb = 2; + sf->rt_sf.part_early_exit_zeromv = 2; + sf->rt_sf.prune_palette_nonrd = 1; + sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2; + sf->rt_sf.increase_color_thresh_palette = 0; + } + sf->rt_sf.use_nonrd_altref_frame = 0; + sf->rt_sf.use_rtc_tf = 0; + sf->rt_sf.use_comp_ref_nonrd = 0; + sf->rt_sf.source_metrics_sb_nonrd = 1; + if (cpi->rc.high_source_sad == 1) { + sf->rt_sf.prefer_large_partition_blocks = 0; + sf->part_sf.max_intra_bsize = BLOCK_128X128; + for (int i = 0; i < BLOCK_SIZES; ++i) { + if (i > BLOCK_32X32) + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + else + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V; + } + } + if (cpi->rc.max_block_source_sad > 20000 && + cpi->rc.frame_source_sad > 100 && speed >= 6 && + (cpi->rc.percent_blocks_with_motion > 1 || + cpi->svc.last_layer_dropped[0])) { + sf->mv_sf.search_method = NSTEP; + sf->rt_sf.fullpel_search_step_param = 2; + } + sf->rt_sf.partition_direct_merging = 0; + sf->hl_sf.accurate_bit_estimate = 0; + // This feature is for nonrd_pickmode. + if (sf->rt_sf.use_nonrd_pick_mode) + sf->rt_sf.estimate_motion_for_var_based_partition = 1; + else + sf->rt_sf.estimate_motion_for_var_based_partition = 0; + } + if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { + sf->rt_sf.use_rtc_tf = 0; + // TODO(aomedia:3412): The setting accurate_bit_estimate = 0 + // can be removed once it's fixed for lossless mode. + sf->hl_sf.accurate_bit_estimate = 0; + } + if (cpi->oxcf.use_highbitdepth) { + // Disable for use_highbitdepth = 1 to mitigate issue: b/303023614. + sf->rt_sf.estimate_motion_for_var_based_partition = 0; + } + if (cpi->oxcf.superres_cfg.enable_superres) { + sf->rt_sf.use_rtc_tf = 0; + sf->rt_sf.nonrd_prune_ref_frame_search = 1; + } +} + +// TODO(kyslov): now this is very similar to +// set_good_speed_features_framesize_independent +// except it sets non-rd flag on speed 8. This function will likely +// be modified in the future with RT-specific speed features. +static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, + SPEED_FEATURES *sf, + int speed) { + AV1_COMMON *const cm = &cpi->common; + const int boosted = frame_is_boosted(cpi); + + // Currently, rt speed 0, 1, 2, 3, 4, 5 are the same. + // Following set of speed features are not impacting encoder's decisions as + // the relevant tools are disabled by default. + sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH; + sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; + sf->inter_sf.reuse_inter_intra_mode = 1; + sf->inter_sf.prune_compound_using_single_ref = 0; + sf->inter_sf.prune_comp_search_by_single_result = 2; + sf->inter_sf.prune_comp_type_by_comp_avg = 2; + sf->inter_sf.fast_wedge_sign_estimate = 1; + sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; + sf->inter_sf.disable_interinter_wedge_var_thresh = 100; + sf->interp_sf.cb_pred_filter_search = 0; + sf->interp_sf.skip_interp_filter_search = 1; + sf->part_sf.ml_prune_partition = 1; + sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; + sf->part_sf.prune_ext_partition_types_search_level = 2; + sf->part_sf.less_rectangular_check_level = 2; + sf->mv_sf.obmc_full_pixel_search_level = 1; + sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF; + sf->tx_sf.model_based_prune_tx_search_level = 0; + sf->lpf_sf.dual_sgr_penalty_level = 1; + // Disable Wiener and Self-guided Loop restoration filters. + sf->lpf_sf.disable_wiener_filter = true; + sf->lpf_sf.disable_sgr_filter = true; + sf->intra_sf.prune_palette_search_level = 2; + sf->intra_sf.prune_luma_palette_size_search_level = 2; + sf->intra_sf.early_term_chroma_palette_size_search = 1; + + // End of set + + // TODO(any, yunqing): tune these features for real-time use cases. + sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_SOLO; + sf->hl_sf.frame_parameter_update = 0; + + sf->inter_sf.model_based_post_interp_filter_breakout = 1; + // TODO(any): As per the experiments, this speed feature is doing redundant + // computation since the model rd based pruning logic is similar to model rd + // based gating when inter_mode_rd_model_estimation = 2. Enable this SF if + // either of the condition becomes true. + // (1) inter_mode_rd_model_estimation != 2 + // (2) skip_interp_filter_search == 0 + // (3) Motion mode or compound mode is enabled */ + sf->inter_sf.prune_mode_search_simple_translation = 0; + sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted; + sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; + sf->inter_sf.selective_ref_frame = 4; + sf->inter_sf.alt_ref_search_fp = 2; + set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 4); + sf->inter_sf.limit_txfm_eval_per_mode = 3; + + sf->inter_sf.adaptive_rd_thresh = 4; + sf->inter_sf.inter_mode_rd_model_estimation = 2; + sf->inter_sf.prune_inter_modes_if_skippable = 1; + sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3; + sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3; + sf->inter_sf.skip_newmv_in_drl = 4; + + sf->interp_sf.use_fast_interpolation_filter_search = 1; + sf->interp_sf.use_interp_filter = 1; + sf->interp_sf.adaptive_interp_filter_search = 1; + sf->interp_sf.disable_dual_filter = 1; + + sf->part_sf.default_max_partition_size = BLOCK_128X128; + sf->part_sf.default_min_partition_size = BLOCK_8X8; + sf->part_sf.use_best_rd_for_pruning = 1; + sf->part_sf.early_term_after_none_split = 1; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 25); + sf->part_sf.max_intra_bsize = BLOCK_16X16; + sf->part_sf.partition_search_breakout_rate_thr = 500; + sf->part_sf.partition_search_type = VAR_BASED_PARTITION; + sf->part_sf.adjust_var_based_rd_partitioning = 2; + + sf->mv_sf.full_pixel_search_level = 1; + sf->mv_sf.exhaustive_searches_thresh = INT_MAX; + sf->mv_sf.auto_mv_step_size = 1; + sf->mv_sf.subpel_iters_per_step = 1; + sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS; + sf->mv_sf.search_method = FAST_DIAMOND; + sf->mv_sf.subpel_force_stop = EIGHTH_PEL; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; + + for (int i = 0; i < TX_SIZES; ++i) { + sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC; + sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL; + } + sf->intra_sf.skip_intra_in_interframe = 5; + sf->intra_sf.disable_smooth_intra = 1; + sf->intra_sf.skip_filter_intra_in_inter_frames = 1; + + sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; + sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + sf->tx_sf.tx_size_search_lgr_block = 1; + sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; + sf->tx_sf.tx_type_search.skip_tx_search = 1; + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; + sf->tx_sf.refine_fast_tx_search_results = 0; + sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; + sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4; + + sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT; + sf->rd_sf.simple_model_rd_from_var = 1; + sf->rd_sf.tx_domain_dist_level = 2; + sf->rd_sf.tx_domain_dist_thres_level = 2; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4; + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; + + sf->winner_mode_sf.dc_blk_pred_level = frame_is_intra_only(cm) ? 0 : 3; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1; + sf->winner_mode_sf.tx_size_search_level = 1; + sf->winner_mode_sf.winner_mode_ifs = 1; + + sf->rt_sf.check_intra_pred_nonrd = 1; + sf->rt_sf.estimate_motion_for_var_based_partition = 2; + sf->rt_sf.hybrid_intra_pickmode = 1; + sf->rt_sf.use_comp_ref_nonrd = 0; + sf->rt_sf.ref_frame_comp_nonrd[0] = 0; + sf->rt_sf.ref_frame_comp_nonrd[1] = 0; + sf->rt_sf.ref_frame_comp_nonrd[2] = 0; + sf->rt_sf.use_nonrd_filter_search = 1; + sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + sf->rt_sf.num_inter_modes_for_tx_search = 5; + sf->rt_sf.prune_inter_modes_using_temp_var = 1; + sf->rt_sf.use_real_time_ref_set = 1; + sf->rt_sf.use_simple_rd_model = 1; + sf->rt_sf.prune_inter_modes_with_golden_ref = boosted ? 0 : 1; + // TODO(any): This sf could be removed. + sf->rt_sf.short_circuit_low_temp_var = 1; + sf->rt_sf.check_scene_detection = 1; + if (cpi->rc.rtc_external_ratectrl) sf->rt_sf.check_scene_detection = 0; + if (cm->current_frame.frame_type != KEY_FRAME && + cpi->oxcf.rc_cfg.mode == AOM_CBR) + sf->rt_sf.overshoot_detection_cbr = FAST_DETECTION_MAXQ; + // Enable noise estimation only for high resolutions for now. + // + // Since use_temporal_noise_estimate has no effect for all-intra frame + // encoding, it is disabled for this case. + if (cpi->oxcf.kf_cfg.key_freq_max != 0 && cm->width * cm->height > 640 * 480) + sf->rt_sf.use_temporal_noise_estimate = 1; + sf->rt_sf.skip_tx_no_split_var_based_partition = 1; + sf->rt_sf.skip_newmv_mode_based_on_sse = 1; + sf->rt_sf.mode_search_skip_flags = + (cm->current_frame.frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR | + FLAG_EARLY_TERMINATE; + sf->rt_sf.var_part_split_threshold_shift = 5; + if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 1; + sf->rt_sf.use_fast_fixed_part = 0; + sf->rt_sf.increase_source_sad_thresh = 0; + + if (speed >= 6) { + sf->mv_sf.use_fullpel_costlist = 1; + + sf->rd_sf.tx_domain_dist_thres_level = 3; + + sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 0; + sf->inter_sf.limit_inter_mode_cands = 4; + sf->inter_sf.prune_warped_prob_thresh = 8; + sf->inter_sf.extra_prune_warped = 1; + + sf->rt_sf.gf_refresh_based_on_qp = 1; + sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1; + sf->rt_sf.var_part_split_threshold_shift = 7; + if (!frame_is_intra_only(&cpi->common)) + sf->rt_sf.var_part_based_on_qidx = 2; + + sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 3; + } + + if (speed >= 7) { + sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_1; + sf->rt_sf.use_comp_ref_nonrd = 1; + sf->rt_sf.ref_frame_comp_nonrd[2] = 1; // LAST_ALTREF + sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2; + sf->part_sf.partition_search_type = VAR_BASED_PARTITION; + sf->part_sf.max_intra_bsize = BLOCK_32X32; + + sf->mv_sf.search_method = FAST_DIAMOND; + sf->mv_sf.subpel_force_stop = QUARTER_PEL; + + sf->inter_sf.inter_mode_rd_model_estimation = 2; + // This sf is not applicable in non-rd path. + sf->inter_sf.skip_newmv_in_drl = 0; + + sf->interp_sf.skip_interp_filter_search = 0; + + // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't + // good. May need more study. + for (int i = 0; i < TX_SIZES; ++i) { + sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL; + } + + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL5; + + sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + sf->rt_sf.nonrd_prune_ref_frame_search = 1; + // This is for rd path only. + sf->rt_sf.prune_inter_modes_using_temp_var = 0; + sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 0; + sf->rt_sf.prune_intra_mode_based_on_mv_range = 0; +#if !CONFIG_REALTIME_ONLY + sf->rt_sf.reuse_inter_pred_nonrd = + (cpi->oxcf.motion_mode_cfg.enable_warped_motion == 0); +#else + sf->rt_sf.reuse_inter_pred_nonrd = 1; +#endif +#if CONFIG_AV1_TEMPORAL_DENOISING + sf->rt_sf.reuse_inter_pred_nonrd = (cpi->oxcf.noise_sensitivity == 0); +#endif + sf->rt_sf.short_circuit_low_temp_var = 0; + // For spatial layers, only LAST and GOLDEN are currently used in the SVC + // for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the + // get_ref_frame_flags() for some patterns, so disable it here for + // spatial layers. + sf->rt_sf.use_nonrd_altref_frame = + (cpi->svc.number_spatial_layers > 1) ? 0 : 1; + sf->rt_sf.use_nonrd_pick_mode = 1; + sf->rt_sf.nonrd_check_partition_merge_mode = 3; + sf->rt_sf.skip_intra_pred = 1; + sf->rt_sf.source_metrics_sb_nonrd = 1; + // Set mask for intra modes. + for (int i = 0; i < BLOCK_SIZES; ++i) + if (i >= BLOCK_32X32) + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + else + // Use DC, H, V intra mode for block sizes < 32X32. + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V; + + sf->winner_mode_sf.dc_blk_pred_level = 0; + sf->rt_sf.var_part_based_on_qidx = 3; + sf->rt_sf.prune_compoundmode_with_singlecompound_var = true; + sf->rt_sf.prune_compoundmode_with_singlemode_var = true; + sf->rt_sf.skip_compound_based_on_var = true; + sf->rt_sf.use_adaptive_subpel_search = true; + } + + if (speed >= 8) { + sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_2; + sf->intra_sf.intra_pruning_with_hog = 1; + sf->rt_sf.short_circuit_low_temp_var = 1; + sf->rt_sf.use_nonrd_altref_frame = 0; + sf->rt_sf.nonrd_prune_ref_frame_search = 2; + sf->rt_sf.nonrd_check_partition_merge_mode = 0; + sf->rt_sf.var_part_split_threshold_shift = 8; + sf->rt_sf.var_part_based_on_qidx = 4; + sf->rt_sf.partition_direct_merging = 1; + sf->rt_sf.prune_compoundmode_with_singlemode_var = false; + sf->mv_sf.use_bsize_dependent_search_method = 2; + sf->rt_sf.prune_hv_pred_modes_using_src_sad = true; + } + if (speed >= 9) { + sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_3; + sf->rt_sf.estimate_motion_for_var_based_partition = 3; + sf->rt_sf.prefer_large_partition_blocks = 3; + sf->rt_sf.skip_intra_pred = 2; + sf->rt_sf.var_part_split_threshold_shift = 9; + for (int i = 0; i < BLOCK_SIZES; ++i) + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + sf->rt_sf.var_part_based_on_qidx = 0; + sf->rt_sf.frame_level_mode_cost_update = true; + sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 0; + sf->rt_sf.use_adaptive_subpel_search = true; + sf->mv_sf.use_bsize_dependent_search_method = 0; + } + if (speed >= 10) { + sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4; + sf->rt_sf.nonrd_prune_ref_frame_search = 3; + sf->rt_sf.var_part_split_threshold_shift = 10; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + } + if (speed >= 11 && !frame_is_intra_only(cm) && + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + sf->winner_mode_sf.dc_blk_pred_level = 3; + } +} + +static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) { + // best quality defaults + hl_sf->frame_parameter_update = 1; + hl_sf->recode_loop = ALLOW_RECODE; + // Recode loop tolerance %. + hl_sf->recode_tolerance = 25; + hl_sf->high_precision_mv_usage = CURRENT_Q; + hl_sf->superres_auto_search_type = SUPERRES_AUTO_ALL; + hl_sf->disable_extra_sc_testing = 0; + hl_sf->second_alt_ref_filtering = 1; + hl_sf->adjust_num_frames_for_arf_filtering = 0; + hl_sf->accurate_bit_estimate = 0; + hl_sf->weight_calc_level_in_tf = 0; + hl_sf->allow_sub_blk_me_in_tf = 0; +} + +static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) { + fp_sf->reduce_mv_step_param = 3; + fp_sf->skip_motion_search_threshold = 0; + fp_sf->disable_recon = 0; + fp_sf->skip_zeromv_motion_search = 0; +} + +static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) { + tpl_sf->gop_length_decision_method = 0; + tpl_sf->prune_intra_modes = 0; + tpl_sf->prune_starting_mv = 0; + tpl_sf->reduce_first_step_size = 0; + tpl_sf->skip_alike_starting_mv = 0; + tpl_sf->subpel_force_stop = EIGHTH_PEL; + tpl_sf->search_method = NSTEP; + tpl_sf->prune_ref_frames_in_tpl = 0; + tpl_sf->allow_compound_pred = 1; + tpl_sf->use_y_only_rate_distortion = 0; + tpl_sf->use_sad_for_mode_decision = 0; + tpl_sf->reduce_num_frames = 0; +} + +static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) { + gm_sf->gm_search_type = GM_FULL_SEARCH; + gm_sf->prune_ref_frame_for_gm_search = 0; + gm_sf->prune_zero_mv_with_sse = 0; + gm_sf->disable_gm_search_based_on_stats = 0; + gm_sf->num_refinement_steps = GM_MAX_REFINEMENT_STEPS; +} + +static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) { + part_sf->partition_search_type = SEARCH_PARTITION; + part_sf->less_rectangular_check_level = 0; + part_sf->use_square_partition_only_threshold = BLOCK_128X128; + part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE; + part_sf->default_max_partition_size = BLOCK_LARGEST; + part_sf->default_min_partition_size = BLOCK_4X4; + part_sf->adjust_var_based_rd_partitioning = 0; + part_sf->max_intra_bsize = BLOCK_LARGEST; + // This setting only takes effect when partition_search_type is set + // to FIXED_PARTITION. + part_sf->fixed_partition_size = BLOCK_16X16; + // Recode loop tolerance %. + part_sf->partition_search_breakout_dist_thr = 0; + part_sf->partition_search_breakout_rate_thr = 0; + part_sf->prune_ext_partition_types_search_level = 0; + part_sf->prune_part4_search = 0; + part_sf->ml_prune_partition = 0; + part_sf->ml_early_term_after_part_split_level = 0; + for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) { + part_sf->ml_partition_search_breakout_thresh[i] = + -1; // -1 means not enabled. + } + part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0; + part_sf->simple_motion_search_split = 0; + part_sf->simple_motion_search_prune_rect = 0; + part_sf->simple_motion_search_early_term_none = 0; + part_sf->simple_motion_search_reduce_search_steps = 0; + part_sf->intra_cnn_based_part_prune_level = 0; + part_sf->ext_partition_eval_thresh = BLOCK_8X8; + part_sf->rect_partition_eval_thresh = BLOCK_128X128; + part_sf->ext_part_eval_based_on_cur_best = 0; + part_sf->prune_ext_part_using_split_info = 0; + part_sf->prune_rectangular_split_based_on_qidx = 0; + part_sf->prune_rect_part_using_4x4_var_deviation = false; + part_sf->prune_rect_part_using_none_pred_mode = false; + part_sf->early_term_after_none_split = 0; + part_sf->ml_predict_breakout_level = 0; + part_sf->prune_sub_8x8_partition_level = 0; + part_sf->simple_motion_search_rect_split = 0; + part_sf->reuse_prev_rd_results_for_part_ab = 0; + part_sf->reuse_best_prediction_for_part_ab = 0; + part_sf->use_best_rd_for_pruning = 0; + part_sf->skip_non_sq_part_based_on_none = 0; + part_sf->disable_8x8_part_based_on_qidx = 0; +} + +static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) { + mv_sf->full_pixel_search_level = 0; + mv_sf->auto_mv_step_size = 0; + mv_sf->exhaustive_searches_thresh = 0; + mv_sf->obmc_full_pixel_search_level = 0; + mv_sf->prune_mesh_search = PRUNE_MESH_SEARCH_DISABLED; + mv_sf->reduce_search_range = 0; + mv_sf->search_method = NSTEP; + mv_sf->simple_motion_subpel_force_stop = EIGHTH_PEL; + mv_sf->subpel_force_stop = EIGHTH_PEL; + mv_sf->subpel_iters_per_step = 2; + mv_sf->subpel_search_method = SUBPEL_TREE; + mv_sf->use_accurate_subpel_search = USE_8_TAPS; + mv_sf->use_bsize_dependent_search_method = 0; + mv_sf->use_fullpel_costlist = 0; + mv_sf->use_downsampled_sad = 0; + mv_sf->disable_extensive_joint_motion_search = 0; + mv_sf->disable_second_mv = 0; + mv_sf->skip_fullpel_search_using_startmv = 0; + mv_sf->warp_search_method = WARP_SEARCH_SQUARE; + mv_sf->warp_search_iters = 8; + mv_sf->use_intrabc = 1; +} + +static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) { + inter_sf->adaptive_rd_thresh = 0; + inter_sf->model_based_post_interp_filter_breakout = 0; + inter_sf->reduce_inter_modes = 0; + inter_sf->alt_ref_search_fp = 0; + inter_sf->prune_single_ref = 0; + inter_sf->prune_comp_ref_frames = 0; + inter_sf->selective_ref_frame = 0; + inter_sf->prune_ref_frame_for_rect_partitions = 0; + inter_sf->fast_wedge_sign_estimate = 0; + inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED; + inter_sf->reuse_inter_intra_mode = 0; + inter_sf->mv_cost_upd_level = INTERNAL_COST_UPD_SB; + inter_sf->coeff_cost_upd_level = INTERNAL_COST_UPD_SB; + inter_sf->mode_cost_upd_level = INTERNAL_COST_UPD_SB; + inter_sf->prune_inter_modes_based_on_tpl = 0; + inter_sf->prune_nearmv_using_neighbors = PRUNE_NEARMV_OFF; + inter_sf->prune_comp_search_by_single_result = 0; + inter_sf->skip_repeated_ref_mv = 0; + inter_sf->skip_newmv_in_drl = 0; + inter_sf->inter_mode_rd_model_estimation = 0; + inter_sf->prune_compound_using_single_ref = 0; + inter_sf->prune_ext_comp_using_neighbors = 0; + inter_sf->skip_ext_comp_nearmv_mode = 0; + inter_sf->prune_comp_using_best_single_mode_ref = 0; + inter_sf->prune_nearest_near_mv_using_refmv_weight = 0; + inter_sf->disable_onesided_comp = 0; + inter_sf->prune_mode_search_simple_translation = 0; + inter_sf->prune_comp_type_by_comp_avg = 0; + inter_sf->disable_interinter_wedge_newmv_search = 0; + inter_sf->fast_interintra_wedge_search = 0; + inter_sf->prune_comp_type_by_model_rd = 0; + inter_sf->perform_best_rd_based_gating_for_chroma = 0; + inter_sf->prune_obmc_prob_thresh = 0; + inter_sf->disable_interinter_wedge_var_thresh = 0; + inter_sf->disable_interintra_wedge_var_thresh = 0; + inter_sf->prune_ref_mv_idx_search = 0; + inter_sf->prune_warped_prob_thresh = 0; + inter_sf->reuse_compound_type_decision = 0; + inter_sf->prune_inter_modes_if_skippable = 0; + inter_sf->disable_masked_comp = 0; + inter_sf->enable_fast_compound_mode_search = 0; + inter_sf->reuse_mask_search_results = 0; + inter_sf->enable_fast_wedge_mask_search = 0; + inter_sf->inter_mode_txfm_breakout = 0; + inter_sf->limit_inter_mode_cands = 0; + inter_sf->limit_txfm_eval_per_mode = 0; + inter_sf->skip_arf_compound = 0; + set_txfm_rd_gate_level(inter_sf->txfm_rd_gate_level, 0); +} + +static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) { + interp_sf->adaptive_interp_filter_search = 0; + interp_sf->cb_pred_filter_search = 0; + interp_sf->disable_dual_filter = 0; + interp_sf->skip_sharp_interp_filter_search = 0; + interp_sf->use_fast_interpolation_filter_search = 0; + interp_sf->use_interp_filter = 0; + interp_sf->skip_interp_filter_search = 0; +} + +static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) { + intra_sf->dv_cost_upd_level = INTERNAL_COST_UPD_SB; + intra_sf->skip_intra_in_interframe = 1; + intra_sf->intra_pruning_with_hog = 0; + intra_sf->chroma_intra_pruning_with_hog = 0; + intra_sf->prune_palette_search_level = 0; + intra_sf->prune_luma_palette_size_search_level = 0; + + for (int i = 0; i < TX_SIZES; i++) { + intra_sf->intra_y_mode_mask[i] = INTRA_ALL; + intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL; + } + intra_sf->disable_smooth_intra = 0; + intra_sf->prune_smooth_intra_mode_for_chroma = 0; + intra_sf->prune_filter_intra_level = 0; + intra_sf->prune_chroma_modes_using_luma_winner = 0; + intra_sf->cfl_search_range = 3; + intra_sf->top_intra_model_count_allowed = TOP_INTRA_MODEL_COUNT; + intra_sf->adapt_top_model_rd_count_using_neighbors = 0; + intra_sf->early_term_chroma_palette_size_search = 0; + intra_sf->skip_filter_intra_in_inter_frames = 0; + intra_sf->prune_luma_odd_delta_angles_in_intra = 0; +} + +static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) { + tx_sf->inter_tx_size_search_init_depth_sqr = 0; + tx_sf->inter_tx_size_search_init_depth_rect = 0; + tx_sf->intra_tx_size_search_init_depth_rect = 0; + tx_sf->intra_tx_size_search_init_depth_sqr = 0; + tx_sf->tx_size_search_lgr_block = 0; + tx_sf->model_based_prune_tx_search_level = 0; + tx_sf->tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_1; + tx_sf->tx_type_search.ml_tx_split_thresh = 8500; + tx_sf->tx_type_search.use_skip_flag_prediction = 1; + tx_sf->tx_type_search.use_reduced_intra_txset = 0; + tx_sf->tx_type_search.fast_intra_tx_type_search = 0; + tx_sf->tx_type_search.fast_inter_tx_type_prob_thresh = INT_MAX; + tx_sf->tx_type_search.skip_tx_search = 0; + tx_sf->tx_type_search.prune_tx_type_using_stats = 0; + tx_sf->tx_type_search.prune_tx_type_est_rd = 0; + tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0; + tx_sf->txb_split_cap = 1; + tx_sf->adaptive_txb_search_level = 0; + tx_sf->refine_fast_tx_search_results = 1; + tx_sf->prune_tx_size_level = 0; + tx_sf->prune_intra_tx_depths_using_nn = false; + tx_sf->use_rd_based_breakout_for_intra_tx_search = false; +} + +static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf, + const AV1EncoderConfig *oxcf) { + const int disable_trellis_quant = oxcf->algo_cfg.disable_trellis_quant; + if (disable_trellis_quant == 3) { + rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg) + ? NO_ESTIMATE_YRD_TRELLIS_OPT + : NO_TRELLIS_OPT; + } else if (disable_trellis_quant == 2) { + rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg) + ? FINAL_PASS_TRELLIS_OPT + : NO_TRELLIS_OPT; + } else if (disable_trellis_quant == 0) { + if (is_lossless_requested(&oxcf->rc_cfg)) { + rd_sf->optimize_coefficients = NO_TRELLIS_OPT; + } else { + rd_sf->optimize_coefficients = FULL_TRELLIS_OPT; + } + } else if (disable_trellis_quant == 1) { + rd_sf->optimize_coefficients = NO_TRELLIS_OPT; + } else { + assert(0 && "Invalid disable_trellis_quant value"); + } + rd_sf->use_mb_rd_hash = 0; + rd_sf->simple_model_rd_from_var = 0; + rd_sf->tx_domain_dist_level = 0; + rd_sf->tx_domain_dist_thres_level = 0; + rd_sf->perform_coeff_opt = 0; +} + +static AOM_INLINE void init_winner_mode_sf( + WINNER_MODE_SPEED_FEATURES *winner_mode_sf) { + winner_mode_sf->motion_mode_for_winner_cand = 0; + // Set this at the appropriate speed levels + winner_mode_sf->tx_size_search_level = 0; + winner_mode_sf->enable_winner_mode_for_coeff_opt = 0; + winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0; + winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0; + winner_mode_sf->multi_winner_mode_type = 0; + winner_mode_sf->dc_blk_pred_level = 0; + winner_mode_sf->winner_mode_ifs = 0; + winner_mode_sf->prune_winner_mode_eval_level = 0; +} + +static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) { + lpf_sf->disable_loop_restoration_chroma = 0; + lpf_sf->disable_loop_restoration_luma = 0; + lpf_sf->min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE; + lpf_sf->max_lr_unit_size = RESTORATION_UNITSIZE_MAX; + lpf_sf->prune_wiener_based_on_src_var = 0; + lpf_sf->prune_sgr_based_on_wiener = 0; + lpf_sf->enable_sgr_ep_pruning = 0; + lpf_sf->reduce_wiener_window_size = 0; + lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE; + lpf_sf->use_coarse_filter_level_search = 0; + lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH; + // Set decoder side speed feature to use less dual sgr modes + lpf_sf->dual_sgr_penalty_level = 0; + // Enable Wiener and Self-guided Loop restoration filters by default. + lpf_sf->disable_wiener_filter = false; + lpf_sf->disable_sgr_filter = false; + lpf_sf->disable_wiener_coeff_refine_search = false; + lpf_sf->use_downsampled_wiener_stats = 0; +} + +static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) { + rt_sf->check_intra_pred_nonrd = 0; + rt_sf->skip_intra_pred = 0; + rt_sf->estimate_motion_for_var_based_partition = 0; + rt_sf->nonrd_check_partition_merge_mode = 0; + rt_sf->nonrd_check_partition_split = 0; + rt_sf->mode_search_skip_flags = 0; + rt_sf->nonrd_prune_ref_frame_search = 0; + rt_sf->use_nonrd_pick_mode = 0; + rt_sf->use_nonrd_altref_frame = 0; + rt_sf->use_comp_ref_nonrd = 0; + rt_sf->use_real_time_ref_set = 0; + rt_sf->short_circuit_low_temp_var = 0; + rt_sf->reuse_inter_pred_nonrd = 0; + rt_sf->num_inter_modes_for_tx_search = INT_MAX; + rt_sf->use_nonrd_filter_search = 0; + rt_sf->use_simple_rd_model = 0; + rt_sf->hybrid_intra_pickmode = 0; + rt_sf->source_metrics_sb_nonrd = 0; + rt_sf->overshoot_detection_cbr = NO_DETECTION; + rt_sf->check_scene_detection = 0; + rt_sf->prefer_large_partition_blocks = 0; + rt_sf->use_temporal_noise_estimate = 0; + rt_sf->fullpel_search_step_param = 0; + for (int i = 0; i < BLOCK_SIZES; ++i) + rt_sf->intra_y_mode_bsize_mask_nrd[i] = INTRA_ALL; + rt_sf->prune_hv_pred_modes_using_src_sad = false; + rt_sf->nonrd_aggressive_skip = 0; + rt_sf->skip_cdef_sb = 0; + rt_sf->force_large_partition_blocks_intra = 0; + rt_sf->skip_tx_no_split_var_based_partition = 0; + rt_sf->skip_newmv_mode_based_on_sse = 0; + rt_sf->gf_length_lvl = 0; + rt_sf->prune_inter_modes_with_golden_ref = 0; + rt_sf->prune_inter_modes_wrt_gf_arf_based_on_sad = 0; + rt_sf->prune_inter_modes_using_temp_var = 0; + rt_sf->reduce_mv_pel_precision_highmotion = 0; + rt_sf->reduce_mv_pel_precision_lowcomplex = 0; + rt_sf->prune_intra_mode_based_on_mv_range = 0; + rt_sf->var_part_split_threshold_shift = 7; + rt_sf->gf_refresh_based_on_qp = 0; + rt_sf->use_rtc_tf = 0; + rt_sf->prune_idtx_nonrd = 0; + rt_sf->prune_palette_nonrd = 0; + rt_sf->dct_only_palette_nonrd = 0; + rt_sf->part_early_exit_zeromv = 0; + rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED; + rt_sf->skip_lf_screen = 0; + rt_sf->sad_based_adp_altref_lag = 0; + rt_sf->partition_direct_merging = 0; + rt_sf->var_part_based_on_qidx = 0; + rt_sf->tx_size_level_based_on_qstep = 0; + rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false; + rt_sf->prune_compoundmode_with_singlecompound_var = false; + rt_sf->frame_level_mode_cost_update = false; + rt_sf->prune_h_pred_using_best_mode_so_far = false; + rt_sf->enable_intra_mode_pruning_using_neighbors = false; + rt_sf->prune_intra_mode_using_best_sad_so_far = false; + rt_sf->check_only_zero_zeromv_on_large_blocks = false; + rt_sf->disable_cdf_update_non_reference_frame = false; + rt_sf->prune_compoundmode_with_singlemode_var = false; + rt_sf->skip_compound_based_on_var = false; + rt_sf->set_zeromv_skip_based_on_source_sad = 1; + rt_sf->use_adaptive_subpel_search = false; + rt_sf->screen_content_cdef_filter_qindex_thresh = 0; + rt_sf->enable_ref_short_signaling = false; + rt_sf->check_globalmv_on_single_ref = true; + rt_sf->increase_color_thresh_palette = false; + rt_sf->selective_cdf_update = 0; + rt_sf->force_only_last_ref = 0; +} + +static fractional_mv_step_fp + *const fractional_mv_search[SUBPEL_SEARCH_METHODS] = { + av1_find_best_sub_pixel_tree, // SUBPEL_TREE = 0 + av1_find_best_sub_pixel_tree_pruned, // SUBPEL_TREE_PRUNED = 1 + av1_find_best_sub_pixel_tree_pruned_more // SUBPEL_TREE_PRUNED_MORE = 2 + }; + +// Populate appropriate sub-pel search method based on speed feature and user +// specified settings +static void set_subpel_search_method( + MotionVectorSearchParams *mv_search_params, + unsigned int motion_vector_unit_test, + SUBPEL_SEARCH_METHOD subpel_search_method) { + assert(subpel_search_method <= SUBPEL_TREE_PRUNED_MORE); + mv_search_params->find_fractional_mv_step = + fractional_mv_search[subpel_search_method]; + + // This is only used in motion vector unit test. + if (motion_vector_unit_test == 1) + mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv; + else if (motion_vector_unit_test == 2) + mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv; +} + +void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) { + SPEED_FEATURES *const sf = &cpi->sf; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + switch (oxcf->mode) { + case GOOD: + set_good_speed_feature_framesize_dependent(cpi, sf, speed); + break; + case ALLINTRA: + set_allintra_speed_feature_framesize_dependent(cpi, sf, speed); + break; + case REALTIME: + set_rt_speed_feature_framesize_dependent(cpi, sf, speed); + break; + } + + if (!cpi->ppi->seq_params_locked) { + cpi->common.seq_params->enable_masked_compound &= + !sf->inter_sf.disable_masked_comp; + cpi->common.seq_params->enable_interintra_compound &= + (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX); + } + + set_subpel_search_method(&cpi->mv_search_params, + cpi->oxcf.unit_test_cfg.motion_vector_unit_test, + sf->mv_sf.subpel_search_method); + + // For multi-thread use case with row_mt enabled, cost update for a set of + // SB rows is not desirable. Hence, the sf mv_cost_upd_level is set to + // INTERNAL_COST_UPD_SBROW in such cases. + if ((cpi->oxcf.row_mt == 1) && (cpi->mt_info.num_workers > 1)) { + if (sf->inter_sf.mv_cost_upd_level == INTERNAL_COST_UPD_SBROW_SET) { + // Set mv_cost_upd_level to use row level update. + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; + } + } +} + +void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) { + SPEED_FEATURES *const sf = &cpi->sf; + WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + int i; + + init_hl_sf(&sf->hl_sf); + init_fp_sf(&sf->fp_sf); + init_tpl_sf(&sf->tpl_sf); + init_gm_sf(&sf->gm_sf); + init_part_sf(&sf->part_sf); + init_mv_sf(&sf->mv_sf); + init_inter_sf(&sf->inter_sf); + init_interp_sf(&sf->interp_sf); + init_intra_sf(&sf->intra_sf); + init_tx_sf(&sf->tx_sf); + init_rd_sf(&sf->rd_sf, oxcf); + init_winner_mode_sf(&sf->winner_mode_sf); + init_lpf_sf(&sf->lpf_sf); + init_rt_sf(&sf->rt_sf); + + switch (oxcf->mode) { + case GOOD: + set_good_speed_features_framesize_independent(cpi, sf, speed); + break; + case ALLINTRA: + set_allintra_speed_features_framesize_independent(cpi, sf, speed); + break; + case REALTIME: + set_rt_speed_features_framesize_independent(cpi, sf, speed); + break; + } + + // Note: when use_nonrd_pick_mode is true, the transform size is the + // minimum of 16x16 and the largest possible size of the current block, + // which conflicts with the speed feature "enable_tx_size_search". + if (!oxcf->txfm_cfg.enable_tx_size_search && + sf->rt_sf.use_nonrd_pick_mode == 0) { + sf->winner_mode_sf.tx_size_search_level = 3; + } + + if (cpi->mt_info.num_workers > 1) { + // Loop restoration stage is conditionally disabled for speed 5, 6 when + // num_workers > 1. Since av1_pick_filter_restoration() is not + // multi-threaded, enabling the Loop restoration stage will cause an + // increase in encode time (3% to 7% increase depends on frame + // resolution). + // TODO(aomedia:3446): Implement multi-threading of + // av1_pick_filter_restoration() and enable Wiener filter for speed 5, 6 + // similar to single thread encoding path. + if (speed >= 5) { + sf->lpf_sf.disable_sgr_filter = true; + sf->lpf_sf.disable_wiener_filter = true; + } + } + + if (!cpi->ppi->seq_params_locked) { + cpi->common.seq_params->order_hint_info.enable_dist_wtd_comp &= + (sf->inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED); + cpi->common.seq_params->enable_dual_filter &= + !sf->interp_sf.disable_dual_filter; + // Set the flag 'enable_restoration', if one the Loop restoration filters + // (i.e., Wiener or Self-guided) is enabled. + cpi->common.seq_params->enable_restoration &= + (!sf->lpf_sf.disable_wiener_filter || !sf->lpf_sf.disable_sgr_filter); + + cpi->common.seq_params->enable_interintra_compound &= + (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX); + } + + const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED); + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mv_sf.mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_speed][i].range; + sf->mv_sf.mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_speed][i].interval; + } + + // Update the mesh pattern of exhaustive motion search for intraBC + // Though intraBC mesh pattern is populated for all frame types, it is used + // only for intra frames of screen contents + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mv_sf.intrabc_mesh_patterns[i].range = + intrabc_mesh_patterns[mesh_speed][i].range; + sf->mv_sf.intrabc_mesh_patterns[i].interval = + intrabc_mesh_patterns[mesh_speed][i].interval; + } + + // Slow quant, dct and trellis not worthwhile for first pass + // so make sure they are always turned off. + if (is_stat_generation_stage(cpi)) + sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT; + + // No recode for 1 pass. + if (oxcf->pass == AOM_RC_ONE_PASS && has_no_stats_stage(cpi)) + sf->hl_sf.recode_loop = DISALLOW_RECODE; + + set_subpel_search_method(&cpi->mv_search_params, + cpi->oxcf.unit_test_cfg.motion_vector_unit_test, + sf->mv_sf.subpel_search_method); + + // assert ensures that tx_domain_dist_level is accessed correctly + assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 && + cpi->sf.rd_sf.tx_domain_dist_thres_level < 4); + memcpy(winner_mode_params->tx_domain_dist_threshold, + tx_domain_dist_thresholds[cpi->sf.rd_sf.tx_domain_dist_thres_level], + sizeof(winner_mode_params->tx_domain_dist_threshold)); + + assert(cpi->sf.rd_sf.tx_domain_dist_level >= 0 && + cpi->sf.rd_sf.tx_domain_dist_level < TX_DOMAIN_DIST_LEVELS); + memcpy(winner_mode_params->use_transform_domain_distortion, + tx_domain_dist_types[cpi->sf.rd_sf.tx_domain_dist_level], + sizeof(winner_mode_params->use_transform_domain_distortion)); + + // assert ensures that coeff_opt_thresholds is accessed correctly + assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 && + cpi->sf.rd_sf.perform_coeff_opt < 9); + memcpy(winner_mode_params->coeff_opt_thresholds, + &coeff_opt_thresholds[cpi->sf.rd_sf.perform_coeff_opt], + sizeof(winner_mode_params->coeff_opt_thresholds)); + + // assert ensures that predict_skip_levels is accessed correctly + assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 && + cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3); + memcpy(winner_mode_params->skip_txfm_level, + predict_skip_levels[cpi->sf.tx_sf.tx_type_search + .use_skip_flag_prediction], + sizeof(winner_mode_params->skip_txfm_level)); + + // assert ensures that tx_size_search_level is accessed correctly + assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 && + cpi->sf.winner_mode_sf.tx_size_search_level <= 3); + memcpy(winner_mode_params->tx_size_search_methods, + tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level], + sizeof(winner_mode_params->tx_size_search_methods)); + memcpy(winner_mode_params->predict_dc_level, + predict_dc_levels[cpi->sf.winner_mode_sf.dc_blk_pred_level], + sizeof(winner_mode_params->predict_dc_level)); + + if (cpi->oxcf.row_mt == 1 && (cpi->mt_info.num_workers > 1)) { + if (sf->inter_sf.inter_mode_rd_model_estimation == 1) { + // Revert to type 2 + sf->inter_sf.inter_mode_rd_model_estimation = 2; + } + +#if !CONFIG_FPMT_TEST + // Disable the speed feature 'prune_ref_frame_for_gm_search' to achieve + // better parallelism when number of threads available are greater than or + // equal to maximum number of reference frames allowed for global motion. + if (sf->gm_sf.gm_search_type != GM_DISABLE_SEARCH && + (cpi->mt_info.num_workers >= + gm_available_reference_frames[sf->gm_sf.gm_search_type])) + sf->gm_sf.prune_ref_frame_for_gm_search = 0; +#endif + } + + // This only applies to the real time mode. Adaptive gf refresh is disabled if + // gf_cbr_boost_pct that is set by the user is larger than 0. + if (cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 0) + sf->rt_sf.gf_refresh_based_on_qp = 0; +} + +// Override some speed features based on qindex +void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) { + AV1_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params; + const int boosted = frame_is_boosted(cpi); + const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; + const int is_1440p_or_larger = AOMMIN(cm->width, cm->height) >= 1440; + const int is_arf2_bwd_type = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; + + if (cpi->oxcf.mode == REALTIME) { + if (speed >= 6) { + const int qindex_thresh = boosted ? 190 : (is_720p_or_larger ? 120 : 150); + sf->part_sf.adjust_var_based_rd_partitioning = + frame_is_intra_only(cm) + ? 0 + : cm->quant_params.base_qindex > qindex_thresh; + } + return; + } + + if (speed == 0) { + // qindex_thresh for resolution < 720p + const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140); + if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) { + sf->part_sf.simple_motion_search_split = + cm->features.allow_screen_content_tools ? 1 : 2; + sf->part_sf.simple_motion_search_early_term_none = 1; + sf->tx_sf.model_based_prune_tx_search_level = 0; + } + + if (is_720p_or_larger && cm->quant_params.base_qindex <= 128) { + sf->rd_sf.perform_coeff_opt = 2 + is_1080p_or_larger; + memcpy(winner_mode_params->coeff_opt_thresholds, + &coeff_opt_thresholds[sf->rd_sf.perform_coeff_opt], + sizeof(winner_mode_params->coeff_opt_thresholds)); + sf->part_sf.simple_motion_search_split = + cm->features.allow_screen_content_tools ? 1 : 2; + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + sf->tx_sf.model_based_prune_tx_search_level = 0; + + if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) { + sf->inter_sf.selective_ref_frame = 2; + sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2; + sf->rd_sf.tx_domain_dist_thres_level = 1; + sf->part_sf.simple_motion_search_early_term_none = 1; + sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; + sf->interp_sf.cb_pred_filter_search = 0; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2; + sf->tx_sf.tx_type_search.skip_tx_search = 1; + } + } + } + + if (speed >= 2) { + // Disable extended partitions for lower quantizers + const int aggr = AOMMIN(4, speed - 2); + const int qindex_thresh1[4] = { 50, 50, 80, 100 }; + const int qindex_thresh2[4] = { 80, 100, 120, 160 }; + int qindex_thresh; + if (aggr <= 1) { + const int qthresh2 = + (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr]; + qindex_thresh = cm->features.allow_screen_content_tools + ? qindex_thresh1[aggr] + : qthresh2; + if (cm->quant_params.base_qindex <= qindex_thresh && !boosted) + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } else if (aggr <= 2) { + qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr]; + if (cm->quant_params.base_qindex <= qindex_thresh && + !frame_is_intra_only(cm)) + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } else if (aggr <= 3) { + if (!is_480p_or_larger) { + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } else if (!is_720p_or_larger && !frame_is_intra_only(cm) && + !cm->features.allow_screen_content_tools) { + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } else { + qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr]; + if (cm->quant_params.base_qindex <= qindex_thresh && + !frame_is_intra_only(cm)) + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } + } else { + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } + } + + if (speed >= 4) { + // Disable rectangular partitions for lower quantizers + const int aggr = AOMMIN(1, speed - 4); + const int qindex_thresh[2] = { 65, 80 }; + int disable_rect_part; + disable_rect_part = !boosted; + if (cm->quant_params.base_qindex <= qindex_thresh[aggr] && + disable_rect_part && is_480p_or_larger) { + sf->part_sf.rect_partition_eval_thresh = BLOCK_8X8; + } + } + + if (speed <= 2) { + if (!is_stat_generation_stage(cpi)) { + // Use faster full-pel motion search for high quantizers. + // Also use reduced total search range for low resolutions at high + // quantizers. + const int aggr = speed; + const int qindex_thresh1 = ms_qindex_thresh[aggr][is_720p_or_larger][0]; + const int qindex_thresh2 = ms_qindex_thresh[aggr][is_720p_or_larger][1]; + const SEARCH_METHODS search_method = + motion_search_method[is_720p_or_larger]; + if (cm->quant_params.base_qindex > qindex_thresh1) { + sf->mv_sf.search_method = search_method; + sf->tpl_sf.search_method = search_method; + } else if (cm->quant_params.base_qindex > qindex_thresh2) { + sf->mv_sf.search_method = NSTEP_8PT; + } + } + } + + if (speed >= 4) { + // Disable LR search at low and high quantizers and enable only for + // mid-quantizer range. + if (!boosted && !is_arf2_bwd_type) { + const int qindex_low[2] = { 100, 60 }; + const int qindex_high[2] = { 180, 160 }; + if (cm->quant_params.base_qindex <= qindex_low[is_720p_or_larger] || + cm->quant_params.base_qindex > qindex_high[is_720p_or_larger]) { + sf->lpf_sf.disable_loop_restoration_luma = 1; + } + } + } + + if (speed == 1) { + // Reuse interinter wedge mask search from first search for non-boosted + // non-internal-arf frames, except at very high quantizers. + if (cm->quant_params.base_qindex <= 200) { + if (!boosted && !is_arf2_bwd_type) + sf->inter_sf.reuse_mask_search_results = 1; + } + } + + if (speed == 5) { + if (!(frame_is_intra_only(&cpi->common) || + cm->features.allow_screen_content_tools)) { + const int qindex[2] = { 256, 128 }; + // Set the sf value as 3 for low resolution and + // for higher resolutions with low quantizers. + if (cm->quant_params.base_qindex < qindex[is_480p_or_larger]) + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3; + } + } + + if (speed >= 5) { + // Disable the sf for low quantizers in case of low resolution screen + // contents. + if (cm->features.allow_screen_content_tools && + cm->quant_params.base_qindex < 128 && is_480p_or_lesser) { + sf->part_sf.prune_sub_8x8_partition_level = 0; + } + } + + // Loop restoration size search + // At speed 0, always search all available sizes for the maximum possible gain + sf->lpf_sf.min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE; + sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX; + + if (speed >= 1) { + // For large frames, small restoration units are almost never useful, + // so prune them away + if (is_1440p_or_larger) { + sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX; + } else if (is_720p_or_larger) { + sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1; + } + } + + if (speed >= 3 || (cpi->oxcf.mode == ALLINTRA && speed >= 1)) { + // At this speed, a full search is too expensive. Instead, pick a single + // size based on size and qindex. Note that, in general, higher quantizers + // (== lower quality) and larger frames generally want to use larger + // restoration units. + int qindex_thresh = 96; + if (cm->quant_params.base_qindex <= qindex_thresh && !is_1440p_or_larger) { + sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1; + sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1; + } else { + sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX; + sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX; + } + } + + set_subpel_search_method(&cpi->mv_search_params, + cpi->oxcf.unit_test_cfg.motion_vector_unit_test, + sf->mv_sf.subpel_search_method); +} diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h new file mode 100644 index 0000000000..60c000e4f4 --- /dev/null +++ b/third_party/aom/av1/encoder/speed_features.h @@ -0,0 +1,2025 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_ +#define AOM_AV1_ENCODER_SPEED_FEATURES_H_ + +#include "av1/common/enums.h" +#include "av1/encoder/enc_enums.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/encodemb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! @file */ + +/*!\cond */ +#define MAX_MESH_STEP 4 + +typedef struct MESH_PATTERN { + int range; + int interval; +} MESH_PATTERN; + +enum { + GM_FULL_SEARCH, + GM_REDUCED_REF_SEARCH_SKIP_L2_L3, + GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2, + + // Same as GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2 but with extra filtering + // to keep at most two ref frames + GM_SEARCH_CLOSEST_REFS_ONLY, + + GM_DISABLE_SEARCH +} UENUM1BYTE(GM_SEARCH_TYPE); + +enum { + DIST_WTD_COMP_ENABLED, + DIST_WTD_COMP_SKIP_MV_SEARCH, + DIST_WTD_COMP_DISABLED, +} UENUM1BYTE(DIST_WTD_COMP_FLAG); + +enum { + INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) | + (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) | + (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) | + (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED), + UV_INTRA_ALL = + (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) | + (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) | + (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) | + (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) | + (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC = (1 << UV_DC_PRED), + UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED), + UV_INTRA_DC_PAETH_CFL = + (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED), + UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | + (1 << UV_H_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | + (1 << UV_V_PRED) | (1 << UV_H_PRED), + UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | + (1 << UV_V_PRED) | (1 << UV_H_PRED) | + (1 << UV_CFL_PRED), + INTRA_DC = (1 << DC_PRED), + INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED), + INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED), + INTRA_DC_H_V_SMOOTH = + (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << SMOOTH_PRED), + INTRA_DC_PAETH_H_V = + (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED) +}; + +enum { + INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | + (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | + (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | + (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV), + INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | + (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | + (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) | + (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | + (1 << NEAR_NEARMV), + INTER_SINGLE_ALL = + (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEWMV), +}; + +enum { + DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | + (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST), + + DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT, + + DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA), + + LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | + (1 << THR_ALTR) | (1 << THR_GOLD) +}; + +enum { + TXFM_CODING_SF = 1, + INTER_PRED_SF = 2, + INTRA_PRED_SF = 4, + PARTITION_SF = 8, + LOOP_FILTER_SF = 16, + RD_SKIP_SF = 32, + RESERVE_2_SF = 64, + RESERVE_3_SF = 128, +} UENUM1BYTE(DEV_SPEED_FEATURES); + +/* This enumeration defines when the rate control recode loop will be + * enabled. + */ +enum { + /* + * No recodes allowed + */ + DISALLOW_RECODE = 0, + /* + * Allow recode only for KF/ARF/GF frames + */ + ALLOW_RECODE_KFARFGF = 1, + /* + * Allow recode for all frame types based on bitrate constraints. + */ + ALLOW_RECODE = 2, +} UENUM1BYTE(RECODE_LOOP_TYPE); + +enum { + SUBPEL_TREE = 0, + SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches + SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively + SUBPEL_SEARCH_METHODS +} UENUM1BYTE(SUBPEL_SEARCH_METHOD); + +enum { + // Try the full image with different values. + LPF_PICK_FROM_FULL_IMAGE, + // Try the full image filter search with non-dual filter only. + LPF_PICK_FROM_FULL_IMAGE_NON_DUAL, + // Try a small portion of the image with different values. + LPF_PICK_FROM_SUBIMAGE, + // Estimate the level based on quantizer and frame type + LPF_PICK_FROM_Q, + // Pick 0 to disable LPF if LPF was enabled last frame + LPF_PICK_MINIMAL_LPF +} UENUM1BYTE(LPF_PICK_METHOD); +/*!\endcond */ + +/*!\enum CDEF_PICK_METHOD + * \brief This enumeration defines a variety of CDEF pick methods + */ +typedef enum { + CDEF_FULL_SEARCH, /**< Full search */ + CDEF_FAST_SEARCH_LVL1, /**< Search among a subset of all possible filters. */ + CDEF_FAST_SEARCH_LVL2, /**< Search reduced subset of filters than Level 1. */ + CDEF_FAST_SEARCH_LVL3, /**< Search reduced subset of secondary filters than + Level 2. */ + CDEF_FAST_SEARCH_LVL4, /**< Search reduced subset of filters than Level 3. */ + CDEF_FAST_SEARCH_LVL5, /**< Search reduced subset of filters than Level 4. */ + CDEF_PICK_FROM_Q, /**< Estimate filter strength based on quantizer. */ + CDEF_PICK_METHODS +} CDEF_PICK_METHOD; + +/*!\cond */ +enum { + // Terminate search early based on distortion so far compared to + // qp step, distortion in the neighborhood of the frame, etc. + FLAG_EARLY_TERMINATE = 1 << 0, + + // Skips comp inter modes if the best so far is an intra mode. + FLAG_SKIP_COMP_BESTINTRA = 1 << 1, + + // Skips oblique intra modes if the best so far is an inter mode. + FLAG_SKIP_INTRA_BESTINTER = 1 << 3, + + // Skips oblique intra modes at angles 27, 63, 117, 153 if the best + // intra so far is not one of the neighboring directions. + FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4, + + // Skips intra modes other than DC_PRED if the source variance is small + FLAG_SKIP_INTRA_LOWVAR = 1 << 5, +} UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC); + +enum { + // No tx type pruning + TX_TYPE_PRUNE_0 = 0, + // adaptively prunes the least perspective tx types out of all 16 + // (tuned to provide negligible quality loss) + TX_TYPE_PRUNE_1 = 1, + // similar, but applies much more aggressive pruning to get better speed-up + TX_TYPE_PRUNE_2 = 2, + TX_TYPE_PRUNE_3 = 3, + // More aggressive pruning based on tx type score and allowed tx count + TX_TYPE_PRUNE_4 = 4, + TX_TYPE_PRUNE_5 = 5, +} UENUM1BYTE(TX_TYPE_PRUNE_MODE); + +enum { + // No reaction to rate control on a detected slide/scene change. + NO_DETECTION = 0, + + // Set to larger Q based only on the detected slide/scene change and + // current/past Q. + FAST_DETECTION_MAXQ = 1, +} UENUM1BYTE(OVERSHOOT_DETECTION_CBR); + +enum { + // Turns off multi-winner mode. So we will do txfm search on either all modes + // if winner mode is off, or we will only on txfm search on a single winner + // mode. + MULTI_WINNER_MODE_OFF = 0, + + // Limits the number of winner modes to at most 2 + MULTI_WINNER_MODE_FAST = 1, + + // Uses the default number of winner modes, which is 3 for intra mode, and 1 + // for inter mode. + MULTI_WINNER_MODE_DEFAULT = 2, + + // Maximum number of winner modes allowed. + MULTI_WINNER_MODE_LEVELS, +} UENUM1BYTE(MULTI_WINNER_MODE_TYPE); + +enum { + PRUNE_NEARMV_OFF = 0, // Turn off nearmv pruning + PRUNE_NEARMV_LEVEL1 = 1, // Prune nearmv for qindex (0-85) + PRUNE_NEARMV_LEVEL2 = 2, // Prune nearmv for qindex (0-170) + PRUNE_NEARMV_LEVEL3 = 3, // Prune nearmv more aggressively for qindex (0-170) + PRUNE_NEARMV_MAX = PRUNE_NEARMV_LEVEL3, +} UENUM1BYTE(PRUNE_NEARMV_LEVEL); + +enum { + // Default transform search used in evaluation of best inter candidates + // (MODE_EVAL stage) and motion mode winner processing (WINNER_MODE_EVAL + // stage). + TX_SEARCH_DEFAULT = 0, + // Transform search in motion mode rd during MODE_EVAL stage. + TX_SEARCH_MOTION_MODE, + // Transform search in compound type mode rd during MODE_EVAL stage. + TX_SEARCH_COMP_TYPE_MODE, + // All transform search cases + TX_SEARCH_CASES +} UENUM1BYTE(TX_SEARCH_CASE); + +typedef struct { + TX_TYPE_PRUNE_MODE prune_2d_txfm_mode; + int fast_intra_tx_type_search; + + // INT_MAX: Disable fast search. + // 1 - 1024: Probability threshold used for conditionally forcing tx type, + // during mode search. + // 0: Force tx type to be DCT_DCT unconditionally, during + // mode search. + int fast_inter_tx_type_prob_thresh; + + // Prune less likely chosen transforms for each intra mode. The speed + // feature ranges from 0 to 2, for different speed / compression trade offs. + int use_reduced_intra_txset; + + // Use a skip flag prediction model to detect blocks with skip = 1 early + // and avoid doing full TX type search for such blocks. + int use_skip_flag_prediction; + + // Threshold used by the ML based method to predict TX block split decisions. + int ml_tx_split_thresh; + + // skip remaining transform type search when we found the rdcost of skip is + // better than applying transform + int skip_tx_search; + + // Prune tx type search using previous frame stats. + int prune_tx_type_using_stats; + // Prune tx type search using estimated RDcost + int prune_tx_type_est_rd; + + // Flag used to control the winner mode processing for tx type pruning for + // inter blocks. It enables further tx type mode pruning based on ML model for + // mode evaluation and disables tx type mode pruning for winner mode + // processing. + int winner_mode_tx_type_pruning; +} TX_TYPE_SEARCH; + +enum { + // Search partitions using RD criterion + SEARCH_PARTITION, + + // Always use a fixed size partition + FIXED_PARTITION, + + // Partition using source variance + VAR_BASED_PARTITION, + +#if CONFIG_RT_ML_PARTITIONING + // Partition using ML model + ML_BASED_PARTITION +#endif +} UENUM1BYTE(PARTITION_SEARCH_TYPE); + +enum { + NOT_IN_USE, + DIRECT_PRED, + RELAXED_PRED, + ADAPT_PRED +} UENUM1BYTE(MAX_PART_PRED_MODE); + +enum { + LAST_MV_DATA, + CURRENT_Q, + QTR_ONLY, +} UENUM1BYTE(MV_PREC_LOGIC); + +enum { + SUPERRES_AUTO_ALL, // Tries all possible superres ratios + SUPERRES_AUTO_DUAL, // Tries no superres and q-based superres ratios + SUPERRES_AUTO_SOLO, // Only apply the q-based superres ratio +} UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE); +/*!\endcond */ + +/*!\enum INTERNAL_COST_UPDATE_TYPE + * \brief This enum decides internally how often to update the entropy costs + * + * INTERNAL_COST_UPD_TYPE is similar to \ref COST_UPDATE_TYPE but has slightly + * more flexibility in update frequency. This enum is separate from \ref + * COST_UPDATE_TYPE because although \ref COST_UPDATE_TYPE is not exposed, its + * values are public so it cannot be modified without breaking public API. + * Due to the use of AOMMIN() in populate_unified_cost_update_freq() to + * compute the unified cost update frequencies (out of COST_UPDATE_TYPE and + * INTERNAL_COST_UPDATE_TYPE), the values of this enum type must be listed in + * the order of increasing frequencies. + * + * \warning In case of any updates/modifications to the enum COST_UPDATE_TYPE, + * update the enum INTERNAL_COST_UPDATE_TYPE as well. + */ +typedef enum { + INTERNAL_COST_UPD_OFF, /*!< Turn off cost updates. */ + INTERNAL_COST_UPD_TILE, /*!< Update every tile. */ + INTERNAL_COST_UPD_SBROW_SET, /*!< Update every row_set of height 256 pixs. */ + INTERNAL_COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */ + INTERNAL_COST_UPD_SB, /*!< Update every sb. */ +} INTERNAL_COST_UPDATE_TYPE; + +/*!\enum SIMPLE_MOTION_SEARCH_PRUNE_LEVEL + * \brief This enumeration defines a variety of simple motion search based + * partition prune levels + */ +typedef enum { + NO_PRUNING = -1, + SIMPLE_AGG_LVL0, /*!< Simple prune aggressiveness level 0. */ + SIMPLE_AGG_LVL1, /*!< Simple prune aggressiveness level 1. */ + SIMPLE_AGG_LVL2, /*!< Simple prune aggressiveness level 2. */ + SIMPLE_AGG_LVL3, /*!< Simple prune aggressiveness level 3. */ + QIDX_BASED_AGG_LVL1, /*!< Qindex based prune aggressiveness level, aggressive + level maps to simple agg level 1 or 2 based on qindex. + */ + TOTAL_SIMPLE_AGG_LVLS = QIDX_BASED_AGG_LVL1, /*!< Total number of simple prune + aggressiveness levels. */ + TOTAL_QINDEX_BASED_AGG_LVLS = + QIDX_BASED_AGG_LVL1 - + SIMPLE_AGG_LVL3, /*!< Total number of qindex based simple prune + aggressiveness levels. */ + TOTAL_AGG_LVLS = TOTAL_SIMPLE_AGG_LVLS + + TOTAL_QINDEX_BASED_AGG_LVLS, /*!< Total number of levels. */ +} SIMPLE_MOTION_SEARCH_PRUNE_LEVEL; + +/*!\enum PRUNE_MESH_SEARCH_LEVEL + * \brief This enumeration defines a variety of mesh search prune levels. + */ +typedef enum { + PRUNE_MESH_SEARCH_DISABLED = 0, /*!< Prune mesh search level 0. */ + PRUNE_MESH_SEARCH_LVL_1 = 1, /*!< Prune mesh search level 1. */ + PRUNE_MESH_SEARCH_LVL_2 = 2, /*!< Prune mesh search level 2. */ +} PRUNE_MESH_SEARCH_LEVEL; + +/*!\enum INTER_SEARCH_EARLY_TERM_IDX + * \brief This enumeration defines inter search early termination index in + * non-rd path based on sse value. + */ +typedef enum { + EARLY_TERM_DISABLED = + 0, /*!< Early terminate inter mode search based on sse disabled. */ + EARLY_TERM_IDX_1 = + 1, /*!< Early terminate inter mode search based on sse, index 1. */ + EARLY_TERM_IDX_2 = + 2, /*!< Early terminate inter mode search based on sse, index 2. */ + EARLY_TERM_IDX_3 = + 3, /*!< Early terminate inter mode search based on sse, index 3. */ + EARLY_TERM_IDX_4 = + 4, /*!< Early terminate inter mode search based on sse, index 4. */ + EARLY_TERM_INDICES, /*!< Total number of early terminate indices */ +} INTER_SEARCH_EARLY_TERM_IDX; + +/*! + * \brief Sequence/frame level speed vs quality features + */ +typedef struct HIGH_LEVEL_SPEED_FEATURES { + /*! Frame level coding parameter update. */ + int frame_parameter_update; + + /*! + * Cases and frame types for which the recode loop is enabled. + */ + RECODE_LOOP_TYPE recode_loop; + + /*! + * Controls the tolerance vs target rate used in deciding whether to + * recode a frame. It has no meaning if recode is disabled. + */ + int recode_tolerance; + + /*! + * Determine how motion vector precision is chosen. The possibilities are: + * LAST_MV_DATA: use the mv data from the last coded frame + * CURRENT_Q: use the current q as a threshold + * QTR_ONLY: use quarter pel precision only. + */ + MV_PREC_LOGIC high_precision_mv_usage; + + /*! + * Always set to 0. If on it enables 0 cost background transmission + * (except for the initial transmission of the segmentation). The feature is + * disabled because the addition of very large block sizes make the + * backgrounds very to cheap to encode, and the segmentation we have + * adds overhead. + */ + int static_segmentation; + + /*! + * Superres-auto mode search type: + */ + SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type; + + /*! + * Enable/disable extra screen content test by encoding key frame twice. + */ + int disable_extra_sc_testing; + + /*! + * Enable/disable second_alt_ref temporal filtering. + */ + int second_alt_ref_filtering; + + /*! + * The number of frames to be used during temporal filtering of an ARF frame + * is adjusted based on noise level of the current frame. The sf has three + * levels to decide number of frames to be considered for filtering: + * 0 : Use default number of frames + * 1 and 2 : Reduce the number of frames based on noise level with varied + * aggressiveness + */ + int adjust_num_frames_for_arf_filtering; + + /*! + * Decide the bit estimation approach used in qindex decision. + * 0: estimate bits based on a constant value; + * 1: estimate bits more accurately based on the frame complexity. + */ + int accurate_bit_estimate; + + /*! + * Decide the approach for weight calculation during temporal filtering. + * 0: Calculate weight using exp() + * 1: Calculate weight using a lookup table that approximates exp(). + */ + int weight_calc_level_in_tf; + + /*! + * Decide whether to perform motion estimation at split block (i.e. 16x16) + * level or not. + * 0: Always allow motion estimation. + * 1: Conditionally allow motion estimation based on 4x4 sub-blocks variance. + */ + int allow_sub_blk_me_in_tf; +} HIGH_LEVEL_SPEED_FEATURES; + +/*! + * Speed features for the first pass. + */ +typedef struct FIRST_PASS_SPEED_FEATURES { + /*! + * \brief Reduces the mv search window. + * By default, the initial search window is around + * MIN(MIN(dims), MAX_FULL_PEL_VAL) = MIN(MIN(dims), 1023). + * Each step reduction decrease the window size by about a factor of 2. + */ + int reduce_mv_step_param; + + /*! + * \brief Skips the motion search when the zero mv has small sse. + */ + int skip_motion_search_threshold; + + /*! + * \brief Skips reconstruction by using source buffers for prediction + */ + int disable_recon; + + /*! + * \brief Skips the motion search centered on 0,0 mv. + */ + int skip_zeromv_motion_search; +} FIRST_PASS_SPEED_FEATURES; + +/*!\cond */ +typedef struct TPL_SPEED_FEATURES { + // GOP length adaptive decision. + // If set to 0, tpl model decides whether a shorter gf interval is better. + // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and + // (base+2) layer decide whether a shorter gf interval is better. + // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost + // decide whether a shorter gf interval is better. + // If set to 3, gop length adaptive decision is disabled. + int gop_length_decision_method; + // Prune the intra modes search by tpl. + // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED. + // If set to 1, we only search DC_PRED, V_PRED, and H_PRED. + int prune_intra_modes; + // This parameter controls which step in the n-step process we start at. + int reduce_first_step_size; + // Skip motion estimation based on the precision of center MVs and the + // difference between center MVs. + // If set to 0, motion estimation is skipped for duplicate center MVs + // (default). If set to 1, motion estimation is skipped for duplicate + // full-pixel center MVs. If set to 2, motion estimation is skipped if the + // difference between center MVs is less than the threshold. + int skip_alike_starting_mv; + + // When to stop subpel search. + SUBPEL_FORCE_STOP subpel_force_stop; + + // Which search method to use. + SEARCH_METHODS search_method; + + // Prune starting mvs in TPL based on sad scores. + int prune_starting_mv; + + // Prune reference frames in TPL. + int prune_ref_frames_in_tpl; + + // Support compound predictions. + int allow_compound_pred; + + // Calculate rate and distortion based on Y plane only. + int use_y_only_rate_distortion; + + // Use SAD instead of SATD during intra/inter mode search. + // If set to 0, use SATD always. + // If set to 1, use SAD during intra/inter mode search for frames in the + // higher temporal layers of the hierarchical prediction structure. + // If set to 2, use SAD during intra/inter mode search for all frames. + // This sf is disabled for the first GF group of the key-frame interval, + // i.e., SATD is used during intra/inter mode search of the first GF group. + int use_sad_for_mode_decision; + + // Skip tpl processing for frames of type LF_UPDATE. + // This sf is disabled for the first GF group of the key-frame interval. + int reduce_num_frames; +} TPL_SPEED_FEATURES; + +typedef struct GLOBAL_MOTION_SPEED_FEATURES { + GM_SEARCH_TYPE gm_search_type; + + // During global motion estimation, prune remaining reference frames in a + // given direction(past/future), if the evaluated ref_frame in that direction + // yields gm_type as INVALID/TRANSLATION/IDENTITY + int prune_ref_frame_for_gm_search; + + // When the current GM type is set to ZEROMV, prune ZEROMV if its performance + // is worse than NEWMV under SSE metric. + // 0 : no pruning + // 1 : conservative pruning + // 2 : aggressive pruning + int prune_zero_mv_with_sse; + + // Disable global motion estimation based on stats of previous frames in the + // GF group + int disable_gm_search_based_on_stats; + + // Number of refinement steps to apply after initial model generation + int num_refinement_steps; +} GLOBAL_MOTION_SPEED_FEATURES; + +typedef struct PARTITION_SPEED_FEATURES { + PARTITION_SEARCH_TYPE partition_search_type; + + // Used if partition_search_type = FIXED_PARTITION + BLOCK_SIZE fixed_partition_size; + + // Prune extended partition types search based on the current best partition + // and the combined rdcost of the subblocks estimated from previous + // partitions. Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 + // increasing aggressiveness of pruning in order. + int prune_ext_partition_types_search_level; + + // Prune part4 based on block size + int prune_part4_search; + + // Use a ML model to prune rectangular, ab and 4-way horz + // and vert partitions + int ml_prune_partition; + + // Use a ML model to adaptively terminate partition search after trying + // PARTITION_SPLIT. Can take values 0 - 2, 0 meaning not being enabled, and + // 1 - 2 increasing aggressiveness in order. + int ml_early_term_after_part_split_level; + + // Skip rectangular partition test when partition type none gives better + // rd than partition type split. Can take values 0 - 2, 0 referring to no + // skipping, and 1 - 2 increasing aggressiveness of skipping in order. + int less_rectangular_check_level; + + // Use square partition only beyond this block size. + BLOCK_SIZE use_square_partition_only_threshold; + + // Sets max square partition levels for this superblock based on + // motion vector and prediction error distribution produced from 16x16 + // simple motion search + MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion; + + // Min and max square partition size we enable (block_size) as per auto + // min max, but also used by adjust partitioning, and pick_partitioning. + BLOCK_SIZE default_min_partition_size; + BLOCK_SIZE default_max_partition_size; + + // Sets level of adjustment of variance-based partitioning during + // rd_use_partition 0 - no partition adjustment, 1 - try to merge partitions + // for small blocks and high QP, 2 - try to merge partitions, 3 - try to merge + // and split leaf partitions and 0 - 3 decreasing aggressiveness in order. + int adjust_var_based_rd_partitioning; + + // Partition search early breakout thresholds. + int64_t partition_search_breakout_dist_thr; + int partition_search_breakout_rate_thr; + + // Thresholds for ML based partition search breakout. + int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES]; + + // Aggressiveness levels for pruning split and rectangular partitions based on + // simple_motion_search. SIMPLE_AGG_LVL0 to SIMPLE_AGG_LVL3 correspond to + // simple motion search based pruning. QIDX_BASED_AGG_LVL1 corresponds to + // qindex based and simple motion search based pruning. + int simple_motion_search_prune_agg; + + // Perform simple_motion_search on each possible subblock and use it to prune + // PARTITION_HORZ and PARTITION_VERT. + int simple_motion_search_prune_rect; + + // Perform simple motion search before none_partition to decide if we + // want to remove all partitions other than PARTITION_SPLIT. If set to 0, this + // model is disabled. If set to 1, the model attempts to perform + // PARTITION_SPLIT only. If set to 2, the model also attempts to prune + // PARTITION_SPLIT. + int simple_motion_search_split; + + // Use features from simple_motion_search to terminate prediction block + // partition after PARTITION_NONE + int simple_motion_search_early_term_none; + + // Controls whether to reduce the number of motion search steps. If this is 0, + // then simple_motion_search has the same number of steps as + // single_motion_search (assuming no other speed features). Otherwise, reduce + // the number of steps by the value contained in this variable. + int simple_motion_search_reduce_search_steps; + + // This variable controls the maximum block size where intra blocks can be + // used in inter frames. + // TODO(aconverse): Fold this into one of the other many mode skips + BLOCK_SIZE max_intra_bsize; + + // Use CNN with luma pixels on source frame on each of the 64x64 subblock to + // perform partition pruning in intra frames. + // 0: No Pruning + // 1: Prune split and rectangular partitions only + // 2: Prune none, split and rectangular partitions + int intra_cnn_based_part_prune_level; + + // Disable extended partition search if the current bsize is greater than the + // threshold. Must be a square block size BLOCK_8X8 or higher. + BLOCK_SIZE ext_partition_eval_thresh; + + // Use best partition decision so far to tune 'ext_partition_eval_thresh' + int ext_part_eval_based_on_cur_best; + + // Disable rectangular partitions for larger block sizes. + int rect_partition_eval_thresh; + + // Prune extended partition search based on whether the split/rect partitions + // provided an improvement in the previous search. + // 0 : no pruning + // 1 : prune 1:4 partition search using winner info from split partitions + // 2 : prune 1:4 and AB partition search using split and HORZ/VERT info + int prune_ext_part_using_split_info; + + // Prunt rectangular, AB and 4-way partition based on q index and block size + // 0 : no pruning + // 1 : prune sub_8x8 at very low quantizers + // 2 : prune all block size based on qindex + int prune_rectangular_split_based_on_qidx; + + // Prune rectangular partitions based on 4x4 sub-block variance + // false : no pruning + // true : prune rectangular partitions based on 4x4 sub-block variance + // deviation + // + // For allintra encode, this speed feature reduces instruction count by 6.4% + // for speed=6 with coding performance change less than 0.24%. For AVIF image + // encode, this speed feature reduces encode time by 8.14% for speed 6 on a + // typical image dataset with coding performance change less than 0.16%. This + // speed feature is not applicable to speed >= 7. + bool prune_rect_part_using_4x4_var_deviation; + + // Prune rectangular partitions based on prediction mode chosen by NONE + // partition. + // false : no pruning + // true : prunes rectangular partition as described below + // If prediction mode chosen by NONE partition is + // DC_PRED or SMOOTH_PRED: Prunes both horizontal and vertical partitions if + // at least one of the left and top neighbor blocks is larger than the + // current block. + // Directional Mode: Prunes either of the horizontal and vertical partition + // based on center angle of the prediction mode chosen by NONE partition. For + // example, vertical partition is pruned if center angle of the prediction + // mode chosen by NONE partition is close to 180 degrees (i.e. horizontal + // direction) and vice versa. + // For allintra encode, this speed feature reduces instruction count by 5.1% + // for speed=6 with coding performance change less than 0.22%. For AVIF image + // encode, this speed feature reduces encode time by 4.44% for speed 6 on a + // typical image dataset with coding performance change less than 0.15%. + // For speed >= 7, variance-based logic is used to determine the partition + // structure instead of recursive partition search. Therefore, this speed + // feature is not applicable in such cases. + bool prune_rect_part_using_none_pred_mode; + + // Terminate partition search for child partition, + // when NONE and SPLIT partition rd_costs are INT64_MAX. + int early_term_after_none_split; + + // Level used to adjust threshold for av1_ml_predict_breakout(). At lower + // levels, more conservative threshold is used, and value of 0 indicates + // av1_ml_predict_breakout() is disabled. Value of 3 corresponds to default + // case with no adjustment to lbd thresholds. + int ml_predict_breakout_level; + + // Prune sub_8x8 (BLOCK_4X4, BLOCK_4X8 and BLOCK_8X4) partitions. + // 0 : no pruning + // 1 : pruning based on neighbour block information + // 2 : prune always + int prune_sub_8x8_partition_level; + + // Prune rectangular split based on simple motion search split/no_split score. + // 0: disable pruning, 1: enable pruning + int simple_motion_search_rect_split; + + // The current encoder adopts a DFS search for block partitions. + // Therefore the mode selection and associated rdcost is ready for smaller + // blocks before the mode selection for some partition types. + // AB partition could use previous rd information and skip mode search. + // An example is: + // + // current block + // +---+---+ + // | | + // + + + // | | + // +-------+ + // + // SPLIT partition has been searched first before trying HORZ_A + // +---+---+ + // | R | R | + // +---+---+ + // | R | R | + // +---+---+ + // + // HORZ_A + // +---+---+ + // | | | + // +---+---+ + // | | + // +-------+ + // + // With this speed feature, the top two sub blocks can directly use rdcost + // searched in split partition, and the mode info is also copied from + // saved info. Similarly, the bottom rectangular block can also use + // the available information from previous rectangular search. + int reuse_prev_rd_results_for_part_ab; + + // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT + // when encoding PARTITION_AB. + int reuse_best_prediction_for_part_ab; + + // The current partition search records the best rdcost so far and uses it + // in mode search and transform search to early skip when some criteria is + // met. For example, when the current rdcost is larger than the best rdcost, + // or the model rdcost is larger than the best rdcost times some thresholds. + // By default, this feature is turned on to speed up the encoder partition + // search. + // If disabling it, at speed 0, 30 frames, we could get + // about -0.25% quality gain (psnr, ssim, vmaf), with about 13% slowdown. + int use_best_rd_for_pruning; + + // Skip evaluation of non-square partitions based on the corresponding NONE + // partition. + // 0: no pruning + // 1: prune extended partitions if NONE is skippable + // 2: on top of 1, prune rectangular partitions if NONE is inter, not a newmv + // mode and skippable + int skip_non_sq_part_based_on_none; + + // Disables 8x8 and below partitions for low quantizers. + int disable_8x8_part_based_on_qidx; +} PARTITION_SPEED_FEATURES; + +typedef struct MV_SPEED_FEATURES { + // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). + SEARCH_METHODS search_method; + + // Enable the use of faster, less accurate mv search method + // 0: disable, 1: if bsize >= BLOCK_32X32, 2: based on bsize, SAD and qp + // TODO(chiyotsai@google.com): Take the clip's resolution and mv activity into + // account. + int use_bsize_dependent_search_method; + + // If this is set to 1, we limit the motion search range to 2 times the + // largest motion vector found in the last frame. + int auto_mv_step_size; + + // Subpel_search_method can only be subpel_tree which does a subpixel + // logarithmic search that keeps stepping at 1/2 pixel units until + // you stop getting a gain, and then goes on to 1/4 and repeats + // the same process. Along the way it skips many diagonals. + SUBPEL_SEARCH_METHOD subpel_search_method; + + // Maximum number of steps in logarithmic subpel search before giving up. + int subpel_iters_per_step; + + // When to stop subpel search. + SUBPEL_FORCE_STOP subpel_force_stop; + + // When to stop subpel search in simple motion search. + SUBPEL_FORCE_STOP simple_motion_subpel_force_stop; + + // If true, sub-pixel search uses the exact convolve function used for final + // encoding and decoding; otherwise, it uses bilinear interpolation. + SUBPEL_SEARCH_TYPE use_accurate_subpel_search; + + // Threshold for allowing exhaustive motion search. + int exhaustive_searches_thresh; + + // Pattern to be used for any exhaustive mesh searches (except intraBC ME). + MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; + + // Pattern to be used for exhaustive mesh searches of intraBC ME. + MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_STEP]; + + // Reduce single motion search range based on MV result of prior ref_mv_idx. + int reduce_search_range; + + // Prune mesh search. + PRUNE_MESH_SEARCH_LEVEL prune_mesh_search; + + // Use the rd cost around the best FULLPEL_MV to speed up subpel search + int use_fullpel_costlist; + + // Set the full pixel search level of obmc + // 0: obmc_full_pixel_diamond + // 1: obmc_refining_search_sad (faster) + int obmc_full_pixel_search_level; + + // Accurate full pixel motion search based on TPL stats. + int full_pixel_search_level; + + // Allow intrabc motion search + int use_intrabc; + + // Whether to downsample the rows in sad calculation during motion search. + // This is only active when there are at least 16 rows. When this sf is + // active, if there is a large discrepancy in the SAD values for the final + // motion vector between skipping vs not skipping, motion search is redone + // with skip row features off. + // 0: Disabled (do not downsample rows) + // 1: Skip SAD calculation of odd rows if the SAD deviation of the even and + // odd rows for the starting MV is small. Redo motion search with sf off + // when SAD deviation is high for the final motion vector. + // 2: Skip SAD calculation of odd rows. SAD deviation is not tested for the + // start MV and tested only for the final MV. + int use_downsampled_sad; + + // Enable/disable extensive joint motion search. + int disable_extensive_joint_motion_search; + + // Enable second best mv check in joint mv search. + // 0: allow second MV (use rd cost as the metric) + // 1: use var as the metric + // 2: disable second MV + int disable_second_mv; + + // Skips full pixel search based on start mv of prior ref_mv_idx. + // 0: Disabled + // 1: Skips the full pixel search upto 4 neighbor full-pel MV positions. + // 2: Skips the full pixel search upto 8 neighbor full-pel MV positions. + int skip_fullpel_search_using_startmv; + + // Method to use for refining WARPED_CAUSAL motion vectors + // TODO(rachelbarker): Can this be unified with OBMC in some way? + WARP_SEARCH_METHOD warp_search_method; + + // Maximum number of iterations in WARPED_CAUSAL refinement search + int warp_search_iters; +} MV_SPEED_FEATURES; + +typedef struct INTER_MODE_SPEED_FEATURES { + // 2-pass inter mode model estimation where the preliminary pass skips + // transform search and uses a model to estimate rd, while the final pass + // computes the full transform search. Two types of models are supported: + // 0: not used + // 1: used with online dynamic rd model + // 2: used with static rd model + int inter_mode_rd_model_estimation; + + // Bypass transform search based on skip rd at following stages + // i. Compound type mode search + // ii. Motion mode search (mode evaluation and winner motion mode stage) + // iii. Transform search for best inter candidates + int txfm_rd_gate_level[TX_SEARCH_CASES]; + + // Limit the inter mode tested in the RD loop + int reduce_inter_modes; + + // This variable is used to cap the maximum number of times we skip testing a + // mode to be evaluated. A high value means we will be faster. + int adaptive_rd_thresh; + + // Aggressively prune inter modes when best mode is skippable. + int prune_inter_modes_if_skippable; + + // Drop less likely to be picked reference frames in the RD search. + // Has seven levels for now: 0, 1, 2, 3, 4, 5 and 6 where higher levels prune + // more aggressively than lower ones. (0 means no pruning). + int selective_ref_frame; + + // Prune reference frames for rectangular partitions. + // 0 implies no pruning + // 1 implies prune for extended partition + // 2 implies prune horiz, vert and extended partition + int prune_ref_frame_for_rect_partitions; + + // Prune inter modes w.r.t past reference frames + // 0 no pruning + // 1 prune inter modes w.r.t ALTREF2 and ALTREF reference frames + // 2 prune inter modes w.r.t BWDREF, ALTREF2 and ALTREF reference frames + int alt_ref_search_fp; + + // Prune reference frames for single prediction modes based on temporal + // distance and pred MV SAD. Feasible values are 0, 1, 2. The feature is + // disabled for 0. An increasing value indicates more aggressive pruning + // threshold. + int prune_single_ref; + + // Prune compound reference frames + // 0 no pruning + // 1 prune compound references which do not satisfy the two conditions: + // a) The references are at a nearest distance from the current frame in + // both past and future direction. + // b) The references have minimum pred_mv_sad in both past and future + // direction. + // 2 prune compound references except the one with nearest distance from the + // current frame in both past and future direction. + int prune_comp_ref_frames; + + // Skip the current ref_mv in NEW_MV mode based on mv, rate cost, etc. + // This speed feature equaling 0 means no skipping. + // If the speed feature equals 1 or 2, skip the current ref_mv in NEW_MV mode + // if we have already encountered ref_mv in the drl such that: + // 1. The other drl has the same mv during the SIMPLE_TRANSLATION search + // process as the current mv. + // 2. The rate needed to encode the current mv is larger than that for the + // other ref_mv. + // The speed feature equaling 1 means using subpel mv in the comparison. + // The speed feature equaling 2 means using fullpel mv in the comparison. + // If the speed feature >= 3, skip the current ref_mv in NEW_MV mode based on + // known full_mv bestsme and drl cost. + int skip_newmv_in_drl; + + // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV, + // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found + // TODO(any): Instead of skipping repeated ref mv, use the recalculated + // rd-cost based on mode rate and skip the mode evaluation + int skip_repeated_ref_mv; + + // Flag used to control the ref_best_rd based gating for chroma + int perform_best_rd_based_gating_for_chroma; + + // Reuse the inter_intra_mode search result from NEARESTMV mode to other + // single ref modes + int reuse_inter_intra_mode; + + // prune wedge and compound segment approximate rd evaluation based on + // compound average modeled rd + int prune_comp_type_by_model_rd; + + // prune wedge and compound segment approximate rd evaluation based on + // compound average rd/ref_best_rd + int prune_comp_type_by_comp_avg; + + // Skip some ref frames in compound motion search by single motion search + // result. Has three levels for now: 0 referring to no skipping, and 1 - 3 + // increasing aggressiveness of skipping in order. + // Note: The search order might affect the result. It assumes that the single + // reference modes are searched before compound modes. It is better to search + // same single inter mode as a group. + int prune_comp_search_by_single_result; + + // Instead of performing a full MV search, do a simple translation first + // and only perform a full MV search on the motion vectors that performed + // well. + int prune_mode_search_simple_translation; + + // Only search compound modes with at least one "good" reference frame. + // A reference frame is good if, after looking at its performance among + // the single reference modes, it is one of the two best performers. + int prune_compound_using_single_ref; + + // Skip extended compound mode (NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV, + // NEW_NEARMV) using ref frames of above and left neighbor + // blocks. + // 0 : no pruning + // 1 : prune ext compound modes using neighbor blocks (less aggressiveness) + // 2 : prune ext compound modes using neighbor blocks (high aggressiveness) + // 3 : prune ext compound modes unconditionally (highest aggressiveness) + int prune_ext_comp_using_neighbors; + + // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes + int skip_ext_comp_nearmv_mode; + + // Skip extended compound mode when ref frame corresponding to NEWMV does not + // have NEWMV as single mode winner. + // 0 : no pruning + // 1 : prune extended compound mode (less aggressiveness) + // 2 : prune extended compound mode (high aggressiveness) + int prune_comp_using_best_single_mode_ref; + + // Skip NEARESTMV and NEARMV using weight computed in ref mv list population + // + // Pruning is enabled only when both the top and left neighbor blocks are + // available and when the current block already has a valid inter prediction. + int prune_nearest_near_mv_using_refmv_weight; + + // Based on previous ref_mv_idx search result, prune the following search. + int prune_ref_mv_idx_search; + + // Disable one sided compound modes. + int disable_onesided_comp; + + // Prune obmc search using previous frame stats. + // INT_MAX : disable obmc search + int prune_obmc_prob_thresh; + + // Prune warped motion search using previous frame stats. + int prune_warped_prob_thresh; + + // Variance threshold to enable/disable Interintra wedge search + unsigned int disable_interintra_wedge_var_thresh; + + // Variance threshold to enable/disable Interinter wedge search + unsigned int disable_interinter_wedge_var_thresh; + + // De-couple wedge and mode search during interintra RDO. + int fast_interintra_wedge_search; + + // Whether fast wedge sign estimate is used + int fast_wedge_sign_estimate; + + // Enable/disable ME for interinter wedge search. + int disable_interinter_wedge_newmv_search; + + // Decide when and how to use joint_comp. + DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag; + + // Clip the frequency of updating the mv cost. + INTERNAL_COST_UPDATE_TYPE mv_cost_upd_level; + + // Clip the frequency of updating the coeff cost. + INTERNAL_COST_UPDATE_TYPE coeff_cost_upd_level; + + // Clip the frequency of updating the mode cost. + INTERNAL_COST_UPDATE_TYPE mode_cost_upd_level; + + // Prune inter modes based on tpl stats + // 0 : no pruning + // 1 - 3 indicate increasing aggressiveness in order. + int prune_inter_modes_based_on_tpl; + + // Skip NEARMV and NEAR_NEARMV modes using ref frames of above and left + // neighbor blocks and qindex. + PRUNE_NEARMV_LEVEL prune_nearmv_using_neighbors; + + // Model based breakout after interpolation filter search + // 0: no breakout + // 1: use model based rd breakout + int model_based_post_interp_filter_breakout; + + // Reuse compound type rd decision when exact match is found + // 0: No reuse + // 1: Reuse the compound type decision + int reuse_compound_type_decision; + + // Enable/disable masked compound. + int disable_masked_comp; + + // Enable/disable MV refinement for compound modes corresponds to compound + // types COMPOUND_AVERAGE, COMPOUND_DISTWTD (currently, this compound type + // is disabled for speeds >= 2 using the sf 'use_dist_wtd_comp_flag') and + // COMPOUND_DIFFWTD based on the availability. Levels 0 to 3 indicate + // increasing order of aggressiveness to disable MV refinement. + // 0: MV Refinement is enabled and for NEW_NEWMV mode used two iterations of + // refinement in av1_joint_motion_search(). + // 1: MV Refinement is disabled for COMPOUND_DIFFWTD and enabled for + // COMPOUND_AVERAGE & COMPOUND_DISTWTD. + // 2: MV Refinement is enabled for COMPOUND_AVERAGE & COMPOUND_DISTWTD for + // NEW_NEWMV mode with one iteration of refinement in + // av1_joint_motion_search() and MV Refinement is disabled for other compound + // type modes. + // 3: MV Refinement is disabled. + int enable_fast_compound_mode_search; + + // Reuse masked compound type search results + int reuse_mask_search_results; + + // Enable/disable fast search for wedge masks + int enable_fast_wedge_mask_search; + + // Early breakout from transform search of inter modes + int inter_mode_txfm_breakout; + + // Limit number of inter modes for txfm search if a newmv mode gets + // evaluated among the top modes. + // 0: no pruning + // 1 to 3 indicate increasing order of aggressiveness + int limit_inter_mode_cands; + + // Cap the no. of txfm searches for a given prediction mode. + // 0: no cap, 1: cap beyond first 4 searches, 2: cap beyond first 3 searches. + int limit_txfm_eval_per_mode; + + // Prune warped motion search based on block size. + int extra_prune_warped; + + // Do not search compound modes for ARF. + // The intuition is that ARF is predicted by frames far away from it, + // whose temporal correlations with the ARF are likely low. + // It is therefore likely that compound modes do not work as well for ARF + // as other inter frames. + // Speed/quality impact: + // Speed 1: 12% faster, 0.1% psnr loss. + // Speed 2: 2% faster, 0.05% psnr loss. + // No change for speed 3 and up, because |disable_onesided_comp| is true. + int skip_arf_compound; +} INTER_MODE_SPEED_FEATURES; + +typedef struct INTERP_FILTER_SPEED_FEATURES { + // Do limited interpolation filter search for dual filters, since best choice + // usually includes EIGHTTAP_REGULAR. + int use_fast_interpolation_filter_search; + + // Disable dual filter + int disable_dual_filter; + + // Save results of av1_interpolation_filter_search for a block + // Check mv and ref_frames before search, if they are very close with previous + // saved results, filter search can be skipped. + int use_interp_filter; + + // skip sharp_filter evaluation based on regular and smooth filter rd for + // dual_filter=0 case + int skip_sharp_interp_filter_search; + + // skip interpolation filter search for a block in chessboard pattern + int cb_pred_filter_search; + + // adaptive interp_filter search to allow skip of certain filter types. + int adaptive_interp_filter_search; + + // Forces interpolation filter to EIGHTTAP_REGULAR and skips interpolation + // filter search. + int skip_interp_filter_search; +} INTERP_FILTER_SPEED_FEATURES; + +typedef struct INTRA_MODE_SPEED_FEATURES { + // These bit masks allow you to enable or disable intra modes for each + // transform size separately. + int intra_y_mode_mask[TX_SIZES]; + int intra_uv_mode_mask[TX_SIZES]; + + // flag to allow skipping intra mode for inter frame prediction + int skip_intra_in_interframe; + + // Prune intra mode candidates based on source block histogram of gradient. + // Applies to luma plane only. + // Feasible values are 0..4. The feature is disabled for 0. An increasing + // value indicates more aggressive pruning threshold. + int intra_pruning_with_hog; + + // Prune intra mode candidates based on source block histogram of gradient. + // Applies to chroma plane only. + // Feasible values are 0..4. The feature is disabled for 0. An increasing + // value indicates more aggressive pruning threshold. + int chroma_intra_pruning_with_hog; + + // Enable/disable smooth intra modes. + int disable_smooth_intra; + + // Prune UV_SMOOTH_PRED mode for chroma based on chroma source variance. + // false : No pruning + // true : Prune UV_SMOOTH_PRED mode based on chroma source variance + // + // For allintra encode, this speed feature reduces instruction count + // by 1.90%, 2.21% and 1.97% for speed 6, 7 and 8 with coding performance + // change less than 0.04%. For AVIF image encode, this speed feature reduces + // encode time by 1.56%, 2.14% and 0.90% for speed 6, 7 and 8 on a typical + // image dataset with coding performance change less than 0.05%. + bool prune_smooth_intra_mode_for_chroma; + + // Prune filter intra modes in intra frames. + // 0 : No pruning + // 1 : Evaluate applicable filter intra modes based on best intra mode so far + // 2 : Do not evaluate filter intra modes + int prune_filter_intra_level; + + // prune palette search + // 0: No pruning + // 1: Perform coarse search to prune the palette colors. For winner colors, + // neighbors are also evaluated using a finer search. + // 2: Perform 2 way palette search from max colors to min colors (and min + // colors to remaining colors) and terminate the search if current number of + // palette colors is not the winner. + int prune_palette_search_level; + + // Terminate early in luma palette_size search. Speed feature values indicate + // increasing level of pruning. + // 0: No early termination + // 1: Terminate early for higher luma palette_size, if header rd cost of lower + // palette_size is more than 2 * best_rd. This level of pruning is more + // conservative when compared to sf level 2 as the cases which will get pruned + // with sf level 1 is a subset of the cases which will get pruned with sf + // level 2. + // 2: Terminate early for higher luma palette_size, if header rd cost of lower + // palette_size is more than best_rd. + // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%, + // 2.76%, 2.30%, 1.84%, 2.69%, 2.04%, 2.05% and 1.44% for speed 0, 1, 2, 3, 4, + // 5, 6, 7 and 8 on screen content set with coding performance change less + // than 0.01% for speed <= 2 and less than 0.03% for speed >= 3. For AVIF + // image encode, this sf reduces instruction count by 1.94%, 1.13%, 1.29%, + // 0.93%, 0.89%, 1.03%, 1.07%, 1.20% and 0.18% for speed 0, 1, 2, 3, 4, 5, 6, + // 7 and 8 on a typical image dataset with coding performance change less than + // 0.01%. + int prune_luma_palette_size_search_level; + + // Prune chroma intra modes based on luma intra mode winner. + // 0: No pruning + // 1: Prune chroma intra modes other than UV_DC_PRED, UV_SMOOTH_PRED, + // UV_CFL_PRED and the mode that corresponds to luma intra mode winner. + int prune_chroma_modes_using_luma_winner; + + // Clip the frequency of updating the mv cost for intrabc. + INTERNAL_COST_UPDATE_TYPE dv_cost_upd_level; + + // We use DCT_DCT transform followed by computing SATD (Sum of Absolute + // Transformed Differences) as an estimation of RD score to quickly find the + // best possible Chroma from Luma (CFL) parameter. Then we do a full RD search + // near the best possible parameter. The search range is set here. + // The range of cfl_searh_range should be [1, 33], and the following are the + // recommended values. + // 1: Fastest mode. + // 3: Default mode that provides good speedup without losing compression + // performance at speed 0. + // 33: Exhaustive rd search (33 == CFL_MAGS_SIZE). This mode should only + // be used for debugging purpose. + int cfl_search_range; + + // TOP_INTRA_MODEL_COUNT is 4 that is the number of top model rd to store in + // intra mode decision. Here, add a speed feature to reduce this number for + // higher speeds. + int top_intra_model_count_allowed; + + // Adapt top_intra_model_count_allowed locally to prune luma intra modes using + // neighbor block and quantizer information. + int adapt_top_model_rd_count_using_neighbors; + + // Prune the evaluation of odd delta angles of directional luma intra modes by + // using the rdcosts of neighbouring delta angles. + // For allintra encode, this speed feature reduces instruction count + // by 4.461%, 3.699% and 3.536% for speed 6, 7 and 8 on a typical video + // dataset with coding performance change less than 0.26%. For AVIF image + // encode, this speed feature reduces encode time by 2.849%, 2.471%, + // and 2.051% for speed 6, 7 and 8 on a typical image dataset with coding + // performance change less than 0.27%. + int prune_luma_odd_delta_angles_in_intra; + + // Terminate early in chroma palette_size search. + // 0: No early termination + // 1: Terminate early for higher palette_size, if header rd cost of lower + // palette_size is more than best_rd. + // For allintra encode, this sf reduces instruction count by 0.45%, + // 0.62%, 1.73%, 2.50%, 2.89%, 3.09% and 3.86% for speed 0 to 6 on screen + // content set with coding performance change less than 0.01%. + // For AVIF image encode, this sf reduces instruction count by 0.45%, 0.81%, + // 0.85%, 1.05%, 1.45%, 1.66% and 1.95% for speed 0 to 6 on a typical image + // dataset with no quality drop. + int early_term_chroma_palette_size_search; + + // Skips the evaluation of filter intra modes in inter frames if rd evaluation + // of luma intra dc mode results in invalid rd stats. + int skip_filter_intra_in_inter_frames; +} INTRA_MODE_SPEED_FEATURES; + +typedef struct TX_SPEED_FEATURES { + // Init search depth for square and rectangular transform partitions. + // Values: + // 0 - search full tree, 1: search 1 level, 2: search the highest level only + int inter_tx_size_search_init_depth_sqr; + int inter_tx_size_search_init_depth_rect; + int intra_tx_size_search_init_depth_sqr; + int intra_tx_size_search_init_depth_rect; + + // If any dimension of a coding block size above 64, always search the + // largest transform only, since the largest transform block size is 64x64. + int tx_size_search_lgr_block; + + TX_TYPE_SEARCH tx_type_search; + + // Skip split transform block partition when the collocated bigger block + // is selected as all zero coefficients. + int txb_split_cap; + + // Shortcut the transform block partition and type search when the target + // rdcost is relatively lower. + // Values are 0 (not used) , or 1 - 2 with progressively increasing + // aggressiveness + int adaptive_txb_search_level; + + // Prune level for tx_size_type search for inter based on rd model + // 0: no pruning + // 1-2: progressively increasing aggressiveness of pruning + int model_based_prune_tx_search_level; + + // Refine TX type after fast TX search. + int refine_fast_tx_search_results; + + // Prune transform split/no_split eval based on residual properties. A value + // of 0 indicates no pruning, and the aggressiveness of pruning progressively + // increases from levels 1 to 3. + int prune_tx_size_level; + + // Prune the evaluation of transform depths as decided by the NN model. + // false: No pruning. + // true : Avoid the evaluation of specific transform depths using NN model. + // + // For allintra encode, this speed feature reduces instruction count + // by 4.76%, 8.92% and 11.28% for speed 6, 7 and 8 with coding performance + // change less than 0.32%. For AVIF image encode, this speed feature reduces + // encode time by 4.65%, 9.16% and 10.45% for speed 6, 7 and 8 on a typical + // image dataset with coding performance change less than 0.19%. + bool prune_intra_tx_depths_using_nn; + + // Enable/disable early breakout during transform search of intra modes, by + // using the minimum rd cost possible. By using this approach, the rd + // evaluation of applicable transform blocks (in the current block) can be + // avoided as + // 1) best_rd evolves during the search in choose_tx_size_type_from_rd() + // 2) appropriate ref_best_rd is passed in intra_block_yrd() + // + // For allintra encode, this speed feature reduces instruction count + // by 1.11%, 1.08%, 1.02% and 0.93% for speed 3, 6, 7 and 8 with coding + // performance change less than 0.02%. For AVIF image encode, this speed + // feature reduces encode time by 0.93%, 1.46%, 1.07%, 0.84%, 0.99% and 0.73% + // for speed 3, 4, 5, 6, 7 and 8 on a typical image dataset with coding + // performance change less than 0.004%. + bool use_rd_based_breakout_for_intra_tx_search; +} TX_SPEED_FEATURES; + +typedef struct RD_CALC_SPEED_FEATURES { + // Fast approximation of av1_model_rd_from_var_lapndz + int simple_model_rd_from_var; + + // Perform faster distortion computation during the R-D evaluation by trying + // to approximate the prediction error with transform coefficients (faster but + // less accurate) rather than computing distortion in the pixel domain (slower + // but more accurate). The following methods are used for distortion + // computation: + // Method 0: Always compute distortion in the pixel domain + // Method 1: Based on block error, try using transform domain distortion for + // tx_type search and compute distortion in pixel domain for final RD_STATS + // Method 2: Based on block error, try to compute distortion in transform + // domain + // Methods 1 and 2 may fallback to computing distortion in the pixel domain in + // case the block error is less than the threshold, which is controlled by the + // speed feature tx_domain_dist_thres_level. + // + // The speed feature tx_domain_dist_level decides which of the above methods + // needs to be used across different mode evaluation stages as described + // below: + // Eval type: Default Mode Winner + // Level 0 : Method 0 Method 2 Method 0 + // Level 1 : Method 1 Method 2 Method 0 + // Level 2 : Method 2 Method 2 Method 0 + // Level 3 : Method 2 Method 2 Method 2 + int tx_domain_dist_level; + + // Transform domain distortion threshold level + int tx_domain_dist_thres_level; + + // Trellis (dynamic programming) optimization of quantized values + TRELLIS_OPT_TYPE optimize_coefficients; + + // Use hash table to store macroblock RD search results + // to avoid repeated search on the same residue signal. + int use_mb_rd_hash; + + // Flag used to control the extent of coeff R-D optimization + int perform_coeff_opt; +} RD_CALC_SPEED_FEATURES; + +typedef struct WINNER_MODE_SPEED_FEATURES { + // Flag used to control the winner mode processing for better R-D optimization + // of quantized coeffs + int enable_winner_mode_for_coeff_opt; + + // Flag used to control the winner mode processing for transform size + // search method + int enable_winner_mode_for_tx_size_srch; + + // Control transform size search level + // Eval type: Default Mode Winner + // Level 0 : FULL RD LARGEST ALL FULL RD + // Level 1 : FAST RD LARGEST ALL FULL RD + // Level 2 : LARGEST ALL LARGEST ALL FULL RD + // Level 3 : LARGEST ALL LARGEST ALL LARGEST ALL + int tx_size_search_level; + + // Flag used to control the winner mode processing for use transform + // domain distortion + int enable_winner_mode_for_use_tx_domain_dist; + + // Flag used to enable processing of multiple winner modes + MULTI_WINNER_MODE_TYPE multi_winner_mode_type; + + // Motion mode for winner candidates: + // 0: speed feature OFF + // 1 / 2 : Use configured number of winner candidates + int motion_mode_for_winner_cand; + + // Controls the prediction of transform skip block or DC only block. + // + // Different speed feature values (0 to 3) decide the aggressiveness of + // prediction (refer to predict_dc_levels[][] in speed_features.c) to be used + // during different mode evaluation stages. + int dc_blk_pred_level; + + // If on, disables interpolation filter search in handle_inter_mode loop, and + // performs it during winner mode processing by \ref + // tx_search_best_inter_candidates. + int winner_mode_ifs; + + // Controls the disabling of winner mode processing. Speed feature levels + // are ordered in increasing aggressiveness of pruning. The method considered + // for disabling, depends on the sf level value and it is described as below. + // 0: Do not disable + // 1: Disable for blocks with low source variance. + // 2: Disable for blocks which turn out to be transform skip (skipped based on + // eob) during MODE_EVAL stage except NEWMV mode. + // 3: Disable for blocks which turn out to be transform skip during MODE_EVAL + // stage except NEWMV mode. For high quantizers, prune conservatively based on + // transform skip (skipped based on eob) except for NEWMV mode. + // 4: Disable for blocks which turn out to be transform skip during MODE_EVAL + // stage. + int prune_winner_mode_eval_level; +} WINNER_MODE_SPEED_FEATURES; + +typedef struct LOOP_FILTER_SPEED_FEATURES { + // This feature controls how the loop filter level is determined. + LPF_PICK_METHOD lpf_pick; + + // Skip some final iterations in the determination of the best loop filter + // level. + int use_coarse_filter_level_search; + + // Control how the CDEF strength is determined. + CDEF_PICK_METHOD cdef_pick_method; + + // Decoder side speed feature to add penalty for use of dual-sgr filters. + // Takes values 0 - 10, 0 indicating no penalty and each additional level + // adding a penalty of 1% + int dual_sgr_penalty_level; + + // prune sgr ep using binary search like mechanism + int enable_sgr_ep_pruning; + + // Disable loop restoration for Chroma plane + int disable_loop_restoration_chroma; + + // Disable loop restoration for luma plane + int disable_loop_restoration_luma; + + // Range of loop restoration unit sizes to search + // The minimum size is clamped against the superblock size in + // av1_pick_filter_restoration, so that the code which sets this value does + // not need to know the superblock size ahead of time. + int min_lr_unit_size; + int max_lr_unit_size; + + // Prune RESTORE_WIENER evaluation based on source variance + // 0 : no pruning + // 1 : conservative pruning + // 2 : aggressive pruning + int prune_wiener_based_on_src_var; + + // Prune self-guided loop restoration based on wiener search results + // 0 : no pruning + // 1 : pruning based on rdcost ratio of RESTORE_WIENER and RESTORE_NONE + // 2 : pruning based on winner restoration type among RESTORE_WIENER and + // RESTORE_NONE + int prune_sgr_based_on_wiener; + + // Reduce the wiener filter win size for luma + int reduce_wiener_window_size; + + // Flag to disable Wiener Loop restoration filter. + bool disable_wiener_filter; + + // Flag to disable Self-guided Loop restoration filter. + bool disable_sgr_filter; + + // Disable the refinement search around the wiener filter coefficients. + bool disable_wiener_coeff_refine_search; + + // Whether to downsample the rows in computation of wiener stats. + int use_downsampled_wiener_stats; +} LOOP_FILTER_SPEED_FEATURES; + +typedef struct REAL_TIME_SPEED_FEATURES { + // check intra prediction for non-RD mode. + int check_intra_pred_nonrd; + + // Skip checking intra prediction. + // 0 - don't skip + // 1 - skip if TX is skipped and best mode is not NEWMV + // 2 - skip if TX is skipped + // Skipping aggressiveness increases from level 1 to 2. + int skip_intra_pred; + + // Estimate motion before calculating variance in variance-based partition + // 0 - Only use zero MV + // 1 - perform coarse ME + // 2 - perform coarse ME, and also use neighbours' MVs + // 3 - use neighbours' MVs without performing coarse ME + int estimate_motion_for_var_based_partition; + + // For nonrd_use_partition: mode of extra check of leaf partition + // 0 - don't check merge + // 1 - always check merge + // 2 - check merge and prune checking final split + // 3 - check merge and prune checking final split based on bsize and qindex + int nonrd_check_partition_merge_mode; + + // For nonrd_use_partition: check of leaf partition extra split + int nonrd_check_partition_split; + + // Implements various heuristics to skip searching modes + // The heuristics selected are based on flags + // defined in the MODE_SEARCH_SKIP_HEURISTICS enum + unsigned int mode_search_skip_flags; + + // For nonrd: Reduces ref frame search. + // 0 - low level of search prune in non last frames + // 1 - pruned search in non last frames + // 2 - more pruned search in non last frames + int nonrd_prune_ref_frame_search; + + // This flag controls the use of non-RD mode decision. + int use_nonrd_pick_mode; + + // Use ALTREF frame in non-RD mode decision. + int use_nonrd_altref_frame; + + // Use compound reference for non-RD mode. + int use_comp_ref_nonrd; + + // Reference frames for compound prediction for nonrd pickmode: + // LAST_GOLDEN (0), LAST_LAST2 (1), or LAST_ALTREF (2). + int ref_frame_comp_nonrd[3]; + + // use reduced ref set for real-time mode + int use_real_time_ref_set; + + // Skip a number of expensive mode evaluations for blocks with very low + // temporal variance. + int short_circuit_low_temp_var; + + // Reuse inter prediction in fast non-rd mode. + int reuse_inter_pred_nonrd; + + // Number of best inter modes to search transform. INT_MAX - search all. + int num_inter_modes_for_tx_search; + + // Use interpolation filter search in non-RD mode decision. + int use_nonrd_filter_search; + + // Use simplified RD model for interpolation search and Intra + int use_simple_rd_model; + + // For nonrd mode: use hybrid intra mode search for intra only frames based on + // block properties. + // 0 : use nonrd pick intra for all blocks + // 1 : use rd for bsize < 16x16, nonrd otherwise + // 2 : use rd for bsize < 16x16 and src var >= 101, nonrd otherwise + int hybrid_intra_pickmode; + + // Compute variance/sse on source difference, prior to encoding superblock. + int source_metrics_sb_nonrd; + + // Flag to indicate process for handling overshoot on slide/scene change, + // for real-time CBR mode. + OVERSHOOT_DETECTION_CBR overshoot_detection_cbr; + + // Check for scene/content change detection on every frame before encoding. + int check_scene_detection; + + // For nonrd mode: Prefer larger partition blks in variance based partitioning + // 0: disabled, 1-3: increasing aggressiveness + int prefer_large_partition_blocks; + + // uses results of temporal noise estimate + int use_temporal_noise_estimate; + + // Parameter indicating initial search window to be used in full-pixel search + // for nonrd_pickmode. Range [0, MAX_MVSEARCH_STEPS - 1]. Lower value + // indicates larger window. If set to 0, step_param is set based on internal + // logic in set_mv_search_params(). + int fullpel_search_step_param; + + // Bit mask to enable or disable intra modes for each prediction block size + // separately, for nonrd_pickmode. Currently, the sf is not respected when + // 'force_intra_check' is true in 'av1_estimate_intra_mode()' function. Also, + // H and V pred modes allowed through this sf can be further pruned when + //'prune_hv_pred_modes_using_src_sad' sf is true. + int intra_y_mode_bsize_mask_nrd[BLOCK_SIZES]; + + // Prune H and V intra predition modes evalution in inter frame. + // The sf does not have any impact. + // i. when frame_source_sad is 1.1 times greater than avg_source_sad + // ii. when cyclic_refresh_segment_id_boosted is enabled + // iii. when SB level source sad is greater than kMedSad + // iv. when color sensitivity is non zero for both the chroma channels + bool prune_hv_pred_modes_using_src_sad; + + // Skips mode checks more aggressively in nonRD mode + int nonrd_aggressive_skip; + + // Skip cdef on 64x64 blocks/ + // 0: disabled + // 1: skip when NEWMV or INTRA is not picked or color sensitivity is off. + // When color sensitivity is on for a superblock, all 64x64 blocks within + // will not skip. + // 2: more aggressive mode where skip is done for all frames where + // rc->high_source_sad = 0 (non slide-changes), and color sensitivity off. + int skip_cdef_sb; + + // Force selective cdf update. + int selective_cdf_update; + + // Force only single reference (LAST) for prediction. + int force_only_last_ref; + + // Forces larger partition blocks in variance based partitioning for intra + // frames + int force_large_partition_blocks_intra; + + // Use fixed partition for superblocks based on source_sad. + // 0: disabled + // 1: enabled + int use_fast_fixed_part; + + // Increase source_sad thresholds in nonrd pickmode. + int increase_source_sad_thresh; + + // Skip evaluation of no split in tx size selection for merge partition + int skip_tx_no_split_var_based_partition; + + // Intermediate termination of newMV mode evaluation based on so far best mode + // sse + int skip_newmv_mode_based_on_sse; + + // Define gf length multiplier. + // Level 0: use large multiplier, level 1: use medium multiplier. + int gf_length_lvl; + + // Prune inter modes with golden frame as reference for NEARMV and NEWMV modes + int prune_inter_modes_with_golden_ref; + + // Prune inter modes w.r.t golden or alt-ref frame based on sad + int prune_inter_modes_wrt_gf_arf_based_on_sad; + + // Prune inter mode search in rd path based on current block's temporal + // variance wrt LAST reference. + int prune_inter_modes_using_temp_var; + + // Reduce MV precision to halfpel for higher int MV value & frame-level motion + // 0: disabled + // 1-2: Reduce precision to halfpel, fullpel based on conservative + // thresholds, aggressiveness increases with increase in level + // 3: Reduce precision to halfpel using more aggressive thresholds + int reduce_mv_pel_precision_highmotion; + + // Reduce MV precision for low complexity blocks + // 0: disabled + // 1: Reduce the mv resolution for zero mv if the variance is low + // 2: Switch to halfpel, fullpel based on low block spatial-temporal + // complexity. + int reduce_mv_pel_precision_lowcomplex; + + // Prune intra mode evaluation in inter frames based on mv range. + BLOCK_SIZE prune_intra_mode_based_on_mv_range; + // The number of times to left shift the splitting thresholds in variance + // based partitioning. The minimum values should be 7 to avoid left shifting + // by a negative number. + int var_part_split_threshold_shift; + + // Qindex based variance partition threshold index, which determines + // the aggressiveness of partition pruning + // 0: disabled for speeds 9,10 + // 1,2: (rd-path) lowers qindex thresholds conditionally (for low SAD sb) + // 3,4: (non-rd path) uses pre-tuned qindex thresholds + int var_part_based_on_qidx; + + // Enable GF refresh based on Q value. + int gf_refresh_based_on_qp; + + // Temporal filtering + // The value can be 1 or 2, which indicates the threshold to use. + // Must be off for lossless mode. + int use_rtc_tf; + + // Prune the use of the identity transform in nonrd_pickmode, + // used for screen content mode: only for smaller blocks + // and higher spatial variance, and when skip_txfm is not + // already set. + int prune_idtx_nonrd; + + // Prune the use of paletter mode in nonrd pickmode. + int prune_palette_nonrd; + + // Force to only use dct for palette search in nonrd pickmode. + int dct_only_palette_nonrd; + + // Skip loopfilter, for static content after slide change + // or key frame, once quality has ramped up. + // 0: disabled + // 1: skip only after quality is ramped up. + // 2: aggrssive mode, where skip is done for all frames that + // where rc->high_source_sad = 0 (no slide-changes). + int skip_lf_screen; + + // For nonrd: early exit out of variance partition that sets the + // block size to superblock size, and sets mode to zeromv-last skip. + // 0: disabled + // 1: zeromv-skip is enabled at SB level only + // 2: zeromv-skip is enabled at SB level and coding block level + int part_early_exit_zeromv; + + // Early terminate inter mode search based on sse in non-rd path. + INTER_SEARCH_EARLY_TERM_IDX sse_early_term_inter_search; + + // SAD based adaptive altref selection + int sad_based_adp_altref_lag; + + // Enable/disable partition direct merging. + int partition_direct_merging; + + // Level of aggressiveness for obtaining tx size based on qstep + int tx_size_level_based_on_qstep; + + // Avoid the partitioning of a 16x16 block in variance based partitioning + // (VBP) by making use of minimum and maximum sub-block variances. + // For allintra encode, this speed feature reduces instruction count by 5.39% + // for speed 9 on a typical video dataset with coding performance gain + // of 1.44%. + // For AVIF image encode, this speed feature reduces encode time + // by 8.44% for speed 9 on a typical image dataset with coding performance + // gain of 0.78%. + bool vbp_prune_16x16_split_using_min_max_sub_blk_var; + + // A qindex threshold that determines whether to use qindex based CDEF filter + // strength estimation for screen content types. The strength estimation model + // used for screen contents prefers to allow cdef filtering for more frames. + // This sf is used to limit the frames which go through cdef filtering and + // following explains the setting of the same. + // MAXQ (255): This disables the usage of this sf. Here, frame does not use a + // screen content model thus reduces the number of frames that go through cdef + // filtering. + // MINQ (0): Frames always use screen content model thus increasing the number + // of frames that go through cdef filtering. + // This speed feature has a substantial gain on coding metrics, with moderate + // increase encoding time. Select threshold based on speed vs quality + // trade-off. + int screen_content_cdef_filter_qindex_thresh; + + // Prune compound mode if its variance is higher than the variance of single + // modes. + bool prune_compoundmode_with_singlecompound_var; + + // Allow mode cost update at frame level every couple frames. This + // overrides the command line setting --mode-cost-upd-freq=3 (never update + // except on key frame and first delta). + bool frame_level_mode_cost_update; + + // Prune H_PRED during intra mode evaluation in the nonrd path based on best + // mode so far. + // + // For allintra encode, this speed feature reduces instruction count by 1.10% + // for speed 9 with coding performance change less than 0.04%. + // For AVIF image encode, this speed feature reduces encode time by 1.03% for + // speed 9 on a typical image dataset with coding performance change less than + // 0.08%. + bool prune_h_pred_using_best_mode_so_far; + + // Enable pruning of intra mode evaluations in nonrd path based on source + // variance and best mode so far. The pruning logic is enabled only if the + // mode is not a winner mode of both the neighboring blocks (left/top). + // + // For allintra encode, this speed feature reduces instruction count by 3.96% + // for speed 9 with coding performance change less than 0.38%. + // For AVIF image encode, this speed feature reduces encode time by 3.46% for + // speed 9 on a typical image dataset with coding performance change less than + // -0.06%. + bool enable_intra_mode_pruning_using_neighbors; + + // Prune intra mode evaluations in nonrd path based on best sad so far. + // + // For allintra encode, this speed feature reduces instruction count by 3.05% + // for speed 9 with coding performance change less than 0.24%. + // For AVIF image encode, this speed feature reduces encode time by 1.87% for + // speed 9 on a typical image dataset with coding performance change less than + // 0.16%. + bool prune_intra_mode_using_best_sad_so_far; + + // If compound is enabled, and the current block size is \geq BLOCK_16X16, + // limit the compound modes to GLOBAL_GLOBALMV. This does not apply to the + // base layer of svc. + bool check_only_zero_zeromv_on_large_blocks; + + // Allow for disabling cdf update for non reference frames in svc mode. + bool disable_cdf_update_non_reference_frame; + + // Prune compound modes if the single modes variances do not perform well. + bool prune_compoundmode_with_singlemode_var; + + // Skip searching all compound mode if the variance of single_mode residue is + // sufficiently low. + bool skip_compound_based_on_var; + + // Sets force_zeromv_skip based on the source sad available. Aggressiveness + // increases with increase in the level set for speed feature. + // 0: No setting + // 1: If source sad is kZeroSad + // 2: If source sad <= kVeryLowSad + int set_zeromv_skip_based_on_source_sad; + + // Downgrades the block-level subpel motion search to + // av1_find_best_sub_pixel_tree_pruned_more for higher QP and when fullpel + // search performed well, zeromv has low sad or low source_var + bool use_adaptive_subpel_search; + + // A flag used in RTC case to control frame_refs_short_signaling. Note that + // the final decision is made in check_frame_refs_short_signaling(). The flag + // can only be turned on when res < 360p and speed >= 9, in which case only + // LAST and GOLDEN ref frames are used now. + bool enable_ref_short_signaling; + + // A flag that controls if we check or bypass GLOBALMV in rtc single ref frame + // case. + bool check_globalmv_on_single_ref; + + // Allows for increasing the color_threshold for palette prediction. + // This generally leads to better coding efficiency but with some speed loss. + // Only used for screen content and for nonrd_pickmode. + bool increase_color_thresh_palette; +} REAL_TIME_SPEED_FEATURES; + +/*!\endcond */ + +/*! + * \brief Top level speed vs quality trade off data struture. + */ +typedef struct SPEED_FEATURES { + /*! + * Sequence/frame level speed features: + */ + HIGH_LEVEL_SPEED_FEATURES hl_sf; + + /*! + * Speed features for the first pass. + */ + FIRST_PASS_SPEED_FEATURES fp_sf; + + /*! + * Speed features related to how tpl's searches are done. + */ + TPL_SPEED_FEATURES tpl_sf; + + /*! + * Global motion speed features: + */ + GLOBAL_MOTION_SPEED_FEATURES gm_sf; + + /*! + * Partition search speed features: + */ + PARTITION_SPEED_FEATURES part_sf; + + /*! + * Motion search speed features: + */ + MV_SPEED_FEATURES mv_sf; + + /*! + * Inter mode search speed features: + */ + INTER_MODE_SPEED_FEATURES inter_sf; + + /*! + * Interpolation filter search speed features: + */ + INTERP_FILTER_SPEED_FEATURES interp_sf; + + /*! + * Intra mode search speed features: + */ + INTRA_MODE_SPEED_FEATURES intra_sf; + + /*! + * Transform size/type search speed features: + */ + TX_SPEED_FEATURES tx_sf; + + /*! + * RD calculation speed features: + */ + RD_CALC_SPEED_FEATURES rd_sf; + + /*! + * Two-pass mode evaluation features: + */ + WINNER_MODE_SPEED_FEATURES winner_mode_sf; + + /*! + * In-loop filter speed features: + */ + LOOP_FILTER_SPEED_FEATURES lpf_sf; + + /*! + * Real-time mode speed features: + */ + REAL_TIME_SPEED_FEATURES rt_sf; +} SPEED_FEATURES; +/*!\cond */ + +struct AV1_COMP; + +/*!\endcond */ +/*!\brief Frame size independent speed vs quality trade off flags + * + *\ingroup speed_features + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] speed Speed setting passed in from the command line + * + * \remark No return value but configures the various speed trade off flags + * based on the passed in speed setting. (Higher speed gives lower + * quality) + */ +void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi, + int speed); + +/*!\brief Frame size dependent speed vs quality trade off flags + * + *\ingroup speed_features + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] speed Speed setting passed in from the command line + * + * \remark No return value but configures the various speed trade off flags + * based on the passed in speed setting and frame size. (Higher speed + * corresponds to lower quality) + */ +void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi, + int speed); +/*!\brief Q index dependent speed vs quality trade off flags + * + *\ingroup speed_features + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] speed Speed setting passed in from the command line + * + * \remark No return value but configures the various speed trade off flags + * based on the passed in speed setting and current frame's Q index. + * (Higher speed corresponds to lower quality) + */ +void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_ diff --git a/third_party/aom/av1/encoder/superres_scale.c b/third_party/aom/av1/encoder/superres_scale.c new file mode 100644 index 0000000000..3b47909b15 --- /dev/null +++ b/third_party/aom/av1/encoder/superres_scale.c @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/superres_scale.h" +#include "av1/encoder/random.h" + +// Compute the horizontal frequency components' energy in a frame +// by calculuating the 16x4 Horizontal DCT. This is to be used to +// decide the superresolution parameters. +static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) { + uint64_t freq_energy[16] = { 0 }; + const YV12_BUFFER_CONFIG *buf = cpi->source; + const int bd = cpi->td.mb.e_mbd.bd; + const int width = buf->y_crop_width; + const int height = buf->y_crop_height; + DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]); + int n = 0; + memset(freq_energy, 0, sizeof(freq_energy)); + if (buf->flags & YV12_FLAG_HIGHBITDEPTH) { + const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer); + for (int i = 0; i < height - 4; i += 4) { + for (int j = 0; j < width - 16; j += 16) { + av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride, + H_DCT, bd); + for (int k = 1; k < 16; ++k) { + const uint64_t this_energy = + ((int64_t)coeff[k] * coeff[k]) + + ((int64_t)coeff[k + 16] * coeff[k + 16]) + + ((int64_t)coeff[k + 32] * coeff[k + 32]) + + ((int64_t)coeff[k + 48] * coeff[k + 48]); + freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8)); + } + n++; + } + } + } else { + assert(bd == 8); + DECLARE_ALIGNED(16, int16_t, src16[16 * 4]); + for (int i = 0; i < height - 4; i += 4) { + for (int j = 0; j < width - 16; j += 16) { + for (int ii = 0; ii < 4; ++ii) + for (int jj = 0; jj < 16; ++jj) + src16[ii * 16 + jj] = + buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)]; + av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd); + for (int k = 1; k < 16; ++k) { + const uint64_t this_energy = + ((int64_t)coeff[k] * coeff[k]) + + ((int64_t)coeff[k + 16] * coeff[k + 16]) + + ((int64_t)coeff[k + 32] * coeff[k + 32]) + + ((int64_t)coeff[k + 48] * coeff[k + 48]); + freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2); + } + n++; + } + } + } + if (n) { + for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n; + // Convert to cumulative energy + for (int k = 14; k > 0; --k) energy[k] += energy[k + 1]; + } else { + for (int k = 1; k < 16; ++k) energy[k] = 1e+20; + } +} + +static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) { + // Choose an arbitrary random number + static unsigned int seed = 56789; + const ResizeCfg *resize_cfg = &cpi->oxcf.resize_cfg; + if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR; + uint8_t new_denom = SCALE_NUMERATOR; + + if (cpi->common.seq_params->reduced_still_picture_hdr) return SCALE_NUMERATOR; + switch (resize_cfg->resize_mode) { + case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break; + case RESIZE_FIXED: + if (cpi->common.current_frame.frame_type == KEY_FRAME) + new_denom = resize_cfg->resize_kf_scale_denominator; + else + new_denom = resize_cfg->resize_scale_denominator; + break; + case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break; + default: assert(0); + } + return new_denom; +} + +int av1_superres_in_recode_allowed(const AV1_COMP *const cpi) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + // Empirically found to not be beneficial for image coding. + return oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO && + cpi->sf.hl_sf.superres_auto_search_type != SUPERRES_AUTO_SOLO && + cpi->rc.frames_to_key > 1; +} + +#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012 +#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008 +#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008 +#define SUPERRES_ENERGY_BY_AC_THRESH 0.2 + +static double get_energy_by_q2_thresh(const GF_GROUP *gf_group, + const RATE_CONTROL *rc, + int gf_frame_index) { + // TODO(now): Return keyframe thresh * factor based on frame type / pyramid + // level. + if (gf_group->update_type[gf_frame_index] == ARF_UPDATE) { + return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME; + } else if (gf_group->update_type[gf_frame_index] == KF_UPDATE) { + if (rc->frames_to_key <= 1) + return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO; + else + return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME; + } else { + assert(0); + } + return 0; +} + +static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy, + double threshq, + double threshp) { + const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8); + const double tq = threshq * q * q; + const double tp = threshp * energy[1]; + const double thresh = AOMMIN(tq, tp); + int k; + for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) { + if (energy[k - 1] > thresh) break; + } + return 3 * SCALE_NUMERATOR - k; +} + +static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex, + int sr_kf, int sr_arf) { + // Use superres for Key-frames and Alt-ref frames only. + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE && + gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE) { + return SCALE_NUMERATOR; + } + if (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE && !sr_kf) { + return SCALE_NUMERATOR; + } + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !sr_arf) { + return SCALE_NUMERATOR; + } + + double energy[16]; + analyze_hor_freq(cpi, energy); + + const double energy_by_q2_thresh = + get_energy_by_q2_thresh(gf_group, &cpi->rc, cpi->gf_frame_index); + int denom = get_superres_denom_from_qindex_energy( + qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH); + /* + printf("\nenergy = ["); + for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]); + printf("]\n"); + printf("boost = %d\n", + (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE) + ? cpi->ppi->p_rc.kf_boost + : cpi->rc.gfu_boost); + printf("denom = %d\n", denom); + */ + if (av1_superres_in_recode_allowed(cpi)) { + assert(cpi->superres_mode != AOM_SUPERRES_NONE); + // Force superres to be tried in the recode loop, as full-res is also going + // to be tried anyway. + denom = AOMMAX(denom, SCALE_NUMERATOR + 1); + } + return denom; +} + +static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { + // Choose an arbitrary random number + static unsigned int seed = 34567; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + const SuperResCfg *const superres_cfg = &oxcf->superres_cfg; + const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR; + uint8_t new_denom = SCALE_NUMERATOR; + + // Make sure that superres mode of the frame is consistent with the + // sequence-level flag. + assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE, + cpi->common.seq_params->enable_superres)); + assert(IMPLIES(!cpi->common.seq_params->enable_superres, + superres_cfg->superres_mode == AOM_SUPERRES_NONE)); + // Make sure that superres mode for current encoding is consistent with user + // provided superres mode. + assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_AUTO, + cpi->superres_mode == superres_cfg->superres_mode)); + + // Note: we must look at the current superres_mode to be tried in 'cpi' here, + // not the user given mode in 'oxcf'. + switch (cpi->superres_mode) { + case AOM_SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break; + case AOM_SUPERRES_FIXED: + if (cpi->common.current_frame.frame_type == KEY_FRAME) + new_denom = superres_cfg->superres_kf_scale_denominator; + else + new_denom = superres_cfg->superres_scale_denominator; + break; + case AOM_SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break; + case AOM_SUPERRES_QTHRESH: { + // Do not use superres when screen content tools are used. + if (cpi->common.features.allow_screen_content_tools) break; + if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ) + av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height); + + // Now decide the use of superres based on 'q'. + int bottom_index, top_index; + const int q = av1_rc_pick_q_and_bounds( + cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index, + &bottom_index, &top_index); + + const int qthresh = (frame_is_intra_only(&cpi->common)) + ? superres_cfg->superres_kf_qthresh + : superres_cfg->superres_qthresh; + if (q <= qthresh) { + new_denom = SCALE_NUMERATOR; + } else { + new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1); + } + break; + } + case AOM_SUPERRES_AUTO: { + if (cpi->common.features.allow_screen_content_tools) break; + if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ) + av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height); + + // Now decide the use of superres based on 'q'. + int bottom_index, top_index; + const int q = av1_rc_pick_q_and_bounds( + cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index, + &bottom_index, &top_index); + + const SUPERRES_AUTO_SEARCH_TYPE sr_search_type = + cpi->sf.hl_sf.superres_auto_search_type; + const int qthresh = (sr_search_type == SUPERRES_AUTO_SOLO) ? 128 : 0; + if (q <= qthresh) { + new_denom = SCALE_NUMERATOR; // Don't use superres. + } else { + if (sr_search_type == SUPERRES_AUTO_ALL) { + if (cpi->common.current_frame.frame_type == KEY_FRAME) + new_denom = superres_cfg->superres_kf_scale_denominator; + else + new_denom = superres_cfg->superres_scale_denominator; + } else { + new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1); + } + } + break; + } + default: assert(0); + } + return new_denom; +} + +static int dimension_is_ok(int orig_dim, int resized_dim, int denom) { + return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2); +} + +static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) { + // Only need to check the width, as scaling is horizontal only. + (void)oheight; + return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom); +} + +static int validate_size_scales(RESIZE_MODE resize_mode, + aom_superres_mode superres_mode, int owidth, + int oheight, size_params_type *rsz) { + if (dimensions_are_ok(owidth, oheight, rsz)) { // Nothing to do. + return 1; + } + + // Calculate current resize scale. + int resize_denom = + AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width), + DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height)); + + if (resize_mode != RESIZE_RANDOM && superres_mode == AOM_SUPERRES_RANDOM) { + // Alter superres scale as needed to enforce conformity. + rsz->superres_denom = + (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom; + if (!dimensions_are_ok(owidth, oheight, rsz)) { + if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom; + } + } else if (resize_mode == RESIZE_RANDOM && + superres_mode != AOM_SUPERRES_RANDOM) { + // Alter resize scale as needed to enforce conformity. + resize_denom = + (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom; + rsz->resize_width = owidth; + rsz->resize_height = oheight; + av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, + resize_denom); + if (!dimensions_are_ok(owidth, oheight, rsz)) { + if (resize_denom > SCALE_NUMERATOR) { + --resize_denom; + rsz->resize_width = owidth; + rsz->resize_height = oheight; + av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, + resize_denom); + } + } + } else if (resize_mode == RESIZE_RANDOM && + superres_mode == AOM_SUPERRES_RANDOM) { + // Alter both resize and superres scales as needed to enforce conformity. + do { + if (resize_denom > rsz->superres_denom) + --resize_denom; + else + --rsz->superres_denom; + rsz->resize_width = owidth; + rsz->resize_height = oheight; + av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, + resize_denom); + } while (!dimensions_are_ok(owidth, oheight, rsz) && + (resize_denom > SCALE_NUMERATOR || + rsz->superres_denom > SCALE_NUMERATOR)); + } else { // We are allowed to alter neither resize scale nor superres + // scale. + return 0; + } + return dimensions_are_ok(owidth, oheight, rsz); +} + +// Calculates resize and superres params for next frame +static size_params_type calculate_next_size_params(AV1_COMP *cpi) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + ResizePendingParams *resize_pending_params = &cpi->resize_pending_params; + const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; + size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height, + SCALE_NUMERATOR }; + int resize_denom = SCALE_NUMERATOR; + if (has_no_stats_stage(cpi) && cpi->ppi->use_svc && + (cpi->common.width != cpi->oxcf.frm_dim_cfg.width || + cpi->common.height != cpi->oxcf.frm_dim_cfg.height)) { + rsz.resize_width = cpi->common.width; + rsz.resize_height = cpi->common.height; + return rsz; + } + if (is_stat_generation_stage(cpi)) return rsz; + if (resize_pending_params->width && resize_pending_params->height) { + rsz.resize_width = resize_pending_params->width; + rsz.resize_height = resize_pending_params->height; + resize_pending_params->width = resize_pending_params->height = 0; + if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE) return rsz; + } else { + resize_denom = calculate_next_resize_scale(cpi); + rsz.resize_width = frm_dim_cfg->width; + rsz.resize_height = frm_dim_cfg->height; + av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height, + resize_denom); + } + rsz.superres_denom = calculate_next_superres_scale(cpi); + if (!validate_size_scales(oxcf->resize_cfg.resize_mode, cpi->superres_mode, + frm_dim_cfg->width, frm_dim_cfg->height, &rsz)) + assert(0 && "Invalid scale parameters"); + return rsz; +} + +static void setup_frame_size_from_params(AV1_COMP *cpi, + const size_params_type *rsz) { + int encode_width = rsz->resize_width; + int encode_height = rsz->resize_height; + + AV1_COMMON *cm = &cpi->common; + cm->superres_upscaled_width = encode_width; + cm->superres_upscaled_height = encode_height; + cm->superres_scale_denominator = rsz->superres_denom; + av1_calculate_scaled_superres_size(&encode_width, &encode_height, + rsz->superres_denom); + av1_set_frame_size(cpi, encode_width, encode_height); +} + +void av1_setup_frame_size(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + // Reset superres params from previous frame. + cm->superres_scale_denominator = SCALE_NUMERATOR; + const size_params_type rsz = calculate_next_size_params(cpi); + setup_frame_size_from_params(cpi, &rsz); + + assert(av1_is_min_tile_width_satisfied(cm)); +} + +void av1_superres_post_encode(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + + assert(cpi->oxcf.superres_cfg.enable_superres); + assert(!is_lossless_requested(&cpi->oxcf.rc_cfg)); + assert(!cm->features.all_lossless); + + av1_superres_upscale(cm, NULL, cpi->image_pyramid_levels); + + // If regular resizing is occurring the source will need to be downscaled to + // match the upscaled superres resolution. Otherwise the original source is + // used. + if (!av1_resize_scaled(cm)) { + cpi->source = cpi->unscaled_source; + if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source; + } else { + assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width); + assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height); + // Do downscale. cm->(width|height) has been updated by + // av1_superres_upscale + cpi->source = realloc_and_scale_source(cpi, cm->superres_upscaled_width, + cm->superres_upscaled_height); + } +} diff --git a/third_party/aom/av1/encoder/superres_scale.h b/third_party/aom/av1/encoder/superres_scale.h new file mode 100644 index 0000000000..450a4ed902 --- /dev/null +++ b/third_party/aom/av1/encoder/superres_scale.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SUPERRES_SCALE_H_ +#define AOM_AV1_ENCODER_SUPERRES_SCALE_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int av1_superres_in_recode_allowed(const AV1_COMP *const cpi); +void av1_superres_post_encode(AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SUPERRES_SCALE_H_ diff --git a/third_party/aom/av1/encoder/svc_layercontext.c b/third_party/aom/av1/encoder/svc_layercontext.c new file mode 100644 index 0000000000..2c99cb89b8 --- /dev/null +++ b/third_party/aom/av1/encoder/svc_layercontext.c @@ -0,0 +1,701 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_alloc.h" + +static void swap_ptr(void *a, void *b) { + void **a_p = (void **)a; + void **b_p = (void **)b; + void *c = *a_p; + *a_p = *b_p; + *b_p = c; +} + +void av1_init_layer_context(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + SVC *const svc = &cpi->svc; + int mi_rows = cpi->common.mi_params.mi_rows; + int mi_cols = cpi->common.mi_params.mi_cols; + svc->base_framerate = 30.0; + svc->current_superframe = 0; + svc->force_zero_mode_spatial_ref = 1; + svc->num_encoded_top_layer = 0; + svc->use_flexible_mode = 0; + svc->has_lower_quality_layer = 0; + + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc; + lrc->ni_av_qi = oxcf->rc_cfg.worst_allowed_q; + lp_rc->total_actual_bits = 0; + lrc->ni_tot_qi = 0; + lp_rc->tot_q = 0.0; + lp_rc->avg_q = 0.0; + lp_rc->ni_frames = 0; + lrc->decimation_count = 0; + lrc->decimation_factor = 0; + lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q); + lrc->best_quality = av1_quantizer_to_qindex(lc->min_q); + lrc->rtc_external_ratectrl = 0; + for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) { + lp_rc->rate_correction_factors[i] = 1.0; + } + lc->target_bandwidth = lc->layer_target_bitrate; + lp_rc->last_q[INTER_FRAME] = lrc->worst_quality; + lp_rc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality; + lp_rc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality; + lp_rc->buffer_level = + oxcf->rc_cfg.starting_buffer_level_ms * lc->target_bandwidth / 1000; + lp_rc->bits_off_target = lp_rc->buffer_level; + // Initialize the cyclic refresh parameters. If spatial layers are used + // (i.e., ss_number_layers > 1), these need to be updated per spatial + // layer. Cyclic refresh is only applied on base temporal layer. + if (svc->number_spatial_layers > 1 && tl == 0) { + lc->sb_index = 0; + lc->actual_num_seg1_blocks = 0; + lc->actual_num_seg2_blocks = 0; + lc->counter_encode_maxq_scene_change = 0; + aom_free(lc->map); + CHECK_MEM_ERROR(cm, lc->map, + aom_calloc(mi_rows * mi_cols, sizeof(*lc->map))); + } + } + svc->downsample_filter_type[sl] = BILINEAR; + svc->downsample_filter_phase[sl] = 8; + svc->last_layer_dropped[sl] = false; + svc->drop_spatial_layer[sl] = false; + } + if (svc->number_spatial_layers == 3) { + svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH; + } +} + +bool av1_alloc_layer_context(AV1_COMP *cpi, int num_layers) { + SVC *const svc = &cpi->svc; + if (svc->layer_context == NULL || svc->num_allocated_layers < num_layers) { + assert(num_layers > 1); + aom_free(svc->layer_context); + svc->num_allocated_layers = 0; + svc->layer_context = + (LAYER_CONTEXT *)aom_calloc(num_layers, sizeof(*svc->layer_context)); + if (svc->layer_context == NULL) return false; + svc->num_allocated_layers = num_layers; + } + return true; +} + +// Update the layer context from a change_config() call. +void av1_update_layer_context_change_config(AV1_COMP *const cpi, + const int64_t target_bandwidth) { + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + AV1_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + int layer = 0; + int64_t spatial_layer_target = 0; + float bitrate_alloc = 1.0; + const int mi_rows = cm->mi_params.mi_rows; + const int mi_cols = cm->mi_params.mi_cols; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + svc->layer_context[layer].target_bandwidth = lc->layer_target_bitrate; + } + spatial_layer_target = svc->layer_context[layer].target_bandwidth; + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + LAYER_CONTEXT *const lc = + &svc->layer_context[sl * svc->number_temporal_layers + tl]; + RATE_CONTROL *const lrc = &lc->rc; + PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc; + lc->spatial_layer_target_bandwidth = spatial_layer_target; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } + lp_rc->starting_buffer_level = + (int64_t)(p_rc->starting_buffer_level * bitrate_alloc); + lp_rc->optimal_buffer_level = + (int64_t)(p_rc->optimal_buffer_level * bitrate_alloc); + lp_rc->maximum_buffer_size = + (int64_t)(p_rc->maximum_buffer_size * bitrate_alloc); + lp_rc->bits_off_target = + AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size); + lp_rc->buffer_level = + AOMMIN(lp_rc->buffer_level, lp_rc->maximum_buffer_size); + lc->framerate = cpi->framerate / lc->framerate_factor; + lrc->avg_frame_bandwidth = + (int)round(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = rc->max_frame_bandwidth; + lrc->rtc_external_ratectrl = rc->rtc_external_ratectrl; + lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q); + lrc->best_quality = av1_quantizer_to_qindex(lc->min_q); + if (rc->use_external_qp_one_pass) { + lrc->worst_quality = rc->worst_quality; + lrc->best_quality = rc->best_quality; + } + // Reset the cyclic refresh parameters, if needed (map is NULL), + // or number of spatial layers has changed. + // Cyclic refresh is only applied on base temporal layer. + if (svc->number_spatial_layers > 1 && tl == 0 && + (lc->map == NULL || + svc->prev_number_spatial_layers != svc->number_spatial_layers)) { + lc->sb_index = 0; + lc->actual_num_seg1_blocks = 0; + lc->actual_num_seg2_blocks = 0; + lc->counter_encode_maxq_scene_change = 0; + aom_free(lc->map); + CHECK_MEM_ERROR(cm, lc->map, + aom_calloc(mi_rows * mi_cols, sizeof(*lc->map))); + } + } + } +} + +/*!\brief Return layer context for current layer. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * + * \return LAYER_CONTEXT for current layer. + */ +static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) { + return &cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id]; +} + +void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + RATE_CONTROL *const lrc = &lc->rc; + const int tl = svc->temporal_layer_id; + lc->framerate = cpi->framerate / lc->framerate_factor; + lrc->avg_frame_bandwidth = (int)round(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; + // Update the average layer frame size (non-cumulative per-frame-bw). + if (tl == 0) { + lc->avg_frame_size = lrc->avg_frame_bandwidth; + } else { + int prev_layer = svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id - 1; + LAYER_CONTEXT *const lcprev = &svc->layer_context[prev_layer]; + const double prev_layer_framerate = + cpi->framerate / lcprev->framerate_factor; + const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate; + lc->avg_frame_size = + (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) / + (lc->framerate - prev_layer_framerate)); + } +} + +static AOM_INLINE bool check_ref_is_low_spatial_res_super_frame( + int ref_frame, const SVC *svc, const RTC_REF *rtc_ref) { + int ref_frame_idx = rtc_ref->ref_idx[ref_frame - 1]; + return rtc_ref->buffer_time_index[ref_frame_idx] == svc->current_superframe && + rtc_ref->buffer_spatial_layer[ref_frame_idx] <= + svc->spatial_layer_id - 1; +} + +void av1_restore_layer_context(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + const AV1_COMMON *const cm = &cpi->common; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + const int old_frame_since_key = cpi->rc.frames_since_key; + const int old_frame_to_key = cpi->rc.frames_to_key; + const int max_consec_drop = cpi->rc.max_consec_drop; + // Restore layer rate control. + cpi->rc = lc->rc; + cpi->ppi->p_rc = lc->p_rc; + cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth; + cpi->gf_frame_index = 0; + cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude; + if (cpi->mv_search_params.max_mv_magnitude == 0) + cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height); + // Reset the frames_since_key and frames_to_key counters to their values + // before the layer restore. Keep these defined for the stream (not layer). + cpi->rc.frames_since_key = old_frame_since_key; + cpi->rc.frames_to_key = old_frame_to_key; + // Reset to value before the layer restore. + cpi->rc.max_consec_drop = max_consec_drop; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, + // for the base temporal layer. + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + swap_ptr(&cr->map, &lc->map); + cr->sb_index = lc->sb_index; + cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks; + cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks; + cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change; + } + svc->skip_mvsearch_last = 0; + svc->skip_mvsearch_gf = 0; + svc->skip_mvsearch_altref = 0; + // For each reference (LAST/GOLDEN) set the skip_mvsearch_last/gf frame flags. + // This is to skip searching mv for that reference if it was last + // refreshed (i.e., buffer slot holding that reference was refreshed) on the + // previous spatial layer(s) at the same time (current_superframe). + if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref && + cpi->sf.rt_sf.use_nonrd_pick_mode) { + if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc, rtc_ref)) { + svc->skip_mvsearch_last = 1; + } + if (check_ref_is_low_spatial_res_super_frame(GOLDEN_FRAME, svc, rtc_ref)) { + svc->skip_mvsearch_gf = 1; + } + if (check_ref_is_low_spatial_res_super_frame(ALTREF_FRAME, svc, rtc_ref)) { + svc->skip_mvsearch_altref = 1; + } + } +} + +void av1_svc_update_buffer_slot_refreshed(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + const unsigned int current_frame = + cpi->ppi->use_svc ? svc->current_superframe + : cpi->common.current_frame.frame_number; + // For any buffer slot that is refreshed, update it with + // the spatial_layer_id and the current_superframe. + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + // All slots are refreshed on KEY. + for (unsigned int i = 0; i < REF_FRAMES; i++) { + rtc_ref->buffer_time_index[i] = current_frame; + rtc_ref->buffer_spatial_layer[i] = svc->spatial_layer_id; + } + } else if (rtc_ref->set_ref_frame_config) { + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + const int ref_frame_map_idx = rtc_ref->ref_idx[i]; + if (cpi->ppi->rtc_ref.refresh[ref_frame_map_idx]) { + rtc_ref->buffer_time_index[ref_frame_map_idx] = current_frame; + rtc_ref->buffer_spatial_layer[ref_frame_map_idx] = + svc->spatial_layer_id; + } + } + } +} + +void av1_save_layer_context(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + const AV1_COMMON *const cm = &cpi->common; + LAYER_CONTEXT *lc = get_layer_context(cpi); + lc->rc = cpi->rc; + lc->p_rc = cpi->ppi->p_rc; + lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth; + lc->group_index = cpi->gf_frame_index; + lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude; + if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, + // for the base temporal layer. + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + signed char *temp = lc->map; + lc->map = cr->map; + cr->map = temp; + lc->sb_index = cr->sb_index; + lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks; + lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks; + lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change; + } + av1_svc_update_buffer_slot_refreshed(cpi); + for (unsigned int i = 0; i < REF_FRAMES; i++) { + if (frame_is_intra_only(cm) || + cm->current_frame.refresh_frame_flags & (1 << i)) { + svc->spatial_layer_fb[i] = svc->spatial_layer_id; + svc->temporal_layer_fb[i] = svc->temporal_layer_id; + } + } + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { + svc->current_superframe++; + // Reset drop flag to false for next superframe. + for (int sl = 0; sl < svc->number_spatial_layers; sl++) + svc->drop_spatial_layer[sl] = false; + } +} + +int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) { + const SVC *const svc = &cpi->svc; + const AV1_COMMON *const cm = &cpi->common; + int fb_idx = -1; + int primary_ref_frame = PRIMARY_REF_NONE; + if (cpi->svc.number_spatial_layers > 1 || + cpi->svc.number_temporal_layers > 1) { + // Set the primary_ref_frame to LAST_FRAME if that buffer slot for LAST + // was last updated on a lower temporal layer (or base TL0) and for the + // same spatial layer. For RTC patterns this allows for continued decoding + // when set of enhancement layers are dropped (continued decoding starting + // at next base TL0), so error_resilience can be off/0 for all layers. + fb_idx = get_ref_frame_map_idx(cm, LAST_FRAME); + if (svc->spatial_layer_fb[fb_idx] == svc->spatial_layer_id && + (svc->temporal_layer_fb[fb_idx] < svc->temporal_layer_id || + svc->temporal_layer_fb[fb_idx] == 0)) { + primary_ref_frame = 0; // LAST_FRAME: ref_frame - LAST_FRAME + } + } else if (cpi->ppi->rtc_ref.set_ref_frame_config) { + const ExternalFlags *const ext_flags = &cpi->ext_flags; + int flags = ext_flags->ref_frame_flags; + if (flags & AOM_LAST_FLAG) { + primary_ref_frame = 0; // LAST_FRAME: ref_frame - LAST_FRAME + } else if (flags & AOM_GOLD_FLAG) { + primary_ref_frame = GOLDEN_FRAME - LAST_FRAME; + } else if (flags & AOM_ALT_FLAG) { + primary_ref_frame = ALTREF_FRAME - LAST_FRAME; + } + } + return primary_ref_frame; +} + +void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + aom_free(lc->map); + lc->map = NULL; + } + } +} + +void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) { + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *lc = NULL; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl]; + if (is_key) lc->frames_from_key_frame = 0; + } + } + av1_update_temporal_layer_framerate(cpi); + av1_restore_layer_context(cpi); +} + +void av1_get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out) { + int w, h; + if (width_out == NULL || height_out == NULL || den == 0) return; + if (den == 1 && num == 1) { + *width_out = width_org; + *height_out = height_org; + return; + } + w = width_org * num / den; + h = height_org * num / den; + // Make height and width even. + w += w % 2; + h += h % 2; + *width_out = w; + *height_out = h; +} + +void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + AV1_COMMON *const cm = &cpi->common; + LAYER_CONTEXT *lc = NULL; + int width = 0, height = 0; + lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id]; + // Set the lower quality layer flag. + svc->has_lower_quality_layer = 0; + if (cpi->svc.spatial_layer_id > 0) { + const LAYER_CONTEXT *lc_prev = + &svc->layer_context[(svc->spatial_layer_id - 1) * + svc->number_temporal_layers + + svc->temporal_layer_id]; + if (lc_prev->scaling_factor_den == 1 && lc_prev->scaling_factor_num == 1) + svc->has_lower_quality_layer = 1; + } + av1_get_layer_resolution(cpi->oxcf.frm_dim_cfg.width, + cpi->oxcf.frm_dim_cfg.height, lc->scaling_factor_num, + lc->scaling_factor_den, &width, &height); + // Use Eightap_smooth for low resolutions. + if (width * height <= 320 * 240) + svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH; + + cm->width = width; + cm->height = height; + alloc_mb_mode_info_buffers(cpi); + av1_update_frame_size(cpi); + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { + svc->mi_cols_full_resoln = cm->mi_params.mi_cols; + svc->mi_rows_full_resoln = cm->mi_params.mi_rows; + } +} + +enum { + SVC_LAST_FRAME = 0, + SVC_LAST2_FRAME, + SVC_LAST3_FRAME, + SVC_GOLDEN_FRAME, + SVC_BWDREF_FRAME, + SVC_ALTREF2_FRAME, + SVC_ALTREF_FRAME +}; + +// For fixed svc mode: fixed pattern is set based on the number of +// spatial and temporal layers, and the ksvc_fixed_mode. +void av1_set_svc_fixed_mode(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + int i; + assert(svc->use_flexible_mode == 0); + // Fixed SVC mode only supports at most 3 spatial or temporal layers. + assert(svc->number_spatial_layers >= 1 && svc->number_spatial_layers <= 3 && + svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3); + rtc_ref->set_ref_frame_config = 1; + int superframe_cnt = svc->current_superframe; + // Set the reference map buffer idx for the 7 references: + // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = i; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->reference[i] = 0; + for (i = 0; i < REF_FRAMES; i++) rtc_ref->refresh[i] = 0; + // Always reference LAST, and reference GOLDEN on SL > 0. + // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later + // when frame_type is set. + rtc_ref->reference[SVC_LAST_FRAME] = 1; + if (svc->spatial_layer_id > 0) rtc_ref->reference[SVC_GOLDEN_FRAME] = 1; + if (svc->temporal_layer_id == 0) { + // Base temporal layer. + if (svc->spatial_layer_id == 0) { + // Set all buffer_idx to 0. Update slot 0 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + rtc_ref->refresh[0] = 1; + } else if (svc->spatial_layer_id == 1) { + // Set buffer_idx for LAST to slot 1, GOLDEN (and all other refs) to + // slot 0. Update slot 1 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 1; + rtc_ref->refresh[1] = 1; + } else if (svc->spatial_layer_id == 2) { + // Set buffer_idx for LAST to slot 2, GOLDEN (and all other refs) to + // slot 1. Update slot 2 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 1; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 2; + rtc_ref->refresh[2] = 1; + } + } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 1) % 4 == 0) { + // First top temporal enhancement layer. + if (svc->spatial_layer_id == 0) { + // Reference LAST (slot 0). + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to slot 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3; + rtc_ref->refresh[3] = 1; + } + } else if (svc->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 3. + // Set LAST2 to slot 4 and Update slot 4. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 3; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 1; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4; + rtc_ref->refresh[4] = 1; + } + } else if (svc->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 4. + // No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 4; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 2; + } + } else if (svc->temporal_layer_id == 1) { + // Middle temporal enhancement layer. + if (svc->spatial_layer_id == 0) { + // Reference LAST. + // Set all buffer_idx to 0. + // Set GOLDEN to slot 5 and update slot 5. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + if (svc->temporal_layer_id < svc->number_temporal_layers - 1 || + svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 5; + rtc_ref->refresh[5] = 1; + } + } else if (svc->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 5. + // Set LAST3 to slot 6 and update slot 6. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 5; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 1; + if (svc->temporal_layer_id < svc->number_temporal_layers - 1 || + svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_LAST3_FRAME] = 6; + rtc_ref->refresh[6] = 1; + } + } else if (svc->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 6. + // Set LAST3 to slot 7 and update slot 7. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 6; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 2; + if (svc->temporal_layer_id < svc->number_temporal_layers - 1) { + rtc_ref->ref_idx[SVC_LAST3_FRAME] = 7; + rtc_ref->refresh[7] = 1; + } + } + } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 3) % 4 == 0) { + // Second top temporal enhancement layer. + if (svc->spatial_layer_id == 0) { + // Set LAST to slot 5 and reference LAST. + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 5; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3; + rtc_ref->refresh[3] = 1; + } + } else if (svc->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, + // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 6; + rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4; + rtc_ref->refresh[4] = 1; + } + } else if (svc->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7, + // GOLDEN to slot 4. No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 7; + rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 4; + } + } +} + +void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + // Check for reset based on avg_frame_bandwidth for spatial layer sl. + // If avg_frame_bandwidth for top temporal layer is not set + // (because enhancement layer was inactive), use the base TL0 + int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + int avg_frame_bandwidth = lrc->avg_frame_bandwidth; + int prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth; + if (avg_frame_bandwidth == 0 || prev_avg_frame_bandwidth == 0) { + // Use base TL0. + layer = LAYER_IDS_TO_IDX(sl, 0, svc->number_temporal_layers); + lc = &svc->layer_context[layer]; + lrc = &lc->rc; + avg_frame_bandwidth = lrc->avg_frame_bandwidth; + prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth; + } + if (avg_frame_bandwidth > (3 * prev_avg_frame_bandwidth >> 1) || + avg_frame_bandwidth < (prev_avg_frame_bandwidth >> 1)) { + // Reset for all temporal layers with spatial layer sl. + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer2 = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc2 = &svc->layer_context[layer2]; + RATE_CONTROL *lrc2 = &lc2->rc; + PRIMARY_RATE_CONTROL *lp_rc2 = &lc2->p_rc; + PRIMARY_RATE_CONTROL *const lp_rc = &lc2->p_rc; + lrc2->rc_1_frame = 0; + lrc2->rc_2_frame = 0; + lp_rc2->bits_off_target = lp_rc->optimal_buffer_level; + lp_rc2->buffer_level = lp_rc->optimal_buffer_level; + } + } + } +} + +void av1_svc_set_last_source(AV1_COMP *const cpi, EncodeFrameInput *frame_input, + YV12_BUFFER_CONFIG *prev_source) { + frame_input->last_source = prev_source != NULL ? prev_source : NULL; + if (!cpi->ppi->use_svc && cpi->rc.prev_frame_is_dropped && + cpi->rc.frame_number_encoded > 0) { + frame_input->last_source = &cpi->svc.source_last_TL0; + } else { + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + if (cpi->svc.spatial_layer_id == 0) { + // For base spatial layer: if the LAST reference (index 0) is not + // the previous (super)frame set the last_source to the source + // corresponding to the last TL0, otherwise keep it at prev_source. + // Always use source_last_TL0 if previous base TL0 was dropped. + if (cpi->svc.current_superframe > 0) { + const int buffslot_last = rtc_ref->ref_idx[0]; + // Check if previous frame was dropped on base TL0 layer. + const int layer = + LAYER_IDS_TO_IDX(0, 0, cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + if (lrc->prev_frame_is_dropped || + rtc_ref->buffer_time_index[buffslot_last] < + cpi->svc.current_superframe - 1) { + frame_input->last_source = &cpi->svc.source_last_TL0; + } + } + } else if (cpi->svc.spatial_layer_id > 0) { + // For spatial enhancement layers: the previous source (prev_source) + // corresponds to the lower spatial layer (which is the same source so + // we can't use that), so always set the last_source to the source of the + // last TL0. + if (cpi->svc.current_superframe > 0) + frame_input->last_source = &cpi->svc.source_last_TL0; + else + frame_input->last_source = NULL; + } + } +} + +int av1_svc_get_min_ref_dist(const AV1_COMP *cpi) { + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + int min_dist = INT_MAX; + const unsigned int current_frame_num = + cpi->ppi->use_svc ? cpi->svc.current_superframe + : cpi->common.current_frame.frame_number; + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + if (cpi->ppi->rtc_ref.reference[i]) { + const int ref_frame_map_idx = rtc_ref->ref_idx[i]; + const int dist = + current_frame_num - rtc_ref->buffer_time_index[ref_frame_map_idx]; + if (dist < min_dist) min_dist = dist; + } + } + return min_dist; +} + +void av1_svc_set_reference_was_previous(AV1_COMP *cpi) { + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + // Check if the encoded frame had some reference that was the + // previous frame. + const unsigned int current_frame = + cpi->ppi->use_svc ? cpi->svc.current_superframe + : cpi->common.current_frame.frame_number; + rtc_ref->reference_was_previous_frame = true; + if (current_frame > 0) { + rtc_ref->reference_was_previous_frame = false; + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + if (rtc_ref->reference[i]) { + const int ref_frame_map_idx = rtc_ref->ref_idx[i]; + if (rtc_ref->buffer_time_index[ref_frame_map_idx] == current_frame - 1) + rtc_ref->reference_was_previous_frame = true; + } + } + } +} diff --git a/third_party/aom/av1/encoder/svc_layercontext.h b/third_party/aom/av1/encoder/svc_layercontext.h new file mode 100644 index 0000000000..93118be2d4 --- /dev/null +++ b/third_party/aom/av1/encoder/svc_layercontext.h @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ +#define AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ + +#include "aom_scale/yv12config.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \brief The stucture of quantities related to each spatial and temporal layer. + * \ingroup SVC + */ +typedef struct { + /*!\cond */ + RATE_CONTROL rc; + PRIMARY_RATE_CONTROL p_rc; + int framerate_factor; + int64_t layer_target_bitrate; // In bits per second. + int scaling_factor_num; + int scaling_factor_den; + int64_t target_bandwidth; + int64_t spatial_layer_target_bandwidth; + double framerate; + int avg_frame_size; + int max_q; + int min_q; + int frames_from_key_frame; + /*!\endcond */ + + /*! + * Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame. + */ + int sb_index; + /*! + * Segmentation map + */ + int8_t *map; + /*! + * Number of blocks on segment 1 + */ + int actual_num_seg1_blocks; + + /*! + * Number of blocks on segment 2 + */ + int actual_num_seg2_blocks; + /*! + * Counter used to detect scene change. + */ + int counter_encode_maxq_scene_change; + + /*! + * Speed settings for each layer. + */ + uint8_t speed; + /*! + * GF group index. + */ + unsigned char group_index; + /*! + * If current layer is key frame. + */ + int is_key_frame; + /*! + * Maximum motion magnitude of previous encoded layer. + */ + int max_mv_magnitude; +} LAYER_CONTEXT; + +/*! + * \brief The stucture of SVC. + * \ingroup SVC + */ +typedef struct SVC { + /*!\cond */ + int spatial_layer_id; + int temporal_layer_id; + int number_spatial_layers; + int number_temporal_layers; + int prev_number_spatial_layers; + int use_flexible_mode; + int ksvc_fixed_mode; + /*!\endcond */ + + /*!\cond */ + double base_framerate; + unsigned int current_superframe; + int skip_mvsearch_last; + int skip_mvsearch_gf; + int skip_mvsearch_altref; + int spatial_layer_fb[REF_FRAMES]; + int temporal_layer_fb[REF_FRAMES]; + int num_encoded_top_layer; + int first_layer_denoise; + YV12_BUFFER_CONFIG source_last_TL0; + int mi_cols_full_resoln; + int mi_rows_full_resoln; + /*!\endcond */ + + /*! + * Layer context used for rate control in CBR mode. + * An array. The index for spatial layer `sl` and temporal layer `tl` is + * sl * number_temporal_layers + tl. + */ + LAYER_CONTEXT *layer_context; + + /*! + * Number of layers allocated for layer_context. If nonzero, must be greater + * than or equal to number_spatial_layers * number_temporal_layers. + */ + int num_allocated_layers; + + /*! + * EIGHTTAP_SMOOTH or BILINEAR + */ + InterpFilter downsample_filter_type[AOM_MAX_SS_LAYERS]; + + /*! + * Downsample_filter_phase: = 0 will do sub-sampling (no weighted average), + * = 8 will center the target pixel and get a symmetric averaging filter. + */ + int downsample_filter_phase[AOM_MAX_SS_LAYERS]; + + /*! + * Force zero-mv in mode search for the spatial/inter-layer reference. + */ + int force_zero_mode_spatial_ref; + + /*! + * Flag to indicate that current spatial layer has a lower quality layer + * (at the same timestamp) that can be used as a reference. + * Lower quality layer refers to the same resolution but encoded at + * different/lower bitrate. + */ + int has_lower_quality_layer; + + /*! + * Flag to indicate the frame drop mode for SVC: one of the two settings: + * AOM_LAYER_DROP (default) or AOM_FULL_SUPERFRAME_DROP. + */ + AOM_SVC_FRAME_DROP_MODE framedrop_mode; + + /*! + * Flag to indicate if frame was dropped for a given spatial_layer_id on + * previous superframe. + */ + bool last_layer_dropped[AOM_MAX_SS_LAYERS]; + + /*! + * Flag to indicate if a previous spatial was dropped for the same superframe. + */ + bool drop_spatial_layer[AOM_MAX_SS_LAYERS]; +} SVC; + +struct AV1_COMP; +struct EncodeFrameInput; + +/*!\brief Initialize layer context data from init_config(). + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Nothing returned. Set cpi->svc. + */ +void av1_init_layer_context(struct AV1_COMP *const cpi); + +/*!\brief Allocate layer context data. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] num_layers Number of layers to be allocated + * + * \remark Allocates memory for cpi->svc.layer_context. + * \return True on success, false on allocation failure. + */ +bool av1_alloc_layer_context(struct AV1_COMP *cpi, int num_layers); + +/*!\brief Update the layer context from a change_config() call. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] target_bandwidth Total target bandwidth + * + * \remark Nothing returned. Buffer level for each layer is set. + */ +void av1_update_layer_context_change_config(struct AV1_COMP *const cpi, + const int64_t target_bandwidth); + +/*!\brief Prior to encoding the frame, update framerate-related quantities + for the current temporal layer. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Nothing returned. Frame related quantities for current temporal + layer are updated. + */ +void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi); + +/*!\brief Prior to encoding the frame, set the layer context, for the current + layer to be encoded, to the cpi struct. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Nothing returned. Layer context for current layer is set. + */ +void av1_restore_layer_context(struct AV1_COMP *const cpi); + +/*!\brief Save the layer context after encoding the frame. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + */ +void av1_save_layer_context(struct AV1_COMP *const cpi); + +/*!\brief Free the memory used for cyclic refresh in layer context. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + */ +void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi); + +/*!\brief Reset on key frame: reset counters, references and buffer updates. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] is_key Whether current layer is key frame + */ +void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key); + +/*!\brief Before encoding, set resolutions and allocate compressor data. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + */ +void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi); + +/*!\brief Get primary reference frame for current layer + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \return The primary reference frame for current layer. + */ +int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi); + +/*!\brief Get resolution for current layer. + * + * \ingroup SVC + * \param[in] width_org Original width, unscaled + * \param[in] height_org Original height, unscaled + * \param[in] num Numerator for the scale ratio + * \param[in] den Denominator for the scale ratio + * \param[in] width_out Output width, scaled for current layer + * \param[in] height_out Output height, scaled for current layer + * + * \remark Nothing is returned. Instead the scaled width and height are set. + */ +void av1_get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out); + +void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi); + +void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi); + +void av1_svc_set_last_source(struct AV1_COMP *const cpi, + struct EncodeFrameInput *frame_input, + YV12_BUFFER_CONFIG *prev_source); + +void av1_svc_update_buffer_slot_refreshed(struct AV1_COMP *const cpi); + +int av1_svc_get_min_ref_dist(const struct AV1_COMP *cpi); + +void av1_svc_set_reference_was_previous(struct AV1_COMP *cpi); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c new file mode 100644 index 0000000000..7d4d25de6a --- /dev/null +++ b/third_party/aom/av1/encoder/temporal_filter.c @@ -0,0 +1,1520 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/odintrin.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_scale/aom_scale.h" +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/temporal_filter.h" + +/*!\cond */ + +// NOTE: All `tf` in this file means `temporal filtering`. + +// Forward Declaration. +static void tf_determine_block_partition(const MV block_mv, const int block_mse, + MV *subblock_mvs, int *subblock_mses); + +// This function returns the minimum and maximum log variances for 4x4 sub +// blocks in the current block. +static INLINE void get_log_var_4x4sub_blk( + AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const frame_to_filter, int mb_row, + int mb_col, BLOCK_SIZE block_size, double *blk_4x4_var_min, + double *blk_4x4_var_max, int is_hbd) { + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + int var_min = INT_MAX; + int var_max = 0; + + // Derive the source buffer. + const int src_stride = frame_to_filter->y_stride; + const int y_offset = mb_row * mb_height * src_stride + mb_col * mb_width; + const uint8_t *src_buf = frame_to_filter->y_buffer + y_offset; + + for (int i = 0; i < mb_height; i += MI_SIZE) { + for (int j = 0; j < mb_width; j += MI_SIZE) { + // Calculate the 4x4 sub-block variance. + const int var = av1_calc_normalized_variance( + cpi->ppi->fn_ptr[BLOCK_4X4].vf, src_buf + (i * src_stride) + j, + src_stride, is_hbd); + + // Record min and max for over-arching block + var_min = AOMMIN(var_min, var); + var_max = AOMMAX(var_max, var); + } + } + + *blk_4x4_var_min = log1p(var_min / 16.0); + *blk_4x4_var_max = log1p(var_max / 16.0); +} + +/*!\endcond */ +/*!\brief Does motion search for blocks in temporal filtering. This is + * the first step for temporal filtering. More specifically, given a frame to + * be filtered and another frame as reference, this function searches the + * reference frame to find out the most similar block as that from the frame + * to be filtered. This found block will be further used for weighted + * averaging. + * + * NOTE: Besides doing motion search for the entire block, this function will + * also do motion search for each 1/4 sub-block to get more precise + * predictions. Then, this function will determines whether to use 4 + * sub-blocks to replace the entire block. If we do need to split the + * entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to + * the searched motion vector and search error (MSE) w.r.t. each sub-block + * respectively. Otherwise, the 4 elements will be the same, all of which + * are assigned as the searched motion vector and search error (MSE) for + * the entire block. + * + * \ingroup src_frame_proc + * \param[in] cpi Top level encoder instance structure + * \param[in] mb Pointer to macroblock + * \param[in] frame_to_filter Pointer to the frame to be filtered + * \param[in] ref_frame Pointer to the reference frame + * \param[in] block_size Block size used for motion search + * \param[in] mb_row Row index of the block in the frame + * \param[in] mb_col Column index of the block in the frame + * \param[in] ref_mv Reference motion vector, which is commonly + * inherited from the motion search result of + * previous frame. + * \param[in] allow_me_for_sub_blks Flag to indicate whether motion search at + * 16x16 sub-block level is needed or not. + * \param[out] subblock_mvs Pointer to the motion vectors for + * 4 sub-blocks + * \param[out] subblock_mses Pointer to the search errors (MSE) for + * 4 sub-blocks + * + * \remark Nothing will be returned. Results are saved in subblock_mvs and + * subblock_mses + */ +static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb, + const YV12_BUFFER_CONFIG *frame_to_filter, + const YV12_BUFFER_CONFIG *ref_frame, + const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, MV *ref_mv, + bool allow_me_for_sub_blks, MV *subblock_mvs, + int *subblock_mses) { + // Frame information + const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height); + + // Block information (ONLY Y-plane is used for motion search). + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int y_stride = frame_to_filter->y_stride; + const int src_width = frame_to_filter->y_width; + const int ref_width = ref_frame->y_width; + assert(y_stride == ref_frame->y_stride); + assert(src_width == ref_width); + const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; + + // Save input state. + MACROBLOCKD *const mbd = &mb->e_mbd; + const struct buf_2d ori_src_buf = mb->plane[0].src; + const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0]; + + // Parameters used for motion search. + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + const int step_param = av1_init_search_range( + AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height)); + const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS; + const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv; + const MV_COST_TYPE mv_cost_type = + min_frame_size >= 720 + ? MV_COST_L1_HDRES + : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES); + + // Starting position for motion search. + FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv); + // Baseline position for motion search (used for rate distortion comparison). + const MV baseline_mv = kZeroMv; + + // Setup. + mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset; + mb->plane[0].src.stride = y_stride; + mb->plane[0].src.width = src_width; + mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset; + mbd->plane[0].pre[0].stride = y_stride; + mbd->plane[0].pre[0].width = ref_width; + + const SEARCH_METHODS search_method = NSTEP; + const search_site_config *search_site_cfg = + av1_get_search_site_config(cpi, mb, search_method); + + // Unused intermediate results for motion search. + unsigned int sse, error; + int distortion; + int cost_list[5]; + + // Do motion search. + int_mv best_mv; // Searched motion vector. + FULLPEL_MV_STATS best_mv_stats; + int block_mse = INT_MAX; + MV block_mv = kZeroMv; + const int q = av1_get_q(cpi); + + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size, + &baseline_mv, start_mv, search_site_cfg, + search_method, + /*fine_search_interval=*/0); + full_ms_params.run_mesh_search = 1; + full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type; + + if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) { + // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1. + full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1; + full_ms_params.mesh_search_mv_diff_threshold = 2; + } + + av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), &best_mv.as_fullmv, + &best_mv_stats, NULL); + + if (force_integer_mv == 1) { // Only do full search on the entire block. + const int mv_row = best_mv.as_mv.row; + const int mv_col = best_mv.as_mv.col; + best_mv.as_mv.row = GET_MV_SUBPEL(mv_row); + best_mv.as_mv.col = GET_MV_SUBPEL(mv_col); + const int mv_offset = mv_row * y_stride + mv_col; + error = cpi->ppi->fn_ptr[block_size].vf( + ref_frame->y_buffer + y_offset + mv_offset, y_stride, + frame_to_filter->y_buffer + y_offset, y_stride, &sse); + block_mse = DIVIDE_AND_ROUND(error, mb_pels); + block_mv = best_mv.as_mv; + } else { // Do fractional search on the entire block and all sub-blocks. + av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size, + &baseline_mv, cost_list); + ms_params.forced_stop = EIGHTH_PEL; + ms_params.var_params.subpel_search_type = subpel_search_type; + // Since we are merely refining the result from full pixel search, we don't + // need regularization for subpel search + ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; + best_mv_stats.err_cost = 0; + + MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + error = cpi->mv_search_params.find_fractional_mv_step( + &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv_stats, + &best_mv.as_mv, &distortion, &sse, NULL); + block_mse = DIVIDE_AND_ROUND(error, mb_pels); + block_mv = best_mv.as_mv; + *ref_mv = best_mv.as_mv; + + if (allow_me_for_sub_blks) { + // On 4 sub-blocks. + const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1]; + const int subblock_height = block_size_high[subblock_size]; + const int subblock_width = block_size_wide[subblock_size]; + const int subblock_pels = subblock_height * subblock_width; + start_mv = get_fullmv_from_mv(ref_mv); + + int subblock_idx = 0; + for (int i = 0; i < mb_height; i += subblock_height) { + for (int j = 0; j < mb_width; j += subblock_width) { + const int offset = i * y_stride + j; + mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset; + mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset; + av1_make_default_fullpel_ms_params( + &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv, + search_site_cfg, search_method, + /*fine_search_interval=*/0); + full_ms_params.run_mesh_search = 1; + full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type; + + if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) { + // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1. + full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1; + full_ms_params.mesh_search_mv_diff_threshold = 2; + } + av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), + &best_mv.as_fullmv, &best_mv_stats, NULL); + + av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size, + &baseline_mv, cost_list); + ms_params.forced_stop = EIGHTH_PEL; + ms_params.var_params.subpel_search_type = subpel_search_type; + // Since we are merely refining the result from full pixel search, we + // don't need regularization for subpel search + ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; + best_mv_stats.err_cost = 0; + + subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert( + av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + error = cpi->mv_search_params.find_fractional_mv_step( + &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, + &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL); + subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels); + subblock_mvs[subblock_idx] = best_mv.as_mv; + ++subblock_idx; + } + } + } + } + + // Restore input state. + mb->plane[0].src = ori_src_buf; + mbd->plane[0].pre[0] = ori_pre_buf; + + // Make partition decision. + if (allow_me_for_sub_blks) { + tf_determine_block_partition(block_mv, block_mse, subblock_mvs, + subblock_mses); + } else { + // Copy 32X32 block mv and mse values to sub blocks + for (int i = 0; i < 4; ++i) { + subblock_mvs[i] = block_mv; + subblock_mses[i] = block_mse; + } + } + // Do not pass down the reference motion vector if error is too large. + const int thresh = (min_frame_size >= 720) ? 12 : 3; + if (block_mse > (thresh << (mbd->bd - 8))) { + *ref_mv = kZeroMv; + } +} +/*!\cond */ + +// Determines whether to split the entire block to 4 sub-blocks for filtering. +// In particular, this decision is made based on the comparison between the +// motion search error of the entire block and the errors of all sub-blocks. +// Inputs: +// block_mv: Motion vector for the entire block (ONLY as reference). +// block_mse: Motion search error (MSE) for the entire block (ONLY as +// reference). +// subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be +// modified based on the partition decision). +// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will +// be modified based on the partition decision). +// Returns: +// Nothing will be returned. Results are saved in `subblock_mvs` and +// `subblock_mses`. +static void tf_determine_block_partition(const MV block_mv, const int block_mse, + MV *subblock_mvs, int *subblock_mses) { + int min_subblock_mse = INT_MAX; + int max_subblock_mse = INT_MIN; + int64_t sum_subblock_mse = 0; + for (int i = 0; i < 4; ++i) { + sum_subblock_mse += subblock_mses[i]; + min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]); + max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]); + } + + // TODO(any): The following magic numbers may be tuned to improve the + // performance OR find a way to get rid of these magic numbers. + if (((block_mse * 15 < sum_subblock_mse * 4) && + max_subblock_mse - min_subblock_mse < 48) || + ((block_mse * 14 < sum_subblock_mse * 4) && + max_subblock_mse - min_subblock_mse < 24)) { // No split. + for (int i = 0; i < 4; ++i) { + subblock_mvs[i] = block_mv; + subblock_mses[i] = block_mse; + } + } +} + +// Helper function to determine whether a frame is encoded with high bit-depth. +static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) { + return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; +} + +/*!\endcond */ +/*!\brief Builds predictor for blocks in temporal filtering. This is the + * second step for temporal filtering, which is to construct predictions from + * all reference frames INCLUDING the frame to be filtered itself. These + * predictors are built based on the motion search results (motion vector is + * set as 0 for the frame to be filtered), and will be futher used for + * weighted averaging. + * + * \ingroup src_frame_proc + * \param[in] ref_frame Pointer to the reference frame (or the frame + * to be filtered) + * \param[in] mbd Pointer to the block for filtering. Besides + * containing the subsampling information of all + * planes, this field also gives the searched + * motion vector for the entire block, i.e., + * `mbd->mi[0]->mv[0]`. This vector should be 0 + * if the `ref_frame` itself is the frame to be + * filtered. + * \param[in] block_size Size of the block + * \param[in] mb_row Row index of the block in the frame + * \param[in] mb_col Column index of the block in the frame + * \param[in] num_planes Number of planes in the frame + * \param[in] scale Scaling factor + * \param[in] subblock_mvs The motion vectors for each sub-block (row-major + * order) + * \param[out] pred Pointer to the predictor to be built + * + * \remark Nothing returned, But the contents of `pred` will be modified + */ +static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame, + const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, const int num_planes, + const struct scale_factors *scale, + const MV *subblock_mvs, uint8_t *pred) { + // Information of the entire block. + const int mb_height = block_size_high[block_size]; // Height. + const int mb_width = block_size_wide[block_size]; // Width. + const int mb_y = mb_height * mb_row; // Y-coord (Top-left). + const int mb_x = mb_width * mb_col; // X-coord (Top-left). + const int bit_depth = mbd->bd; // Bit depth. + const int is_intrabc = 0; // Is intra-copied? + const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame); + + // Default interpolation filters. + const int_interpfilters interp_filters = + av1_broadcast_interp_filter(MULTITAP_SHARP2); + + // Handle Y-plane, U-plane and V-plane (if needed) in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling_y = mbd->plane[plane].subsampling_y; + const int subsampling_x = mbd->plane[plane].subsampling_x; + // Information of each sub-block in current plane. + const int plane_h = mb_height >> subsampling_y; // Plane height. + const int plane_w = mb_width >> subsampling_x; // Plane width. + const int plane_y = mb_y >> subsampling_y; // Y-coord (Top-left). + const int plane_x = mb_x >> subsampling_x; // X-coord (Top-left). + const int h = plane_h >> 1; // Sub-block height. + const int w = plane_w >> 1; // Sub-block width. + const int is_y_plane = (plane == 0); // Is Y-plane? + + const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane], + ref_frame->widths[is_y_plane ? 0 : 1], + ref_frame->heights[is_y_plane ? 0 : 1], + ref_frame->strides[is_y_plane ? 0 : 1] }; + + // Handle each subblock. + int subblock_idx = 0; + for (int i = 0; i < plane_h; i += h) { + for (int j = 0; j < plane_w; j += w) { + // Choose proper motion vector. + const MV mv = subblock_mvs[subblock_idx++]; + assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX && + mv.col >= INT16_MIN && mv.col <= INT16_MAX); + + const int y = plane_y + i; + const int x = plane_x + j; + + // Build predictior for each sub-block on current plane. + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, + subsampling_y, bit_depth, is_high_bitdepth, + is_intrabc, scale, &ref_buf, interp_filters); + inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); + av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j], + plane_w, &mv, &inter_pred_params); + } + } + plane_offset += plane_h * plane_w; + } +} +/*!\cond */ + +// Computes temporal filter weights and accumulators for the frame to be +// filtered. More concretely, the filter weights for all pixels are the same. +// Inputs: +// mbd: Pointer to the block for filtering, which is ONLY used to get +// subsampling information of all planes as well as the bit-depth. +// block_size: Size of the block. +// num_planes: Number of planes in the frame. +// pred: Pointer to the well-built predictors. +// accum: Pointer to the pixel-wise accumulator for filtering. +// count: Pointer to the pixel-wise counter fot filtering. +// Returns: +// Nothing will be returned. But the content to which `accum` and `pred` +// point will be modified. +void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame, + const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, + const int mb_row, const int mb_col, + const int num_planes, uint32_t *accum, + uint16_t *count) { + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int is_high_bitdepth = is_cur_buf_hbd(mbd); + + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling_y = mbd->plane[plane].subsampling_y; + const int subsampling_x = mbd->plane[plane].subsampling_x; + const int h = mb_height >> subsampling_y; // Plane height. + const int w = mb_width >> subsampling_x; // Plane width. + + const int frame_stride = ref_frame->strides[plane == AOM_PLANE_Y ? 0 : 1]; + const uint8_t *buf8 = ref_frame->buffers[plane]; + const uint16_t *buf16 = CONVERT_TO_SHORTPTR(buf8); + const int frame_offset = mb_row * h * frame_stride + mb_col * w; + + int pred_idx = 0; + int pixel_idx = 0; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const int idx = plane_offset + pred_idx; // Index with plane shift. + const int pred_value = is_high_bitdepth + ? buf16[frame_offset + pixel_idx] + : buf8[frame_offset + pixel_idx]; + accum[idx] += TF_WEIGHT_SCALE * pred_value; + count[idx] += TF_WEIGHT_SCALE; + ++pred_idx; + ++pixel_idx; + } + pixel_idx += (frame_stride - w); + } + plane_offset += h * w; + } +} + +// Function to compute pixel-wise squared difference between two buffers. +// Inputs: +// ref: Pointer to reference buffer. +// ref_offset: Start position of reference buffer for computation. +// ref_stride: Stride for reference buffer. +// tgt: Pointer to target buffer. +// tgt_offset: Start position of target buffer for computation. +// tgt_stride: Stride for target buffer. +// height: Height of block for computation. +// width: Width of block for computation. +// is_high_bitdepth: Whether the two buffers point to high bit-depth frames. +// square_diff: Pointer to save the squared differces. +// Returns: +// Nothing will be returned. But the content to which `square_diff` points +// will be modified. +static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset, + const int ref_stride, const uint8_t *tgt, + const int tgt_offset, + const int tgt_stride, const int height, + const int width, + const int is_high_bitdepth, + uint32_t *square_diff) { + const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); + const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt); + + int ref_idx = 0; + int tgt_idx = 0; + int idx = 0; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx] + : ref[ref_offset + ref_idx]; + const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx] + : tgt[tgt_offset + tgt_idx]; + const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value) + : (tgt_value - ref_value); + square_diff[idx] = diff * diff; + + ++ref_idx; + ++tgt_idx; + ++idx; + } + ref_idx += (ref_stride - width); + tgt_idx += (tgt_stride - width); + } +} + +// Function to accumulate pixel-wise squared difference between two luma buffers +// to be consumed while filtering the chroma planes. +// Inputs: +// square_diff: Pointer to squared differences from luma plane. +// luma_sse_sum: Pointer to save the sum of luma squared differences. +// block_height: Height of block for computation. +// block_width: Width of block for computation. +// ss_x_shift: Chroma subsampling shift in 'X' direction +// ss_y_shift: Chroma subsampling shift in 'Y' direction +// Returns: +// Nothing will be returned. But the content to which `luma_sse_sum` points +// will be modified. +void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum, + int block_height, int block_width, + int ss_x_shift, int ss_y_shift) { + for (int i = 0; i < block_height; ++i) { + for (int j = 0; j < block_width; ++j) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + const int ww = block_width << ss_x_shift; // Width of Y-plane. + luma_sse_sum[i * block_width + j] += square_diff[yy * ww + xx]; + } + } + } + } +} + +/*!\endcond */ +/*!\brief Applies temporal filtering. NOTE that there are various optimised + * versions of this function called where the appropriate instruction set is + * supported. + * + * \ingroup src_frame_proc + * \param[in] frame_to_filter Pointer to the frame to be filtered, which is + * used as reference to compute squared + * difference from the predictor. + * \param[in] mbd Pointer to the block for filtering, ONLY used + * to get subsampling information for the planes + * \param[in] block_size Size of the block + * \param[in] mb_row Row index of the block in the frame + * \param[in] mb_col Column index of the block in the frame + * \param[in] num_planes Number of planes in the frame + * \param[in] noise_levels Estimated noise levels for each plane + * in the frame (Y,U,V) + * \param[in] subblock_mvs Pointer to the motion vectors for 4 sub-blocks + * \param[in] subblock_mses Pointer to the search errors (MSE) for 4 + * sub-blocks + * \param[in] q_factor Quantization factor. This is actually the `q` + * defined in libaom, converted from `qindex` + * \param[in] filter_strength Filtering strength. This value lies in range + * [0, 6] where 6 is the maximum strength. + * \param[in] tf_wgt_calc_lvl Controls the weight calculation method during + * temporal filtering + * \param[out] pred Pointer to the well-built predictors + * \param[out] accum Pointer to the pixel-wise accumulator for + * filtering + * \param[out] count Pointer to the pixel-wise counter for + * filtering + * + * \remark Nothing returned, But the contents of `accum`, `pred` and 'count' + * will be modified + */ +void av1_apply_temporal_filter_c( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter); + const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred); + // Frame information. + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Decay factors for non-local mean approach. + double decay_factor[MAX_MB_PLANE] = { 0 }; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + for (int plane = 0; plane < num_planes; plane++) { + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + decay_factor[plane] = 1 / (n_decay * q_decay * s_decay); + } + double d_factor[4] = { 0 }; + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Allocate memory for pixel-wise squared differences. They, + // regardless of the subsampling, are assigned with memory of size `mb_pels`. + uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t)); + if (!square_diff) { + aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR, + "Error allocating temporal filter data"); + } + memset(square_diff, 0, mb_pels * sizeof(square_diff[0])); + + // Allocate memory for accumulated luma squared error. This value will be + // consumed while filtering the chroma planes. + uint32_t *luma_sse_sum = aom_memalign(32, mb_pels * sizeof(uint32_t)); + if (!luma_sse_sum) { + aom_free(square_diff); + aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR, + "Error allocating temporal filter data"); + } + memset(luma_sse_sum, 0, mb_pels * sizeof(luma_sse_sum[0])); + + // Get window size for pixel-wise filtering. + assert(TF_WINDOW_LENGTH % 2 == 1); + const int half_window = TF_WINDOW_LENGTH >> 1; + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + // Locate pixel on reference frame. + const int subsampling_y = mbd->plane[plane].subsampling_y; + const int subsampling_x = mbd->plane[plane].subsampling_x; + const int h = mb_height >> subsampling_y; // Plane height. + const int w = mb_width >> subsampling_x; // Plane width. + const int frame_stride = + frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; + const int frame_offset = mb_row * h * frame_stride + mb_col * w; + const uint8_t *ref = frame_to_filter->buffers[plane]; + const int ss_y_shift = + subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int ss_x_shift = + subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane will + // be more accurate. The luma sse sum is reused in both chroma planes. + if (plane == AOM_PLANE_U) + compute_luma_sq_error_sum(square_diff, luma_sse_sum, h, w, ss_x_shift, + ss_y_shift); + compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, w, + h, w, is_high_bitdepth, square_diff); + + // Perform filtering. + int pred_idx = 0; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + // non-local mean approach + uint64_t sum_square_diff = 0; + + for (int wi = -half_window; wi <= half_window; ++wi) { + for (int wj = -half_window; wj <= half_window; ++wj) { + const int y = CLIP(i + wi, 0, h - 1); // Y-coord on current plane. + const int x = CLIP(j + wj, 0, w - 1); // X-coord on current plane. + sum_square_diff += square_diff[y * w + x]; + } + } + + sum_square_diff += luma_sse_sum[i * w + j]; + + // Scale down the difference for high bit depth input. + if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2); + + // Combine window error and block error, and normalize it. + const double window_error = sum_square_diff * inv_num_ref_pixels; + const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor[plane]; + scaled_error = AOMMIN(scaled_error, 7); + int weight; + if (tf_wgt_calc_lvl == 0) { + weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + } else { + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + weight = iroundpf(fweight); + } + + const int idx = plane_offset + pred_idx; // Index with plane shift. + const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx]; + accum[idx] += weight * pred_value; + count[idx] += weight; + + ++pred_idx; + } + } + plane_offset += h * w; + } + + aom_free(square_diff); + aom_free(luma_sse_sum); +} +#if CONFIG_AV1_HIGHBITDEPTH +// Calls High bit-depth temporal filter +void av1_highbd_apply_temporal_filter_c( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col, + num_planes, noise_levels, subblock_mvs, + subblock_mses, q_factor, filter_strength, + tf_wgt_calc_lvl, pred, accum, count); +} +#endif // CONFIG_AV1_HIGHBITDEPTH +/*!\brief Normalizes the accumulated filtering result to produce the filtered + * frame + * + * \ingroup src_frame_proc + * \param[in] mbd Pointer to the block for filtering, which is + * ONLY used to get subsampling information for + * all the planes + * \param[in] block_size Size of the block + * \param[in] mb_row Row index of the block in the frame + * \param[in] mb_col Column index of the block in the frame + * \param[in] num_planes Number of planes in the frame + * \param[in] accum Pointer to the pre-computed accumulator + * \param[in] count Pointer to the pre-computed count + * \param[out] result_buffer Pointer to result buffer + * + * \remark Nothing returned, but the content to which `result_buffer` pointer + * will be modified + */ +static void tf_normalize_filtered_frame( + const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, const int num_planes, const uint32_t *accum, + const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) { + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer); + + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const int plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + uint8_t *const buf = result_buffer->buffers[plane]; + uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf); + + int plane_idx = 0; // Pixel index on current plane (block-base). + int frame_idx = frame_offset; // Pixel index on the entire frame. + for (int i = 0; i < plane_h; ++i) { + for (int j = 0; j < plane_w; ++j) { + const int idx = plane_idx + plane_offset; + const uint16_t rounding = count[idx] >> 1; + if (is_high_bitdepth) { + buf16[frame_idx] = + (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]); + } else { + buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]); + } + ++plane_idx; + ++frame_idx; + } + frame_idx += (frame_stride - plane_w); + } + plane_offset += plane_h * plane_w; + } +} + +int av1_get_q(const AV1_COMP *cpi) { + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index]; + const int q = + (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type], + cpi->common.seq_params->bit_depth); + return q; +} + +void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) { + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + YV12_BUFFER_CONFIG **frames = tf_ctx->frames; + const int num_frames = tf_ctx->num_frames; + const int filter_frame_idx = tf_ctx->filter_frame_idx; + const int compute_frame_diff = tf_ctx->compute_frame_diff; + const struct scale_factors *scale = &tf_ctx->sf; + const double *noise_levels = tf_ctx->noise_levels; + const int num_pels = tf_ctx->num_pels; + const int q_factor = tf_ctx->q_factor; + const BLOCK_SIZE block_size = TF_BLOCK_SIZE; + const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx]; + MACROBLOCK *const mb = &td->mb; + MACROBLOCKD *const mbd = &mb->e_mbd; + TemporalFilterData *const tf_data = &td->tf_data; + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mi_h = mi_size_high_log2[block_size]; + const int mi_w = mi_size_wide_log2[block_size]; + const int num_planes = av1_num_planes(&cpi->common); + const int weight_calc_level_in_tf = cpi->sf.hl_sf.weight_calc_level_in_tf; + uint32_t *accum = tf_data->accum; + uint16_t *count = tf_data->count; + uint8_t *pred = tf_data->pred; + + // Factor to control the filering strength. + const int filter_strength = cpi->oxcf.algo_cfg.arnr_strength; + + // Do filtering. + FRAME_DIFF *diff = &td->tf_data.diff; + av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + for (int mb_col = 0; mb_col < tf_ctx->mb_cols; mb_col++) { + av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + memset(accum, 0, num_pels * sizeof(accum[0])); + memset(count, 0, num_pels * sizeof(count[0])); + MV ref_mv = kZeroMv; // Reference motion vector passed down along frames. + // Perform temporal filtering frame by frame. + + // Decide whether to perform motion search at 16x16 sub-block level or not + // based on 4x4 sub-blocks source variance. Allow motion search for split + // partition only if the difference between max and min source variance of + // 4x4 blocks is greater than a threshold (which is derived empirically). + bool allow_me_for_sub_blks = true; + if (cpi->sf.hl_sf.allow_sub_blk_me_in_tf) { + const int is_hbd = is_frame_high_bitdepth(frame_to_filter); + // Initialize minimum variance to a large value and maximum variance to 0. + double blk_4x4_var_min = DBL_MAX; + double blk_4x4_var_max = 0; + get_log_var_4x4sub_blk(cpi, frame_to_filter, mb_row, mb_col, + TF_BLOCK_SIZE, &blk_4x4_var_min, &blk_4x4_var_max, + is_hbd); + // TODO(sanampudi.venkatarao@ittiam.com): Experiment and adjust the + // threshold for high bit depth. + if ((blk_4x4_var_max - blk_4x4_var_min) <= 4.0) + allow_me_for_sub_blks = false; + } + + for (int frame = 0; frame < num_frames; frame++) { + if (frames[frame] == NULL) continue; + + // Motion search. + MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv }; + int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + if (frame == + filter_frame_idx) { // Frame to be filtered. + // Change ref_mv sign for following frames. + ref_mv.row *= -1; + ref_mv.col *= -1; + } else { // Other reference frames. + tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size, + mb_row, mb_col, &ref_mv, allow_me_for_sub_blks, + subblock_mvs, subblock_mses); + } + + // Perform weighted averaging. + if (frame == filter_frame_idx) { // Frame to be filtered. + tf_apply_temporal_filter_self(frames[frame], mbd, block_size, mb_row, + mb_col, num_planes, accum, count); + } else { // Other reference frames. + tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col, + num_planes, scale, subblock_mvs, pred); + + // All variants of av1_apply_temporal_filter() contain floating point + // operations. Hence, clear the system state. + + // TODO(any): avx2/sse2 version should be changed to align with C + // function before using. In particular, current avx2/sse2 function + // only supports 32x32 block size and 5x5 filtering window. + if (is_frame_high_bitdepth(frame_to_filter)) { // for high bit-depth +#if CONFIG_AV1_HIGHBITDEPTH + if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) { + av1_highbd_apply_temporal_filter( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, weight_calc_level_in_tf, pred, accum, count); + } else { +#endif // CONFIG_AV1_HIGHBITDEPTH + av1_apply_temporal_filter_c( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, weight_calc_level_in_tf, pred, accum, count); +#if CONFIG_AV1_HIGHBITDEPTH + } +#endif // CONFIG_AV1_HIGHBITDEPTH + } else { + // for 8-bit + if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) { + av1_apply_temporal_filter( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, weight_calc_level_in_tf, pred, accum, count); + } else { + av1_apply_temporal_filter_c( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, weight_calc_level_in_tf, pred, accum, count); + } + } + } + } + tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes, + accum, count, tf_ctx->output_frame); + + if (compute_frame_diff) { + const int y_height = mb_height >> mbd->plane[0].subsampling_y; + const int y_width = mb_width >> mbd->plane[0].subsampling_x; + const int source_y_stride = frame_to_filter->y_stride; + const int filter_y_stride = tf_ctx->output_frame->y_stride; + const int source_offset = + mb_row * y_height * source_y_stride + mb_col * y_width; + const int filter_offset = + mb_row * y_height * filter_y_stride + mb_col * y_width; + unsigned int sse = 0; + cpi->ppi->fn_ptr[block_size].vf( + frame_to_filter->y_buffer + source_offset, source_y_stride, + tf_ctx->output_frame->y_buffer + filter_offset, filter_y_stride, + &sse); + diff->sum += sse; + diff->sse += sse * (int64_t)sse; + } + } +} + +/*!\brief Does temporal filter for a given frame. + * + * \ingroup src_frame_proc + * \param[in] cpi Top level encoder instance structure + * + * \remark Nothing will be returned, but the contents of td->diff will be + modified. + */ +static void tf_do_filtering(AV1_COMP *cpi) { + // Basic information. + ThreadData *td = &cpi->td; + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + const struct scale_factors *scale = &tf_ctx->sf; + const int num_planes = av1_num_planes(&cpi->common); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + MACROBLOCKD *mbd = &td->mb.e_mbd; + uint8_t *input_buffer[MAX_MB_PLANE]; + MB_MODE_INFO **input_mb_mode_info; + tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes); + tf_setup_macroblockd(mbd, &td->tf_data, scale); + + // Perform temporal filtering for each row. + for (int mb_row = 0; mb_row < tf_ctx->mb_rows; mb_row++) + av1_tf_do_filtering_row(cpi, td, mb_row); + + tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes); +} + +/*!\brief Setups the frame buffer for temporal filtering. This fuction + * determines how many frames will be used for temporal filtering and then + * groups them into a buffer. This function will also estimate the noise level + * of the to-filter frame. + * + * \ingroup src_frame_proc + * \param[in] cpi Top level encoder instance structure + * \param[in] filter_frame_lookahead_idx The index of the to-filter frame + * in the lookahead buffer cpi->lookahead + * \param[in] gf_frame_index GOP index + * + * \remark Nothing will be returned. But the fields `frames`, `num_frames`, + * `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx. + */ +static void tf_setup_filtering_buffer(AV1_COMP *cpi, + int filter_frame_lookahead_idx, + int gf_frame_index) { + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index]; + const FRAME_TYPE frame_type = gf_group->frame_type[gf_frame_index]; + const int is_forward_keyframe = + av1_gop_check_forward_keyframe(gf_group, gf_frame_index); + + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + YV12_BUFFER_CONFIG **frames = tf_ctx->frames; + // Number of frames used for filtering. Set `arnr_max_frames` as 1 to disable + // temporal filtering. + int num_frames = AOMMAX(cpi->oxcf.algo_cfg.arnr_max_frames, 1); + int num_before = 0; // Number of filtering frames before the to-filter frame. + int num_after = 0; // Number of filtering frames after the to-filer frame. + const int lookahead_depth = + av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); + + // Temporal filtering should not go beyond key frames + const int key_to_curframe = + AOMMAX(cpi->rc.frames_since_key + filter_frame_lookahead_idx, 0); + const int curframe_to_key = + AOMMAX(cpi->rc.frames_to_key - filter_frame_lookahead_idx - 1, 0); + + // Number of buffered frames before the to-filter frame. + int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe); + + // Number of buffered frames after the to-filter frame. + int max_after = + AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key); + + // Estimate noises for each plane. + const struct lookahead_entry *to_filter_buf = av1_lookahead_peek( + cpi->ppi->lookahead, filter_frame_lookahead_idx, cpi->compressor_stage); + assert(to_filter_buf != NULL); + const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img; + const int num_planes = av1_num_planes(&cpi->common); + double *noise_levels = tf_ctx->noise_levels; + av1_estimate_noise_level(to_filter_frame, noise_levels, AOM_PLANE_Y, + num_planes - 1, cpi->common.seq_params->bit_depth, + NOISE_ESTIMATION_EDGE_THRESHOLD); + // Get quantization factor. + const int q = av1_get_q(cpi); + // Get correlation estimates from first-pass; + const FIRSTPASS_STATS *stats = + cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0); + double accu_coeff0 = 1.0, accu_coeff1 = 1.0; + for (int i = 1; i <= max_after; i++) { + if (stats + filter_frame_lookahead_idx + i >= + cpi->ppi->twopass.stats_buf_ctx->stats_in_end) { + max_after = i - 1; + break; + } + accu_coeff1 *= + AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001); + } + if (max_after >= 1) { + accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after); + } + for (int i = 1; i <= max_before; i++) { + if (stats + filter_frame_lookahead_idx - i + 1 <= + cpi->ppi->twopass.stats_buf_ctx->stats_in_start) { + max_before = i - 1; + break; + } + accu_coeff0 *= + AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001); + } + if (max_before >= 1) { + accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before); + } + + // Adjust number of filtering frames based on quantization factor. When the + // quantization factor is small enough (lossless compression), we will not + // change the number of frames for key frame filtering, which is to avoid + // visual quality drop. + int adjust_num = 6; + const int adjust_num_frames_for_arf_filtering = + cpi->sf.hl_sf.adjust_num_frames_for_arf_filtering; + if (num_frames == 1) { // `arnr_max_frames = 1` is used to disable filtering. + adjust_num = 0; + } else if ((update_type == KF_UPDATE) && q <= 10) { + adjust_num = 0; + } else if (adjust_num_frames_for_arf_filtering > 0 && + update_type != KF_UPDATE && (cpi->rc.frames_since_key > 0)) { + // Since screen content detection happens after temporal filtering, + // 'frames_since_key' check is added to ensure the sf is disabled for the + // first alt-ref frame. + // Adjust number of frames to be considered for filtering based on noise + // level of the current frame. For low-noise frame, use more frames to + // filter such that the filtered frame can provide better predictions for + // subsequent frames and vice versa. + const uint8_t av1_adjust_num_using_noise_lvl[2][3] = { { 6, 4, 2 }, + { 4, 2, 0 } }; + const uint8_t *adjust_num_frames = + av1_adjust_num_using_noise_lvl[adjust_num_frames_for_arf_filtering - 1]; + + if (noise_levels[AOM_PLANE_Y] < 0.5) + adjust_num = adjust_num_frames[0]; + else if (noise_levels[AOM_PLANE_Y] < 1.0) + adjust_num = adjust_num_frames[1]; + else + adjust_num = adjust_num_frames[2]; + } + num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth); + + if (frame_type == KEY_FRAME) { + num_before = AOMMIN(is_forward_keyframe ? num_frames / 2 : 0, max_before); + num_after = AOMMIN(num_frames - 1, max_after); + } else { + int gfu_boost = av1_calc_arf_boost(&cpi->ppi->twopass, &cpi->twopass_frame, + &cpi->ppi->p_rc, &cpi->frame_info, + filter_frame_lookahead_idx, max_before, + max_after, NULL, NULL, 0); + + num_frames = AOMMIN(num_frames, gfu_boost / 150); + num_frames += !(num_frames & 1); // Make the number odd. + + // Only use 2 neighbours for the second ARF. + if (update_type == INTNL_ARF_UPDATE) num_frames = AOMMIN(num_frames, 3); + if (AOMMIN(max_after, max_before) >= num_frames / 2) { + // just use half half + num_before = num_frames / 2; + num_after = num_frames / 2; + } else { + if (max_after < num_frames / 2) { + num_after = max_after; + num_before = AOMMIN(num_frames - 1 - num_after, max_before); + } else { + num_before = max_before; + num_after = AOMMIN(num_frames - 1 - num_before, max_after); + } + // Adjust insymmetry based on frame-level correlation + if (max_after > 0 && max_before > 0) { + if (num_after < num_before) { + const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff1, 0.01)); + num_before = AOMMIN(num_before, num_after + insym); + } else { + const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff0, 0.01)); + num_after = AOMMIN(num_after, num_before + insym); + } + } + } + } + num_frames = num_before + 1 + num_after; + + // Setup the frame buffer. + for (int frame = 0; frame < num_frames; ++frame) { + const int lookahead_idx = frame - num_before + filter_frame_lookahead_idx; + struct lookahead_entry *buf = av1_lookahead_peek( + cpi->ppi->lookahead, lookahead_idx, cpi->compressor_stage); + assert(buf != NULL); + frames[frame] = &buf->img; + } + tf_ctx->num_frames = num_frames; + tf_ctx->filter_frame_idx = num_before; + assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame); + + av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes, + cpi->common.seq_params->sb_size); + av1_setup_block_planes(&cpi->td.mb.e_mbd, + cpi->common.seq_params->subsampling_x, + cpi->common.seq_params->subsampling_y, num_planes); +} + +/*!\cond */ + +double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height, + int width, int stride, + int edge_thresh) { + int64_t accum = 0; + int count = 0; + + for (int i = 1; i < height - 1; ++i) { + for (int j = 1; j < width - 1; ++j) { + // Setup a small 3x3 matrix. + const int center_idx = i * stride + j; + int mat[3][3]; + for (int ii = -1; ii <= 1; ++ii) { + for (int jj = -1; jj <= 1; ++jj) { + const int idx = center_idx + ii * stride + jj; + mat[ii + 1][jj + 1] = src[idx]; + } + } + // Compute sobel gradients. + const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + + 2 * (mat[1][0] - mat[1][2]); + const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + + 2 * (mat[0][1] - mat[2][1]); + const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), 0); + // Accumulate Laplacian. + if (Ga < edge_thresh) { // Only count smooth pixels. + const int v = 4 * mat[1][1] - + 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); + accum += ROUND_POWER_OF_TWO(abs(v), 0); + ++count; + } + } + } + + // Return -1.0 (unreliable estimation) if there are too few smooth pixels. + return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2; +} + +#if CONFIG_AV1_HIGHBITDEPTH +double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src16, + int height, int width, + const int stride, + int bit_depth, + int edge_thresh) { + int64_t accum = 0; + int count = 0; + for (int i = 1; i < height - 1; ++i) { + for (int j = 1; j < width - 1; ++j) { + // Setup a small 3x3 matrix. + const int center_idx = i * stride + j; + int mat[3][3]; + for (int ii = -1; ii <= 1; ++ii) { + for (int jj = -1; jj <= 1; ++jj) { + const int idx = center_idx + ii * stride + jj; + mat[ii + 1][jj + 1] = src16[idx]; + } + } + // Compute sobel gradients. + const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + + 2 * (mat[1][0] - mat[1][2]); + const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + + 2 * (mat[0][1] - mat[2][1]); + const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8); + // Accumulate Laplacian. + if (Ga < edge_thresh) { // Only count smooth pixels. + const int v = 4 * mat[1][1] - + 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); + accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8); + ++count; + } + } + } + + // Return -1.0 (unreliable estimation) if there are too few smooth pixels. + return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2; +} +#endif + +void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame, + double *noise_level, int plane_from, int plane_to, + int bit_depth, int edge_thresh) { + for (int plane = plane_from; plane <= plane_to; plane++) { + const bool is_uv_plane = (plane != AOM_PLANE_Y); + const int height = frame->crop_heights[is_uv_plane]; + const int width = frame->crop_widths[is_uv_plane]; + const int stride = frame->strides[is_uv_plane]; + const uint8_t *src = frame->buffers[plane]; + +#if CONFIG_AV1_HIGHBITDEPTH + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const int is_high_bitdepth = is_frame_high_bitdepth(frame); + if (is_high_bitdepth) { + noise_level[plane] = av1_highbd_estimate_noise_from_single_plane( + src16, height, width, stride, bit_depth, edge_thresh); + } else { + noise_level[plane] = av1_estimate_noise_from_single_plane( + src, height, width, stride, edge_thresh); + } +#else + (void)bit_depth; + noise_level[plane] = av1_estimate_noise_from_single_plane( + src, height, width, stride, edge_thresh); +#endif + } +} + +// Initializes the members of TemporalFilterCtx +// Inputs: +// cpi: Top level encoder instance structure +// check_show_existing: If 1, check whether the filtered frame is similar +// to the original frame. +// filter_frame_lookahead_idx: The index of the frame to be filtered in the +// lookahead buffer cpi->lookahead. +// Returns: +// Nothing will be returned. But the contents of cpi->tf_ctx will be modified. +static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx, + int gf_frame_index, int compute_frame_diff, + YV12_BUFFER_CONFIG *output_frame) { + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + // Setup frame buffer for filtering. + YV12_BUFFER_CONFIG **frames = tf_ctx->frames; + tf_ctx->num_frames = 0; + tf_ctx->filter_frame_idx = -1; + tf_ctx->output_frame = output_frame; + tf_ctx->compute_frame_diff = compute_frame_diff; + tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, gf_frame_index); + assert(tf_ctx->num_frames > 0); + assert(tf_ctx->filter_frame_idx < tf_ctx->num_frames); + + // Setup scaling factors. Scaling on each of the arnr frames is not + // supported. + // ARF is produced at the native frame size and resized when coded. + struct scale_factors *sf = &tf_ctx->sf; + av1_setup_scale_factors_for_frame( + sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + frames[0]->y_crop_width, frames[0]->y_crop_height); + + // Initialize temporal filter parameters. + MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; + const int filter_frame_idx = tf_ctx->filter_frame_idx; + const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx]; + const BLOCK_SIZE block_size = TF_BLOCK_SIZE; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int mb_width = block_size_wide[block_size]; + const int mb_height = block_size_high[block_size]; + const int mb_rows = get_num_blocks(frame_height, mb_height); + const int mb_cols = get_num_blocks(frame_width, mb_width); + const int mb_pels = mb_width * mb_height; + const int is_highbitdepth = is_frame_high_bitdepth(frame_to_filter); + const int num_planes = av1_num_planes(&cpi->common); + int num_pels = 0; + for (int i = 0; i < num_planes; i++) { + const int subsampling_x = mbd->plane[i].subsampling_x; + const int subsampling_y = mbd->plane[i].subsampling_y; + num_pels += mb_pels >> (subsampling_x + subsampling_y); + } + tf_ctx->num_pels = num_pels; + tf_ctx->mb_rows = mb_rows; + tf_ctx->mb_cols = mb_cols; + tf_ctx->is_highbitdepth = is_highbitdepth; + tf_ctx->q_factor = av1_get_q(cpi); +} + +int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame, + const FRAME_DIFF *frame_diff, int q_index, + aom_bit_depth_t bit_depth) { + const int frame_height = frame->y_crop_height; + const int frame_width = frame->y_crop_width; + const int block_height = block_size_high[TF_BLOCK_SIZE]; + const int block_width = block_size_wide[TF_BLOCK_SIZE]; + const int mb_rows = get_num_blocks(frame_height, block_height); + const int mb_cols = get_num_blocks(frame_width, block_width); + const int num_mbs = AOMMAX(1, mb_rows * mb_cols); + const float mean = (float)frame_diff->sum / num_mbs; + const float std = (float)sqrt((float)frame_diff->sse / num_mbs - mean * mean); + + const int ac_q_step = av1_ac_quant_QTX(q_index, 0, bit_depth); + const float threshold = 0.7f * ac_q_step * ac_q_step; + + if (mean < threshold && std < mean * 1.2) { + return 1; + } + return 0; +} + +void av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx, + int gf_frame_index, FRAME_DIFF *frame_diff, + YV12_BUFFER_CONFIG *output_frame) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + // Basic informaton of the current frame. + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + TemporalFilterData *tf_data = &cpi->td.tf_data; + const int compute_frame_diff = frame_diff != NULL; + // TODO(anyone): Currently, we enforce the filtering strength on internal + // ARFs except the second ARF to be zero. We should investigate in which case + // it is more beneficial to use non-zero strength filtering. + // Only parallel level 0 frames go through temporal filtering. + assert(cpi->ppi->gf_group.frame_parallel_level[gf_frame_index] == 0); + + // Initialize temporal filter context structure. + init_tf_ctx(cpi, filter_frame_lookahead_idx, gf_frame_index, + compute_frame_diff, output_frame); + + // Allocate and reset temporal filter buffers. + const int is_highbitdepth = tf_ctx->is_highbitdepth; + if (!tf_alloc_and_reset_data(tf_data, tf_ctx->num_pels, is_highbitdepth)) { + aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, + "Error allocating temporal filter data"); + } + + // Perform temporal filtering process. + if (mt_info->num_workers > 1) + av1_tf_do_filtering_mt(cpi); + else + tf_do_filtering(cpi); + + if (compute_frame_diff) { + *frame_diff = tf_data->diff; + } + // Deallocate temporal filter buffers. + tf_dealloc_data(tf_data, is_highbitdepth); +} + +int av1_is_temporal_filter_on(const AV1EncoderConfig *oxcf) { + return oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1; +} + +bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + tf_info->is_temporal_filter_on = av1_is_temporal_filter_on(oxcf); + if (tf_info->is_temporal_filter_on == 0) return true; + + const AV1_COMMON *cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { + if (aom_realloc_frame_buffer( + &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width, + oxcf->frm_dim_cfg.height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL, cpi->image_pyramid_levels, 0)) { + return false; + } + } + return true; +} + +void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) { + if (tf_info->is_temporal_filter_on == 0) return; + for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { + aom_free_frame_buffer(&tf_info->tf_buf[i]); + } + aom_free_frame_buffer(&tf_info->tf_buf_second_arf); +} + +void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info) { + av1_zero(tf_info->tf_buf_valid); + av1_zero(tf_info->tf_buf_gf_index); + av1_zero(tf_info->tf_buf_display_index_offset); +} + +void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, AV1_COMP *cpi, + const GF_GROUP *gf_group) { + if (tf_info->is_temporal_filter_on == 0) return; + const AV1_COMMON *const cm = &cpi->common; + for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) { + int update_type = gf_group->update_type[gf_index]; + if (update_type == KF_UPDATE || update_type == ARF_UPDATE) { + int buf_idx = gf_group->frame_type[gf_index] == INTER_FRAME; + int lookahead_idx = gf_group->arf_src_offset[gf_index] + + gf_group->cur_frame_idx[gf_index]; + // This function is designed to be called multiple times after + // av1_tf_info_reset(). It will only generate the filtered frame that does + // not exist yet. + if (tf_info->tf_buf_valid[buf_idx] == 0 || + tf_info->tf_buf_display_index_offset[buf_idx] != lookahead_idx) { + YV12_BUFFER_CONFIG *out_buf = &tf_info->tf_buf[buf_idx]; + av1_temporal_filter(cpi, lookahead_idx, gf_index, + &tf_info->frame_diff[buf_idx], out_buf); + aom_extend_frame_borders(out_buf, av1_num_planes(cm)); + tf_info->tf_buf_gf_index[buf_idx] = gf_index; + tf_info->tf_buf_display_index_offset[buf_idx] = lookahead_idx; + tf_info->tf_buf_valid[buf_idx] = 1; + } + } + } +} + +YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info, + int gf_index, + FRAME_DIFF *frame_diff) { + if (tf_info->is_temporal_filter_on == 0) return NULL; + YV12_BUFFER_CONFIG *out_buf = NULL; + for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { + if (tf_info->tf_buf_valid[i] && tf_info->tf_buf_gf_index[i] == gf_index) { + out_buf = &tf_info->tf_buf[i]; + *frame_diff = tf_info->frame_diff[i]; + } + } + return out_buf; +} +/*!\endcond */ diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h new file mode 100644 index 0000000000..6504b91b66 --- /dev/null +++ b/third_party/aom/av1/encoder/temporal_filter.h @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ +#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*!\cond */ +struct AV1_COMP; +struct AV1EncoderConfig; +struct ThreadData; +// TODO(wtc): These two variables are only used in avx2, sse2, neon +// implementations, where the block size is still hard coded to TF_BLOCK_SIZE. +// This should be fixed to align with the c implementation. +#define BH 32 +#define BW 32 + +// Block size used in temporal filtering. +#define TF_BLOCK_SIZE BLOCK_32X32 + +// Window size for temporal filtering. +#define TF_WINDOW_LENGTH 5 + +// A constant number, sqrt(pi / 2), used for noise estimation. +static const double SQRT_PI_BY_2 = 1.25331413732; + +// Hyper-parameters used to compute filtering weight. These hyper-parameters can +// be tuned for a better performance. +// 0. A scale factor used in temporal filtering to raise the filter weight from +// `double` with range [0, 1] to `int` with range [0, 1000]. +#define TF_WEIGHT_SCALE 1000 +// 1. Weight factor used to balance the weighted-average between window error +// and block error. The weight is for window error while the weight for block +// error is always set as 1. +#define TF_WINDOW_BLOCK_BALANCE_WEIGHT 5 +// 2. Threshold for using q to adjust the filtering weight. Concretely, when +// using a small q (high bitrate), we would like to reduce the filtering +// strength such that more detailed information can be preserved. Hence, when +// q is smaller than this threshold, we will adjust the filtering weight +// based on the q-value. +#define TF_Q_DECAY_THRESHOLD 20 +// 3. Normalization factor used to normalize the motion search error. Since the +// motion search error can be large and uncontrollable, we will simply +// normalize it before using it to compute the filtering weight. +#define TF_SEARCH_ERROR_NORM_WEIGHT 20 +// 4. Threshold for using `arnr_strength` to adjust the filtering strength. +// Concretely, users can use `arnr_strength` arguments to control the +// strength of temporal filtering. When `arnr_strength` is small enough ( +// i.e., smaller than this threshold), we will adjust the filtering weight +// based on the strength value. +#define TF_STRENGTH_THRESHOLD 4 +// 5. Threshold for using motion search distance to adjust the filtering weight. +// Concretely, larger motion search vector leads to a higher probability of +// unreliable search. Hence, we would like to reduce the filtering strength +// when the distance is large enough. Considering that the distance actually +// relies on the frame size, this threshold is also a resolution-based +// threshold. Taking 720p videos as an instance, if this field equals to 0.1, +// then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold +// for 360p videos will be 360 * 0.1 = 36. +#define TF_SEARCH_DISTANCE_THRESHOLD 0.1 +// 6. Threshold to identify if the q is in a relative high range. +// Above this cutoff q, a stronger filtering is applied. +// For a high q, the quantization throws away more information, and thus a +// stronger filtering is less likely to distort the encoded quality, while a +// stronger filtering could reduce bit rates. +// Ror a low q, more details are expected to be retained. Filtering is thus +// more conservative. +#define TF_QINDEX_CUTOFF 128 + +#define NOISE_ESTIMATION_EDGE_THRESHOLD 50 + +// Sum and SSE source vs filtered frame difference returned by +// temporal filter. +typedef struct { + int64_t sum; + int64_t sse; +} FRAME_DIFF; + +/*!\endcond */ + +/*! + * \brief Parameters related to temporal filtering. + */ +typedef struct { + /*! + * Frame buffers used for temporal filtering. + */ + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; + /*! + * Number of frames in the frame buffer. + */ + int num_frames; + + /*! + * Output filtered frame + */ + YV12_BUFFER_CONFIG *output_frame; + + /*! + * Index of the frame to be filtered. + */ + int filter_frame_idx; + /*! + * Whether to accumulate diff for show existing condition check. + */ + int compute_frame_diff; + /*! + * Frame scaling factor. + */ + struct scale_factors sf; + /*! + * Estimated noise levels for each plane in the frame. + */ + double noise_levels[MAX_MB_PLANE]; + /*! + * Number of pixels in the temporal filtering block across all planes. + */ + int num_pels; + /*! + * Number of temporal filtering block rows. + */ + int mb_rows; + /*! + * Number of temporal filtering block columns. + */ + int mb_cols; + /*! + * Whether the frame is high-bitdepth or not. + */ + int is_highbitdepth; + /*! + * Quantization factor used in temporal filtering. + */ + int q_factor; +} TemporalFilterCtx; + +/*! + * buffer count in TEMPORAL_FILTER_INFO + * Currently we only apply filtering on KEY and ARF after + * define_gf_group(). Hence, the count is two. + */ +#define TF_INFO_BUF_COUNT 2 + +/*! + * \brief Temporal filter info for a gop + */ +typedef struct TEMPORAL_FILTER_INFO { + /*! + * A flag indicate whether temporal filter shoud be applied. + * This flag will stored the result of + * av1_is_temporal_filter_on() + */ + int is_temporal_filter_on; + /*! + * buffers used for temporal filtering in a GOP + * index 0 for key frame and index 1 for ARF + */ + YV12_BUFFER_CONFIG tf_buf[TF_INFO_BUF_COUNT]; + + /*! + * buffers used for temporal filtering for + * INTNL_ARF_UPDATE + * Check av1_gop_is_second_arf() for the + * definition of second_arf in detail + */ + YV12_BUFFER_CONFIG tf_buf_second_arf; + /*! + * whether to show the buffer directly or not. + */ + FRAME_DIFF frame_diff[TF_INFO_BUF_COUNT]; + /*! + * the corresponding gf_index for the buffer. + */ + int tf_buf_gf_index[TF_INFO_BUF_COUNT]; + /*! + * the display_index offset between next show frame and the frames in the GOP + */ + int tf_buf_display_index_offset[TF_INFO_BUF_COUNT]; + /*! + * whether the buf is valid or not. + */ + int tf_buf_valid[TF_INFO_BUF_COUNT]; +} TEMPORAL_FILTER_INFO; + +/*!\brief Check whether we should apply temporal filter at all. + * \param[in] oxcf AV1 encoder config + * + * \return 1: temporal filter is on 0: temporal is off + */ +int av1_is_temporal_filter_on(const struct AV1EncoderConfig *oxcf); + +/*!\brief Allocate buffers for TEMPORAL_FILTER_INFO + * \param[in,out] tf_info Temporal filter info for a gop + * \param[in,out] cpi Top level encoder instance structure + * + * \return True on success, false on memory allocation failure. + */ +bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, + const struct AV1_COMP *cpi); + +/*!\brief Free buffers for TEMPORAL_FILTER_INFO + * \param[in,out] tf_info Temporal filter info for a gop + */ +void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info); + +/*!\brief Reset validity of tf_buf in TEMPORAL_FILTER_INFO + * \param[in,out] tf_info Temporal filter info for a gop + */ +void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info); + +/*!\brief Apply temporal filter for key frame and ARF in a gop + * \param[in,out] tf_info Temporal filter info for a gop + * \param[in,out] cpi Top level encoder instance structure + * \param[in] gf_group GF/ARF group data structure + */ +void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, struct AV1_COMP *cpi, + const GF_GROUP *gf_group); + +/*!\brief Get a filtered buffer from TEMPORAL_FILTER_INFO + * \param[in,out] tf_info Temporal filter info for a gop + * \param[in] gf_index gf_index for the target buffer + * \param[out] show_tf_buf whether the target buffer can be shown + * directly + */ +YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info, + int gf_index, + FRAME_DIFF *frame_diff); + +/*!\cond */ + +// Data related to temporal filtering. +typedef struct { + // Source vs filtered frame error. + FRAME_DIFF diff; + // Pointer to temporary block info used to store state in temporal filtering + // process. + MB_MODE_INFO *tmp_mbmi; + // Pointer to accumulator buffer used in temporal filtering process. + uint32_t *accum; + // Pointer to count buffer used in temporal filtering process. + uint16_t *count; + // Pointer to predictor used in temporal filtering process. + uint8_t *pred; +} TemporalFilterData; + +// Data related to temporal filter multi-thread synchronization. +typedef struct { +#if CONFIG_MULTITHREAD + // Mutex lock used for dispatching jobs. + pthread_mutex_t *mutex_; +#endif // CONFIG_MULTITHREAD + // Next temporal filter block row to be filtered. + int next_tf_row; + // Initialized to false, set to true by the worker thread that encounters an + // error in order to abort the processing of other worker threads. + bool tf_mt_exit; +} AV1TemporalFilterSync; + +// Estimates noise level from a given frame using a single plane (Y, U, or V). +// This is an adaptation of the mehtod in the following paper: +// Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise +// estimation using Laplacian operator and adaptive edge detection", +// Proc. 3rd International Symposium on Communications, Control and +// Signal Processing, 2008, St Julians, Malta. +// Inputs: +// frame: Pointer to the frame to estimate noise level from. +// noise_level: Pointer to store the estimated noise. +// plane_from: Index of the starting plane used for noise estimation. +// Commonly, 0 for Y-plane, 1 for U-plane, and 2 for V-plane. +// plane_to: Index of the end plane used for noise estimation. +// bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame. +// edge_thresh: Edge threshold. +void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame, + double *noise_level, int plane_from, int plane_to, + int bit_depth, int edge_thresh); +/*!\endcond */ + +/*!\brief Does temporal filter for a given macroblock row. +* +* \ingroup src_frame_proc +* \param[in] cpi Top level encoder instance structure +* \param[in] td Pointer to thread data +* \param[in] mb_row Macroblock row to be filtered +filtering +* +* \remark Nothing will be returned, but the contents of td->diff will be +modified. +*/ +void av1_tf_do_filtering_row(struct AV1_COMP *cpi, struct ThreadData *td, + int mb_row); + +/*!\brief Performs temporal filtering if needed on a source frame. + * For example to create a filtered alternate reference frame (ARF) + * + * In this function, the lookahead index is different from the 0-based + * real index. For example, if we want to filter the first frame in the + * pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead + * of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the + * second frame in the pre-fetched buffer. Another example: if we want to filter + * the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16. + * Futhermore, negative number is used for key frame in one-pass mode, where key + * frame is filtered with the frames before it instead of after it. For example, + * -15 means to filter the 17-th frame, which is a key frame in one-pass mode. + * + * \ingroup src_frame_proc + * \param[in] cpi Top level encoder instance + * structure + * \param[in] filter_frame_lookahead_idx The index of the + * to-filter frame in the lookahead + * buffer cpi->lookahead. + * \param[in] gf_frame_index Index of GOP + * \param[in,out] frame_diff structure of sse and sum of the + * filtered frame. + * \param[out] output_frame Ouput filtered frame. + */ +void av1_temporal_filter(struct AV1_COMP *cpi, + const int filter_frame_lookahead_idx, + int gf_frame_index, FRAME_DIFF *frame_diff, + YV12_BUFFER_CONFIG *output_frame); + +/*!\brief Check whether a filtered frame can be show directly + * + * This function will use the filtered frame's sse and current q index + * to make decision. + * + * \ingroup src_frame_proc + * \param[in] frame filtered frame's buffer + * \param[in] frame_diff structure of sse and sum of the + * filtered frame. + * \param[in] q_index q_index used for this frame + * \param[in] bit_depth bit depth + * \return return 1 if this frame can be shown directly, otherwise + * return 0 + */ +int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame, + const FRAME_DIFF *frame_diff, int q_index, + aom_bit_depth_t bit_depth); + +/*!\cond */ +// Helper function to get `q` used for encoding. +int av1_get_q(const struct AV1_COMP *cpi); + +// Allocates memory for members of TemporalFilterData. +// Inputs: +// tf_data: Pointer to the structure containing temporal filter related data. +// num_pels: Number of pixels in the block across all planes. +// is_high_bitdepth: Whether the frame is high-bitdepth or not. +// Returns: +// True if allocation is successful and false otherwise. +static AOM_INLINE bool tf_alloc_and_reset_data(TemporalFilterData *tf_data, + int num_pels, + int is_high_bitdepth) { + tf_data->tmp_mbmi = (MB_MODE_INFO *)aom_calloc(1, sizeof(*tf_data->tmp_mbmi)); + tf_data->accum = + (uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum)); + tf_data->count = + (uint16_t *)aom_memalign(16, num_pels * sizeof(*tf_data->count)); + if (is_high_bitdepth) + tf_data->pred = CONVERT_TO_BYTEPTR( + aom_memalign(32, num_pels * 2 * sizeof(*tf_data->pred))); + else + tf_data->pred = + (uint8_t *)aom_memalign(32, num_pels * sizeof(*tf_data->pred)); + // In case of an allocation failure, other successfully allocated buffers will + // be freed by the tf_dealloc_data() call in encoder_destroy(). + if (!(tf_data->tmp_mbmi && tf_data->accum && tf_data->count && tf_data->pred)) + return false; + memset(&tf_data->diff, 0, sizeof(tf_data->diff)); + return true; +} + +// Setup macroblockd params for temporal filtering process. +// Inputs: +// mbd: Pointer to the block for filtering. +// tf_data: Pointer to the structure containing temporal filter related data. +// scale: Scaling factor. +// Returns: +// Nothing will be returned. Contents of mbd will be modified. +static AOM_INLINE void tf_setup_macroblockd(MACROBLOCKD *mbd, + TemporalFilterData *tf_data, + const struct scale_factors *scale) { + mbd->block_ref_scale_factors[0] = scale; + mbd->block_ref_scale_factors[1] = scale; + mbd->mi = &tf_data->tmp_mbmi; + mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION; +} + +// Deallocates the memory allocated for members of TemporalFilterData. +// Inputs: +// tf_data: Pointer to the structure containing temporal filter related data. +// is_high_bitdepth: Whether the frame is high-bitdepth or not. +// Returns: +// Nothing will be returned. +static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data, + int is_high_bitdepth) { + if (is_high_bitdepth) + tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred); + aom_free(tf_data->tmp_mbmi); + tf_data->tmp_mbmi = NULL; + aom_free(tf_data->accum); + tf_data->accum = NULL; + aom_free(tf_data->count); + tf_data->count = NULL; + aom_free(tf_data->pred); + tf_data->pred = NULL; +} + +// Saves the state prior to temporal filter process. +// Inputs: +// mbd: Pointer to the block for filtering. +// input_mbmi: Backup block info to save input state. +// input_buffer: Backup buffer pointer to save input state. +// num_planes: Number of planes. +// Returns: +// Nothing will be returned. Contents of input_mbmi and input_buffer will be +// modified. +static INLINE void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi, + uint8_t **input_buffer, int num_planes) { + for (int i = 0; i < num_planes; i++) { + input_buffer[i] = mbd->plane[i].pre[0].buf; + } + *input_mbmi = mbd->mi; +} + +// Restores the initial state after temporal filter process. +// Inputs: +// mbd: Pointer to the block for filtering. +// input_mbmi: Backup block info from where input state is restored. +// input_buffer: Backup buffer pointer from where input state is restored. +// num_planes: Number of planes. +// Returns: +// Nothing will be returned. Contents of mbd will be modified. +static INLINE void tf_restore_state(MACROBLOCKD *mbd, MB_MODE_INFO **input_mbmi, + uint8_t **input_buffer, int num_planes) { + for (int i = 0; i < num_planes; i++) { + mbd->plane[i].pre[0].buf = input_buffer[i]; + } + mbd->mi = input_mbmi; +} + +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ diff --git a/third_party/aom/av1/encoder/thirdpass.c b/third_party/aom/av1/encoder/thirdpass.c new file mode 100644 index 0000000000..a25522fbc5 --- /dev/null +++ b/third_party/aom/av1/encoder/thirdpass.c @@ -0,0 +1,877 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "av1/encoder/thirdpass.h" + +#if CONFIG_THREE_PASS && CONFIG_AV1_DECODER +#include "aom/aom_codec.h" +#include "aom/aomdx.h" +#include "aom_dsp/psnr.h" +#include "aom_mem/aom_mem.h" +#include "av1/av1_iface_common.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" +#include "av1/common/blockd.h" +#include "common/ivfdec.h" + +static void setup_two_pass_stream_input( + struct AvxInputContext **input_ctx_ptr, const char *input_file_name, + struct aom_internal_error_info *err_info) { + FILE *infile; + infile = fopen(input_file_name, "rb"); + if (!infile) { + aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM, + "Failed to open input file '%s'.", input_file_name); + } + struct AvxInputContext *aom_input_ctx = aom_malloc(sizeof(*aom_input_ctx)); + if (!aom_input_ctx) { + fclose(infile); + aom_internal_error(err_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate memory for third-pass context."); + } + memset(aom_input_ctx, 0, sizeof(*aom_input_ctx)); + aom_input_ctx->filename = input_file_name; + aom_input_ctx->file = infile; + + if (file_is_ivf(aom_input_ctx)) { + aom_input_ctx->file_type = FILE_TYPE_IVF; + } else { + fclose(infile); + aom_free(aom_input_ctx); + aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM, + "Unrecognized input file type."); + } + *input_ctx_ptr = aom_input_ctx; +} + +static void init_third_pass(THIRD_PASS_DEC_CTX *ctx) { + if (!ctx->input_ctx) { + if (ctx->input_file_name == NULL) { + aom_internal_error(ctx->err_info, AOM_CODEC_INVALID_PARAM, + "No third pass input specified."); + } + setup_two_pass_stream_input(&ctx->input_ctx, ctx->input_file_name, + ctx->err_info); + } + + if (!ctx->decoder.iface) { + aom_codec_iface_t *decoder_iface = &aom_codec_av1_inspect_algo; + if (aom_codec_dec_init(&ctx->decoder, decoder_iface, NULL, 0)) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to initialize decoder."); + } + } +} + +// Return 0: success +// 1: cannot read because this is end of file +// -1: failure to read the frame +static int read_frame(THIRD_PASS_DEC_CTX *ctx) { + if (!ctx->input_ctx || !ctx->decoder.iface) { + init_third_pass(ctx); + } + if (!ctx->have_frame) { + if (ivf_read_frame(ctx->input_ctx, &ctx->buf, &ctx->bytes_in_buffer, + &ctx->buffer_size, NULL) != 0) { + if (feof(ctx->input_ctx->file)) { + return 1; + } else { + return -1; + } + } + ctx->frame = ctx->buf; + ctx->end_frame = ctx->frame + ctx->bytes_in_buffer; + ctx->have_frame = 1; + } + + Av1DecodeReturn adr; + if (aom_codec_decode(&ctx->decoder, ctx->frame, + (unsigned int)ctx->bytes_in_buffer, + &adr) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to decode frame for third pass."); + } + ctx->this_frame_bits = (int)(adr.buf - ctx->frame) << 3; + ctx->frame = adr.buf; + ctx->bytes_in_buffer = ctx->end_frame - ctx->frame; + if (ctx->frame == ctx->end_frame) ctx->have_frame = 0; + return 0; +} + +static void free_frame_info(THIRD_PASS_FRAME_INFO *frame_info) { + if (!frame_info) return; + aom_free(frame_info->mi_info); + frame_info->mi_info = NULL; +} + +// This function gets the information needed from the recently decoded frame, +// via various decoder APIs, and saves the info into ctx->frame_info. +// Return 0: success +// 1: cannot read because this is end of file +// -1: failure to read the frame +static int get_frame_info(THIRD_PASS_DEC_CTX *ctx) { + int ret = read_frame(ctx); + if (ret != 0) return ret; + int cur = ctx->frame_info_count; + + ctx->frame_info[cur].actual_bits = ctx->this_frame_bits; + + if (cur >= MAX_THIRD_PASS_BUF) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Third pass frame info ran out of available slots."); + } + aom_codec_frame_flags_t frame_type_flags = 0; + if (aom_codec_control(&ctx->decoder, AOMD_GET_FRAME_FLAGS, + &frame_type_flags) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read frame flags."); + } + if (frame_type_flags & AOM_FRAME_IS_KEY) { + ctx->frame_info[cur].frame_type = KEY_FRAME; + } else if (frame_type_flags & AOM_FRAME_IS_INTRAONLY) { + ctx->frame_info[cur].frame_type = INTRA_ONLY_FRAME; + } else if (frame_type_flags & AOM_FRAME_IS_SWITCH) { + ctx->frame_info[cur].frame_type = S_FRAME; + } else { + ctx->frame_info[cur].frame_type = INTER_FRAME; + } + + // Get frame width and height + int frame_size[2]; + if (aom_codec_control(&ctx->decoder, AV1D_GET_FRAME_SIZE, frame_size) != + AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read frame size."); + } + + // Check if we need to re-alloc the mi fields. + const int mi_cols = (frame_size[0] + 3) >> 2; + const int mi_rows = (frame_size[1] + 3) >> 2; + ctx->frame_info[cur].mi_stride = mi_cols; + ctx->frame_info[cur].mi_rows = mi_rows; + ctx->frame_info[cur].mi_cols = mi_cols; + + if (ctx->frame_info[cur].width != frame_size[0] || + ctx->frame_info[cur].height != frame_size[1] || + !ctx->frame_info[cur].mi_info) { + free_frame_info(&ctx->frame_info[cur]); + + ctx->frame_info[cur].mi_info = + aom_malloc(mi_cols * mi_rows * sizeof(*ctx->frame_info[cur].mi_info)); + + if (!ctx->frame_info[cur].mi_info) { + aom_internal_error(ctx->err_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate mi buffer for the third pass."); + } + } + + ctx->frame_info[cur].width = frame_size[0]; + ctx->frame_info[cur].height = frame_size[1]; + + // Get frame base q idx + if (aom_codec_control(&ctx->decoder, AOMD_GET_BASE_Q_IDX, + &ctx->frame_info[cur].base_q_idx) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read base q index."); + } + + // Get show existing frame flag + if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_EXISTING_FRAME_FLAG, + &ctx->frame_info[cur].is_show_existing_frame) != + AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read show existing frame flag."); + } + + // Get show frame flag + if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_FRAME_FLAG, + &ctx->frame_info[cur].is_show_frame) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read show frame flag."); + } + + // Get order hint + if (aom_codec_control(&ctx->decoder, AOMD_GET_ORDER_HINT, + &ctx->frame_info[cur].order_hint) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read order hint."); + } + + // Clear MI info + for (int mi_row = 0; mi_row < mi_rows; mi_row++) { + for (int mi_col = 0; mi_col < mi_cols; mi_col++) { + ctx->frame_info[cur].mi_info[mi_row * mi_cols + mi_col].bsize = + BLOCK_INVALID; + } + } + + // Get relevant information regarding each 4x4 MI + MB_MODE_INFO cur_mi_info; + THIRD_PASS_MI_INFO *const this_mi = ctx->frame_info[cur].mi_info; + for (int mi_row = 0; mi_row < mi_rows; mi_row++) { + for (int mi_col = 0; mi_col < mi_cols; mi_col++) { + const int offset = mi_row * mi_cols + mi_col; + if (this_mi[offset].bsize != BLOCK_INVALID) { + continue; + } + // Get info of this MI + if (aom_codec_control(&ctx->decoder, AV1D_GET_MI_INFO, mi_row, mi_col, + &cur_mi_info) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read mi info."); + } + const int blk_mi_rows = mi_size_high[cur_mi_info.bsize]; + const int blk_mi_cols = mi_size_wide[cur_mi_info.bsize]; + + for (int h = 0; h < blk_mi_rows; h++) { + for (int w = 0; w < blk_mi_cols; w++) { + if (h + mi_row >= mi_rows || w + mi_col >= mi_cols) { + continue; + } + const int this_offset = offset + h * mi_cols + w; + this_mi[this_offset].bsize = cur_mi_info.bsize; + this_mi[this_offset].partition = cur_mi_info.partition; + this_mi[this_offset].mi_row_start = mi_row; + this_mi[this_offset].mi_col_start = mi_col; + this_mi[this_offset].mv[0] = cur_mi_info.mv[0]; + this_mi[this_offset].mv[1] = cur_mi_info.mv[1]; + this_mi[this_offset].ref_frame[0] = cur_mi_info.ref_frame[0]; + this_mi[this_offset].ref_frame[1] = cur_mi_info.ref_frame[1]; + this_mi[this_offset].pred_mode = cur_mi_info.mode; + } + } + } + } + + ctx->frame_info_count++; + + return 0; +} + +#define USE_SECOND_PASS_FILE 1 + +#if !USE_SECOND_PASS_FILE +// Parse the frames in the gop and determine the last frame of the current GOP. +// Decode more frames if necessary. The variable max_num is the maximum static +// GOP length if we detect an IPPP structure, and it is expected that max_mum >= +// MAX_GF_INTERVAL. +static void get_current_gop_end(THIRD_PASS_DEC_CTX *ctx, int max_num, + int *last_idx) { + assert(max_num >= MAX_GF_INTERVAL); + *last_idx = 0; + int cur_idx = 0; + int arf_order_hint = -1; + int num_show_frames = 0; + while (num_show_frames < max_num) { + assert(cur_idx < MAX_THIRD_PASS_BUF); + // Read in from bitstream if needed. + if (cur_idx >= ctx->frame_info_count) { + int ret = get_frame_info(ctx); + if (ret == 1) { + // At the end of the file, GOP ends in the prev frame. + if (arf_order_hint >= 0) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to derive GOP length."); + } + *last_idx = cur_idx - 1; + return; + } + if (ret < 0) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read frame for third pass."); + } + } + + // TODO(bohanli): verify that fwd_kf works here. + if (ctx->frame_info[cur_idx].frame_type == KEY_FRAME && + ctx->frame_info[cur_idx].is_show_frame) { + if (cur_idx != 0) { + // If this is a key frame and is not the first kf in this kf group, we + // have reached the next key frame. Stop here. + *last_idx = cur_idx - 1; + return; + } + } else if (!ctx->frame_info[cur_idx].is_show_frame && + arf_order_hint == -1) { + // If this is an arf (the first no show) + if (num_show_frames <= 1) { + // This is an arf and we should end the GOP with its overlay. + arf_order_hint = ctx->frame_info[cur_idx].order_hint; + } else { + // There are multiple show frames before the this arf, so we treat the + // frames previous to this arf as a GOP. + *last_idx = cur_idx - 1; + return; + } + } else if (arf_order_hint >= 0 && ctx->frame_info[cur_idx].order_hint == + (unsigned int)arf_order_hint) { + // If this is the overlay/show existing of the arf + assert(ctx->frame_info[cur_idx].is_show_frame); + *last_idx = cur_idx; + return; + } else { + // This frame is part of the GOP. + if (ctx->frame_info[cur_idx].is_show_frame) num_show_frames++; + } + cur_idx++; + } + // This is a long IPPP GOP and we will use a length of max_num here. + assert(arf_order_hint < 0); + *last_idx = max_num - 1; + return; +} +#endif + +static AOM_INLINE void read_gop_frames(THIRD_PASS_DEC_CTX *ctx) { + int cur_idx = 0; + while (cur_idx < ctx->gop_info.num_frames) { + assert(cur_idx < MAX_THIRD_PASS_BUF); + // Read in from bitstream if needed. + if (cur_idx >= ctx->frame_info_count) { + int ret = get_frame_info(ctx); + if (ret != 0) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read frame for third pass."); + } + } + cur_idx++; + } + return; +} + +void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) { + // Read in future frames in the current GOP. + read_gop_frames(ctx); + + int gf_len = 0; + // Check the GOP length against the value read from second_pass_file + for (int i = 0; i < ctx->gop_info.num_frames; i++) { + if (ctx->frame_info[i].is_show_frame) gf_len++; + } + + if (gf_len != ctx->gop_info.gf_length) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Mismatch in third pass GOP length!"); + } +} + +void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) { + if (ctx->frame_info_count == 0) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "No available frame info for third pass."); + } + ctx->frame_info_count--; + free_frame_info(&ctx->frame_info[0]); + for (int i = 0; i < ctx->frame_info_count; i++) { + ctx->frame_info[i] = ctx->frame_info[i + 1]; + } + ctx->frame_info[ctx->frame_info_count].mi_info = NULL; +} + +void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx, + const char *file) { + av1_free_thirdpass_ctx(*ctx); + CHECK_MEM_ERROR(cm, *ctx, aom_calloc(1, sizeof(**ctx))); + THIRD_PASS_DEC_CTX *ctx_ptr = *ctx; + ctx_ptr->input_file_name = file; + ctx_ptr->prev_gop_end = -1; + ctx_ptr->err_info = cm->error; +} + +void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) { + if (ctx == NULL) return; + if (ctx->decoder.iface) { + aom_codec_destroy(&ctx->decoder); + } + if (ctx->input_ctx && ctx->input_ctx->file) fclose(ctx->input_ctx->file); + aom_free(ctx->input_ctx); + if (ctx->buf) free(ctx->buf); + for (int i = 0; i < MAX_THIRD_PASS_BUF; i++) { + free_frame_info(&ctx->frame_info[i]); + } + aom_free(ctx); +} + +void av1_write_second_pass_gop_info(AV1_COMP *cpi) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + + if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) { + // Write the GOP length to a log file. + av1_open_second_pass_log(cpi, 0); + + THIRD_PASS_GOP_INFO gop_info; + + gop_info.num_frames = gf_group->size; + gop_info.use_arf = (gf_group->arf_index >= 0); + gop_info.gf_length = p_rc->baseline_gf_interval; + + size_t count = + fwrite(&gop_info, sizeof(gop_info), 1, cpi->second_pass_log_stream); + if (count < 1) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not write to second pass log file!"); + } + } +} + +void av1_write_second_pass_per_frame_info(AV1_COMP *cpi, int gf_index) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + + if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) { + // write target bitrate + int bits = gf_group->bit_allocation[gf_index]; + size_t count = fwrite(&bits, sizeof(bits), 1, cpi->second_pass_log_stream); + if (count < 1) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not write to second pass log file!"); + } + + // write sse + uint64_t sse = 0; + int pkt_idx = cpi->ppi->output_pkt_list->cnt - 1; + if (pkt_idx >= 0 && + cpi->ppi->output_pkt_list->pkts[pkt_idx].kind == AOM_CODEC_PSNR_PKT) { + sse = cpi->ppi->output_pkt_list->pkts[pkt_idx].data.psnr.sse[0]; +#if CONFIG_INTERNAL_STATS + } else if (cpi->ppi->b_calculate_psnr) { + sse = cpi->ppi->total_sq_error[0]; +#endif + } else { + const YV12_BUFFER_CONFIG *orig = cpi->source; + const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; + PSNR_STATS psnr; +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth); +#else + aom_calc_psnr(orig, recon, &psnr); +#endif + sse = psnr.sse[0]; + } + + count = fwrite(&sse, sizeof(sse), 1, cpi->second_pass_log_stream); + if (count < 1) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not write to second pass log file!"); + } + + // write bpm_factor + double factor = cpi->ppi->twopass.bpm_factor; + count = fwrite(&factor, sizeof(factor), 1, cpi->second_pass_log_stream); + if (count < 1) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not write to second pass log file!"); + } + } +} +void av1_open_second_pass_log(AV1_COMP *cpi, int is_read) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + if (oxcf->second_pass_log == NULL) { + aom_internal_error(cpi->common.error, AOM_CODEC_INVALID_PARAM, + "No second pass log file specified for the third pass!"); + } + // Read the GOP length from a file. + if (!cpi->second_pass_log_stream) { + if (is_read) { + cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "rb"); + } else { + cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "wb"); + } + if (!cpi->second_pass_log_stream) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not open second pass log file!"); + } + } +} + +void av1_close_second_pass_log(AV1_COMP *cpi) { + if (cpi->second_pass_log_stream) { + int ret = fclose(cpi->second_pass_log_stream); + if (ret != 0) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not close second pass log file!"); + } + cpi->second_pass_log_stream = 0; + } +} + +void av1_read_second_pass_gop_info(FILE *second_pass_log_stream, + THIRD_PASS_GOP_INFO *gop_info, + struct aom_internal_error_info *error) { + size_t count = fread(gop_info, sizeof(*gop_info), 1, second_pass_log_stream); + if (count < 1) { + aom_internal_error(error, AOM_CODEC_ERROR, + "Could not read from second pass log file!"); + } +} + +void av1_read_second_pass_per_frame_info( + FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr, + int frame_info_count, struct aom_internal_error_info *error) { + for (int i = 0; i < frame_info_count; i++) { + // read target bits + int bits = 0; + size_t count = fread(&bits, sizeof(bits), 1, second_pass_log_stream); + if (count < 1) { + aom_internal_error(error, AOM_CODEC_ERROR, + "Could not read from second pass log file!"); + } + frame_info_arr[i].bits_allocated = bits; + + // read distortion + uint64_t sse; + count = fread(&sse, sizeof(sse), 1, second_pass_log_stream); + if (count < 1) { + aom_internal_error(error, AOM_CODEC_ERROR, + "Could not read from second pass log file!"); + } + frame_info_arr[i].sse = sse; + + // read bpm factor + double factor; + count = fread(&factor, sizeof(factor), 1, second_pass_log_stream); + if (count < 1) { + aom_internal_error(error, AOM_CODEC_ERROR, + "Could not read from second pass log file!"); + } + frame_info_arr[i].bpm_factor = factor; + } +} + +int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) { + if (ctx == NULL) return -1; + int use_arf = 0; + for (int i = 0; i < ctx->gop_info.gf_length; i++) { + if (ctx->frame_info[i].order_hint != 0 && + ctx->frame_info[i].is_show_frame == 0) { + use_arf = 1; + } + } + if (use_arf != ctx->gop_info.use_arf) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Mismatch in third pass GOP length!"); + } + return use_arf; +} + +void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight, + int fwidth, double *ratio_h, double *ratio_w) { + assert(ctx); + assert(fidx < ctx->frame_info_count); + const int fheight_second_pass = ctx->frame_info[fidx].height; + const int fwidth_second_pass = ctx->frame_info[fidx].width; + assert(fheight_second_pass <= fheight && fwidth_second_pass <= fwidth); + + *ratio_h = (double)fheight / fheight_second_pass; + *ratio_w = (double)fwidth / fwidth_second_pass; +} + +THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx, + int mi_row, int mi_col, + double ratio_h, double ratio_w) { + assert(ctx); + assert(fidx < ctx->frame_info_count); + + const int mi_rows_second_pass = ctx->frame_info[fidx].mi_rows; + const int mi_cols_second_pass = ctx->frame_info[fidx].mi_cols; + + const int mi_row_second_pass = + clamp((int)round(mi_row / ratio_h), 0, mi_rows_second_pass - 1); + const int mi_col_second_pass = + clamp((int)round(mi_col / ratio_w), 0, mi_cols_second_pass - 1); + + const int mi_stride_second_pass = ctx->frame_info[fidx].mi_stride; + THIRD_PASS_MI_INFO *this_mi = ctx->frame_info[fidx].mi_info + + mi_row_second_pass * mi_stride_second_pass + + mi_col_second_pass; + return this_mi; +} + +void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi, + double ratio_h, double ratio_w, int *mi_row, + int *mi_col) { + *mi_row = (int)round(third_pass_mi->mi_row_start * ratio_h); + *mi_col = (int)round(third_pass_mi->mi_col_start * ratio_w); +} + +int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, double ratio_w, + MV_REFERENCE_FRAME frame) { + assert(this_mi != NULL); + int_mv cur_mv; + cur_mv.as_int = INVALID_MV; + + if (frame < LAST_FRAME || frame > ALTREF_FRAME) return cur_mv; + + for (int r = 0; r < 2; r++) { + if (this_mi->ref_frame[r] == frame) { + cur_mv.as_mv.row = (int16_t)round(this_mi->mv[r].as_mv.row * ratio_h); + cur_mv.as_mv.col = (int16_t)round(this_mi->mv[r].as_mv.col * ratio_w); + } + } + + return cur_mv; +} + +BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, + double ratio_w) { + assert(this_mi != NULL); + BLOCK_SIZE bsize = BLOCK_INVALID; + + const BLOCK_SIZE bsize_second_pass = this_mi->bsize; + assert(bsize_second_pass != BLOCK_INVALID); + + const int w_second_pass = block_size_wide[bsize_second_pass]; + const int h_second_pass = block_size_high[bsize_second_pass]; + + int part_type; + + if (w_second_pass == h_second_pass) { + part_type = PARTITION_NONE; + } else if (w_second_pass / h_second_pass == 2) { + part_type = PARTITION_HORZ; + } else if (w_second_pass / h_second_pass == 4) { + part_type = PARTITION_HORZ_4; + } else if (h_second_pass / w_second_pass == 2) { + part_type = PARTITION_VERT; + } else if (h_second_pass / w_second_pass == 4) { + part_type = PARTITION_VERT_4; + } else { + part_type = PARTITION_INVALID; + } + assert(part_type != PARTITION_INVALID); + + const int w = (int)(round(w_second_pass * ratio_w)); + const int h = (int)(round(h_second_pass * ratio_h)); + + for (int i = 0; i < SQR_BLOCK_SIZES; i++) { + const BLOCK_SIZE this_bsize = subsize_lookup[part_type][i]; + if (this_bsize == BLOCK_INVALID) continue; + + const int this_w = block_size_wide[this_bsize]; + const int this_h = block_size_high[this_bsize]; + + if (this_w >= w && this_h >= h) { + // find the smallest block size that contains the mapped block + bsize = this_bsize; + break; + } + } + if (bsize == BLOCK_INVALID) { + // could not find a proper one, just use the largest then. + bsize = BLOCK_128X128; + } + + return bsize; +} + +PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx, + THIRD_PASS_MI_INFO *this_mi) { + int mi_stride = ctx->frame_info[0].mi_stride; + + int mi_row = this_mi->mi_row_start; + int mi_col = this_mi->mi_col_start; + + THIRD_PASS_MI_INFO *corner_mi = + &ctx->frame_info[0].mi_info[mi_row * mi_stride + mi_col]; + + return corner_mi->partition; +} + +#else // !(CONFIG_THREE_PASS && CONFIG_AV1_DECODER) +void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx, + const char *file) { + (void)ctx; + (void)file; + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "To utilize three-pass encoding, libaom must be built " + "with CONFIG_THREE_PASS=1 & CONFIG_AV1_DECODER=1."); +} + +void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; } + +void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; } + +void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; } + +void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read) { + (void)cpi; + (void)is_read; +} + +void av1_close_second_pass_log(struct AV1_COMP *cpi) { (void)cpi; } + +void av1_write_second_pass_gop_info(struct AV1_COMP *cpi) { (void)cpi; } + +void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index) { + (void)cpi; + (void)gf_index; +} + +void av1_read_second_pass_gop_info(FILE *second_pass_log_stream, + THIRD_PASS_GOP_INFO *gop_info, + struct aom_internal_error_info *error) { + (void)second_pass_log_stream; + (void)gop_info; + (void)error; +} + +void av1_read_second_pass_per_frame_info( + FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr, + int frame_info_count, struct aom_internal_error_info *error) { + (void)second_pass_log_stream; + (void)frame_info_arr; + (void)frame_info_count; + (void)error; +} + +int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) { + (void)ctx; + return 1; +} + +void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight, + int fwidth, double *ratio_h, double *ratio_w) { + (void)ctx; + (void)fidx; + (void)fheight; + (void)fwidth; + (void)ratio_h; + (void)ratio_w; +} + +THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx, + int mi_row, int mi_col, + double ratio_h, double ratio_w) { + (void)ctx; + (void)fidx; + (void)mi_row; + (void)mi_col; + (void)ratio_h; + (void)ratio_w; + return NULL; +} + +int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, double ratio_w, + MV_REFERENCE_FRAME frame) { + (void)this_mi; + (void)ratio_h; + (void)ratio_w; + (void)frame; + int_mv mv; + mv.as_int = INVALID_MV; + return mv; +} + +BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, + double ratio_w) { + (void)this_mi; + (void)ratio_h; + (void)ratio_w; + return BLOCK_INVALID; +} + +void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi, + double ratio_h, double ratio_w, int *mi_row, + int *mi_col) { + (void)third_pass_mi; + (void)ratio_h; + (void)ratio_w; + (void)mi_row; + (void)mi_col; +} + +PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx, + THIRD_PASS_MI_INFO *this_mi) { + (void)ctx; + (void)this_mi; + return PARTITION_INVALID; +} +#endif // CONFIG_THREE_PASS && CONFIG_AV1_DECODER + +#if CONFIG_BITRATE_ACCURACY +static void fwrite_and_check(const void *ptr, size_t size, size_t nmemb, + FILE *stream, + struct aom_internal_error_info *error) { + size_t count = fwrite(ptr, size, nmemb, stream); + if (count < nmemb) { + aom_internal_error(error, AOM_CODEC_ERROR, "fwrite_and_check failed\n"); + } +} + +static void fread_and_check(void *ptr, size_t size, size_t nmemb, FILE *stream, + struct aom_internal_error_info *error) { + size_t count = fread(ptr, size, nmemb, stream); + if (count < nmemb) { + aom_internal_error(error, AOM_CODEC_ERROR, "fread_and_check failed\n"); + } +} + +void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group, + const TplParams *tpl_data) { + tpl_info->tpl_ready = tpl_data->ready; + if (tpl_info->tpl_ready) { + tpl_info->gf_length = gf_group->size; + for (int i = 0; i < tpl_info->gf_length; ++i) { + tpl_info->txfm_stats_list[i] = tpl_data->txfm_stats_list[i]; + tpl_info->qstep_ratio_ls[i] = av1_tpl_get_qstep_ratio(tpl_data, i); + tpl_info->update_type_list[i] = gf_group->update_type[i]; + } + } +} + +void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream, + struct aom_internal_error_info *error) { + fwrite_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1, + log_stream, error); + if (tpl_info->tpl_ready) { + fwrite_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1, + log_stream, error); + assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS); + fwrite_and_check(&tpl_info->txfm_stats_list, + sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length, + log_stream, error); + fwrite_and_check(&tpl_info->qstep_ratio_ls, + sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length, + log_stream, error); + fwrite_and_check(&tpl_info->update_type_list, + sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length, + log_stream, error); + } +} + +void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream, + struct aom_internal_error_info *error) { + av1_zero(*tpl_info); + fread_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1, + log_stream, error); + if (tpl_info->tpl_ready) { + fread_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1, + log_stream, error); + assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS); + fread_and_check(&tpl_info->txfm_stats_list, + sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length, + log_stream, error); + fread_and_check(&tpl_info->qstep_ratio_ls, + sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length, + log_stream, error); + fread_and_check(&tpl_info->update_type_list, + sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length, + log_stream, error); + } +} +#endif // CONFIG_BITRATE_ACCURACY diff --git a/third_party/aom/av1/encoder/thirdpass.h b/third_party/aom/av1/encoder/thirdpass.h new file mode 100644 index 0000000000..8080c06cb6 --- /dev/null +++ b/third_party/aom/av1/encoder/thirdpass.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_THIRDPASS_H_ +#define AOM_AV1_ENCODER_THIRDPASS_H_ + +#include "av1/common/enums.h" +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/firstpass.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/tpl_model.h" + +struct AV1_COMP; + +// TODO(bohanli): optimize this number +#define MAX_THIRD_PASS_BUF \ + (AOMMAX((2 * MAX_GF_INTERVAL + 1), MAX_STATIC_GF_GROUP_LENGTH)) + +// Struct to store useful information related to a GOP, in addition to what is +// available in the bitstream +typedef struct { + int gf_length; + int num_frames; + int use_arf; +} THIRD_PASS_GOP_INFO; + +#if CONFIG_BITRATE_ACCURACY +typedef struct TPL_INFO { + int gf_length; + int tpl_ready; + TplTxfmStats txfm_stats_list[MAX_LENGTH_TPL_FRAME_STATS]; + double qstep_ratio_ls[MAX_LENGTH_TPL_FRAME_STATS]; + FRAME_UPDATE_TYPE update_type_list[MAX_LENGTH_TPL_FRAME_STATS]; +} TPL_INFO; +#endif // CONFIG_BITRATE_ACCURACY + +typedef struct { + BLOCK_SIZE bsize; + PARTITION_TYPE partition; + int mi_row_start; + int mi_col_start; + int_mv mv[2]; + MV_REFERENCE_FRAME ref_frame[2]; + PREDICTION_MODE pred_mode; +} THIRD_PASS_MI_INFO; + +// Struct to store useful information about a frame for the third pass. +// The members are extracted from the decoder by function get_frame_info. +typedef struct { + int width; + int height; + int mi_stride; + int mi_rows; + int mi_cols; + int base_q_idx; + int is_show_existing_frame; + int is_show_frame; + int bits_allocated; + int actual_bits; + uint64_t sse; + double bpm_factor; + FRAME_TYPE frame_type; + unsigned int order_hint; + THIRD_PASS_MI_INFO *mi_info; +} THIRD_PASS_FRAME_INFO; + +typedef struct { + /* --- Input and decoding related members --- */ + // the input file + const char *input_file_name; +#if CONFIG_THREE_PASS + // input context + struct AvxInputContext *input_ctx; +#endif + // decoder codec context + aom_codec_ctx_t decoder; + // start of the frame in buf + const unsigned char *frame; + // end of the frame(s) in buf + const unsigned char *end_frame; + // whether we still have following frames in buf + int have_frame; + // pointer to buffer for the read frames + uint8_t *buf; + // size of data in buffer + size_t bytes_in_buffer; + // current buffer size + size_t buffer_size; + // error info pointer + struct aom_internal_error_info *err_info; + + int this_frame_bits; + + /* --- Members for third pass encoding --- */ + // Array to store info about each frame. + // frame_info[0] should point to the current frame. + THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF]; + // number of frames available in frame_info + int frame_info_count; + // the end of the previous GOP (order hint) + int prev_gop_end; + THIRD_PASS_GOP_INFO gop_info; +} THIRD_PASS_DEC_CTX; + +void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx, + const char *file); +void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx); + +// Set the GOP structure from the twopass bitstream. +// TODO(bohanli): this is currently a skeleton and we only return the gop +// length. This function also saves all frame information in the array +// ctx->frame_info for this GOP. +void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx); + +// Pop one frame out of the array ctx->frame_info. This function is used to make +// sure that frame_info[0] always corresponds to the current frame. +void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx); + +void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read); +void av1_close_second_pass_log(struct AV1_COMP *cpi); + +// Write the current GOP information into the second pass log file. +void av1_write_second_pass_gop_info(struct AV1_COMP *cpi); +// Write the information of the frames in this GOP into the second pass log +// file. +void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index); + +// Read the next GOP information from the second pass log file. +void av1_read_second_pass_gop_info(FILE *second_pass_log_stream, + THIRD_PASS_GOP_INFO *gop_info, + struct aom_internal_error_info *error); +// read the information of the frames in next GOP from the second pass log file. +void av1_read_second_pass_per_frame_info(FILE *second_pass_log_stream, + THIRD_PASS_FRAME_INFO *frame_info_arr, + int frame_info_count, + struct aom_internal_error_info *error); + +int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx); + +// Calculate the ratio of third pass frame dimensions over second pass frame +// dimensions. Return them in ratio_h and ratio_w. +void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight, + int fwidth, double *ratio_h, double *ratio_w); + +// Get the pointer to a second pass mi info, where mi_row and mi_col are the mi +// location in the thirdpass frame. +THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx, + int mi_row, int mi_col, + double ratio_h, double ratio_w); + +// Get the adjusted MVs of this_mi, associated with the reference frame. If no +// MV is found with the reference frame, INVALID_MV is returned. +int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, double ratio_w, + MV_REFERENCE_FRAME frame); + +// Get the adjusted block size of this_mi. +BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, double ratio_w); + +// Get the adjusted mi position in the third pass frame, of a given +// third_pass_mi. Location is returned in mi_row and mi_col. +void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi, + double ratio_h, double ratio_w, int *mi_row, + int *mi_col); + +PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx, + THIRD_PASS_MI_INFO *this_mi); + +#if CONFIG_BITRATE_ACCURACY + +void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group, + const TplParams *tpl_data); + +void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream, + struct aom_internal_error_info *error); + +void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream, + struct aom_internal_error_info *error); + +#endif // CONFIG_BITRATE_ACCURACY +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_THIRDPASS_H_ diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c new file mode 100644 index 0000000000..ffac886e32 --- /dev/null +++ b/third_party/aom/av1/encoder/tokenize.c @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_mem/aom_mem.h" + +#include "av1/common/entropy.h" +#include "av1/common/pred_common.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/tokenize.h" + +static AOM_INLINE int av1_fast_palette_color_index_context_on_edge( + const uint8_t *color_map, int stride, int r, int c, int *color_idx) { + const bool has_left = (c - 1 >= 0); + const bool has_above = (r - 1 >= 0); + assert(r > 0 || c > 0); + assert(has_above ^ has_left); + assert(color_idx); + (void)has_left; + + const uint8_t color_neighbor = has_above + ? color_map[(r - 1) * stride + (c - 0)] + : color_map[(r - 0) * stride + (c - 1)]; + // If the neighbor color has higher index than current color index, then we + // move up by 1. + const uint8_t current_color = *color_idx = color_map[r * stride + c]; + if (color_neighbor > current_color) { + (*color_idx)++; + } else if (color_neighbor == current_color) { + *color_idx = 0; + } + + // Get hash value of context. + // The non-diagonal neighbors get a weight of 2. + const uint8_t color_score = 2; + const uint8_t hash_multiplier = 1; + const uint8_t color_index_ctx_hash = color_score * hash_multiplier; + + // Lookup context from hash. + const int color_index_ctx = + av1_palette_color_index_context_lookup[color_index_ctx_hash]; + assert(color_index_ctx == 0); + (void)color_index_ctx; + return 0; +} + +#define SWAP(i, j) \ + do { \ + const uint8_t tmp_score = score_rank[i]; \ + const uint8_t tmp_color = color_rank[i]; \ + score_rank[i] = score_rank[j]; \ + color_rank[i] = color_rank[j]; \ + score_rank[j] = tmp_score; \ + color_rank[j] = tmp_color; \ + } while (0) +#define INVALID_COLOR_IDX (UINT8_MAX) + +// A faster version of av1_get_palette_color_index_context used by the encoder +// exploiting the fact that the encoder does not need to maintain a color order. +static AOM_INLINE int av1_fast_palette_color_index_context( + const uint8_t *color_map, int stride, int r, int c, int *color_idx) { + assert(r > 0 || c > 0); + + const bool has_above = (r - 1 >= 0); + const bool has_left = (c - 1 >= 0); + assert(has_above || has_left); + if (has_above ^ has_left) { + return av1_fast_palette_color_index_context_on_edge(color_map, stride, r, c, + color_idx); + } + + // This goes in the order of left, top, and top-left. This has the advantage + // that unless anything here are not distinct or invalid, this will already + // be in sorted order. Furthermore, if either of the first two is + // invalid, we know the last one is also invalid. + uint8_t color_neighbors[NUM_PALETTE_NEIGHBORS]; + color_neighbors[0] = color_map[(r - 0) * stride + (c - 1)]; + color_neighbors[1] = color_map[(r - 1) * stride + (c - 0)]; + color_neighbors[2] = color_map[(r - 1) * stride + (c - 1)]; + + // Aggregate duplicated values. + // Since our array is so small, using a couple if statements is faster + uint8_t scores[NUM_PALETTE_NEIGHBORS] = { 2, 2, 1 }; + uint8_t num_invalid_colors = 0; + if (color_neighbors[0] == color_neighbors[1]) { + scores[0] += scores[1]; + color_neighbors[1] = INVALID_COLOR_IDX; + num_invalid_colors += 1; + + if (color_neighbors[0] == color_neighbors[2]) { + scores[0] += scores[2]; + num_invalid_colors += 1; + } + } else if (color_neighbors[0] == color_neighbors[2]) { + scores[0] += scores[2]; + num_invalid_colors += 1; + } else if (color_neighbors[1] == color_neighbors[2]) { + scores[1] += scores[2]; + num_invalid_colors += 1; + } + + const uint8_t num_valid_colors = NUM_PALETTE_NEIGHBORS - num_invalid_colors; + + uint8_t *color_rank = color_neighbors; + uint8_t *score_rank = scores; + + // Sort everything + if (num_valid_colors > 1) { + if (color_neighbors[1] == INVALID_COLOR_IDX) { + scores[1] = scores[2]; + color_neighbors[1] = color_neighbors[2]; + } + + // We need to swap the first two elements if they have the same score but + // the color indices are not in the right order + if (score_rank[0] < score_rank[1] || + (score_rank[0] == score_rank[1] && color_rank[0] > color_rank[1])) { + SWAP(0, 1); + } + if (num_valid_colors > 2) { + if (score_rank[0] < score_rank[2]) { + SWAP(0, 2); + } + if (score_rank[1] < score_rank[2]) { + SWAP(1, 2); + } + } + } + + // If any of the neighbor colors has higher index than current color index, + // then we move up by 1 unless the current color is the same as one of the + // neighbors. + const uint8_t current_color = *color_idx = color_map[r * stride + c]; + for (int idx = 0; idx < num_valid_colors; idx++) { + if (color_rank[idx] > current_color) { + (*color_idx)++; + } else if (color_rank[idx] == current_color) { + *color_idx = idx; + break; + } + } + + // Get hash value of context. + uint8_t color_index_ctx_hash = 0; + static const uint8_t hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 }; + for (int idx = 0; idx < num_valid_colors; ++idx) { + color_index_ctx_hash += score_rank[idx] * hash_multipliers[idx]; + } + assert(color_index_ctx_hash > 0); + assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH); + + // Lookup context from hash. + const int color_index_ctx = 9 - color_index_ctx_hash; + assert(color_index_ctx == + av1_palette_color_index_context_lookup[color_index_ctx_hash]); + assert(color_index_ctx >= 0); + assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS); + return color_index_ctx; +} +#undef INVALID_COLOR_IDX +#undef SWAP + +static int cost_and_tokenize_map(Av1ColorMapParam *param, TokenExtra **t, + int plane, int calc_rate, int allow_update_cdf, + FRAME_COUNTS *counts) { + const uint8_t *const color_map = param->color_map; + MapCdf map_cdf = param->map_cdf; + ColorCost color_cost = param->color_cost; + const int plane_block_width = param->plane_width; + const int rows = param->rows; + const int cols = param->cols; + const int n = param->n_colors; + const int palette_size_idx = n - PALETTE_MIN_SIZE; + int this_rate = 0; + + (void)plane; + (void)counts; + + for (int k = 1; k < rows + cols - 1; ++k) { + for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) { + int i = k - j; + int color_new_idx; + const int color_ctx = av1_fast_palette_color_index_context( + color_map, plane_block_width, i, j, &color_new_idx); + assert(color_new_idx >= 0 && color_new_idx < n); + if (calc_rate) { + this_rate += color_cost[palette_size_idx][color_ctx][color_new_idx]; + } else { + (*t)->token = color_new_idx; + (*t)->color_ctx = color_ctx; + ++(*t); + if (allow_update_cdf) + update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n); +#if CONFIG_ENTROPY_STATS + if (plane) { + ++counts->palette_uv_color_index[palette_size_idx][color_ctx] + [color_new_idx]; + } else { + ++counts->palette_y_color_index[palette_size_idx][color_ctx] + [color_new_idx]; + } +#endif + } + } + } + if (calc_rate) return this_rate; + return 0; +} + +static void get_palette_params(const MACROBLOCK *const x, int plane, + BLOCK_SIZE bsize, Av1ColorMapParam *params) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + params->color_map = xd->plane[plane].color_index_map; + params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf + : xd->tile_ctx->palette_y_color_index_cdf; + params->color_cost = plane ? x->mode_costs.palette_uv_color_cost + : x->mode_costs.palette_y_color_cost; + params->n_colors = pmi->palette_size[plane]; + av1_get_block_dimensions(bsize, plane, xd, ¶ms->plane_width, NULL, + ¶ms->rows, ¶ms->cols); +} + +// TODO(any): Remove this function +static void get_color_map_params(const MACROBLOCK *const x, int plane, + BLOCK_SIZE bsize, TX_SIZE tx_size, + COLOR_MAP_TYPE type, + Av1ColorMapParam *params) { + (void)tx_size; + memset(params, 0, sizeof(*params)); + switch (type) { + case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break; + default: assert(0 && "Invalid color map type"); return; + } +} + +int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, + TX_SIZE tx_size, COLOR_MAP_TYPE type) { + assert(plane == 0 || plane == 1); + Av1ColorMapParam color_map_params; + get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params); + return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL); +} + +void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, + TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size, + COLOR_MAP_TYPE type, int allow_update_cdf, + FRAME_COUNTS *counts) { + assert(plane == 0 || plane == 1); + Av1ColorMapParam color_map_params; + get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params); + // The first color index does not use context or entropy. + (*t)->token = color_map_params.color_map[0]; + (*t)->color_ctx = -1; + ++(*t); + cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf, + counts); +} + +static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size, + BLOCK_SIZE plane_bsize, int blk_row, int blk_col, + int block, int plane, void *arg) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + + if (tx_size == plane_tx_size || plane) { + plane_bsize = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + + struct tokenize_b_args *args = arg; + if (args->allow_update_cdf) + av1_update_and_record_txb_context(plane, block, blk_row, blk_col, + plane_bsize, tx_size, arg); + else + av1_record_txb_context(plane, block, blk_row, blk_col, plane_bsize, + tx_size, arg); + + } else { + // Half the block size in transform block unit. + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsw * bsh; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + + assert(bsw > 0 && bsh > 0); + + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { + const int offsetc = blk_col + col; + + tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane, + arg); + block += step; + } + } + } +} + +void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, + uint8_t allow_update_cdf) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) + return; + + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run }; + + if (mbmi->skip_txfm) { + av1_reset_entropy_context(xd, bsize, num_planes); + return; + } + + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; + const int bw = mi_size_wide[txb_size]; + const int bh = mi_size_high[txb_size]; + int block = 0; + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, ss_x, ss_y); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + + mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide); + mu_blocks_high = AOMMIN(mi_height, mu_blocks_high); + + for (int idy = 0; idy < mi_height; idy += mu_blocks_high) { + for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) { + const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height); + const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width); + for (int blk_row = idy; blk_row < unit_height; blk_row += bh) { + for (int blk_col = idx; blk_col < unit_width; blk_col += bw) { + tokenize_vartx(td, max_tx_size, plane_bsize, blk_row, blk_col, + block, plane, &arg); + block += step; + } + } + } + } + } + if (rate) *rate += arg.this_rate; +} diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h new file mode 100644 index 0000000000..f675c489ae --- /dev/null +++ b/third_party/aom/av1/encoder/tokenize.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TOKENIZE_H_ +#define AOM_AV1_ENCODER_TOKENIZE_H_ + +#include "av1/common/entropy.h" +#include "av1/encoder/block.h" +#include "aom_dsp/bitwriter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The token and color_ctx members of the TokenExtra structure are used +// to store the indices of color and color context of each pixel in +// case of palette mode. +// 1) token can take values in the range of [0, 7] as maximum number of possible +// colors is 8 (PALETTE_COLORS). Hence token requires 3 bits (unsigned). +// 2) The reserved field (1-bit) is positioned such that color_ctx occupies the +// most significant bits and token occupies the least significant bits of the +// byte. Thus accesses to token and color_ctx are optimal. If TokenExtra is +// defined as: +// typedef struct { +// int8_t color_ctx : 4; +// uint8_t token : 3; +// } TokenExtra; +// then read of color_ctx requires an extra left shift to facilitate sign +// extension and write of token requires an extra masking. +// 3) color_ctx can take 5 (PALETTE_COLOR_INDEX_CONTEXTS) valid values, i.e., +// from 0 to 4. As per the current implementation it can take values in the +// range of [-1, 4]. Here -1 corresponds to invalid color index context and is +// used for default initialization. Hence color_ctx requires 4 bits (signed). +typedef struct { + uint8_t token : 3; + uint8_t reserved : 1; + int8_t color_ctx : 4; +} TokenExtra; + +typedef struct { + TokenExtra *start; + unsigned int count; +} TokenList; + +typedef struct { + // Number of tile tokens for which memory is allocated. + unsigned int tokens_allocated; + // tile_tok[i][j] is a pointer to the buffer storing palette tokens of the ith + // tile row, jth tile column. + TokenExtra *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS]; + // tplist[i][j][k] holds the start pointer of tile_tok[i][j] and the count of + // palette tokens for the kth superblock row of the ith tile row, jth tile + // column. + TokenList *tplist[MAX_TILE_ROWS][MAX_TILE_COLS]; +} TokenInfo; + +struct AV1_COMP; +struct ThreadData; +struct FRAME_COUNTS; + +enum { + OUTPUT_ENABLED = 0, + DRY_RUN_NORMAL, + DRY_RUN_COSTCOEFFS, +} UENUM1BYTE(RUN_TYPE); + +struct tokenize_b_args { + const struct AV1_COMP *cpi; + struct ThreadData *td; + int this_rate; + uint8_t allow_update_cdf; + RUN_TYPE dry_run; +}; + +// Note in all the tokenize functions rate if non NULL is incremented +// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS, +// otherwise rate is not incremented. +void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, + uint8_t allow_update_cdf); + +int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, + TX_SIZE tx_size, COLOR_MAP_TYPE type); + +void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, + TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size, + COLOR_MAP_TYPE type, int allow_update_cdf, + struct FRAME_COUNTS *counts); + +static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id, + TX_SIZE tx_size) { + const int eob_max = av1_get_max_eob(tx_size); + return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; +} + +// Token buffer is only used for palette tokens. +static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols, + int sb_size_log2, + const int num_planes) { + // Calculate the maximum number of max superblocks in the image. + const int shift = sb_size_log2 - 4; + const int sb_size = 1 << sb_size_log2; + const int sb_size_square = sb_size * sb_size; + const int sb_rows = CEIL_POWER_OF_TWO(mb_rows, shift); + const int sb_cols = CEIL_POWER_OF_TWO(mb_cols, shift); + + // One palette token for each pixel. There can be palettes on two planes. + const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square; + + return sb_rows * sb_cols * sb_palette_toks; +} + +// Allocate memory for token related info. +static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info, + unsigned int tokens_required) { + int sb_rows = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); + token_info->tokens_allocated = tokens_required; + + CHECK_MEM_ERROR(cm, token_info->tile_tok[0][0], + (TokenExtra *)aom_calloc( + tokens_required, sizeof(*token_info->tile_tok[0][0]))); + + CHECK_MEM_ERROR( + cm, token_info->tplist[0][0], + (TokenList *)aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS, + sizeof(*token_info->tplist[0][0]))); +} + +// Check if memory allocation has been done for token related info. +static AOM_INLINE bool is_token_info_allocated(const TokenInfo *token_info) { + return ((token_info->tile_tok[0][0] != NULL) && + (token_info->tplist[0][0] != NULL)); +} + +// Free memory from token related variables. +static AOM_INLINE void free_token_info(TokenInfo *token_info) { + aom_free(token_info->tile_tok[0][0]); + token_info->tile_tok[0][0] = NULL; + + aom_free(token_info->tplist[0][0]); + token_info->tplist[0][0] = NULL; + + token_info->tokens_allocated = 0; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TOKENIZE_H_ diff --git a/third_party/aom/av1/encoder/tpl_model.c b/third_party/aom/av1/encoder/tpl_model.c new file mode 100644 index 0000000000..ca60e4981e --- /dev/null +++ b/third_party/aom/av1/encoder/tpl_model.c @@ -0,0 +1,2511 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "av1/encoder/thirdpass.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tpl_model.h" + +static INLINE double exp_bounded(double v) { + // When v > 700 or <-700, the exp function will be close to overflow + // For details, see the "Notes" in the following link. + // https://en.cppreference.com/w/c/numeric/math/exp + if (v > 700) { + return DBL_MAX; + } else if (v < -700) { + return 0; + } + return exp(v); +} + +void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats) { + tpl_txfm_stats->ready = 0; + tpl_txfm_stats->coeff_num = 256; + tpl_txfm_stats->txfm_block_count = 0; + memset(tpl_txfm_stats->abs_coeff_sum, 0, + sizeof(tpl_txfm_stats->abs_coeff_sum[0]) * tpl_txfm_stats->coeff_num); + memset(tpl_txfm_stats->abs_coeff_mean, 0, + sizeof(tpl_txfm_stats->abs_coeff_mean[0]) * tpl_txfm_stats->coeff_num); +} + +#if CONFIG_BITRATE_ACCURACY +void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats, + TplTxfmStats *accumulated_stats) { + accumulated_stats->txfm_block_count += sub_stats->txfm_block_count; + for (int i = 0; i < accumulated_stats->coeff_num; ++i) { + accumulated_stats->abs_coeff_sum[i] += sub_stats->abs_coeff_sum[i]; + } +} + +void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats, + const tran_low_t *coeff) { + // For transform larger than 16x16, the scale of coeff need to be adjusted. + // It's not LOSSLESS_Q_STEP. + assert(tpl_txfm_stats->coeff_num <= 256); + for (int i = 0; i < tpl_txfm_stats->coeff_num; ++i) { + tpl_txfm_stats->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP; + } + ++tpl_txfm_stats->txfm_block_count; +} + +void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats) { + if (txfm_stats->txfm_block_count > 0) { + for (int j = 0; j < txfm_stats->coeff_num; j++) { + txfm_stats->abs_coeff_mean[j] = + txfm_stats->abs_coeff_sum[j] / txfm_stats->txfm_block_count; + } + txfm_stats->ready = 1; + } else { + txfm_stats->ready = 0; + } +} + +static AOM_INLINE void av1_tpl_store_txfm_stats( + TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats, + const int frame_index) { + tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats; +} +#endif // CONFIG_BITRATE_ACCURACY + +static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane, + const tran_low_t *coeff, + tran_low_t *qcoeff, + tran_low_t *dqcoeff, TX_SIZE tx_size, + uint16_t *eob, int64_t *recon_error, + int64_t *sse) { + const struct macroblock_plane *const p = &x->plane[plane]; + const MACROBLOCKD *xd = &x->e_mbd; + const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + + QUANT_PARAM quant_param; + av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param); + +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, + scan_order, &quant_param); + *recon_error = + av1_highbd_block_error(coeff, dqcoeff, pix_num, sse, xd->bd) >> shift; + } else { + av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order, + &quant_param); + *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + } +#else + (void)xd; + av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order, + &quant_param); + *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift; +#endif // CONFIG_AV1_HIGHBITDEPTH + + *recon_error = AOMMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = AOMMAX(*sse, 1); +} + +static AOM_INLINE void set_tpl_stats_block_size(uint8_t *block_mis_log2, + uint8_t *tpl_bsize_1d) { + // tpl stats bsize: 2 means 16x16 + *block_mis_log2 = 2; + // Block size used in tpl motion estimation + *tpl_bsize_1d = 16; + // MIN_TPL_BSIZE_1D = 16; + assert(*tpl_bsize_1d >= 16); +} + +void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi, + CommonModeInfoParams *const mi_params, int width, + int height, int byte_alignment, int lag_in_frames) { + SequenceHeader *const seq_params = &ppi->seq_params; + TplParams *const tpl_data = &ppi->tpl_data; + set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2, + &tpl_data->tpl_bsize_1d); + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + tpl_data->border_in_pixels = + ALIGN_POWER_OF_TWO(tpl_data->tpl_bsize_1d + 2 * AOM_INTERP_EXTEND, 5); + + const int alloc_y_plane_only = + ppi->cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : 0; + for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) { + const int mi_cols = + ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2); + const int mi_rows = + ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2); + TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame]; + tpl_frame->is_valid = 0; + tpl_frame->width = mi_cols >> block_mis_log2; + tpl_frame->height = mi_rows >> block_mis_log2; + tpl_frame->stride = tpl_data->tpl_stats_buffer[frame].width; + tpl_frame->mi_rows = mi_params->mi_rows; + tpl_frame->mi_cols = mi_params->mi_cols; + } + tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1]; + + // If lag_in_frames <= 1, TPL module is not invoked. Hence dynamic memory + // allocations are avoided for buffers in tpl_data. + if (lag_in_frames <= 1) return; + + AOM_CHECK_MEM_ERROR(&ppi->error, tpl_data->txfm_stats_list, + aom_calloc(MAX_LENGTH_TPL_FRAME_STATS, + sizeof(*tpl_data->txfm_stats_list))); + + for (int frame = 0; frame < lag_in_frames; ++frame) { + AOM_CHECK_MEM_ERROR( + &ppi->error, tpl_data->tpl_stats_pool[frame], + aom_calloc(tpl_data->tpl_stats_buffer[frame].width * + tpl_data->tpl_stats_buffer[frame].height, + sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr))); + + if (aom_alloc_frame_buffer( + &tpl_data->tpl_rec_pool[frame], width, height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, tpl_data->border_in_pixels, + byte_alignment, 0, alloc_y_plane_only)) + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } +} + +static AOM_INLINE int32_t tpl_get_satd_cost(BitDepthInfo bd_info, + int16_t *src_diff, int diff_stride, + const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + tran_low_t *coeff, int bw, int bh, + TX_SIZE tx_size) { + const int pix_num = bw * bh; + + av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride, + dst, dst_stride); + av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff); + return aom_satd(coeff, pix_num); +} + +static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { + const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; + + assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); + int rate_cost = 1; + + for (int idx = 0; idx < eob; ++idx) { + unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]); + rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0); + } + + return (rate_cost << AV1_PROB_COST_SHIFT); +} + +static AOM_INLINE void txfm_quant_rdcost( + const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size, + int do_recon, int *rate_cost, int64_t *recon_error, int64_t *sse) { + const MACROBLOCKD *xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + uint16_t eob; + av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride, + dst, dst_stride); + av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff); + + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error, + sse); + + *rate_cost = rate_estimator(qcoeff, eob, tx_size); + + if (do_recon) + av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst, + dst_stride, eob, 0); +} + +static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + int ref_stride, int width, int ref_width, + BLOCK_SIZE bsize, MV center_mv, + int_mv *best_mv) { + AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf; + int step_param; + uint32_t bestsme = UINT_MAX; + FULLPEL_MV_STATS best_mv_stats; + int distortion; + uint32_t sse; + int cost_list[5]; + FULLPEL_MV start_mv = get_fullmv_from_mv(¢er_mv); + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + x->plane[0].src.width = width; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = ref_stride; + xd->plane[0].pre[0].width = ref_width; + + step_param = tpl_sf->reduce_first_step_size; + step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + const search_site_config *search_site_cfg = + cpi->mv_search_params.search_site_cfg[SS_CFG_SRC]; + if (search_site_cfg->stride != ref_stride) + search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD]; + assert(search_site_cfg->stride == ref_stride); + + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, ¢er_mv, + start_mv, search_site_cfg, + tpl_sf->search_method, + /*fine_search_interval=*/0); + + bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), + &best_mv->as_fullmv, &best_mv_stats, NULL); + + // When sub-pel motion search is skipped, populate sub-pel precision MV and + // return. + if (tpl_sf->subpel_force_stop == FULL_PEL) { + best_mv->as_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + return bestsme; + } + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, ¢er_mv, + cost_list); + ms_params.forced_stop = tpl_sf->subpel_force_stop; + ms_params.var_params.subpel_search_type = USE_2_TAPS; + ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; + best_mv_stats.err_cost = 0; + MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + bestsme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv->as_mv, + &distortion, &sse, NULL); + + return bestsme; +} + +typedef struct { + int_mv mv; + int sad; +} center_mv_t; + +static int compare_sad(const void *a, const void *b) { + const int diff = ((center_mv_t *)a)->sad - ((center_mv_t *)b)->sad; + if (diff < 0) + return -1; + else if (diff > 0) + return 1; + return 0; +} + +static int is_alike_mv(int_mv candidate_mv, center_mv_t *center_mvs, + int center_mvs_count, int skip_alike_starting_mv) { + // MV difference threshold is in 1/8 precision. + const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) }; + int thr = mv_diff_thr[skip_alike_starting_mv]; + int i; + + for (i = 0; i < center_mvs_count; i++) { + if (abs(center_mvs[i].mv.as_mv.col - candidate_mv.as_mv.col) < thr && + abs(center_mvs[i].mv.as_mv.row - candidate_mv.as_mv.row) < thr) + return 1; + } + + return 0; +} + +static void get_rate_distortion( + int *rate_cost, int64_t *recon_error, int64_t *pred_error, + int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff, + tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x, + const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3], + const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode, + int mi_row, int mi_col, int use_y_only_rate_distortion, int do_recon, + TplTxfmStats *tpl_txfm_stats) { + const SequenceHeader *seq_params = cm->seq_params; + *rate_cost = 0; + *recon_error = 1; + *pred_error = 1; + + (void)tpl_txfm_stats; + + MACROBLOCKD *xd = &x->e_mbd; + int is_compound = (best_mode == NEW_NEWMV); + int num_planes = use_y_only_rate_distortion ? 1 : MAX_MB_PLANE; + + uint8_t *src_buffer_pool[MAX_MB_PLANE] = { + xd->cur_buf->y_buffer, + xd->cur_buf->u_buffer, + xd->cur_buf->v_buffer, + }; + const int src_stride_pool[MAX_MB_PLANE] = { + xd->cur_buf->y_stride, + xd->cur_buf->uv_stride, + xd->cur_buf->uv_stride, + }; + + const int_interpfilters kernel = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + for (int plane = 0; plane < num_planes; ++plane) { + struct macroblockd_plane *pd = &xd->plane[plane]; + BLOCK_SIZE bsize_plane = + av1_ss_size_lookup[txsize_to_bsize[tx_size]][pd->subsampling_x] + [pd->subsampling_y]; + + int dst_buffer_stride = rec_stride_pool[plane]; + int dst_mb_offset = + ((mi_row * MI_SIZE * dst_buffer_stride) >> pd->subsampling_y) + + ((mi_col * MI_SIZE) >> pd->subsampling_x); + uint8_t *dst_buffer = rec_buffer_pool[plane] + dst_mb_offset; + for (int ref = 0; ref < 1 + is_compound; ++ref) { + if (!is_inter_mode(best_mode)) { + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, + block_size_wide[bsize_plane], block_size_high[bsize_plane], + max_txsize_rect_lookup[bsize_plane], best_mode, 0, 0, + FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, dst_buffer, + dst_buffer_stride, 0, 0, plane); + } else { + int_mv best_mv = xd->mi[0]->mv[ref]; + uint8_t *ref_buffer_pool[MAX_MB_PLANE] = { + ref_frame_ptr[ref]->y_buffer, + ref_frame_ptr[ref]->u_buffer, + ref_frame_ptr[ref]->v_buffer, + }; + InterPredParams inter_pred_params; + struct buf_2d ref_buf = { + NULL, ref_buffer_pool[plane], + plane ? ref_frame_ptr[ref]->uv_width : ref_frame_ptr[ref]->y_width, + plane ? ref_frame_ptr[ref]->uv_height : ref_frame_ptr[ref]->y_height, + plane ? ref_frame_ptr[ref]->uv_stride : ref_frame_ptr[ref]->y_stride + }; + av1_init_inter_params(&inter_pred_params, block_size_wide[bsize_plane], + block_size_high[bsize_plane], + (mi_row * MI_SIZE) >> pd->subsampling_y, + (mi_col * MI_SIZE) >> pd->subsampling_x, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), 0, + xd->block_ref_scale_factors[0], &ref_buf, kernel); + if (is_compound) av1_init_comp_mode(&inter_pred_params); + inter_pred_params.conv_params = get_conv_params_no_round( + ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); + + av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride, + &best_mv.as_mv, &inter_pred_params); + } + } + + int src_stride = src_stride_pool[plane]; + int src_mb_offset = ((mi_row * MI_SIZE * src_stride) >> pd->subsampling_y) + + ((mi_col * MI_SIZE) >> pd->subsampling_x); + + int this_rate = 1; + int64_t this_recon_error = 1; + int64_t sse; + txfm_quant_rdcost( + x, src_diff, block_size_wide[bsize_plane], + src_buffer_pool[plane] + src_mb_offset, src_stride, dst_buffer, + dst_buffer_stride, coeff, qcoeff, dqcoeff, block_size_wide[bsize_plane], + block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane], + do_recon, &this_rate, &this_recon_error, &sse); + +#if CONFIG_BITRATE_ACCURACY + if (plane == 0 && tpl_txfm_stats) { + // We only collect Y plane's transform coefficient + av1_record_tpl_txfm_block(tpl_txfm_stats, coeff); + } +#endif // CONFIG_BITRATE_ACCURACY + + *recon_error += this_recon_error; + *pred_error += sse; + *rate_cost += this_rate; + } +} + +static AOM_INLINE int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd, + const uint8_t *src_mb_buffer, + int src_stride, + TplBuffers *tpl_tmp_buffers, + BLOCK_SIZE bsize, TX_SIZE tx_size, + int mi_row, int mi_col, int rf_idx, + MV *rfidx_mv, int use_pred_sad) { + const BitDepthInfo bd_info = get_bit_depth_info(xd); + TplParams *tpl_data = &cpi->ppi->tpl_data; + const YV12_BUFFER_CONFIG *const ref_frame_ptr = + tpl_data->src_ref_frame[rf_idx]; + int16_t *src_diff = tpl_tmp_buffers->src_diff; + tran_low_t *coeff = tpl_tmp_buffers->coeff; + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + int32_t inter_cost; + + if (cpi->sf.tpl_sf.subpel_force_stop != FULL_PEL) { + const int_interpfilters kernel = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + uint8_t *predictor8 = tpl_tmp_buffers->predictor8; + uint8_t *predictor = + is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8; + struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer, + ref_frame_ptr->y_width, ref_frame_ptr->y_height, + ref_frame_ptr->y_stride }; + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE, + mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0, + &tpl_data->sf, &ref_buf, kernel); + inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd); + + av1_enc_build_one_inter_predictor(predictor, bw, rfidx_mv, + &inter_pred_params); + + if (use_pred_sad) { + inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(src_mb_buffer, src_stride, + predictor, bw); + } else { + inter_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + } + } else { + int ref_mb_offset = + mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE; + uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset; + int ref_stride = ref_frame_ptr->y_stride; + const FULLPEL_MV fullmv = get_fullmv_from_mv(rfidx_mv); + // Since sub-pel motion search is not performed, use the prediction pixels + // directly from the reference block ref_mb + if (use_pred_sad) { + inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf( + src_mb_buffer, src_stride, + &ref_mb[fullmv.row * ref_stride + fullmv.col], ref_stride); + } else { + inter_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + &ref_mb[fullmv.row * ref_stride + fullmv.col], + ref_stride, coeff, bw, bh, tx_size); + } + } + return inter_cost; +} + +static AOM_INLINE void mode_estimation(AV1_COMP *cpi, + TplTxfmStats *tpl_txfm_stats, + TplBuffers *tpl_tmp_buffers, + MACROBLOCK *x, int mi_row, int mi_col, + BLOCK_SIZE bsize, TX_SIZE tx_size, + TplDepStats *tpl_stats) { + AV1_COMMON *cm = &cpi->common; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf; + + (void)gf_group; + + MACROBLOCKD *xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + TplParams *tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx]; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + + int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index; + + int32_t best_intra_cost = INT32_MAX; + int32_t intra_cost; + PREDICTION_MODE best_mode = DC_PRED; + + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset; + const int src_stride = xd->cur_buf->y_stride; + const int src_width = xd->cur_buf->y_width; + + int dst_mb_offset = + mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE; + uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset; + int dst_buffer_stride = tpl_frame->rec_picture->y_stride; + int use_y_only_rate_distortion = tpl_sf->use_y_only_rate_distortion; + + uint8_t *rec_buffer_pool[3] = { + tpl_frame->rec_picture->y_buffer, + tpl_frame->rec_picture->u_buffer, + tpl_frame->rec_picture->v_buffer, + }; + + const int rec_stride_pool[3] = { + tpl_frame->rec_picture->y_stride, + tpl_frame->rec_picture->uv_stride, + tpl_frame->rec_picture->uv_stride, + }; + + for (int plane = 1; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *pd = &xd->plane[plane]; + pd->subsampling_x = xd->cur_buf->subsampling_x; + pd->subsampling_y = xd->cur_buf->subsampling_y; + } + + uint8_t *predictor8 = tpl_tmp_buffers->predictor8; + int16_t *src_diff = tpl_tmp_buffers->src_diff; + tran_low_t *coeff = tpl_tmp_buffers->coeff; + tran_low_t *qcoeff = tpl_tmp_buffers->qcoeff; + tran_low_t *dqcoeff = tpl_tmp_buffers->dqcoeff; + uint8_t *predictor = + is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8; + int64_t recon_error = 1; + int64_t pred_error = 1; + + memset(tpl_stats, 0, sizeof(*tpl_stats)); + tpl_stats->ref_frame_index[0] = -1; + tpl_stats->ref_frame_index[1] = -1; + + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width, + cm->mi_params.mi_rows, cm->mi_params.mi_cols); + set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], + av1_num_planes(cm)); + xd->mi[0]->bsize = bsize; + xd->mi[0]->motion_mode = SIMPLE_TRANSLATION; + + // Intra prediction search + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + // Pre-load the bottom left line. + if (xd->left_available && + mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) { + if (is_cur_buf_hbd(xd)) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer); + for (int i = 0; i < bw; ++i) + dst[(bw + i) * dst_buffer_stride - 1] = + dst[(bw - 1) * dst_buffer_stride - 1]; + } else { + for (int i = 0; i < bw; ++i) + dst_buffer[(bw + i) * dst_buffer_stride - 1] = + dst_buffer[(bw - 1) * dst_buffer_stride - 1]; + } + } + + // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED, + // H_PRED, and V_PRED + const PREDICTION_MODE last_intra_mode = + tpl_sf->prune_intra_modes ? D45_PRED : INTRA_MODE_END; + const SequenceHeader *seq_params = cm->seq_params; + for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode; + ++mode) { + av1_predict_intra_block(xd, seq_params->sb_size, + seq_params->enable_intra_edge_filter, + block_size_wide[bsize], block_size_high[bsize], + tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer, + dst_buffer_stride, predictor, bw, 0, 0, 0); + + if (tpl_frame->use_pred_sad) { + intra_cost = (int32_t)cpi->ppi->fn_ptr[bsize].sdf( + src_mb_buffer, src_stride, predictor, bw); + } else { + intra_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + } + + if (intra_cost < best_intra_cost) { + best_intra_cost = intra_cost; + best_mode = mode; + } + } + // Calculate SATD of the best intra mode if SAD was used for mode decision + // as best_intra_cost is used in ML model to skip intra mode evaluation. + if (tpl_frame->use_pred_sad) { + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, + block_size_wide[bsize], block_size_high[bsize], tx_size, best_mode, 0, + 0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, predictor, bw, 0, + 0, 0); + best_intra_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + } + + int rate_cost = 1; + + if (cpi->use_ducky_encode) { + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, NULL, rec_buffer_pool, + rec_stride_pool, tx_size, best_mode, mi_row, mi_col, + use_y_only_rate_distortion, 1 /*do_recon*/, NULL); + + tpl_stats->intra_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->intra_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->intra_rate = rate_cost; + } + + if (cpi->third_pass_ctx && + frame_offset < cpi->third_pass_ctx->frame_info_count && + tpl_data->frame_idx < gf_group->size) { + double ratio_h, ratio_w; + av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height, + cm->width, &ratio_h, &ratio_w); + THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( + cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w); + + PREDICTION_MODE third_pass_mode = this_mi->pred_mode; + + if (third_pass_mode >= last_intra_mode && + third_pass_mode < INTRA_MODE_END) { + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, + block_size_wide[bsize], block_size_high[bsize], tx_size, + third_pass_mode, 0, 0, FILTER_INTRA_MODES, dst_buffer, + dst_buffer_stride, predictor, bw, 0, 0, 0); + + intra_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + + if (intra_cost < best_intra_cost) { + best_intra_cost = intra_cost; + best_mode = third_pass_mode; + } + } + } + + // Motion compensated prediction + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + xd->mi[0]->ref_frame[1] = NONE_FRAME; + xd->mi[0]->compound_idx = 1; + + int best_rf_idx = -1; + int_mv best_mv[2]; + int32_t inter_cost; + int32_t best_inter_cost = INT32_MAX; + int rf_idx; + int_mv single_mv[INTER_REFS_PER_FRAME]; + + best_mv[0].as_int = INVALID_MV; + best_mv[1].as_int = INVALID_MV; + + for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) { + single_mv[rf_idx].as_int = INVALID_MV; + if (tpl_data->ref_frame[rf_idx] == NULL || + tpl_data->src_ref_frame[rf_idx] == NULL) { + tpl_stats->mv[rf_idx].as_int = INVALID_MV; + continue; + } + + const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->src_ref_frame[rf_idx]; + const int ref_mb_offset = + mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE; + uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset; + const int ref_stride = ref_frame_ptr->y_stride; + const int ref_width = ref_frame_ptr->y_width; + + int_mv best_rfidx_mv = { 0 }; + uint32_t bestsme = UINT32_MAX; + + center_mv_t center_mvs[4] = { { { 0 }, INT_MAX }, + { { 0 }, INT_MAX }, + { { 0 }, INT_MAX }, + { { 0 }, INT_MAX } }; + int refmv_count = 1; + int idx; + + if (xd->up_available) { + TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)]; + if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, + tpl_sf->skip_alike_starting_mv)) { + center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int; + ++refmv_count; + } + } + + if (xd->left_available) { + TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)]; + if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, + tpl_sf->skip_alike_starting_mv)) { + center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int; + ++refmv_count; + } + } + + if (xd->up_available && mi_col + mi_width < xd->tile.mi_col_end) { + TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + mi_row - mi_height, mi_col + mi_width, tpl_frame->stride, + block_mis_log2)]; + if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, + tpl_sf->skip_alike_starting_mv)) { + center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int; + ++refmv_count; + } + } + + if (cpi->third_pass_ctx && + frame_offset < cpi->third_pass_ctx->frame_info_count && + tpl_data->frame_idx < gf_group->size) { + double ratio_h, ratio_w; + av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height, + cm->width, &ratio_h, &ratio_w); + THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( + cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w); + + int_mv tp_mv = av1_get_third_pass_adjusted_mv(this_mi, ratio_h, ratio_w, + rf_idx + LAST_FRAME); + if (tp_mv.as_int != INVALID_MV && + !is_alike_mv(tp_mv, center_mvs + 1, refmv_count - 1, + tpl_sf->skip_alike_starting_mv)) { + center_mvs[0].mv = tp_mv; + } + } + + // Prune starting mvs + if (tpl_sf->prune_starting_mv && refmv_count > 1) { + // Get each center mv's sad. + for (idx = 0; idx < refmv_count; ++idx) { + FULLPEL_MV mv = get_fullmv_from_mv(¢er_mvs[idx].mv.as_mv); + clamp_fullmv(&mv, &x->mv_limits); + center_mvs[idx].sad = (int)cpi->ppi->fn_ptr[bsize].sdf( + src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col], + ref_stride); + } + + // Rank center_mv using sad. + qsort(center_mvs, refmv_count, sizeof(center_mvs[0]), compare_sad); + + refmv_count = AOMMIN(4 - tpl_sf->prune_starting_mv, refmv_count); + // Further reduce number of refmv based on sad difference. + if (refmv_count > 1) { + int last_sad = center_mvs[refmv_count - 1].sad; + int second_to_last_sad = center_mvs[refmv_count - 2].sad; + if ((last_sad - second_to_last_sad) * 5 > second_to_last_sad) + refmv_count--; + } + } + + for (idx = 0; idx < refmv_count; ++idx) { + int_mv this_mv; + uint32_t thissme = motion_estimation( + cpi, x, src_mb_buffer, ref_mb, src_stride, ref_stride, src_width, + ref_width, bsize, center_mvs[idx].mv.as_mv, &this_mv); + + if (thissme < bestsme) { + bestsme = thissme; + best_rfidx_mv = this_mv; + } + } + + tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int; + single_mv[rf_idx] = best_rfidx_mv; + + inter_cost = get_inter_cost( + cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size, + mi_row, mi_col, rf_idx, &best_rfidx_mv.as_mv, tpl_frame->use_pred_sad); + // Store inter cost for each ref frame. This is used to prune inter modes. + tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost); + + if (inter_cost < best_inter_cost) { + best_rf_idx = rf_idx; + + best_inter_cost = inter_cost; + best_mv[0].as_int = best_rfidx_mv.as_int; + } + } + // Calculate SATD of the best inter mode if SAD was used for mode decision + // as best_inter_cost is used in ML model to skip intra mode evaluation. + if (best_inter_cost < INT32_MAX && tpl_frame->use_pred_sad) { + assert(best_rf_idx != -1); + best_inter_cost = get_inter_cost( + cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size, + mi_row, mi_col, best_rf_idx, &best_mv[0].as_mv, 0 /* use_pred_sad */); + } + + if (best_rf_idx != -1 && best_inter_cost < best_intra_cost) { + best_mode = NEWMV; + xd->mi[0]->ref_frame[0] = best_rf_idx + LAST_FRAME; + xd->mi[0]->mv[0].as_int = best_mv[0].as_int; + } + + // Start compound predition search. + int comp_ref_frames[3][2] = { + { 0, 4 }, + { 0, 6 }, + { 3, 6 }, + }; + + int start_rf = 0; + int end_rf = 3; + if (!tpl_sf->allow_compound_pred) end_rf = 0; + if (cpi->third_pass_ctx && + frame_offset < cpi->third_pass_ctx->frame_info_count && + tpl_data->frame_idx < gf_group->size) { + double ratio_h, ratio_w; + av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height, + cm->width, &ratio_h, &ratio_w); + THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( + cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w); + + if (this_mi->ref_frame[0] >= LAST_FRAME && + this_mi->ref_frame[1] >= LAST_FRAME) { + int found = 0; + for (int i = 0; i < 3; i++) { + if (comp_ref_frames[i][0] + LAST_FRAME == this_mi->ref_frame[0] && + comp_ref_frames[i][1] + LAST_FRAME == this_mi->ref_frame[1]) { + found = 1; + break; + } + } + if (!found || !tpl_sf->allow_compound_pred) { + comp_ref_frames[2][0] = this_mi->ref_frame[0] - LAST_FRAME; + comp_ref_frames[2][1] = this_mi->ref_frame[1] - LAST_FRAME; + if (!tpl_sf->allow_compound_pred) { + start_rf = 2; + end_rf = 3; + } + } + } + } + + xd->mi_row = mi_row; + xd->mi_col = mi_col; + int best_cmp_rf_idx = -1; + const int_interpfilters kernel = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + for (int cmp_rf_idx = start_rf; cmp_rf_idx < end_rf; ++cmp_rf_idx) { + int rf_idx0 = comp_ref_frames[cmp_rf_idx][0]; + int rf_idx1 = comp_ref_frames[cmp_rf_idx][1]; + + if (tpl_data->ref_frame[rf_idx0] == NULL || + tpl_data->src_ref_frame[rf_idx0] == NULL || + tpl_data->ref_frame[rf_idx1] == NULL || + tpl_data->src_ref_frame[rf_idx1] == NULL) { + continue; + } + + const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = { + tpl_data->src_ref_frame[rf_idx0], + tpl_data->src_ref_frame[rf_idx1], + }; + + xd->mi[0]->ref_frame[0] = rf_idx0 + LAST_FRAME; + xd->mi[0]->ref_frame[1] = rf_idx1 + LAST_FRAME; + xd->mi[0]->mode = NEW_NEWMV; + const int8_t ref_frame_type = av1_ref_frame_type(xd->mi[0]->ref_frame); + // Set up ref_mv for av1_joint_motion_search(). + CANDIDATE_MV *this_ref_mv_stack = x->mbmi_ext.ref_mv_stack[ref_frame_type]; + this_ref_mv_stack[xd->mi[0]->ref_mv_idx].this_mv = single_mv[rf_idx0]; + this_ref_mv_stack[xd->mi[0]->ref_mv_idx].comp_mv = single_mv[rf_idx1]; + + struct buf_2d yv12_mb[2][MAX_MB_PLANE]; + for (int i = 0; i < 2; ++i) { + av1_setup_pred_block(xd, yv12_mb[i], ref_frame_ptr[i], + xd->block_ref_scale_factors[i], + xd->block_ref_scale_factors[i], MAX_MB_PLANE); + for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { + xd->plane[plane].pre[i] = yv12_mb[i][plane]; + } + } + + int_mv tmp_mv[2] = { single_mv[rf_idx0], single_mv[rf_idx1] }; + int rate_mv; + av1_joint_motion_search(cpi, x, bsize, tmp_mv, NULL, 0, &rate_mv, + !cpi->sf.mv_sf.disable_second_mv, + NUM_JOINT_ME_REFINE_ITER); + + for (int ref = 0; ref < 2; ++ref) { + struct buf_2d ref_buf = { NULL, ref_frame_ptr[ref]->y_buffer, + ref_frame_ptr[ref]->y_width, + ref_frame_ptr[ref]->y_height, + ref_frame_ptr[ref]->y_stride }; + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE, + mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), + 0, &tpl_data->sf, &ref_buf, kernel); + av1_init_comp_mode(&inter_pred_params); + + inter_pred_params.conv_params = get_conv_params_no_round( + ref, 0, xd->tmp_conv_dst, MAX_SB_SIZE, 1, xd->bd); + + av1_enc_build_one_inter_predictor(predictor, bw, &tmp_mv[ref].as_mv, + &inter_pred_params); + } + inter_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + if (inter_cost < best_inter_cost) { + best_cmp_rf_idx = cmp_rf_idx; + best_inter_cost = inter_cost; + best_mv[0] = tmp_mv[0]; + best_mv[1] = tmp_mv[1]; + } + } + + if (best_cmp_rf_idx != -1 && best_inter_cost < best_intra_cost) { + best_mode = NEW_NEWMV; + const int best_rf_idx0 = comp_ref_frames[best_cmp_rf_idx][0]; + const int best_rf_idx1 = comp_ref_frames[best_cmp_rf_idx][1]; + xd->mi[0]->ref_frame[0] = best_rf_idx0 + LAST_FRAME; + xd->mi[0]->ref_frame[1] = best_rf_idx1 + LAST_FRAME; + } + + if (best_inter_cost < INT32_MAX && is_inter_mode(best_mode)) { + xd->mi[0]->mv[0].as_int = best_mv[0].as_int; + xd->mi[0]->mv[1].as_int = best_mv[1].as_int; + const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = { + best_cmp_rf_idx >= 0 + ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]] + : tpl_data->src_ref_frame[best_rf_idx], + best_cmp_rf_idx >= 0 + ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]] + : NULL, + }; + rate_cost = 1; + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + rec_stride_pool, tx_size, best_mode, mi_row, mi_col, + use_y_only_rate_distortion, 0 /*do_recon*/, NULL); + tpl_stats->srcrf_rate = rate_cost; + } + + best_intra_cost = AOMMAX(best_intra_cost, 1); + best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost); + tpl_stats->inter_cost = best_inter_cost; + tpl_stats->intra_cost = best_intra_cost; + + tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; + + // Final encode + rate_cost = 0; + const YV12_BUFFER_CONFIG *ref_frame_ptr[2]; + + ref_frame_ptr[0] = + best_mode == NEW_NEWMV + ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]] + : best_rf_idx >= 0 ? tpl_data->ref_frame[best_rf_idx] + : NULL; + ref_frame_ptr[1] = + best_mode == NEW_NEWMV + ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]] + : NULL; + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + rec_stride_pool, tx_size, best_mode, mi_row, mi_col, + use_y_only_rate_distortion, 1 /*do_recon*/, + tpl_txfm_stats); + + tpl_stats->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->recrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->recrf_rate = rate_cost; + + if (!is_inter_mode(best_mode)) { + tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->srcrf_rate = rate_cost; + tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; + } + + tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist); + tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate); + + if (best_mode == NEW_NEWMV) { + ref_frame_ptr[0] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]; + ref_frame_ptr[1] = + tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]; + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + rec_stride_pool, tx_size, best_mode, mi_row, mi_col, + use_y_only_rate_distortion, 1 /*do_recon*/, NULL); + tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->cmp_recrf_rate[0] = rate_cost; + + tpl_stats->cmp_recrf_dist[0] = + AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[0]); + tpl_stats->cmp_recrf_rate[0] = + AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[0]); + + tpl_stats->cmp_recrf_dist[0] = + AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[0]); + tpl_stats->cmp_recrf_rate[0] = + AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[0]); + + rate_cost = 0; + ref_frame_ptr[0] = + tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]; + ref_frame_ptr[1] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]; + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + rec_stride_pool, tx_size, best_mode, mi_row, mi_col, + use_y_only_rate_distortion, 1 /*do_recon*/, NULL); + tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->cmp_recrf_rate[1] = rate_cost; + + tpl_stats->cmp_recrf_dist[1] = + AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[1]); + tpl_stats->cmp_recrf_rate[1] = + AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[1]); + + tpl_stats->cmp_recrf_dist[1] = + AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[1]); + tpl_stats->cmp_recrf_rate[1] = + AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[1]); + } + + if (best_mode == NEWMV) { + tpl_stats->mv[best_rf_idx] = best_mv[0]; + tpl_stats->ref_frame_index[0] = best_rf_idx; + tpl_stats->ref_frame_index[1] = NONE_FRAME; + } else if (best_mode == NEW_NEWMV) { + tpl_stats->ref_frame_index[0] = comp_ref_frames[best_cmp_rf_idx][0]; + tpl_stats->ref_frame_index[1] = comp_ref_frames[best_cmp_rf_idx][1]; + tpl_stats->mv[tpl_stats->ref_frame_index[0]] = best_mv[0]; + tpl_stats->mv[tpl_stats->ref_frame_index[1]] = best_mv[1]; + } + + for (int idy = 0; idy < mi_height; ++idy) { + for (int idx = 0; idx < mi_width; ++idx) { + if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > idx && + (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > idy) { + xd->mi[idx + idy * cm->mi_params.mi_stride] = xd->mi[0]; + } + } + } +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width, + int height) { + int min_row = AOMMAX(row_a, row_b); + int max_row = AOMMIN(row_a + height, row_b + height); + int min_col = AOMMAX(col_a, col_b); + int max_col = AOMMIN(col_a + width, col_b + width); + if (min_row < max_row && min_col < max_col) { + return (max_row - min_row) * (max_col - min_col); + } + return 0; +} + +int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) { + return (mi_row >> right_shift) * stride + (mi_col >> right_shift); +} + +int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, + int64_t srcrf_dist, int pix_num) { + double beta = (double)srcrf_dist / recrf_dist; + int64_t rate_cost = delta_rate; + + if (srcrf_dist <= 128) return rate_cost; + + double dr = + (double)(delta_rate >> (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT)) / + pix_num; + + double log_den = log(beta) / log(2.0) + 2.0 * dr; + + if (log_den > log(10.0) / log(2.0)) { + rate_cost = (int64_t)((log(1.0 / beta) * pix_num) / log(2.0) / 2.0); + rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT); + return rate_cost; + } + + double num = pow(2.0, log_den); + double den = num * beta + (1 - beta) * beta; + + rate_cost = (int64_t)((pix_num * log(num / den)) / log(2.0) / 2.0); + + rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT); + + return rate_cost; +} + +static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row, + int mi_col, const BLOCK_SIZE bsize, + int frame_idx, int ref) { + TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx]; + TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr; + TplDepFrame *tpl_frame = tpl_data->tpl_frame; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + TplDepStats *tpl_stats_ptr = &tpl_ptr[av1_tpl_ptr_pos( + mi_row, mi_col, tpl_frame->stride, block_mis_log2)]; + + int is_compound = tpl_stats_ptr->ref_frame_index[1] >= 0; + + if (tpl_stats_ptr->ref_frame_index[ref] < 0) return; + const int ref_frame_index = tpl_stats_ptr->ref_frame_index[ref]; + TplDepFrame *ref_tpl_frame = + &tpl_frame[tpl_frame[frame_idx].ref_map_index[ref_frame_index]]; + TplDepStats *ref_stats_ptr = ref_tpl_frame->tpl_stats_ptr; + + if (tpl_frame[frame_idx].ref_map_index[ref_frame_index] < 0) return; + + const FULLPEL_MV full_mv = + get_fullmv_from_mv(&tpl_stats_ptr->mv[ref_frame_index].as_mv); + const int ref_pos_row = mi_row * MI_SIZE + full_mv.row; + const int ref_pos_col = mi_col * MI_SIZE + full_mv.col; + + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + int64_t srcrf_dist = is_compound ? tpl_stats_ptr->cmp_recrf_dist[!ref] + : tpl_stats_ptr->srcrf_dist; + int64_t srcrf_rate = + is_compound + ? (tpl_stats_ptr->cmp_recrf_rate[!ref] << TPL_DEP_COST_SCALE_LOG2) + : (tpl_stats_ptr->srcrf_rate << TPL_DEP_COST_SCALE_LOG2); + + int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - srcrf_dist; + int64_t mc_dep_dist = + (int64_t)(tpl_stats_ptr->mc_dep_dist * + ((double)(tpl_stats_ptr->recrf_dist - srcrf_dist) / + tpl_stats_ptr->recrf_dist)); + int64_t delta_rate = + (tpl_stats_ptr->recrf_rate << TPL_DEP_COST_SCALE_LOG2) - srcrf_rate; + int64_t mc_dep_rate = + av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist, + srcrf_dist, pix_num); + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = av1_get_overlap_area(grid_pos_row, grid_pos_col, + ref_pos_row, ref_pos_col, bw, bh); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + assert((1 << block_mis_log2) == mi_height); + assert((1 << block_mis_log2) == mi_width); + TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos( + ref_mi_row, ref_mi_col, ref_tpl_frame->stride, block_mis_log2)]; + des_stats->mc_dep_dist += + ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num; + des_stats->mc_dep_rate += + ((delta_rate + mc_dep_rate) * overlap_area) / pix_num; + } + } +} + +static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row, + int mi_col, int frame_idx) { + const BLOCK_SIZE tpl_stats_block_size = + convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2); + tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx, + 0); + tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx, + 1); +} + +static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row, + int mi_col, int stride, + const TplDepStats *src_stats, + uint8_t block_mis_log2) { + int index = av1_tpl_ptr_pos(mi_row, mi_col, stride, block_mis_log2); + TplDepStats *tpl_ptr = &tpl_stats_ptr[index]; + *tpl_ptr = *src_stats; + tpl_ptr->intra_cost = AOMMAX(1, tpl_ptr->intra_cost); + tpl_ptr->inter_cost = AOMMAX(1, tpl_ptr->inter_cost); + tpl_ptr->srcrf_dist = AOMMAX(1, tpl_ptr->srcrf_dist); + tpl_ptr->srcrf_sse = AOMMAX(1, tpl_ptr->srcrf_sse); + tpl_ptr->recrf_dist = AOMMAX(1, tpl_ptr->recrf_dist); + tpl_ptr->srcrf_rate = AOMMAX(1, tpl_ptr->srcrf_rate); + tpl_ptr->recrf_rate = AOMMAX(1, tpl_ptr->recrf_rate); + tpl_ptr->cmp_recrf_dist[0] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[0]); + tpl_ptr->cmp_recrf_dist[1] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[1]); + tpl_ptr->cmp_recrf_rate[0] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[0]); + tpl_ptr->cmp_recrf_rate[1] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[1]); +} + +// Reset the ref and source frame pointers of tpl_data. +static AOM_INLINE void tpl_reset_src_ref_frames(TplParams *tpl_data) { + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + tpl_data->ref_frame[i] = NULL; + tpl_data->src_ref_frame[i] = NULL; + } +} + +static AOM_INLINE int get_gop_length(const GF_GROUP *gf_group) { + int gop_length = AOMMIN(gf_group->size, MAX_TPL_FRAME_IDX - 1); + return gop_length; +} + +// Initialize the mc_flow parameters used in computing tpl data. +static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx, + int pframe_qindex) { + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; + const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture; + const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME]; + uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME]; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf; + int ref_pruning_enabled = is_frame_eligible_for_ref_pruning( + gf_group, cpi->sf.inter_sf.selective_ref_frame, + tpl_sf->prune_ref_frames_in_tpl, frame_idx); + int gop_length = get_gop_length(gf_group); + int ref_frame_flags; + AV1_COMMON *cm = &cpi->common; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats; + tpl_data->frame_idx = frame_idx; + tpl_reset_src_ref_frames(tpl_data); + av1_tile_init(&xd->tile, cm, 0, 0); + + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + + // Setup scaling factor + av1_setup_scale_factors_for_frame( + &tpl_data->sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); + + xd->cur_buf = this_frame; + + for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { + TplDepFrame *tpl_ref_frame = + &tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]]; + tpl_data->ref_frame[idx] = tpl_ref_frame->rec_picture; + tpl_data->src_ref_frame[idx] = tpl_ref_frame->gf_picture; + ref_frame_display_indices[idx] = tpl_ref_frame->frame_display_index; + } + + // Store the reference frames based on priority order + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + ref_frames_ordered[i] = + tpl_data->ref_frame[ref_frame_priority_order[i] - 1]; + } + + // Work out which reference frame slots may be used. + ref_frame_flags = + get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), + ref_frames_ordered, cpi->ext_flags.ref_frame_flags); + + enforce_max_ref_frames(cpi, &ref_frame_flags, ref_frame_display_indices, + tpl_frame->frame_display_index); + + // Prune reference frames + for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { + if ((ref_frame_flags & (1 << idx)) == 0) { + tpl_data->ref_frame[idx] = NULL; + } + } + + // Skip motion estimation w.r.t. reference frames which are not + // considered in RD search, using "selective_ref_frame" speed feature. + // The reference frame pruning is not enabled for frames beyond the gop + // length, as there are fewer reference frames and the reference frames + // differ from the frames considered during RD search. + if (ref_pruning_enabled && (frame_idx < gop_length)) { + for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { + const MV_REFERENCE_FRAME refs[2] = { idx + 1, NONE_FRAME }; + if (prune_ref_by_selective_ref_frame(cpi, NULL, refs, + ref_frame_display_indices)) { + tpl_data->ref_frame[idx] = NULL; + } + } + } + + // Make a temporary mbmi for tpl model + MB_MODE_INFO mbmi; + memset(&mbmi, 0, sizeof(mbmi)); + MB_MODE_INFO *mbmi_ptr = &mbmi; + xd->mi = &mbmi_ptr; + + xd->block_ref_scale_factors[0] = &tpl_data->sf; + xd->block_ref_scale_factors[1] = &tpl_data->sf; + + const int base_qindex = + cpi->use_ducky_encode ? gf_group->q_val[frame_idx] : pframe_qindex; + // Get rd multiplier set up. + rdmult = (int)av1_compute_rd_mult( + base_qindex, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); + + if (rdmult < 1) rdmult = 1; + av1_set_error_per_bit(&x->errorperbit, rdmult); + av1_set_sad_per_bit(cpi, &x->sadperbit, base_qindex); + + tpl_frame->is_valid = 1; + + cm->quant_params.base_qindex = base_qindex; + av1_frame_init_quantizer(cpi); + + const BitDepthInfo bd_info = get_bit_depth_info(xd); + const FRAME_UPDATE_TYPE update_type = + gf_group->update_type[cpi->gf_frame_index]; + tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex( + bd_info.bit_depth, update_type, base_qindex) / + 6; + + if (cpi->use_ducky_encode) + tpl_frame->base_rdmult = gf_group->rdmult_val[frame_idx]; + + av1_init_tpl_txfm_stats(tpl_txfm_stats); + + // Initialize x->mbmi_ext when compound predictions are enabled. + if (tpl_sf->allow_compound_pred) av1_zero(x->mbmi_ext); + + // Set the pointer to null since mbmi is only allocated inside this function. + assert(xd->mi == &mbmi_ptr); + xd->mi = NULL; + + // Tpl module is called before the setting of speed features at frame level. + // Thus, turning off this speed feature for key frame is done here and not + // integrated into the speed feature setting itself. + const int layer_depth_th = (tpl_sf->use_sad_for_mode_decision == 1) ? 5 : 0; + tpl_frame->use_pred_sad = + tpl_sf->use_sad_for_mode_decision && + gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE && + gf_group->layer_depth[frame_idx] >= layer_depth_th; +} + +// This function stores the motion estimation dependencies of all the blocks in +// a row +void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats, + TplBuffers *tpl_tmp_buffers, MACROBLOCK *x, + int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mi_width = mi_size_wide[bsize]; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx]; + MACROBLOCKD *xd = &x->e_mbd; + + const int tplb_cols_in_tile = + ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]); + const int tplb_row = ROUND_POWER_OF_TWO(mi_row, mi_size_high_log2[bsize]); + assert(mi_size_high[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2)); + assert(mi_size_wide[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2)); + + for (int mi_col = 0, tplb_col_in_tile = 0; mi_col < mi_params->mi_cols; + mi_col += mi_width, tplb_col_in_tile++) { + (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row, + tplb_col_in_tile); + +#if CONFIG_MULTITHREAD + if (mt_info->num_workers > 1) { + pthread_mutex_lock(tpl_row_mt->mutex_); + const bool tpl_mt_exit = tpl_row_mt->tpl_mt_exit; + pthread_mutex_unlock(tpl_row_mt->mutex_); + // Exit in case any worker has encountered an error. + if (tpl_mt_exit) return; + } +#endif + + TplDepStats tpl_stats; + + // Motion estimation column boundary + av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width, + tpl_data->border_in_pixels); + xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE); + xd->mb_to_right_edge = + GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col); + mode_estimation(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, mi_col, + bsize, tx_size, &tpl_stats); + + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride, + &tpl_stats, tpl_data->tpl_stats_block_mis_log2); + (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row, + tplb_col_in_tile, tplb_cols_in_tile); + } +} + +static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE bsize = + convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = mi_size_high[bsize]; + for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) { + // Motion estimation row boundary + av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height, + cpi->ppi->tpl_data.border_in_pixels); + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = + GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); + av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, &td->tpl_tmp_buffers, x, + mi_row, bsize, tx_size); + } +} + +static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows, + int mi_cols) { + if (!frame_idx) { + return; + } + const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d); + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + assert(mi_height == (1 << tpl_data->tpl_stats_block_mis_log2)); + assert(mi_width == (1 << tpl_data->tpl_stats_block_mis_log2)); + + for (int mi_row = 0; mi_row < mi_rows; mi_row += mi_height) { + for (int mi_col = 0; mi_col < mi_cols; mi_col += mi_width) { + tpl_model_update(tpl_data, mi_row, mi_col, frame_idx); + } + } +} + +static AOM_INLINE void init_gop_frames_for_tpl( + AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params, + GF_GROUP *gf_group, int *tpl_group_frames, int *pframe_qindex) { + AV1_COMMON *cm = &cpi->common; + assert(cpi->gf_frame_index == 0); + *pframe_qindex = 0; + + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; + init_ref_map_pair(cpi, ref_frame_map_pairs); + + int remapped_ref_idx[REF_FRAMES]; + + EncodeFrameParams frame_params = *init_frame_params; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + + int ref_picture_map[REF_FRAMES]; + + for (int i = 0; i < REF_FRAMES; ++i) { + if (frame_params.frame_type == KEY_FRAME) { + tpl_data->tpl_frame[-i - 1].gf_picture = NULL; + tpl_data->tpl_frame[-i - 1].rec_picture = NULL; + tpl_data->tpl_frame[-i - 1].frame_display_index = 0; + } else { + tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf; + tpl_data->tpl_frame[-i - 1].rec_picture = &cm->ref_frame_map[i]->buf; + tpl_data->tpl_frame[-i - 1].frame_display_index = + cm->ref_frame_map[i]->display_order_hint; + } + + ref_picture_map[i] = -i - 1; + } + + *tpl_group_frames = 0; + + int gf_index; + int process_frame_count = 0; + const int gop_length = get_gop_length(gf_group); + + for (gf_index = 0; gf_index < gop_length; ++gf_index) { + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index]; + FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index]; + int lookahead_index = + gf_group->cur_frame_idx[gf_index] + gf_group->arf_src_offset[gf_index]; + frame_params.show_frame = frame_update_type != ARF_UPDATE && + frame_update_type != INTNL_ARF_UPDATE; + frame_params.show_existing_frame = + frame_update_type == INTNL_OVERLAY_UPDATE || + frame_update_type == OVERLAY_UPDATE; + frame_params.frame_type = gf_group->frame_type[gf_index]; + + if (frame_update_type == LF_UPDATE) + *pframe_qindex = gf_group->q_val[gf_index]; + + const struct lookahead_entry *buf = av1_lookahead_peek( + cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage); + if (buf == NULL) break; + tpl_frame->gf_picture = &buf->img; + + // Use filtered frame buffer if available. This will make tpl stats more + // precise. + FRAME_DIFF frame_diff; + const YV12_BUFFER_CONFIG *tf_buf = + av1_tf_info_get_filtered_buf(&cpi->ppi->tf_info, gf_index, &frame_diff); + if (tf_buf != NULL) { + tpl_frame->gf_picture = tf_buf; + } + + // 'cm->current_frame.frame_number' is the display number + // of the current frame. + // 'lookahead_index' is frame offset within the gf group. + // 'lookahead_index + cm->current_frame.frame_number' + // is the display index of the frame. + tpl_frame->frame_display_index = + lookahead_index + cm->current_frame.frame_number; + assert(buf->display_idx == + cpi->frame_index_set.show_frame_count + lookahead_index); + + if (frame_update_type != OVERLAY_UPDATE && + frame_update_type != INTNL_OVERLAY_UPDATE) { + tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count]; + tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count]; + ++process_frame_count; + } + const int true_disp = (int)(tpl_frame->frame_display_index); + + av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0, + remapped_ref_idx); + + int refresh_mask = + av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type, + gf_index, true_disp, ref_frame_map_pairs); + + // Make the frames marked as is_frame_non_ref to non-reference frames. + if (cpi->ppi->gf_group.is_frame_non_ref[gf_index]) refresh_mask = 0; + + int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask); + + if (refresh_frame_map_index < REF_FRAMES && + refresh_frame_map_index != INVALID_IDX) { + ref_frame_map_pairs[refresh_frame_map_index].disp_order = + AOMMAX(0, true_disp); + ref_frame_map_pairs[refresh_frame_map_index].pyr_level = + get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp, + cpi->ppi->gf_group.max_layer_depth); + } + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + tpl_frame->ref_map_index[i - LAST_FRAME] = + ref_picture_map[remapped_ref_idx[i - LAST_FRAME]]; + + if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index; + + ++*tpl_group_frames; + } + + const int tpl_extend = cpi->oxcf.gf_cfg.lag_in_frames - MAX_GF_INTERVAL; + int extend_frame_count = 0; + int extend_frame_length = AOMMIN( + tpl_extend, cpi->rc.frames_to_key - cpi->ppi->p_rc.baseline_gf_interval); + + int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] + + gf_group->arf_src_offset[gop_length - 1] + 1; + + for (; + gf_index < MAX_TPL_FRAME_IDX && extend_frame_count < extend_frame_length; + ++gf_index) { + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index]; + FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE; + frame_params.show_frame = frame_update_type != ARF_UPDATE && + frame_update_type != INTNL_ARF_UPDATE; + frame_params.show_existing_frame = + frame_update_type == INTNL_OVERLAY_UPDATE; + frame_params.frame_type = INTER_FRAME; + + int lookahead_index = frame_display_index; + struct lookahead_entry *buf = av1_lookahead_peek( + cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage); + + if (buf == NULL) break; + + tpl_frame->gf_picture = &buf->img; + tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count]; + tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count]; + // 'cm->current_frame.frame_number' is the display number + // of the current frame. + // 'frame_display_index' is frame offset within the gf group. + // 'frame_display_index + cm->current_frame.frame_number' + // is the display index of the frame. + tpl_frame->frame_display_index = + frame_display_index + cm->current_frame.frame_number; + + ++process_frame_count; + + gf_group->update_type[gf_index] = LF_UPDATE; + +#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + if (cpi->oxcf.pass == AOM_RC_SECOND_PASS) { + if (cpi->oxcf.rc_cfg.mode == AOM_Q) { + *pframe_qindex = cpi->oxcf.rc_cfg.cq_level; + } else if (cpi->oxcf.rc_cfg.mode == AOM_VBR) { + // TODO(angiebird): Find a more adaptive method to decide pframe_qindex + // override the pframe_qindex in the second pass when bitrate accuracy + // is on. We found that setting this pframe_qindex make the tpl stats + // more stable. + *pframe_qindex = 128; + } + } +#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + gf_group->q_val[gf_index] = *pframe_qindex; + const int true_disp = (int)(tpl_frame->frame_display_index); + av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0, + remapped_ref_idx); + int refresh_mask = + av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type, + gf_index, true_disp, ref_frame_map_pairs); + int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask); + + if (refresh_frame_map_index < REF_FRAMES && + refresh_frame_map_index != INVALID_IDX) { + ref_frame_map_pairs[refresh_frame_map_index].disp_order = + AOMMAX(0, true_disp); + ref_frame_map_pairs[refresh_frame_map_index].pyr_level = + get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp, + cpi->ppi->gf_group.max_layer_depth); + } + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + tpl_frame->ref_map_index[i - LAST_FRAME] = + ref_picture_map[remapped_ref_idx[i - LAST_FRAME]]; + + tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1; + tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1; + tpl_frame->ref_map_index[BWDREF_FRAME - LAST_FRAME] = -1; + tpl_frame->ref_map_index[ALTREF2_FRAME - LAST_FRAME] = -1; + + if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index; + + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_display_index; + } +} + +void av1_init_tpl_stats(TplParams *const tpl_data) { + tpl_data->ready = 0; + set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2, + &tpl_data->tpl_bsize_1d); + for (int frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) { + TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx]; + tpl_frame->is_valid = 0; + } + for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) { + TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx]; + if (tpl_data->tpl_stats_pool[frame_idx] == NULL) continue; + memset(tpl_data->tpl_stats_pool[frame_idx], 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + } +} + +int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index) { + if (tpl_data->ready == 0) { + return 0; + } + if (gf_frame_index >= MAX_TPL_FRAME_IDX) { + // The sub-GOP length exceeds the TPL buffer capacity. + // Hence the TPL related functions are disabled hereafter. + return 0; + } + return tpl_data->tpl_frame[gf_frame_index].is_valid; +} + +static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) { + switch (gop_eval) { + case 1: + // Allow larger GOP size if the base layer ARF has higher dependency + // factor than the intermediate ARF and both ARFs have reasonably high + // dependency factors. + return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0; + case 2: + if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6) + return 1; // Don't shorten the gf interval + else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4) + return 0; // Shorten the gf interval + else + return 2; // Cannot decide the gf interval, so redo the + // tpl stats calculation. + case 3: return beta[0] > 1.1; + default: return 2; + } +} + +// TODO(jingning): Restructure av1_rc_pick_q_and_bounds() to narrow down +// the scope of input arguments. +void av1_tpl_preload_rc_estimate(AV1_COMP *cpi, + const EncodeFrameParams *const frame_params) { + AV1_COMMON *cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + int bottom_index, top_index; + if (cpi->use_ducky_encode) return; + + cm->current_frame.frame_type = frame_params->frame_type; + for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size; + ++gf_index) { + cm->current_frame.frame_type = gf_group->frame_type[gf_index]; + cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE && + gf_group->update_type[gf_index] != INTNL_ARF_UPDATE; + gf_group->q_val[gf_index] = av1_rc_pick_q_and_bounds( + cpi, cm->width, cm->height, gf_index, &bottom_index, &top_index); + } +} + +static AOM_INLINE int skip_tpl_for_frame(const GF_GROUP *gf_group, + int frame_idx, int gop_eval, + int approx_gop_eval, + int reduce_num_frames) { + // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base + // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3, + // tpl stats calculation is limited to ARFs from base layer and (base+1) + // layer. + const int num_arf_layers = (gop_eval == 2) ? 3 : 2; + const int gop_length = get_gop_length(gf_group); + + if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE || + gf_group->update_type[frame_idx] == OVERLAY_UPDATE) + return 1; + + // When approx_gop_eval = 1, skip tpl stats calculation for higher layer + // frames and for frames beyond gop length. + if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers || + frame_idx >= gop_length)) + return 1; + + if (reduce_num_frames && gf_group->update_type[frame_idx] == LF_UPDATE && + frame_idx < gop_length) + return 1; + + return 0; +} + +int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, + const EncodeFrameParams *const frame_params) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_tpl_setup_stats_time); +#endif + assert(cpi->gf_frame_index == 0); + AV1_COMMON *cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + EncodeFrameParams this_frame_params = *frame_params; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + int approx_gop_eval = (gop_eval > 1); + + if (cpi->superres_mode != AOM_SUPERRES_NONE) { + assert(cpi->superres_mode != AOM_SUPERRES_AUTO); + av1_init_tpl_stats(tpl_data); + return 0; + } + + cm->current_frame.frame_type = frame_params->frame_type; + for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size; + ++gf_index) { + cm->current_frame.frame_type = gf_group->frame_type[gf_index]; + av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame, + gf_group->update_type[gf_index], + gf_group->refbuf_state[gf_index], 0); + + memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame, + sizeof(cpi->refresh_frame)); + } + + int pframe_qindex; + int tpl_gf_group_frames; + init_gop_frames_for_tpl(cpi, frame_params, gf_group, &tpl_gf_group_frames, + &pframe_qindex); + + cpi->ppi->p_rc.base_layer_qp = pframe_qindex; + + av1_init_tpl_stats(tpl_data); + + TplBuffers *tpl_tmp_buffers = &cpi->td.tpl_tmp_buffers; + if (!tpl_alloc_temp_buffers(tpl_tmp_buffers, tpl_data->tpl_bsize_1d)) { + aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, + "Error allocating tpl data"); + } + + tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read_dummy; + tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write_dummy; + + av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height, + cm->width, cm->height); + + if (frame_params->frame_type == KEY_FRAME) { + av1_init_mv_probs(cm); + } + av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv, + cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs); + + const int num_planes = + cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : av1_num_planes(cm); + // As tpl module is called before the setting of speed features at frame + // level, turning off this speed feature for the first GF group of the + // key-frame interval is done here. + int reduce_num_frames = + cpi->sf.tpl_sf.reduce_num_frames && + gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE && + gf_group->max_layer_depth > 2; + // TPL processing is skipped for frames of type LF_UPDATE when + // 'reduce_num_frames' is 1, which affects the r0 calcuation. Thus, a factor + // to adjust r0 is used. The value of 1.6 corresponds to using ~60% of the + // frames in the gf group on an average. + tpl_data->r0_adjust_factor = reduce_num_frames ? 1.6 : 1.0; + + // Backward propagation from tpl_group_frames to 1. + for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames; + ++frame_idx) { + if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval, + reduce_num_frames)) + continue; + + init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex); + if (mt_info->num_workers > 1) { + tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read; + tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write; + av1_mc_flow_dispenser_mt(cpi); + } else { + mc_flow_dispenser(cpi); + } +#if CONFIG_BITRATE_ACCURACY + av1_tpl_txfm_stats_update_abs_coeff_mean(&cpi->td.tpl_txfm_stats); + av1_tpl_store_txfm_stats(tpl_data, &cpi->td.tpl_txfm_stats, frame_idx); +#endif // CONFIG_BITRATE_ACCURACY +#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) { + int frame_coding_idx = + av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, frame_idx); + rc_log_frame_stats(&cpi->rc_log, frame_coding_idx, + &cpi->td.tpl_txfm_stats); + } +#endif // CONFIG_RATECTRL_LOG + + aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture, + num_planes); + } + + for (int frame_idx = tpl_gf_group_frames - 1; + frame_idx >= cpi->gf_frame_index; --frame_idx) { + if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval, + reduce_num_frames)) + continue; + + mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows, + cm->mi_params.mi_cols); + } + + av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame, + gf_group->update_type[cpi->gf_frame_index], + gf_group->update_type[cpi->gf_frame_index], 0); + cm->current_frame.frame_type = frame_params->frame_type; + cm->show_frame = frame_params->show_frame; + +#if CONFIG_COLLECT_COMPONENT_TIMING + // Record the time if the function returns. + if (cpi->common.tiles.large_scale || gf_group->max_layer_depth_allowed == 0 || + !gop_eval) + end_timing(cpi, av1_tpl_setup_stats_time); +#endif + + tpl_dealloc_temp_buffers(tpl_tmp_buffers); + + if (!approx_gop_eval) { + tpl_data->ready = 1; + } + if (cpi->common.tiles.large_scale) return 0; + if (gf_group->max_layer_depth_allowed == 0) return 1; + if (!gop_eval) return 0; + assert(gf_group->arf_index >= 0); + + double beta[2] = { 0.0 }; + const int frame_idx_0 = gf_group->arf_index; + const int frame_idx_1 = + AOMMIN(tpl_gf_group_frames - 1, gf_group->arf_index + 1); + beta[0] = av1_tpl_get_frame_importance(tpl_data, frame_idx_0); + beta[1] = av1_tpl_get_frame_importance(tpl_data, frame_idx_1); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_tpl_setup_stats_time); +#endif + return eval_gop_length(beta, gop_eval); +} + +void av1_tpl_rdmult_setup(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + const int tpl_idx = cpi->gf_frame_index; + + assert( + IMPLIES(cpi->ppi->gf_group.size > 0, tpl_idx < cpi->ppi->gf_group.size)); + + TplParams *const tpl_data = &cpi->ppi->tpl_data; + const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + + if (!tpl_frame->is_valid) return; + + const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + const int block_size = BLOCK_16X16; + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const double c = 1.2; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + + // Loop through each 'block_size' X 'block_size' block. + for (int row = 0; row < num_rows; row++) { + for (int col = 0; col < num_cols; col++) { + double intra_cost = 0.0, mc_dep_cost = 0.0; + // Loop through each mi block. + for (int mi_row = row * num_mi_h; mi_row < (row + 1) * num_mi_h; + mi_row += step) { + for (int mi_col = col * num_mi_w; mi_col < (col + 1) * num_mi_w; + mi_col += step) { + if (mi_row >= cm->mi_params.mi_rows || mi_col >= mi_cols_sr) continue; + const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + mi_row, mi_col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + intra_cost += (double)(this_stats->recrf_dist << RDDIV_BITS); + mc_dep_cost += + (double)(this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; + } + } + const double rk = intra_cost / mc_dep_cost; + const int index = row * num_cols + col; + cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c; + } + } +} + +void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x, + BLOCK_SIZE sb_size, int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + + if (tpl_idx >= MAX_TPL_FRAME_IDX) return; + TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx]; + if (!tpl_frame->is_valid) return; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return; + if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return; + + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int sb_mi_width_sr = coded_to_superres_mi( + mi_size_wide[sb_size], cm->superres_scale_denominator); + + const int bsize_base = BLOCK_16X16; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (sb_mi_width_sr + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h; + int row, col; + + double base_block_count = 0.0; + double log_sum = 0.0; + + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col_sr / num_mi_h; + col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + log_sum += log(cpi->tpl_rdmult_scaling_factors[index]); + base_block_count += 1.0; + } + } + + const CommonQuantParams *quant_params = &cm->quant_params; + + const int orig_qindex_rdmult = + quant_params->base_qindex + quant_params->y_dc_delta_q; + const int orig_rdmult = av1_compute_rd_mult( + orig_qindex_rdmult, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); + + const int new_qindex_rdmult = quant_params->base_qindex + + x->rdmult_delta_qindex + + quant_params->y_dc_delta_q; + const int new_rdmult = av1_compute_rd_mult( + new_qindex_rdmult, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); + + const double scaling_factor = (double)new_rdmult / (double)orig_rdmult; + + double scale_adj = log(scaling_factor) - log_sum / base_block_count; + scale_adj = exp_bounded(scale_adj); + + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col_sr / num_mi_h; + col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + cpi->ppi->tpl_sb_rdmult_scaling_factors[index] = + scale_adj * cpi->tpl_rdmult_scaling_factors[index]; + } + } +} + +double av1_exponential_entropy(double q_step, double b) { + b = AOMMAX(b, TPL_EPSILON); + double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON); + return -log2(1 - z) - z * log2(z) / (1 - z); +} + +double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) { + // zero bin's size is zero_bin_ratio * q_step + // non-zero bin's size is q_step + b = AOMMAX(b, TPL_EPSILON); + double z = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON); + double h = av1_exponential_entropy(q_step, b); + double r = -(1 - z) * log2(1 - z) - z * log2(z) + z * (h + 1); + return r; +} + +double av1_laplace_estimate_frame_rate(int q_index, int block_count, + const double *abs_coeff_mean, + int coeff_num) { + double zero_bin_ratio = 2; + double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; + double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; + double est_rate = 0; + // dc coeff + est_rate += av1_laplace_entropy(dc_q_step, abs_coeff_mean[0], zero_bin_ratio); + // ac coeff + for (int i = 1; i < coeff_num; ++i) { + est_rate += + av1_laplace_entropy(ac_q_step, abs_coeff_mean[i], zero_bin_ratio); + } + est_rate *= block_count; + return est_rate; +} + +double av1_estimate_coeff_entropy(double q_step, double b, + double zero_bin_ratio, int qcoeff) { + b = AOMMAX(b, TPL_EPSILON); + int abs_qcoeff = abs(qcoeff); + double z0 = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON); + if (abs_qcoeff == 0) { + double r = -log2(1 - z0); + return r; + } else { + double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON); + double r = 1 - log2(z0) - log2(1 - z) - (abs_qcoeff - 1) * log2(z); + return r; + } +} + +double av1_estimate_txfm_block_entropy(int q_index, + const double *abs_coeff_mean, + int *qcoeff_arr, int coeff_num) { + double zero_bin_ratio = 2; + double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; + double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; + double est_rate = 0; + // dc coeff + est_rate += av1_estimate_coeff_entropy(dc_q_step, abs_coeff_mean[0], + zero_bin_ratio, qcoeff_arr[0]); + // ac coeff + for (int i = 1; i < coeff_num; ++i) { + est_rate += av1_estimate_coeff_entropy(ac_q_step, abs_coeff_mean[i], + zero_bin_ratio, qcoeff_arr[i]); + } + return est_rate; +} + +#if CONFIG_RD_COMMAND +void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command) { + FILE *fptr = fopen(filepath, "r"); + fscanf(fptr, "%d", &rd_command->frame_count); + rd_command->frame_index = 0; + for (int i = 0; i < rd_command->frame_count; ++i) { + int option; + fscanf(fptr, "%d", &option); + rd_command->option_ls[i] = (RD_OPTION)option; + if (option == RD_OPTION_SET_Q) { + fscanf(fptr, "%d", &rd_command->q_index_ls[i]); + } else if (option == RD_OPTION_SET_Q_RDMULT) { + fscanf(fptr, "%d", &rd_command->q_index_ls[i]); + fscanf(fptr, "%d", &rd_command->rdmult_ls[i]); + } + } + fclose(fptr); +} +#endif // CONFIG_RD_COMMAND + +double av1_tpl_get_frame_importance(const TplParams *tpl_data, + int gf_frame_index) { + const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_frame_index]; + const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + + const int tpl_stride = tpl_frame->stride; + double intra_cost_base = 0; + double mc_dep_cost_base = 0; + double cbcmp_base = 1; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + + for (int row = 0; row < tpl_frame->mi_rows; row += step) { + for (int col = 0; col < tpl_frame->mi_cols; col += step) { + const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + double cbcmp = (double)this_stats->srcrf_dist; + const int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); + dist_scaled = AOMMAX(dist_scaled, 1); + intra_cost_base += log(dist_scaled) * cbcmp; + mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp; + cbcmp_base += cbcmp; + } + } + return exp((mc_dep_cost_base - intra_cost_base) / cbcmp_base); +} + +double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index) { + if (!av1_tpl_stats_ready(tpl_data, gf_frame_index)) { + return 1; + } + const double frame_importance = + av1_tpl_get_frame_importance(tpl_data, gf_frame_index); + return sqrt(1 / frame_importance); +} + +int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio, + aom_bit_depth_t bit_depth) { + const double leaf_qstep = av1_dc_quant_QTX(leaf_qindex, 0, bit_depth); + const double target_qstep = leaf_qstep * qstep_ratio; + int qindex = leaf_qindex; + if (qstep_ratio < 1.0) { + for (qindex = leaf_qindex; qindex > 0; --qindex) { + const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth); + if (qstep <= target_qstep) break; + } + } else { + for (qindex = leaf_qindex; qindex <= MAXQ; ++qindex) { + const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth); + if (qstep >= target_qstep) break; + } + } + return qindex; +} + +int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index, + int leaf_qindex, aom_bit_depth_t bit_depth) { + const double qstep_ratio = av1_tpl_get_qstep_ratio(tpl_data, gf_frame_index); + return av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth); +} + +#if CONFIG_BITRATE_ACCURACY +void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget, + int show_frame_count) { + av1_zero(*vbr_rc_info); + vbr_rc_info->ready = 0; + vbr_rc_info->total_bit_budget = total_bit_budget; + vbr_rc_info->show_frame_count = show_frame_count; + const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.94559, 1, + 0.94559, 1, 1, + 0.94559 }; + + // TODO(angiebird): Based on the previous code, only the scale factor 0.94559 + // will be used in most of the cases with --limi=17. Figure out if the + // following scale factors works better. + // const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.12040, 1, + // 1.10199, 1, 1, + // 0.16393 }; + + const double mv_scale_factors[FRAME_UPDATE_TYPES] = { 3, 3, 3, 3, 3, 3, 3 }; + memcpy(vbr_rc_info->scale_factors, scale_factors, + sizeof(scale_factors[0]) * FRAME_UPDATE_TYPES); + memcpy(vbr_rc_info->mv_scale_factors, mv_scale_factors, + sizeof(mv_scale_factors[0]) * FRAME_UPDATE_TYPES); + + vbr_rc_reset_gop_data(vbr_rc_info); +#if CONFIG_THREE_PASS + // TODO(angiebird): Explain why we use -1 here + vbr_rc_info->cur_gop_idx = -1; + vbr_rc_info->gop_count = 0; + vbr_rc_info->total_frame_count = 0; +#endif // CONFIG_THREE_PASS +} + +#if CONFIG_THREE_PASS +int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info, + int gf_frame_index) { + int gop_idx = vbr_rc_info->cur_gop_idx; + int gop_start_idx = vbr_rc_info->gop_start_idx_list[gop_idx]; + return gop_start_idx + gf_frame_index; +} + +void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info, + const TPL_INFO *tpl_info) { + int gop_start_idx = vbr_rc_info->total_frame_count; + vbr_rc_info->gop_start_idx_list[vbr_rc_info->gop_count] = gop_start_idx; + vbr_rc_info->gop_length_list[vbr_rc_info->gop_count] = tpl_info->gf_length; + assert(gop_start_idx + tpl_info->gf_length <= VBR_RC_INFO_MAX_FRAMES); + for (int i = 0; i < tpl_info->gf_length; ++i) { + vbr_rc_info->txfm_stats_list[gop_start_idx + i] = + tpl_info->txfm_stats_list[i]; + vbr_rc_info->qstep_ratio_list[gop_start_idx + i] = + tpl_info->qstep_ratio_ls[i]; + vbr_rc_info->update_type_list[gop_start_idx + i] = + tpl_info->update_type_list[i]; + } + vbr_rc_info->total_frame_count += tpl_info->gf_length; + vbr_rc_info->gop_count++; +} +#endif // CONFIG_THREE_PASS + +void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info, + int gop_showframe_count) { + vbr_rc_info->gop_showframe_count = gop_showframe_count; + vbr_rc_info->gop_bit_budget = vbr_rc_info->total_bit_budget * + gop_showframe_count / + vbr_rc_info->show_frame_count; +} + +void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count, + const double *qstep_ratio_list, + aom_bit_depth_t bit_depth, + int *q_index_list) { + for (int i = 0; i < frame_count; ++i) { + q_index_list[i] = av1_get_q_index_from_qstep_ratio( + base_q_index, qstep_ratio_list[i], bit_depth); + } +} + +double av1_vbr_rc_info_estimate_gop_bitrate( + int base_q_index, aom_bit_depth_t bit_depth, + const double *update_type_scale_factors, int frame_count, + const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, + const TplTxfmStats *stats_list, int *q_index_list, + double *estimated_bitrate_byframe) { + av1_vbr_rc_compute_q_indices(base_q_index, frame_count, qstep_ratio_list, + bit_depth, q_index_list); + double estimated_gop_bitrate = 0; + for (int frame_index = 0; frame_index < frame_count; frame_index++) { + const TplTxfmStats *frame_stats = &stats_list[frame_index]; + double frame_bitrate = 0; + if (frame_stats->ready) { + int q_index = q_index_list[frame_index]; + + frame_bitrate = av1_laplace_estimate_frame_rate( + q_index, frame_stats->txfm_block_count, frame_stats->abs_coeff_mean, + frame_stats->coeff_num); + } + FRAME_UPDATE_TYPE update_type = update_type_list[frame_index]; + estimated_gop_bitrate += + frame_bitrate * update_type_scale_factors[update_type]; + if (estimated_bitrate_byframe != NULL) { + estimated_bitrate_byframe[frame_index] = frame_bitrate; + } + } + return estimated_gop_bitrate; +} + +int av1_vbr_rc_info_estimate_base_q( + double bit_budget, aom_bit_depth_t bit_depth, + const double *update_type_scale_factors, int frame_count, + const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, + const TplTxfmStats *stats_list, int *q_index_list, + double *estimated_bitrate_byframe) { + int q_max = 255; // Maximum q value. + int q_min = 0; // Minimum q value. + int q = (q_max + q_min) / 2; + + double q_max_estimate = av1_vbr_rc_info_estimate_gop_bitrate( + q_max, bit_depth, update_type_scale_factors, frame_count, + update_type_list, qstep_ratio_list, stats_list, q_index_list, + estimated_bitrate_byframe); + + double q_min_estimate = av1_vbr_rc_info_estimate_gop_bitrate( + q_min, bit_depth, update_type_scale_factors, frame_count, + update_type_list, qstep_ratio_list, stats_list, q_index_list, + estimated_bitrate_byframe); + while (q_min + 1 < q_max) { + double estimate = av1_vbr_rc_info_estimate_gop_bitrate( + q, bit_depth, update_type_scale_factors, frame_count, update_type_list, + qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe); + if (estimate > bit_budget) { + q_min = q; + q_min_estimate = estimate; + } else { + q_max = q; + q_max_estimate = estimate; + } + q = (q_max + q_min) / 2; + } + // Pick the estimate that lands closest to the budget. + if (fabs(q_max_estimate - bit_budget) < fabs(q_min_estimate - bit_budget)) { + q = q_max; + } else { + q = q_min; + } + // Update q_index_list and vbr_rc_info. + av1_vbr_rc_info_estimate_gop_bitrate( + q, bit_depth, update_type_scale_factors, frame_count, update_type_list, + qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe); + return q; +} +void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info, + const TplParams *tpl_data, + const GF_GROUP *gf_group, + aom_bit_depth_t bit_depth) { + vbr_rc_info->q_index_list_ready = 1; + double gop_bit_budget = vbr_rc_info->gop_bit_budget; + + for (int i = 0; i < gf_group->size; i++) { + vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i); + } + + double mv_bits = 0; + for (int i = 0; i < gf_group->size; i++) { + double frame_mv_bits = 0; + if (av1_tpl_stats_ready(tpl_data, i)) { + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[i]; + frame_mv_bits = av1_tpl_compute_frame_mv_entropy( + tpl_frame, tpl_data->tpl_stats_block_mis_log2); + FRAME_UPDATE_TYPE updae_type = gf_group->update_type[i]; + mv_bits += frame_mv_bits * vbr_rc_info->mv_scale_factors[updae_type]; + } + } + + mv_bits = AOMMIN(mv_bits, 0.6 * gop_bit_budget); + gop_bit_budget -= mv_bits; + + vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q( + gop_bit_budget, bit_depth, vbr_rc_info->scale_factors, gf_group->size, + gf_group->update_type, vbr_rc_info->qstep_ratio_list, + tpl_data->txfm_stats_list, vbr_rc_info->q_index_list, NULL); +} + +#endif // CONFIG_BITRATE_ACCURACY + +// Use upper and left neighbor block as the reference MVs. +// Compute the minimum difference between current MV and reference MV. +int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col, + int step, int tpl_stride, int right_shift) { + const TplDepStats *tpl_stats = + &tpl_frame + ->tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_stride, right_shift)]; + int_mv current_mv = tpl_stats->mv[tpl_stats->ref_frame_index[0]]; + int current_mv_magnitude = + abs(current_mv.as_mv.row) + abs(current_mv.as_mv.col); + + // Retrieve the up and left neighbors. + int up_error = INT_MAX; + int_mv up_mv_diff; + if (row - step >= 0) { + tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + row - step, col, tpl_stride, right_shift)]; + up_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]]; + up_mv_diff.as_mv.row = current_mv.as_mv.row - up_mv_diff.as_mv.row; + up_mv_diff.as_mv.col = current_mv.as_mv.col - up_mv_diff.as_mv.col; + up_error = abs(up_mv_diff.as_mv.row) + abs(up_mv_diff.as_mv.col); + } + + int left_error = INT_MAX; + int_mv left_mv_diff; + if (col - step >= 0) { + tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + row, col - step, tpl_stride, right_shift)]; + left_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]]; + left_mv_diff.as_mv.row = current_mv.as_mv.row - left_mv_diff.as_mv.row; + left_mv_diff.as_mv.col = current_mv.as_mv.col - left_mv_diff.as_mv.col; + left_error = abs(left_mv_diff.as_mv.row) + abs(left_mv_diff.as_mv.col); + } + + // Return the MV with the minimum distance from current. + if (up_error < left_error && up_error < current_mv_magnitude) { + return up_mv_diff; + } else if (left_error < up_error && left_error < current_mv_magnitude) { + return left_mv_diff; + } + return current_mv; +} + +/* Compute the entropy of motion vectors for a single frame. */ +double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame, + uint8_t right_shift) { + if (!tpl_frame->is_valid) { + return 0; + } + + int count_row[500] = { 0 }; + int count_col[500] = { 0 }; + int n = 0; // number of MVs to process + + const int tpl_stride = tpl_frame->stride; + const int step = 1 << right_shift; + + for (int row = 0; row < tpl_frame->mi_rows; row += step) { + for (int col = 0; col < tpl_frame->mi_cols; col += step) { + int_mv mv = av1_compute_mv_difference(tpl_frame, row, col, step, + tpl_stride, right_shift); + count_row[clamp(mv.as_mv.row, 0, 499)] += 1; + count_col[clamp(mv.as_mv.row, 0, 499)] += 1; + n += 1; + } + } + + // Estimate the bits used using the entropy formula. + double rate_row = 0; + double rate_col = 0; + for (int i = 0; i < 500; i++) { + if (count_row[i] != 0) { + double p = count_row[i] / (double)n; + rate_row += count_row[i] * -log2(p); + } + if (count_col[i] != 0) { + double p = count_col[i] / (double)n; + rate_col += count_col[i] * -log2(p); + } + } + + return rate_row + rate_col; +} diff --git a/third_party/aom/av1/encoder/tpl_model.h b/third_party/aom/av1/encoder/tpl_model.h new file mode 100644 index 0000000000..bcd58216c5 --- /dev/null +++ b/third_party/aom/av1/encoder/tpl_model.h @@ -0,0 +1,794 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_ +#define AOM_AV1_ENCODER_TPL_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ + +struct AV1_PRIMARY; +struct AV1_COMP; +struct AV1_SEQ_CODING_TOOLS; +struct EncodeFrameParams; +struct EncodeFrameInput; +struct GF_GROUP; +struct ThreadData; +struct TPL_INFO; + +#include "config/aom_config.h" + +#include "aom_scale/yv12config.h" + +#include "av1/common/mv.h" +#include "av1/common/scale.h" +#include "av1/encoder/block.h" +#include "av1/encoder/lookahead.h" +#include "av1/encoder/ratectrl.h" + +static INLINE BLOCK_SIZE convert_length_to_bsize(int length) { + switch (length) { + case 64: return BLOCK_64X64; + case 32: return BLOCK_32X32; + case 16: return BLOCK_16X16; + case 8: return BLOCK_8X8; + case 4: return BLOCK_4X4; + default: + assert(0 && "Invalid block size for tpl model"); + return BLOCK_16X16; + } +} + +typedef struct AV1TplRowMultiThreadSync { +#if CONFIG_MULTITHREAD + // Synchronization objects for top-right dependency. + pthread_mutex_t *mutex_; + pthread_cond_t *cond_; +#endif + // Buffer to store the macroblock whose encoding is complete. + // num_finished_cols[i] stores the number of macroblocks which finished + // encoding in the ith macroblock row. + int *num_finished_cols; + // Number of extra macroblocks of the top row to be complete for encoding + // of the current macroblock to start. A value of 1 indicates top-right + // dependency. + int sync_range; + // Number of macroblock rows. + int rows; + // Number of threads processing the current tile. + int num_threads_working; +} AV1TplRowMultiThreadSync; + +typedef struct AV1TplRowMultiThreadInfo { + // Initialized to false, set to true by the worker thread that encounters an + // error in order to abort the processing of other worker threads. + bool tpl_mt_exit; +#if CONFIG_MULTITHREAD + // Mutex lock object used for error handling. + pthread_mutex_t *mutex_; +#endif + // Row synchronization related function pointers. + void (*sync_read_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c); + void (*sync_write_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c, + int cols); +} AV1TplRowMultiThreadInfo; + +// TODO(jingning): This needs to be cleaned up next. + +// TPL stats buffers are prepared for every frame in the GOP, +// including (internal) overlays and (internal) arfs. +// In addition, frames in the lookahead that are outside of the GOP +// are also used. +// Thus it should use +// (gop_length) + (# overlays) + (MAX_LAG_BUFFERS - gop_len) = +// MAX_LAG_BUFFERS + (# overlays) +// 2 * MAX_LAG_BUFFERS is therefore a safe estimate. +// TODO(bohanli): test setting it to 1.5 * MAX_LAG_BUFFER +#define MAX_TPL_FRAME_IDX (2 * MAX_LAG_BUFFERS) +// The first REF_FRAMES + 1 buffers are reserved. +// tpl_data->tpl_frame starts after REF_FRAMES + 1 +#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TPL_FRAME_IDX + REF_FRAMES + 1) +#define TPL_DEP_COST_SCALE_LOG2 4 + +#define TPL_EPSILON 0.0000001 + +typedef struct TplTxfmStats { + int ready; // Whether abs_coeff_mean is ready + double abs_coeff_sum[256]; // Assume we are using 16x16 transform block + double abs_coeff_mean[256]; + int txfm_block_count; + int coeff_num; +} TplTxfmStats; + +typedef struct { + uint8_t *predictor8; + int16_t *src_diff; + tran_low_t *coeff; + tran_low_t *qcoeff; + tran_low_t *dqcoeff; +} TplBuffers; + +typedef struct TplDepStats { + int64_t srcrf_sse; + int64_t srcrf_dist; + int64_t recrf_sse; + int64_t recrf_dist; + int64_t intra_sse; + int64_t intra_dist; + int64_t cmp_recrf_dist[2]; + int64_t mc_dep_rate; + int64_t mc_dep_dist; + int64_t pred_error[INTER_REFS_PER_FRAME]; + int32_t intra_cost; + int32_t inter_cost; + int32_t srcrf_rate; + int32_t recrf_rate; + int32_t intra_rate; + int32_t cmp_recrf_rate[2]; + int_mv mv[INTER_REFS_PER_FRAME]; + int8_t ref_frame_index[2]; +} TplDepStats; + +typedef struct TplDepFrame { + uint8_t is_valid; + TplDepStats *tpl_stats_ptr; + const YV12_BUFFER_CONFIG *gf_picture; + YV12_BUFFER_CONFIG *rec_picture; + int ref_map_index[REF_FRAMES]; + int stride; + int width; + int height; + int mi_rows; + int mi_cols; + int base_rdmult; + uint32_t frame_display_index; + // When set, SAD metric is used for intra and inter mode decision. + int use_pred_sad; +} TplDepFrame; + +/*!\endcond */ +/*! + * \brief Params related to temporal dependency model. + */ +typedef struct TplParams { + /*! + * Whether the tpl stats is ready. + */ + int ready; + + /*! + * Block granularity of tpl score storage. + */ + uint8_t tpl_stats_block_mis_log2; + + /*! + * Tpl motion estimation block 1d size. tpl_bsize_1d >= 16. + */ + uint8_t tpl_bsize_1d; + + /*! + * Buffer to store the frame level tpl information for each frame in a gf + * group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf + * group + */ + TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS]; + + /*! + * Buffer to store tpl stats at block granularity. + * tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf + * group. + */ + TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS]; + + /*! + * Pointer to the buffer which stores tpl transform stats per frame. + * txfm_stats_list[i] stores the TplTxfmStats of the ith frame in a gf group. + * Memory is allocated dynamically for MAX_LENGTH_TPL_FRAME_STATS frames when + * tpl is enabled. + */ + TplTxfmStats *txfm_stats_list; + + /*! + * Buffer to store tpl reconstructed frame. + * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group. + */ + YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS]; + + /*! + * Pointer to tpl_stats_buffer. + */ + TplDepFrame *tpl_frame; + + /*! + * Scale factors for the current frame. + */ + struct scale_factors sf; + + /*! + * GF group index of the current frame. + */ + int frame_idx; + + /*! + * Array of pointers to the frame buffers holding the source frame. + * src_ref_frame[i] stores the pointer to the source frame of the ith + * reference frame type. + */ + const YV12_BUFFER_CONFIG *src_ref_frame[INTER_REFS_PER_FRAME]; + + /*! + * Array of pointers to the frame buffers holding the tpl reconstructed frame. + * ref_frame[i] stores the pointer to the tpl reconstructed frame of the ith + * reference frame type. + */ + const YV12_BUFFER_CONFIG *ref_frame[INTER_REFS_PER_FRAME]; + + /*! + * Parameters related to synchronization for top-right dependency in row based + * multi-threading of tpl + */ + AV1TplRowMultiThreadSync tpl_mt_sync; + + /*! + * Frame border for tpl frame. + */ + int border_in_pixels; + + /*! + * Factor to adjust r0 if TPL uses a subset of frames in the gf group. + */ + double r0_adjust_factor; +} TplParams; + +#if CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG +#define VBR_RC_INFO_MAX_FRAMES 500 +#endif // CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG + +#if CONFIG_BITRATE_ACCURACY + +/*! + * \brief This structure stores information needed for bitrate accuracy + * experiment. + */ +typedef struct { + int ready; + double total_bit_budget; // The total bit budget of the entire video + int show_frame_count; // Number of show frames in the entire video + + int gop_showframe_count; // The number of show frames in the current gop + double gop_bit_budget; // The bitbudget for the current gop + double scale_factors[FRAME_UPDATE_TYPES]; // Scale factors to improve the + // budget estimation + double mv_scale_factors[FRAME_UPDATE_TYPES]; // Scale factors to improve + // MV entropy estimation + + // === Below this line are GOP related data that will be updated per GOP === + int base_q_index; // Stores the base q index. + int q_index_list_ready; + int q_index_list[VBR_RC_INFO_MAX_FRAMES]; // q indices for the current + // GOP + + // Array to store qstep_ratio for each frame in a GOP + double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES]; + +#if CONFIG_THREE_PASS + TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES]; + FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES]; + int gop_start_idx_list[VBR_RC_INFO_MAX_FRAMES]; + int gop_length_list[VBR_RC_INFO_MAX_FRAMES]; + int cur_gop_idx; + int total_frame_count; + int gop_count; +#endif // CONFIG_THREE_PASS +} VBR_RATECTRL_INFO; + +static INLINE void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) { + vbr_rc_info->q_index_list_ready = 0; + av1_zero(vbr_rc_info->q_index_list); +} + +void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget, + int show_frame_count); + +int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info, + int gf_frame_index); + +void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info, + const struct TPL_INFO *tpl_info); + +void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info, + int gop_showframe_count); + +void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count, + const double *qstep_ratio_list, + aom_bit_depth_t bit_depth, int *q_index_list); + +/*!\brief Update q_index_list in vbr_rc_info based on tpl stats + * + * \param[out] vbr_rc_info Rate control info for BITRATE_ACCURACY + * experiment + * \param[in] tpl_data TPL struct + * \param[in] gf_group GOP struct + * \param[in] bit_depth bit depth + */ +void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info, + const TplParams *tpl_data, + const struct GF_GROUP *gf_group, + aom_bit_depth_t bit_depth); +/* + *!\brief Compute the number of bits needed to encode a GOP + * + * \param[in] base_q_index base layer q_index + * \param[in] bit_depth bit depth + * \param[in] update_type_scale_factors array of scale factors for each + * update_type + * \param[in] frame_count size of update_type_list, + * qstep_ratio_list stats_list, + * q_index_list and + * estimated_bitrate_byframe + * \param[in] update_type_list array of update_type, one per frame + * \param[in] qstep_ratio_list array of qstep_ratio, one per frame + * \param[in] stats_list array of transform stats, one per + * frame + * \param[out] q_index_list array of q_index, one per frame + * \param[out] estimated_bitrate_byframe array to keep track of frame + * bitrate + * + * \return The estimated GOP bitrate. + * + */ +double av1_vbr_rc_info_estimate_gop_bitrate( + int base_q_index, aom_bit_depth_t bit_depth, + const double *update_type_scale_factors, int frame_count, + const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, + const TplTxfmStats *stats_list, int *q_index_list, + double *estimated_bitrate_byframe); + +/*!\brief Estimate the optimal base q index for a GOP. + * + * This function uses a binary search to find base layer q index to + * achieve the specified bit budget. + * + * \param[in] bit_budget target bit budget + * \param[in] bit_depth bit depth + * \param[in] update_type_scale_factors array of scale factors for each + * update_type + * \param[in] frame_count size of update_type_list, qstep_ratio_list + * stats_list, q_index_list and + * estimated_bitrate_byframe + * \param[in] update_type_list array of update_type, one per frame + * \param[in] qstep_ratio_list array of qstep_ratio, one per frame + * \param[in] stats_list array of transform stats, one per frame + * \param[out] q_index_list array of q_index, one per frame + * \param[out] estimated_bitrate_byframe Array to keep track of frame + * bitrate + * + * \return Returns the optimal base q index to use. + */ +int av1_vbr_rc_info_estimate_base_q( + double bit_budget, aom_bit_depth_t bit_depth, + const double *update_type_scale_factors, int frame_count, + const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, + const TplTxfmStats *stats_list, int *q_index_list, + double *estimated_bitrate_byframe); + +#endif // CONFIG_BITRATE_ACCURACY + +#if CONFIG_RD_COMMAND +typedef enum { + RD_OPTION_NONE, + RD_OPTION_SET_Q, + RD_OPTION_SET_Q_RDMULT +} RD_OPTION; + +typedef struct RD_COMMAND { + RD_OPTION option_ls[MAX_LENGTH_TPL_FRAME_STATS]; + int q_index_ls[MAX_LENGTH_TPL_FRAME_STATS]; + int rdmult_ls[MAX_LENGTH_TPL_FRAME_STATS]; + int frame_count; + int frame_index; +} RD_COMMAND; + +void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command); +#endif // CONFIG_RD_COMMAND + +/*!\brief Allocate buffers used by tpl model + * + * \param[in] Top-level encode/decode structure + * \param[in] lag_in_frames number of lookahead frames + * + * \param[out] tpl_data tpl data structure + */ + +void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi, + CommonModeInfoParams *const mi_params, int width, + int height, int byte_alignment, int lag_in_frames); + +static AOM_INLINE void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) { + aom_free(tpl_tmp_buffers->predictor8); + tpl_tmp_buffers->predictor8 = NULL; + aom_free(tpl_tmp_buffers->src_diff); + tpl_tmp_buffers->src_diff = NULL; + aom_free(tpl_tmp_buffers->coeff); + tpl_tmp_buffers->coeff = NULL; + aom_free(tpl_tmp_buffers->qcoeff); + tpl_tmp_buffers->qcoeff = NULL; + aom_free(tpl_tmp_buffers->dqcoeff); + tpl_tmp_buffers->dqcoeff = NULL; +} + +static AOM_INLINE bool tpl_alloc_temp_buffers(TplBuffers *tpl_tmp_buffers, + uint8_t tpl_bsize_1d) { + // Number of pixels in a tpl block + const int tpl_block_pels = tpl_bsize_1d * tpl_bsize_1d; + + // Allocate temporary buffers used in mode estimation. + tpl_tmp_buffers->predictor8 = (uint8_t *)aom_memalign( + 32, tpl_block_pels * 2 * sizeof(*tpl_tmp_buffers->predictor8)); + tpl_tmp_buffers->src_diff = (int16_t *)aom_memalign( + 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->src_diff)); + tpl_tmp_buffers->coeff = (tran_low_t *)aom_memalign( + 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->coeff)); + tpl_tmp_buffers->qcoeff = (tran_low_t *)aom_memalign( + 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->qcoeff)); + tpl_tmp_buffers->dqcoeff = (tran_low_t *)aom_memalign( + 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->dqcoeff)); + + if (!(tpl_tmp_buffers->predictor8 && tpl_tmp_buffers->src_diff && + tpl_tmp_buffers->coeff && tpl_tmp_buffers->qcoeff && + tpl_tmp_buffers->dqcoeff)) { + tpl_dealloc_temp_buffers(tpl_tmp_buffers); + return false; + } + return true; +} + +/*!\brief Implements temporal dependency modelling for a GOP (GF/ARF + * group) and selects between 16 and 32 frame GOP structure. + * + *\ingroup tpl_modelling + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] gop_eval Flag if it is in the GOP length decision stage + * \param[in] frame_params Per frame encoding parameters + * + * \return Indicates whether or not we should use a longer GOP length. + */ +int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval, + const struct EncodeFrameParams *const frame_params); + +/*!\cond */ + +void av1_tpl_preload_rc_estimate( + struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params); + +int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift); + +void av1_init_tpl_stats(TplParams *const tpl_data); + +int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index); + +void av1_tpl_rdmult_setup(struct AV1_COMP *cpi); + +void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x, + BLOCK_SIZE sb_size, int mi_row, int mi_col); + +void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi, + TplTxfmStats *tpl_txfm_stats, + TplBuffers *tpl_tmp_buffers, MACROBLOCK *x, + int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size); + +/*!\brief Compute the entropy of an exponential probability distribution + * function (pdf) subjected to uniform quantization. + * + * pdf(x) = b*exp(-b*x) + * + *\ingroup tpl_modelling + * + * \param[in] q_step quantizer step size + * \param[in] b parameter of exponential distribution + * + * \return entropy cost + */ +double av1_exponential_entropy(double q_step, double b); + +/*!\brief Compute the entropy of a Laplace probability distribution + * function (pdf) subjected to non-uniform quantization. + * + * pdf(x) = 0.5*b*exp(-0.5*b*|x|) + * + *\ingroup tpl_modelling + * + * \param[in] q_step quantizer step size for non-zero bins + * \param[in] b parameter of Laplace distribution + * \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio * q_step + * + * \return entropy cost + */ +double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio); + +/*!\brief Compute the frame rate using transform block stats + * + * Assume each position i in the transform block is of Laplace distribution + * with mean absolute deviation abs_coeff_mean[i] + * + * Then we can use av1_laplace_entropy() to compute the expected frame + * rate. + * + *\ingroup tpl_modelling + * + * \param[in] q_index quantizer index + * \param[in] block_count number of transform blocks + * \param[in] abs_coeff_mean array of mean absolute deviation + * \param[in] coeff_num number of coefficients per transform block + * + * \return expected frame rate + */ +double av1_laplace_estimate_frame_rate(int q_index, int block_count, + const double *abs_coeff_mean, + int coeff_num); + +/* + *!\brief Init TplTxfmStats + * + * \param[in] tpl_txfm_stats a structure for storing transform stats + * + */ +void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats); + +#if CONFIG_BITRATE_ACCURACY +/* + *!\brief Accumulate TplTxfmStats + * + * \param[in] sub_stats a structure for storing sub transform stats + * \param[out] accumulated_stats a structure for storing accumulated + *transform stats + * + */ +void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats, + TplTxfmStats *accumulated_stats); + +/* + *!\brief Record a transform block into TplTxfmStats + * + * \param[in] tpl_txfm_stats A structure for storing transform stats + * \param[out] coeff An array of transform coefficients. Its size + * should equal to tpl_txfm_stats.coeff_num. + * + */ +void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats, + const tran_low_t *coeff); + +/* + *!\brief Update abs_coeff_mean and ready of txfm_stats + * If txfm_block_count > 0, this function will use abs_coeff_sum and + * txfm_block_count to compute abs_coeff_mean. Moreover, reday flag + * will be set to one. + * + * \param[in] txfm_stats A structure for storing transform stats + */ +void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats); +#endif // CONFIG_BITRATE_ACCURACY + +/*!\brief Estimate coefficient entropy using Laplace dsitribution + * + *\ingroup tpl_modelling + * + * This function is equivalent to -log2(laplace_prob()), where laplace_prob() + *is defined in tpl_model_test.cc + * + * \param[in] q_step quantizer step size without any scaling + * \param[in] b mean absolute deviation of Laplace + *distribution \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio + ** q_step \param[in] qcoeff quantized coefficient + * + * \return estimated coefficient entropy + * + */ +double av1_estimate_coeff_entropy(double q_step, double b, + double zero_bin_ratio, int qcoeff); + +/*!\brief Estimate entropy of a transform block using Laplace dsitribution + * + *\ingroup tpl_modelling + * + * \param[in] q_index quantizer index + * \param[in] abs_coeff_mean array of mean absolute deviations + * \param[in] qcoeff_arr array of quantized coefficients + * \param[in] coeff_num number of coefficients per transform block + * + * \return estimated transform block entropy + * + */ +double av1_estimate_txfm_block_entropy(int q_index, + const double *abs_coeff_mean, + int *qcoeff_arr, int coeff_num); + +// TODO(angiebird): Add doxygen description here. +int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, + int64_t srcrf_dist, int pix_num); + +/*!\brief Compute the overlap area between two blocks with the same size + * + *\ingroup tpl_modelling + * + * If there is no overlap, this function should return zero. + * + * \param[in] row_a row position of the first block + * \param[in] col_a column position of the first block + * \param[in] row_b row position of the second block + * \param[in] col_b column position of the second block + * \param[in] width width shared by the two blocks + * \param[in] height height shared by the two blocks + * + * \return overlap area of the two blocks + */ +int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width, + int height); + +/*!\brief Get current frame's q_index from tpl stats and leaf_qindex + * + * \param[in] tpl_data TPL struct + * \param[in] gf_frame_index current frame index in the GOP + * \param[in] leaf_qindex q index of leaf frame + * \param[in] bit_depth bit depth + * + * \return q_index + */ +int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index, + int leaf_qindex, aom_bit_depth_t bit_depth); + +/*!\brief Compute the frame importance from TPL stats + * + * \param[in] tpl_data TPL struct + * \param[in] gf_frame_index current frame index in the GOP + * + * \return frame_importance + */ +double av1_tpl_get_frame_importance(const TplParams *tpl_data, + int gf_frame_index); + +/*!\brief Compute the ratio between arf q step and the leaf q step based on + * TPL stats + * + * \param[in] tpl_data TPL struct + * \param[in] gf_frame_index current frame index in the GOP + * \param[in] leaf_qindex q index of leaf frame + * \param[in] bit_depth bit depth + * + * \return qstep_ratio + */ +double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index); + +/*!\brief Find a q index whose step size is near qstep_ratio * leaf_qstep + * + * \param[in] leaf_qindex q index of leaf frame + * \param[in] qstep_ratio step ratio between target q index and + * leaf q index \param[in] bit_depth bit depth + * + * \return q_index + */ +int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio, + aom_bit_depth_t bit_depth); + +/*!\brief Improve the motion vector estimation by taking neighbors into + * account. + * + * Use the upper and left neighbor block as the reference MVs. + * Compute the minimum difference between current MV and reference MV. + * + * \param[in] tpl_frame Tpl frame struct + * \param[in] row Current row + * \param[in] col Current column + * \param[in] step Step parameter for av1_tpl_ptr_pos + * \param[in] tpl_stride Stride parameter for av1_tpl_ptr_pos + * \param[in] right_shift Right shift parameter for + * av1_tpl_ptr_pos + */ +int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col, + int step, int tpl_stride, int right_shift); + +/*!\brief Compute the entropy of motion vectors for a single frame. + * + * \param[in] tpl_frame TPL frame struct + * \param[in] right_shift right shift value for step + * + * \return Bits used by the motion vectors for one frame. + */ +double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame, + uint8_t right_shift); + +#if CONFIG_RATECTRL_LOG +typedef struct { + int coding_frame_count; + int base_q_index; + + // Encode decision + int q_index_list[VBR_RC_INFO_MAX_FRAMES]; + double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES]; + FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES]; + + // Frame stats + TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES]; + + // Estimated encode results + double est_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES]; + + // Actual encode results + double act_rate_list[VBR_RC_INFO_MAX_FRAMES]; + double act_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES]; +} RATECTRL_LOG; + +static INLINE void rc_log_init(RATECTRL_LOG *rc_log) { av1_zero(*rc_log); } + +static INLINE void rc_log_frame_stats(RATECTRL_LOG *rc_log, int coding_index, + const TplTxfmStats *txfm_stats) { + rc_log->txfm_stats_list[coding_index] = *txfm_stats; +} + +static INLINE void rc_log_frame_encode_param(RATECTRL_LOG *rc_log, + int coding_index, + double qstep_ratio, int q_index, + FRAME_UPDATE_TYPE update_type) { + rc_log->qstep_ratio_list[coding_index] = qstep_ratio; + rc_log->q_index_list[coding_index] = q_index; + rc_log->update_type_list[coding_index] = update_type; + const TplTxfmStats *txfm_stats = &rc_log->txfm_stats_list[coding_index]; + rc_log->est_coeff_rate_list[coding_index] = 0; + if (txfm_stats->ready) { + rc_log->est_coeff_rate_list[coding_index] = av1_laplace_estimate_frame_rate( + q_index, txfm_stats->txfm_block_count, txfm_stats->abs_coeff_mean, + txfm_stats->coeff_num); + } +} + +static INLINE void rc_log_frame_entropy(RATECTRL_LOG *rc_log, int coding_index, + double act_rate, + double act_coeff_rate) { + rc_log->act_rate_list[coding_index] = act_rate; + rc_log->act_coeff_rate_list[coding_index] = act_coeff_rate; +} + +static INLINE void rc_log_record_chunk_info(RATECTRL_LOG *rc_log, + int base_q_index, + int coding_frame_count) { + rc_log->base_q_index = base_q_index; + rc_log->coding_frame_count = coding_frame_count; +} + +static INLINE void rc_log_show(const RATECTRL_LOG *rc_log) { + printf("= chunk 1\n"); + printf("coding_frame_count %d base_q_index %d\n", rc_log->coding_frame_count, + rc_log->base_q_index); + printf("= frame %d\n", rc_log->coding_frame_count); + for (int coding_idx = 0; coding_idx < rc_log->coding_frame_count; + coding_idx++) { + printf( + "coding_idx %d update_type %d q %d qstep_ratio %f est_coeff_rate %f " + "act_coeff_rate %f act_rate %f\n", + coding_idx, rc_log->update_type_list[coding_idx], + rc_log->q_index_list[coding_idx], rc_log->qstep_ratio_list[coding_idx], + rc_log->est_coeff_rate_list[coding_idx], + rc_log->act_coeff_rate_list[coding_idx], + rc_log->act_rate_list[coding_idx]); + } +} +#endif // CONFIG_RATECTRL_LOG + +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TPL_MODEL_H_ diff --git a/third_party/aom/av1/encoder/tune_butteraugli.c b/third_party/aom/av1/encoder/tune_butteraugli.c new file mode 100644 index 0000000000..92fc4b2a92 --- /dev/null +++ b/third_party/aom/av1/encoder/tune_butteraugli.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/encoder/tune_butteraugli.h" + +#include "aom_dsp/butteraugli.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/var_based_part.h" + +static const int resize_factor = 2; + +static void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *recon, + const double K) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const aom_color_range_t color_range = + seq_params->color_range != 0 ? AOM_CR_FULL_RANGE : AOM_CR_STUDIO_RANGE; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int width = source->y_crop_width; + const int height = source->y_crop_height; + const int ss_x = source->subsampling_x; + const int ss_y = source->subsampling_y; + + float *diffmap; + CHECK_MEM_ERROR(cm, diffmap, aom_malloc(width * height * sizeof(*diffmap))); + if (!aom_calc_butteraugli(source, recon, bit_depth, + seq_params->matrix_coefficients, color_range, + diffmap)) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Failed to calculate Butteraugli distances."); + } + + const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize] / resize_factor; + const int num_mi_h = mi_size_high[butteraugli_rdo_bsize] / resize_factor; + const int num_cols = + (mi_params->mi_cols / resize_factor + num_mi_w - 1) / num_mi_w; + const int num_rows = + (mi_params->mi_rows / resize_factor + num_mi_h - 1) / num_mi_h; + const int block_w = num_mi_w << 2; + const int block_h = num_mi_h << 2; + double log_sum = 0.0; + double blk_count = 0.0; + + // Loop through each block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + const int y_start = row * block_h; + const int x_start = col * block_w; + float dbutteraugli = 0.0f; + float dmse = 0.0f; + float px_count = 0.0f; + + // Loop through each pixel. + for (int y = y_start; y < y_start + block_h && y < height; y++) { + for (int x = x_start; x < x_start + block_w && x < width; x++) { + dbutteraugli += powf(diffmap[y * width + x], 12.0f); + float px_diff = source->y_buffer[y * source->y_stride + x] - + recon->y_buffer[y * recon->y_stride + x]; + dmse += px_diff * px_diff; + px_count += 1.0f; + } + } + const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y), + (height + ss_y) >> ss_y); + for (int y = y_start >> ss_y; y < y_end; y++) { + const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x), + (width + ss_x) >> ss_x); + for (int x = x_start >> ss_x; x < x_end; x++) { + const int src_px_index = y * source->uv_stride + x; + const int recon_px_index = y * recon->uv_stride + x; + const float px_diff_u = (float)(source->u_buffer[src_px_index] - + recon->u_buffer[recon_px_index]); + const float px_diff_v = (float)(source->v_buffer[src_px_index] - + recon->v_buffer[recon_px_index]); + dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v; + px_count += 2.0f; + } + } + + dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f); + dmse = dmse / px_count; + const float eps = 0.01f; + double weight; + if (dbutteraugli < eps || dmse < eps) { + weight = -1.0; + } else { + blk_count += 1.0; + weight = dmse / dbutteraugli; + weight = AOMMIN(weight, 5.0); + weight += K; + log_sum += log(weight); + } + cpi->butteraugli_info.rdmult_scaling_factors[index] = weight; + } + } + // Geometric average of the weights. + log_sum = exp(log_sum / blk_count); + + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + double *weight = &cpi->butteraugli_info.rdmult_scaling_factors[index]; + if (*weight <= 0.0) { + *weight = 1.0; + } else { + *weight /= log_sum; + } + *weight = AOMMIN(*weight, 2.5); + *weight = AOMMAX(*weight, 0.4); + } + } + + aom_free(diffmap); +} + +void av1_set_butteraugli_rdmult(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int *rdmult) { + assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI); + if (!cpi->butteraugli_info.recon_set) { + return; + } + const AV1_COMMON *const cm = &cpi->common; + + const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize]; + const int num_mi_h = mi_size_high[butteraugli_rdo_bsize]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + double num_of_mi = 0.0; + double geom_mean_of_scale = 0.0; + + for (int row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (int col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += + log(cpi->butteraugli_info.rdmult_scaling_factors[index]); + num_of_mi += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); + *rdmult = AOMMAX(*rdmult, 0); + av1_set_error_per_bit(&x->errorperbit, *rdmult); +} + +static void copy_plane(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h) { + for (int row = 0; row < h; row++) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } +} + +static void copy_img(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + int width, int height) { + copy_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width, + height); + const int width_uv = (width + src->subsampling_x) >> src->subsampling_x; + const int height_uv = (height + src->subsampling_y) >> src->subsampling_y; + copy_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + width_uv, height_uv); + copy_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + width_uv, height_uv); +} + +static void zero_plane(uint8_t *dst, int dst_stride, int h) { + for (int row = 0; row < h; row++) { + memset(dst, 0, dst_stride); + dst += dst_stride; + } +} + +static void zero_img(YV12_BUFFER_CONFIG *dst) { + zero_plane(dst->y_buffer, dst->y_stride, dst->y_height); + zero_plane(dst->u_buffer, dst->uv_stride, dst->uv_height); + zero_plane(dst->v_buffer, dst->uv_stride, dst->uv_height); +} + +void av1_setup_butteraugli_source(AV1_COMP *cpi) { + YV12_BUFFER_CONFIG *const dst = &cpi->butteraugli_info.source; + AV1_COMMON *const cm = &cpi->common; + const int width = cpi->source->y_crop_width; + const int height = cpi->source->y_crop_height; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = cpi->source->subsampling_x; + const int ss_y = cpi->source->subsampling_y; + if (dst->buffer_alloc_sz == 0) { + aom_alloc_frame_buffer( + dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + } + av1_copy_and_extend_frame(cpi->source, dst); + + YV12_BUFFER_CONFIG *const resized_dst = &cpi->butteraugli_info.resized_source; + if (resized_dst->buffer_alloc_sz == 0) { + aom_alloc_frame_buffer( + resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + } + if (!av1_resize_and_extend_frame_nonnormative( + cpi->source, resized_dst, bit_depth, av1_num_planes(cm))) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating buffers during resize"); + } + + zero_img(cpi->source); + copy_img(resized_dst, cpi->source, width / resize_factor, + height / resize_factor); +} + +void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) { + av1_copy_and_extend_frame(&cpi->butteraugli_info.source, cpi->source); + AV1_COMMON *const cm = &cpi->common; + const int width = cpi->source->y_crop_width; + const int height = cpi->source->y_crop_height; + const int ss_x = cpi->source->subsampling_x; + const int ss_y = cpi->source->subsampling_y; + + YV12_BUFFER_CONFIG resized_recon; + memset(&resized_recon, 0, sizeof(resized_recon)); + aom_alloc_frame_buffer( + &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor, + height / resize_factor); + + set_mb_butteraugli_rdmult_scaling(cpi, &cpi->butteraugli_info.resized_source, + &resized_recon, K); + cpi->butteraugli_info.recon_set = true; + aom_free_frame_buffer(&resized_recon); +} + +void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const QuantizationCfg *const q_cfg = &oxcf->q_cfg; + const int q_index = 96; + + // Setup necessary params for encoding, including frame source, etc. + if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi); + av1_set_frame_size(cpi, cm->superres_upscaled_width, + cm->superres_upscaled_height); + + cpi->source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter, + 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, + cpi->image_pyramid_levels); + } + + av1_setup_butteraugli_source(cpi); + av1_setup_frame(cpi); + + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + + const PARTITION_SEARCH_TYPE partition_search_type = + cpi->sf.part_sf.partition_search_type; + const BLOCK_SIZE fixed_partition_size = cpi->sf.part_sf.fixed_partition_size; + // Enable a quicker pass by uncommenting the following lines: + // cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; + // cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32; + + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_index, + q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq); + av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + + av1_set_variance_partition_thresholds(cpi, q_index, 0); + av1_encode_frame(cpi); + + av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.3); + cpi->sf.part_sf.partition_search_type = partition_search_type; + cpi->sf.part_sf.fixed_partition_size = fixed_partition_size; +} diff --git a/third_party/aom/av1/encoder/tune_butteraugli.h b/third_party/aom/av1/encoder/tune_butteraugli.h new file mode 100644 index 0000000000..bae5d2a882 --- /dev/null +++ b/third_party/aom/av1/encoder/tune_butteraugli.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_ +#define AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_ + +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/block.h" + +typedef struct { + // Stores the scaling factors for rdmult when tuning for Butteraugli. + // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for + // 4x4 block at (row, col). + double *rdmult_scaling_factors; + YV12_BUFFER_CONFIG source, resized_source; + bool recon_set; +} TuneButteraugliInfo; + +struct AV1_COMP; +static const BLOCK_SIZE butteraugli_rdo_bsize = BLOCK_16X16; + +void av1_set_butteraugli_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int *rdmult); + +void av1_setup_butteraugli_source(struct AV1_COMP *cpi); + +// 'K' is used to balance the rate-distortion distribution between PSNR +// and Butteraugli. +void av1_setup_butteraugli_rdmult_and_restore_source(struct AV1_COMP *cpi, + double K); + +void av1_setup_butteraugli_rdmult(struct AV1_COMP *cpi); + +#endif // AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_ diff --git a/third_party/aom/av1/encoder/tune_vmaf.c b/third_party/aom/av1/encoder/tune_vmaf.c new file mode 100644 index 0000000000..4e5ffa387c --- /dev/null +++ b/third_party/aom/av1/encoder/tune_vmaf.c @@ -0,0 +1,1112 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/tune_vmaf.h" + +#include "aom_dsp/psnr.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/rdopt.h" +#include "config/aom_scale_rtcd.h" + +static const double kBaselineVmaf = 97.42773; + +static double get_layer_value(const double *array, int layer) { + while (array[layer] < 0.0 && layer > 0) layer--; + return AOMMAX(array[layer], 0.0); +} + +static void motion_search(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *ref, + const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, FULLPEL_MV *ref_mv) { + // Block information (ONLY Y-plane is used for motion search). + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int y_stride = src->y_stride; + assert(y_stride == ref->y_stride); + const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; + + // Save input state. + MACROBLOCK *const mb = &cpi->td.mb; + MACROBLOCKD *const mbd = &mb->e_mbd; + const struct buf_2d ori_src_buf = mb->plane[0].src; + const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0]; + + // Parameters used for motion search. + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + FULLPEL_MV_STATS best_mv_stats; + const SEARCH_METHODS search_method = NSTEP; + const search_site_config *search_site_cfg = + cpi->mv_search_params.search_site_cfg[SS_CFG_FPF]; + const int step_param = + av1_init_search_range(AOMMAX(src->y_crop_width, src->y_crop_height)); + + // Baseline position for motion search (used for rate distortion comparison). + const MV baseline_mv = kZeroMv; + + // Setup. + mb->plane[0].src.buf = src->y_buffer + y_offset; + mb->plane[0].src.stride = y_stride; + mbd->plane[0].pre[0].buf = ref->y_buffer + y_offset; + mbd->plane[0].pre[0].stride = y_stride; + + // Unused intermediate results for motion search. + int cost_list[5]; + + // Do motion search. + // Only do full search on the entire block. + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size, + &baseline_mv, *ref_mv, search_site_cfg, + search_method, + /*fine_search_interval=*/0); + av1_full_pixel_search(*ref_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), ref_mv, &best_mv_stats, + NULL); + + // Restore input state. + mb->plane[0].src = ori_src_buf; + mbd->plane[0].pre[0] = ori_pre_buf; +} + +static unsigned int residual_variance(const AV1_COMP *cpi, + const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *ref, + const BLOCK_SIZE block_size, + const int mb_row, const int mb_col, + FULLPEL_MV ref_mv, unsigned int *sse) { + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int y_stride = src->y_stride; + assert(y_stride == ref->y_stride); + const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; + const int mv_offset = ref_mv.row * y_stride + ref_mv.col; + const unsigned int var = cpi->ppi->fn_ptr[block_size].vf( + ref->y_buffer + y_offset + mv_offset, y_stride, src->y_buffer + y_offset, + y_stride, sse); + return var; +} + +static double frame_average_variance(const AV1_COMP *const cpi, + const YV12_BUFFER_CONFIG *const frame) { + const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + const uint8_t *const y_buffer = frame->y_buffer; + const int y_stride = frame->y_stride; + const BLOCK_SIZE block_size = BLOCK_64X64; + + const int block_w = mi_size_wide[block_size] * 4; + const int block_h = mi_size_high[block_size] * 4; + int row, col; + double var = 0.0, var_count = 0.0; + const int use_hbd = frame->flags & YV12_FLAG_HIGHBITDEPTH; + + // Loop through each block. + for (row = 0; row < frame->y_height / block_h; ++row) { + for (col = 0; col < frame->y_width / block_w; ++col) { + struct buf_2d buf; + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + + buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + var += av1_get_perpixel_variance(cpi, xd, &buf, block_size, AOM_PLANE_Y, + use_hbd); + var_count += 1.0; + } + } + var /= var_count; + return var; +} + +static double residual_frame_average_variance(AV1_COMP *cpi, + const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *ref, + FULLPEL_MV *mvs) { + if (ref == NULL) return frame_average_variance(cpi, src); + const BLOCK_SIZE block_size = BLOCK_16X16; + const int frame_height = src->y_height; + const int frame_width = src->y_width; + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_rows = (frame_height + mb_height - 1) / mb_height; + const int mb_cols = (frame_width + mb_width - 1) / mb_width; + const int num_planes = av1_num_planes(&cpi->common); + const int mi_h = mi_size_high_log2[block_size]; + const int mi_w = mi_size_wide_log2[block_size]; + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + // Save input state. + MACROBLOCK *const mb = &cpi->td.mb; + MACROBLOCKD *const mbd = &mb->e_mbd; + uint8_t *input_buffer[MAX_MB_PLANE]; + for (int i = 0; i < num_planes; i++) { + input_buffer[i] = mbd->plane[i].pre[0].buf; + } + MB_MODE_INFO **input_mb_mode_info = mbd->mi; + + bool do_motion_search = false; + if (mvs == NULL) { + do_motion_search = true; + CHECK_MEM_ERROR(&cpi->common, mvs, + (FULLPEL_MV *)aom_calloc(mb_rows * mb_cols, sizeof(*mvs))); + } + + unsigned int variance = 0; + // Perform temporal filtering block by block. + for (int mb_row = 0; mb_row < mb_rows; mb_row++) { + av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + for (int mb_col = 0; mb_col < mb_cols; mb_col++) { + av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + FULLPEL_MV *ref_mv = &mvs[mb_col + mb_row * mb_cols]; + if (do_motion_search) { + motion_search(cpi, src, ref, block_size, mb_row, mb_col, ref_mv); + } + unsigned int mv_sse; + const unsigned int blk_var = residual_variance( + cpi, src, ref, block_size, mb_row, mb_col, *ref_mv, &mv_sse); + variance += blk_var; + } + } + + // Restore input state + for (int i = 0; i < num_planes; i++) { + mbd->plane[i].pre[0].buf = input_buffer[i]; + } + mbd->mi = input_mb_mode_info; + return (double)variance / (double)(mb_rows * mb_cols); +} + +// TODO(sdeng): Add the SIMD implementation. +static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source, + int source_stride, + const uint16_t *blurred, + int blurred_stride, uint16_t *dst, + int dst_stride, int w, int h, + double amount, int bit_depth) { + const int max_value = (1 << bit_depth) - 1; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const double val = + (double)source[j] + amount * ((double)source[j] - (double)blurred[j]); + dst[j] = (uint16_t)clamp((int)(val + 0.5), 0, max_value); + } + source += source_stride; + blurred += blurred_stride; + dst += dst_stride; + } +} + +static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride, + const uint8_t *blurred, int blurred_stride, + uint8_t *dst, int dst_stride, int w, int h, + double amount) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const double val = + (double)source[j] + amount * ((double)source[j] - (double)blurred[j]); + dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255); + } + source += source_stride; + blurred += blurred_stride; + dst += dst_stride; + } +} + +static AOM_INLINE void unsharp(const AV1_COMP *const cpi, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *blurred, + const YV12_BUFFER_CONFIG *dst, double amount) { + const int bit_depth = cpi->td.mb.e_mbd.bd; + if (cpi->common.seq_params->use_highbitdepth) { + assert(source->flags & YV12_FLAG_HIGHBITDEPTH); + assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH); + assert(dst->flags & YV12_FLAG_HIGHBITDEPTH); + highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride, + CONVERT_TO_SHORTPTR(blurred->y_buffer), + blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer), + dst->y_stride, source->y_width, source->y_height, + amount, bit_depth); + } else { + unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer, + blurred->y_stride, dst->y_buffer, dst->y_stride, + source->y_width, source->y_height, amount); + } +} + +// 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128, +// all co-efficients must be even. +DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0, 8, 30, 52, + 30, 8, 0, 0 }; +static AOM_INLINE void gaussian_blur(const int bit_depth, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dst) { + const int block_size = BLOCK_128X128; + const int block_w = mi_size_wide[block_size] * 4; + const int block_h = mi_size_high[block_size] * 4; + const int num_cols = (source->y_width + block_w - 1) / block_w; + const int num_rows = (source->y_height + block_h - 1) / block_h; + int row, col; + + ConvolveParams conv_params = get_conv_params(0, 0, bit_depth); + InterpFilterParams filter = { .filter_ptr = gauss_filter, + .taps = 8, + .interp_filter = EIGHTTAP_REGULAR }; + + for (row = 0; row < num_rows; ++row) { + for (col = 0; col < num_cols; ++col) { + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + + uint8_t *src_buf = + source->y_buffer + row_offset_y * source->y_stride + col_offset_y; + uint8_t *dst_buf = + dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y; + + if (source->flags & YV12_FLAG_HIGHBITDEPTH) { + av1_highbd_convolve_2d_sr( + CONVERT_TO_SHORTPTR(src_buf), source->y_stride, + CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h, + &filter, &filter, 0, 0, &conv_params, bit_depth); + } else { + av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride, + block_w, block_h, &filter, &filter, 0, 0, + &conv_params); + } + } + } +} + +static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi, + double source_variance, + YV12_BUFFER_CONFIG *const source, + YV12_BUFFER_CONFIG *const sharpened) { + const int bit_depth = cpi->td.mb.e_mbd.bd; + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + double new_vmaf; + + aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, sharpened, bit_depth, + cal_vmaf_neg, &new_vmaf); + + const double sharpened_var = frame_average_variance(cpi, sharpened); + return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf); +} + +static double find_best_frame_unsharp_amount_loop( + const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source, + YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened, + double best_vmaf, const double baseline_variance, + const double unsharp_amount_start, const double step_size, + const int max_loop_count, const double max_amount) { + const double min_amount = 0.0; + int loop_count = 0; + double approx_vmaf = best_vmaf; + double unsharp_amount = unsharp_amount_start; + do { + best_vmaf = approx_vmaf; + unsharp_amount += step_size; + if (unsharp_amount > max_amount || unsharp_amount < min_amount) break; + unsharp(cpi, source, blurred, sharpened, unsharp_amount); + approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened); + + loop_count++; + } while (approx_vmaf > best_vmaf && loop_count < max_loop_count); + unsharp_amount = + approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size; + return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount)); +} + +static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source, + YV12_BUFFER_CONFIG *const blurred, + const double unsharp_amount_start, + const double step_size, + const int max_loop_count, + const double max_filter_amount) { + const AV1_COMMON *const cm = &cpi->common; + const int width = source->y_width; + const int height = source->y_height; + YV12_BUFFER_CONFIG sharpened; + memset(&sharpened, 0, sizeof(sharpened)); + aom_alloc_frame_buffer( + &sharpened, width, height, source->subsampling_x, source->subsampling_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + const double baseline_variance = frame_average_variance(cpi, source); + double unsharp_amount; + if (unsharp_amount_start <= step_size) { + unsharp_amount = find_best_frame_unsharp_amount_loop( + cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0, + step_size, max_loop_count, max_filter_amount); + } else { + double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start; + double v0, v1; + unsharp(cpi, source, blurred, &sharpened, a0); + v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened); + unsharp(cpi, source, blurred, &sharpened, a1); + v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened); + if (fabs(v0 - v1) < 0.01) { + unsharp_amount = a0; + } else if (v0 > v1) { + unsharp_amount = find_best_frame_unsharp_amount_loop( + cpi, source, blurred, &sharpened, v0, baseline_variance, a0, + -step_size, max_loop_count, max_filter_amount); + } else { + unsharp_amount = find_best_frame_unsharp_amount_loop( + cpi, source, blurred, &sharpened, v1, baseline_variance, a1, + step_size, max_loop_count, max_filter_amount); + } + } + + aom_free_frame_buffer(&sharpened); + return unsharp_amount; +} + +void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source) { + const AV1_COMMON *const cm = &cpi->common; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int width = source->y_width; + const int height = source->y_height; + + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); + const double best_frame_unsharp_amount = + get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); + + if (best_frame_unsharp_amount <= 0.0) return; + + YV12_BUFFER_CONFIG blurred; + memset(&blurred, 0, sizeof(blurred)); + aom_alloc_frame_buffer( + &blurred, width, height, source->subsampling_x, source->subsampling_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + gaussian_blur(bit_depth, source, &blurred); + unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount); + aom_free_frame_buffer(&blurred); +} + +void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source) { + const AV1_COMMON *const cm = &cpi->common; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int width = source->y_width; + const int height = source->y_height; + + YV12_BUFFER_CONFIG source_extended, blurred; + memset(&source_extended, 0, sizeof(source_extended)); + memset(&blurred, 0, sizeof(blurred)); + aom_alloc_frame_buffer( + &source_extended, width, height, source->subsampling_x, + source->subsampling_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer( + &blurred, width, height, source->subsampling_x, source->subsampling_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + av1_copy_and_extend_frame(source, &source_extended); + gaussian_blur(bit_depth, &source_extended, &blurred); + aom_free_frame_buffer(&source_extended); + + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); + const double last_frame_unsharp_amount = + get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); + + const double best_frame_unsharp_amount = find_best_frame_unsharp_amount( + cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01); + + cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] = + best_frame_unsharp_amount; + + unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount); + aom_free_frame_buffer(&blurred); +} + +void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source) { + const AV1_COMMON *const cm = &cpi->common; + const int width = source->y_width; + const int height = source->y_height; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = source->subsampling_x; + const int ss_y = source->subsampling_y; + + YV12_BUFFER_CONFIG source_extended, blurred; + memset(&blurred, 0, sizeof(blurred)); + memset(&source_extended, 0, sizeof(source_extended)); + aom_alloc_frame_buffer( + &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + av1_copy_and_extend_frame(source, &source_extended); + gaussian_blur(bit_depth, &source_extended, &blurred); + aom_free_frame_buffer(&source_extended); + + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); + const double last_frame_unsharp_amount = + get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); + + const double best_frame_unsharp_amount = find_best_frame_unsharp_amount( + cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01); + + cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] = + best_frame_unsharp_amount; + + const int block_size = BLOCK_64X64; + const int block_w = mi_size_wide[block_size] * 4; + const int block_h = mi_size_high[block_size] * 4; + const int num_cols = (source->y_width + block_w - 1) / block_w; + const int num_rows = (source->y_height + block_h - 1) / block_h; + double *best_unsharp_amounts = + aom_calloc(num_cols * num_rows, sizeof(*best_unsharp_amounts)); + if (!best_unsharp_amounts) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating vmaf data"); + } + + YV12_BUFFER_CONFIG source_block, blurred_block; + memset(&source_block, 0, sizeof(source_block)); + memset(&blurred_block, 0, sizeof(blurred_block)); + aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + const int block_width = AOMMIN(width - col_offset_y, block_w); + const int block_height = AOMMIN(height - row_offset_y, block_h); + const int index = col + row * num_cols; + + if (cm->seq_params->use_highbitdepth) { + assert(source->flags & YV12_FLAG_HIGHBITDEPTH); + assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH); + uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) + + row_offset_y * source->y_stride + + col_offset_y; + uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) + + row_offset_y * blurred.y_stride + + col_offset_y; + uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer); + uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer); + + // Copy block from source frame. + for (int i = 0; i < block_h; ++i) { + for (int j = 0; j < block_w; ++j) { + if (i >= block_height || j >= block_width) { + src_dst[j] = 0; + blurred_dst[j] = 0; + } else { + src_dst[j] = frame_src_buf[j]; + blurred_dst[j] = frame_blurred_buf[j]; + } + } + frame_src_buf += source->y_stride; + frame_blurred_buf += blurred.y_stride; + src_dst += source_block.y_stride; + blurred_dst += blurred_block.y_stride; + } + } else { + uint8_t *frame_src_buf = + source->y_buffer + row_offset_y * source->y_stride + col_offset_y; + uint8_t *frame_blurred_buf = + blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; + uint8_t *blurred_dst = blurred_block.y_buffer; + uint8_t *src_dst = source_block.y_buffer; + + // Copy block from source frame. + for (int i = 0; i < block_h; ++i) { + for (int j = 0; j < block_w; ++j) { + if (i >= block_height || j >= block_width) { + src_dst[j] = 0; + blurred_dst[j] = 0; + } else { + src_dst[j] = frame_src_buf[j]; + blurred_dst[j] = frame_blurred_buf[j]; + } + } + frame_src_buf += source->y_stride; + frame_blurred_buf += blurred.y_stride; + src_dst += source_block.y_stride; + blurred_dst += blurred_block.y_stride; + } + } + + best_unsharp_amounts[index] = find_best_frame_unsharp_amount( + cpi, &source_block, &blurred_block, best_frame_unsharp_amount, 0.1, 3, + 1.5); + } + } + + // Apply best blur amounts + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + const int block_width = AOMMIN(source->y_width - col_offset_y, block_w); + const int block_height = AOMMIN(source->y_height - row_offset_y, block_h); + const int index = col + row * num_cols; + + if (cm->seq_params->use_highbitdepth) { + assert(source->flags & YV12_FLAG_HIGHBITDEPTH); + assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH); + uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) + + row_offset_y * source->y_stride + col_offset_y; + uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) + + row_offset_y * blurred.y_stride + col_offset_y; + highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf, + blurred.y_stride, src_buf, source->y_stride, + block_width, block_height, + best_unsharp_amounts[index], bit_depth); + } else { + uint8_t *src_buf = + source->y_buffer + row_offset_y * source->y_stride + col_offset_y; + uint8_t *blurred_buf = + blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; + unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride, + src_buf, source->y_stride, block_width, block_height, + best_unsharp_amounts[index]); + } + } + } + + aom_free_frame_buffer(&source_block); + aom_free_frame_buffer(&blurred_block); + aom_free_frame_buffer(&blurred); + aom_free(best_unsharp_amounts); +} + +void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const int y_width = cpi->source->y_width; + const int y_height = cpi->source->y_height; + const int resized_block_size = BLOCK_32X32; + const int resize_factor = 2; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = cpi->source->subsampling_x; + const int ss_y = cpi->source->subsampling_y; + + YV12_BUFFER_CONFIG resized_source; + memset(&resized_source, 0, sizeof(resized_source)); + aom_alloc_frame_buffer( + &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x, + ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + if (!av1_resize_and_extend_frame_nonnormative( + cpi->source, &resized_source, bit_depth, av1_num_planes(cm))) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating buffers during resize"); + } + + const int resized_y_width = resized_source.y_width; + const int resized_y_height = resized_source.y_height; + const int resized_block_w = mi_size_wide[resized_block_size] * 4; + const int resized_block_h = mi_size_high[resized_block_size] * 4; + const int num_cols = + (resized_y_width + resized_block_w - 1) / resized_block_w; + const int num_rows = + (resized_y_height + resized_block_h - 1) / resized_block_h; + + YV12_BUFFER_CONFIG blurred; + memset(&blurred, 0, sizeof(blurred)); + aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x, + ss_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + gaussian_blur(bit_depth, &resized_source, &blurred); + + YV12_BUFFER_CONFIG recon; + memset(&recon, 0, sizeof(recon)); + aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_yv12_copy_frame(&resized_source, &recon, 1); + + VmafContext *vmaf_context; + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + aom_init_vmaf_context(&vmaf_context, cpi->vmaf_info.vmaf_model, cal_vmaf_neg); + unsigned int *sses = aom_calloc(num_rows * num_cols, sizeof(*sses)); + if (!sses) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating vmaf data"); + } + + // Loop through each 'block_size' block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + const int row_offset_y = row * resized_block_h; + const int col_offset_y = col * resized_block_w; + + uint8_t *const orig_buf = resized_source.y_buffer + + row_offset_y * resized_source.y_stride + + col_offset_y; + uint8_t *const blurred_buf = + blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; + + cpi->ppi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride, + blurred_buf, blurred.y_stride, + &sses[index]); + + uint8_t *const recon_buf = + recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y; + // Set recon buf + if (cpi->common.seq_params->use_highbitdepth) { + highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride, + CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride, + CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, + resized_block_w, resized_block_h, 0.0, bit_depth); + } else { + unsharp_rect(blurred_buf, blurred.y_stride, blurred_buf, + blurred.y_stride, recon_buf, recon.y_stride, + resized_block_w, resized_block_h, 0.0); + } + + aom_read_vmaf_image(vmaf_context, &resized_source, &recon, bit_depth, + index); + + // Restore recon buf + if (cpi->common.seq_params->use_highbitdepth) { + highbd_unsharp_rect( + CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride, + CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride, + CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w, + resized_block_h, 0.0, bit_depth); + } else { + unsharp_rect(orig_buf, resized_source.y_stride, orig_buf, + resized_source.y_stride, recon_buf, recon.y_stride, + resized_block_w, resized_block_h, 0.0); + } + } + } + aom_flush_vmaf_context(vmaf_context); + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + const double vmaf = aom_calc_vmaf_at_index( + vmaf_context, cpi->vmaf_info.vmaf_model, index); + const double dvmaf = kBaselineVmaf - vmaf; + + const double mse = + (double)sses[index] / (double)(resized_y_width * resized_y_height); + double weight; + const double eps = 0.01 / (num_rows * num_cols); + if (dvmaf < eps || mse < eps) { + weight = 1.0; + } else { + weight = mse / dvmaf; + } + + // Normalize it with a data fitted model. + weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8; + cpi->vmaf_info.rdmult_scaling_factors[index] = weight; + } + } + + aom_free_frame_buffer(&resized_source); + aom_free_frame_buffer(&blurred); + aom_close_vmaf_context(vmaf_context); + aom_free(sses); +} + +void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult) { + const AV1_COMMON *const cm = &cpi->common; + + const int bsize_base = BLOCK_64X64; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + int row, col; + double num_of_mi = 0.0; + double geom_mean_of_scale = 0.0; + + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += log(cpi->vmaf_info.rdmult_scaling_factors[index]); + num_of_mi += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); + *rdmult = AOMMAX(*rdmult, 0); + av1_set_error_per_bit(&x->errorperbit, *rdmult); +} + +// TODO(sdeng): replace them with the SIMD versions. +static AOM_INLINE double highbd_image_sad_c(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h) { + double accum = 0.0; + int i, j; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + double img1px = src[i * src_stride + j]; + double img2px = ref[i * ref_stride + j]; + + accum += fabs(img1px - img2px); + } + } + + return accum / (double)(h * w); +} + +static AOM_INLINE double image_sad_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, + int h) { + double accum = 0.0; + int i, j; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + double img1px = src[i * src_stride + j]; + double img2px = ref[i * ref_stride + j]; + + accum += fabs(img1px - img2px); + } + } + + return accum / (double)(h * w); +} + +static double calc_vmaf_motion_score(const AV1_COMP *const cpi, + const AV1_COMMON *const cm, + const YV12_BUFFER_CONFIG *const cur, + const YV12_BUFFER_CONFIG *const last, + const YV12_BUFFER_CONFIG *const next) { + const int y_width = cur->y_width; + const int y_height = cur->y_height; + YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = cur->subsampling_x; + const int ss_y = cur->subsampling_y; + + memset(&blurred_cur, 0, sizeof(blurred_cur)); + memset(&blurred_last, 0, sizeof(blurred_last)); + memset(&blurred_next, 0, sizeof(blurred_next)); + + aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + gaussian_blur(bit_depth, cur, &blurred_cur); + gaussian_blur(bit_depth, last, &blurred_last); + if (next) gaussian_blur(bit_depth, next, &blurred_next); + + double motion1, motion2 = 65536.0; + if (cm->seq_params->use_highbitdepth) { + assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH); + assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH); + const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8)); + motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer), + blurred_cur.y_stride, + CONVERT_TO_SHORTPTR(blurred_last.y_buffer), + blurred_last.y_stride, y_width, y_height) * + scale_factor; + if (next) { + assert(blurred_next.flags & YV12_FLAG_HIGHBITDEPTH); + motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer), + blurred_cur.y_stride, + CONVERT_TO_SHORTPTR(blurred_next.y_buffer), + blurred_next.y_stride, y_width, y_height) * + scale_factor; + } + } else { + motion1 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride, + blurred_last.y_buffer, blurred_last.y_stride, y_width, + y_height); + if (next) { + motion2 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride, + blurred_next.y_buffer, blurred_next.y_stride, + y_width, y_height); + } + } + + aom_free_frame_buffer(&blurred_cur); + aom_free_frame_buffer(&blurred_last); + aom_free_frame_buffer(&blurred_next); + + return AOMMIN(motion1, motion2); +} + +static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi, + YV12_BUFFER_CONFIG **last, + YV12_BUFFER_CONFIG **next) { + const AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + const int src_index = + cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[cpi->gf_frame_index]; + struct lookahead_entry *last_entry = av1_lookahead_peek( + cpi->ppi->lookahead, src_index - 1, cpi->compressor_stage); + struct lookahead_entry *next_entry = av1_lookahead_peek( + cpi->ppi->lookahead, src_index + 1, cpi->compressor_stage); + *next = &next_entry->img; + *last = cm->show_frame ? cpi->last_source : &last_entry->img; +} + +// Calculates the new qindex from the VMAF motion score. This is based on the +// observation: when the motion score becomes higher, the VMAF score of the +// same source and distorted frames would become higher. +int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) { + const AV1_COMMON *const cm = &cpi->common; + if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) { + return current_qindex; + } + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); + const double last_frame_ysse = + get_layer_value(cpi->vmaf_info.last_frame_ysse, layer_depth); + const double last_frame_vmaf = + get_layer_value(cpi->vmaf_info.last_frame_vmaf, layer_depth); + const int bit_depth = cpi->td.mb.e_mbd.bd; + const double approx_sse = last_frame_ysse / (double)((1 << (bit_depth - 8)) * + (1 << (bit_depth - 8))); + const double approx_dvmaf = kBaselineVmaf - last_frame_vmaf; + const double sse_threshold = + 0.01 * cpi->source->y_width * cpi->source->y_height; + const double vmaf_threshold = 0.01; + if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) { + return current_qindex; + } + YV12_BUFFER_CONFIG *cur_buf = cpi->source; + if (cm->show_frame == 0) { + const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; + struct lookahead_entry *cur_entry = av1_lookahead_peek( + cpi->ppi->lookahead, src_index, cpi->compressor_stage); + cur_buf = &cur_entry->img; + } + assert(cur_buf); + + YV12_BUFFER_CONFIG *next_buf, *last_buf; + get_neighbor_frames(cpi, &last_buf, &next_buf); + assert(last_buf); + + const double motion = + calc_vmaf_motion_score(cpi, cm, cur_buf, last_buf, next_buf); + + // Get dVMAF through a data fitted model. + const double dvmaf = 26.11 * (1.0 - exp(-0.06 * motion)); + const double dsse = dvmaf * approx_sse / approx_dvmaf; + + // Clamping beta to address VQ issue (aomedia:3170). + const double beta = AOMMAX(approx_sse / (dsse + approx_sse), 0.5); + const int offset = + av1_get_deltaq_offset(cm->seq_params->bit_depth, current_qindex, beta); + int qindex = current_qindex + offset; + + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + + return qindex; +} + +static AOM_INLINE double cal_approx_score( + AV1_COMP *const cpi, double src_variance, double new_variance, + double src_score, YV12_BUFFER_CONFIG *const src, + YV12_BUFFER_CONFIG *const recon_sharpened) { + double score; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + aom_calc_vmaf(cpi->vmaf_info.vmaf_model, src, recon_sharpened, bit_depth, + cal_vmaf_neg, &score); + return src_variance / new_variance * (score - src_score); +} + +static double find_best_frame_unsharp_amount_loop_neg( + AV1_COMP *const cpi, double src_variance, double base_score, + YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon, + YV12_BUFFER_CONFIG *const ref, YV12_BUFFER_CONFIG *const src_blurred, + YV12_BUFFER_CONFIG *const recon_blurred, + YV12_BUFFER_CONFIG *const src_sharpened, + YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs, + double best_score, const double unsharp_amount_start, + const double step_size, const int max_loop_count, const double max_amount) { + const double min_amount = 0.0; + int loop_count = 0; + double approx_score = best_score; + double unsharp_amount = unsharp_amount_start; + + do { + best_score = approx_score; + unsharp_amount += step_size; + if (unsharp_amount > max_amount || unsharp_amount < min_amount) break; + unsharp(cpi, recon, recon_blurred, recon_sharpened, unsharp_amount); + unsharp(cpi, src, src_blurred, src_sharpened, unsharp_amount); + const double new_variance = + residual_frame_average_variance(cpi, src_sharpened, ref, mvs); + approx_score = cal_approx_score(cpi, src_variance, new_variance, base_score, + src, recon_sharpened); + + loop_count++; + } while (approx_score > best_score && loop_count < max_loop_count); + unsharp_amount = + approx_score > best_score ? unsharp_amount : unsharp_amount - step_size; + + return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount)); +} + +static double find_best_frame_unsharp_amount_neg( + AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const src, + YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref, + double base_score, const double unsharp_amount_start, + const double step_size, const int max_loop_count, + const double max_filter_amount) { + FULLPEL_MV *mvs = NULL; + const double src_variance = + residual_frame_average_variance(cpi, src, ref, mvs); + + const AV1_COMMON *const cm = &cpi->common; + const int width = recon->y_width; + const int height = recon->y_height; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = recon->subsampling_x; + const int ss_y = recon->subsampling_y; + + YV12_BUFFER_CONFIG src_blurred, recon_blurred, src_sharpened, recon_sharpened; + memset(&recon_sharpened, 0, sizeof(recon_sharpened)); + memset(&src_sharpened, 0, sizeof(src_sharpened)); + memset(&recon_blurred, 0, sizeof(recon_blurred)); + memset(&src_blurred, 0, sizeof(src_blurred)); + aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer( + &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + + gaussian_blur(bit_depth, recon, &recon_blurred); + gaussian_blur(bit_depth, src, &src_blurred); + + unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_start); + unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_start); + const double variance_start = + residual_frame_average_variance(cpi, &src_sharpened, ref, mvs); + const double score_start = cal_approx_score( + cpi, src_variance, variance_start, base_score, src, &recon_sharpened); + + const double unsharp_amount_next = unsharp_amount_start + step_size; + unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_next); + unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_next); + const double variance_next = + residual_frame_average_variance(cpi, &src_sharpened, ref, mvs); + const double score_next = cal_approx_score(cpi, src_variance, variance_next, + base_score, src, &recon_sharpened); + + double unsharp_amount; + if (score_next > score_start) { + unsharp_amount = find_best_frame_unsharp_amount_loop_neg( + cpi, src_variance, base_score, src, recon, ref, &src_blurred, + &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_next, + unsharp_amount_next, step_size, max_loop_count, max_filter_amount); + } else { + unsharp_amount = find_best_frame_unsharp_amount_loop_neg( + cpi, src_variance, base_score, src, recon, ref, &src_blurred, + &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_start, + unsharp_amount_start, -step_size, max_loop_count, max_filter_amount); + } + + aom_free_frame_buffer(&recon_sharpened); + aom_free_frame_buffer(&src_sharpened); + aom_free_frame_buffer(&recon_blurred); + aom_free_frame_buffer(&src_blurred); + aom_free(mvs); + return unsharp_amount; +} + +void av1_update_vmaf_curve(AV1_COMP *cpi) { + YV12_BUFFER_CONFIG *source = cpi->source; + YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); + double base_score; + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, recon, bit_depth, + cal_vmaf_neg, &base_score); + cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score; + if (cpi->common.seq_params->use_highbitdepth) { + assert(source->flags & YV12_FLAG_HIGHBITDEPTH); + assert(recon->flags & YV12_FLAG_HIGHBITDEPTH); + cpi->vmaf_info.last_frame_ysse[layer_depth] = + (double)aom_highbd_get_y_sse(source, recon); + } else { + cpi->vmaf_info.last_frame_ysse[layer_depth] = + (double)aom_get_y_sse(source, recon); + } + + if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { + YV12_BUFFER_CONFIG *last, *next; + get_neighbor_frames(cpi, &last, &next); + double best_unsharp_amount_start = + get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); + const int max_loop_count = 5; + cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] = + find_best_frame_unsharp_amount_neg(cpi, source, recon, last, base_score, + best_unsharp_amount_start, 0.025, + max_loop_count, 1.01); + } +} diff --git a/third_party/aom/av1/encoder/tune_vmaf.h b/third_party/aom/av1/encoder/tune_vmaf.h new file mode 100644 index 0000000000..a04a29e6fe --- /dev/null +++ b/third_party/aom/av1/encoder/tune_vmaf.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_ +#define AOM_AV1_ENCODER_TUNE_VMAF_H_ + +#include "aom_dsp/vmaf.h" +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/block.h" + +typedef struct { + // Stores the scaling factors for rdmult when tuning for VMAF. + // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for + // 64x64 block at (row, col). + double *rdmult_scaling_factors; + + // Stores the luma sse of the last frame. + double last_frame_ysse[MAX_ARF_LAYERS]; + + // Stores the VMAF of the last frame. + double last_frame_vmaf[MAX_ARF_LAYERS]; + + // Stores the filter strength of the last frame. + double last_frame_unsharp_amount[MAX_ARF_LAYERS]; + + // Stores the origial qindex before scaling. + int original_qindex; + + // VMAF model used in VMAF caculations. + VmafModel *vmaf_model; +} TuneVMAFInfo; + +struct AV1_COMP; + +void av1_vmaf_blk_preprocessing(struct AV1_COMP *cpi, + YV12_BUFFER_CONFIG *source); + +void av1_vmaf_frame_preprocessing(struct AV1_COMP *cpi, + YV12_BUFFER_CONFIG *source); + +void av1_vmaf_neg_preprocessing(struct AV1_COMP *cpi, + YV12_BUFFER_CONFIG *source); + +void av1_set_mb_vmaf_rdmult_scaling(struct AV1_COMP *cpi); + +void av1_set_vmaf_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, int *rdmult); + +int av1_get_vmaf_base_qindex(const struct AV1_COMP *cpi, int current_qindex); + +void av1_update_vmaf_curve(struct AV1_COMP *cpi); + +#endif // AOM_AV1_ENCODER_TUNE_VMAF_H_ diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h new file mode 100644 index 0000000000..aab5e1398d --- /dev/null +++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h @@ -0,0 +1,3422 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*! \file + * Contains the details of the ML models used for pruning transform size. This + * file is only included by av1/encoder/tx_search.c. + */ +#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +/***************************CONFIG_NN_V2 (New)********************************/ +#if CONFIG_NN_V2 +// Tx type model for 4x4 block. +static float av1_tx_type_nn_4x4_hor_layer0_weights[32] = { + -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f, + 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f, + -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f, + 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f, + 1.35792f, 0.27733f, 0.88660f, -0.68304f, +}; + +static float av1_tx_type_nn_4x4_hor_layer0_bias[8] = { + 1.38742f, 0.59540f, -1.37622f, 1.92114f, + 0.00000f, -0.38998f, -0.32726f, -0.15650f, +}; + +static float av1_tx_type_nn_4x4_hor_layer1_weights[32] = { + 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f, + -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f, + -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f, + 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f, + -0.26782f, -0.65416f, -0.10648f, 0.05568f, +}; + +static float av1_tx_type_nn_4x4_hor_layer1_bias[4] = { + 4.07177f, + 3.26961f, + 0.58083f, + 1.21199f, +}; + +static float av1_tx_type_nn_4x4_hor_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x4_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x4_hor_layer0_weights, // weights + av1_tx_type_nn_4x4_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x4_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x4_hor_layer1_weights, + av1_tx_type_nn_4x4_hor_layer1_bias, + NONE, + av1_tx_type_nn_4x4_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x4_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_4x4_ver_layer0_weights[32] = { + -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f, + 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f, + 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f, + 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f, + -0.06589f, -0.28142f, -0.33118f, 1.72227f, +}; + +static float av1_tx_type_nn_4x4_ver_layer0_bias[8] = { + -0.33685f, 0.22025f, 0.28140f, 0.56138f, + 0.93489f, -1.77048f, 1.34989f, -0.93747f, +}; + +static float av1_tx_type_nn_4x4_ver_layer1_weights[32] = { + -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f, + 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f, + -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f, + -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f, + -0.86315f, -0.53336f, 0.30320f, -1.32331f, +}; + +static float av1_tx_type_nn_4x4_ver_layer1_bias[4] = { + -1.31519f, + -3.26321f, + 1.71794f, + -1.90778f, +}; + +static float av1_tx_type_nn_4x4_ver_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x4_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x4_ver_layer0_weights, // weights + av1_tx_type_nn_4x4_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x4_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x4_ver_layer1_weights, + av1_tx_type_nn_4x4_ver_layer1_bias, + NONE, + av1_tx_type_nn_4x4_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x4_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 4x8 block. +static float av1_tx_type_nn_4x8_hor_layer0_weights[32] = { + 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f, + 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f, + -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f, + -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f, + -1.35896f, -1.17121f, 1.68866f, 0.10357f, +}; + +static float av1_tx_type_nn_4x8_hor_layer0_bias[8] = { + 2.93391f, 0.66831f, -0.21419f, 0.00000f, + -0.72878f, 0.15127f, -1.46755f, 0.16658f, +}; + +static float av1_tx_type_nn_4x8_hor_layer1_weights[32] = { + -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f, + -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f, + 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f, + 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f, + -0.50191f, 0.18219f, 1.83664f, -0.75276f, +}; + +static float av1_tx_type_nn_4x8_hor_layer1_bias[4] = { + -1.17455f, + -2.26089f, + -1.79863f, + -2.26333f, +}; + +static float av1_tx_type_nn_4x8_hor_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x8_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x8_hor_layer0_weights, // weights + av1_tx_type_nn_4x8_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x8_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x8_hor_layer1_weights, + av1_tx_type_nn_4x8_hor_layer1_bias, + NONE, + av1_tx_type_nn_4x8_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x8_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_4x8_ver_layer0_weights[128] = { + -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f, + -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f, + -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f, + 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f, + 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f, + 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f, + -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f, + -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f, + 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f, + -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f, + -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f, + -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f, + 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f, + 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f, + -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f, + -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f, + 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f, + -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f, + -0.21958f, 0.05970f, +}; + +static float av1_tx_type_nn_4x8_ver_layer0_bias[16] = { + 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f, + 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f, + 0.08288f, 0.18195f, -0.79890f, 0.10047f, +}; + +static float av1_tx_type_nn_4x8_ver_layer1_weights[64] = { + -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f, + -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f, + -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f, + -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f, + 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f, + 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f, + -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f, + -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f, + -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f, + -1.01848f, +}; + +static float av1_tx_type_nn_4x8_ver_layer1_bias[4] = { + -1.45955f, + -2.08949f, + -1.24813f, + -1.55368f, +}; + +static float av1_tx_type_nn_4x8_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_4x8_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_4x8_ver_layer0_weights, // weights + av1_tx_type_nn_4x8_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x8_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x8_ver_layer1_weights, + av1_tx_type_nn_4x8_ver_layer1_bias, + NONE, + av1_tx_type_nn_4x8_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x8_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +/******************************************************************************/ + +// Tx type model for 8x4 block. +static float av1_tx_type_nn_8x4_hor_layer0_weights[128] = { + -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f, + 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f, + -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f, + -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f, + -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f, + 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f, + 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f, + -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f, + -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f, + 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f, + 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f, + -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f, + -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f, + 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f, + 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f, + 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f, + -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f, + -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f, + -1.85523f, 0.92532f, +}; + +static float av1_tx_type_nn_8x4_hor_layer0_bias[16] = { + 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f, + -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f, + -0.28958f, -0.32869f, -0.01704f, 0.68171f, +}; + +static float av1_tx_type_nn_8x4_hor_layer1_weights[64] = { + -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f, + -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f, + 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f, + -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f, + 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f, + -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f, + -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f, + 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f, + 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f, + -1.10654f, +}; + +static float av1_tx_type_nn_8x4_hor_layer1_bias[4] = { + -0.92861f, + -1.45151f, + -1.33588f, + -4.33853f, +}; + +static float av1_tx_type_nn_8x4_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x4_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x4_hor_layer0_weights, // weights + av1_tx_type_nn_8x4_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x4_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x4_hor_layer1_weights, + av1_tx_type_nn_8x4_hor_layer1_bias, + NONE, + av1_tx_type_nn_8x4_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x4_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_8x4_ver_layer0_weights[32] = { + -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f, + -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f, + -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f, + -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f, + 1.66212f, 1.70826f, 1.55182f, 0.12230f, +}; + +static float av1_tx_type_nn_8x4_ver_layer0_bias[8] = { + 0.10943f, 2.09789f, 2.16578f, 0.15766f, + -0.42461f, 0.00000f, 1.22090f, -1.28717f, +}; + +static float av1_tx_type_nn_8x4_ver_layer1_weights[32] = { + 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f, + 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f, + 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f, + -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f, + -1.15005f, -0.39311f, 1.51236f, -1.68973f, +}; + +static float av1_tx_type_nn_8x4_ver_layer1_bias[4] = { + 1.81013f, + 1.10517f, + 2.90059f, + 0.95391f, +}; + +static float av1_tx_type_nn_8x4_ver_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_8x4_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_8x4_ver_layer0_weights, // weights + av1_tx_type_nn_8x4_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x4_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x4_ver_layer1_weights, + av1_tx_type_nn_8x4_ver_layer1_bias, + NONE, + av1_tx_type_nn_8x4_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x4_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 8x8 block. +static float av1_tx_type_nn_8x8_hor_layer0_weights[128] = { + -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f, + -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f, + 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f, + 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f, + -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f, + -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f, + -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f, + 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f, + 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f, + -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f, + 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f, + -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f, + 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f, + 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f, + 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f, + 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f, + 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f, + 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f, + -0.99892f, 1.09823f, +}; + +static float av1_tx_type_nn_8x8_hor_layer0_bias[16] = { + -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f, + -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f, + -0.26319f, 2.65579f, -1.30137f, -0.01487f, +}; + +static float av1_tx_type_nn_8x8_hor_layer1_weights[64] = { + -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f, + -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f, + 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f, + 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f, + 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f, + -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f, + 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f, + 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f, + 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f, + 0.06161f, +}; + +static float av1_tx_type_nn_8x8_hor_layer1_bias[4] = { + 1.70385f, + 1.82373f, + 1.78496f, + 1.80826f, +}; + +static float av1_tx_type_nn_8x8_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x8_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x8_hor_layer0_weights, // weights + av1_tx_type_nn_8x8_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x8_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x8_hor_layer1_weights, + av1_tx_type_nn_8x8_hor_layer1_bias, + NONE, + av1_tx_type_nn_8x8_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x8_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_8x8_ver_layer0_weights[128] = { + -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f, + 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f, + -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f, + -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f, + 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f, + 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f, + 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f, + -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f, + -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f, + 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f, + 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f, + -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f, + 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f, + 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f, + -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f, + 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f, + -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f, + -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f, + -1.29848f, 0.39308f, +}; + +static float av1_tx_type_nn_8x8_ver_layer0_bias[16] = { + -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f, + 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f, + 0.83015f, 0.06024f, 1.17180f, 0.65122f, +}; + +static float av1_tx_type_nn_8x8_ver_layer1_weights[64] = { + -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f, + 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f, + 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f, + 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f, + 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f, + 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f, + 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f, + 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f, + -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f, + -0.41305f, +}; + +static float av1_tx_type_nn_8x8_ver_layer1_bias[4] = { + 2.14067f, + 2.76699f, + 2.04233f, + 1.34803f, +}; + +static float av1_tx_type_nn_8x8_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x8_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x8_ver_layer0_weights, // weights + av1_tx_type_nn_8x8_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x8_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x8_ver_layer1_weights, + av1_tx_type_nn_8x8_ver_layer1_bias, + NONE, + av1_tx_type_nn_8x8_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x8_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 8x16 block. +static float av1_tx_type_nn_8x16_hor_layer0_weights[128] = { + -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f, + 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f, + -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f, + 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f, + -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f, + 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f, + -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f, + 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f, + -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f, + -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f, + 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f, + 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f, + -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f, + 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f, + -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f, + 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f, + 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f, + -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f, + -0.28136f, 0.42556f, +}; + +static float av1_tx_type_nn_8x16_hor_layer0_bias[16] = { + 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f, + -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f, + 1.81560f, -1.02643f, -0.81690f, 0.08302f, +}; + +static float av1_tx_type_nn_8x16_hor_layer1_weights[64] = { + 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f, + -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f, + 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f, + -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f, + 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f, + 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f, + 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f, + 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f, + 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f, + -1.31243f, +}; + +static float av1_tx_type_nn_8x16_hor_layer1_bias[4] = { + 0.83359f, + 1.06875f, + 1.77645f, + 1.49570f, +}; + +static float av1_tx_type_nn_8x16_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x16_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x16_hor_layer0_weights, // weights + av1_tx_type_nn_8x16_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x16_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x16_hor_layer1_weights, + av1_tx_type_nn_8x16_hor_layer1_bias, + NONE, + av1_tx_type_nn_8x16_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x16_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_8x16_ver_layer0_weights[128] = { + 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f, + -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f, + -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f, + 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f, + -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f, + 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f, + 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f, + 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f, + -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f, + -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f, + 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f, + 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f, + -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f, + -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f, + -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f, + -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f, + -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f, + 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f, + -0.12236f, 0.16075f, +}; + +static float av1_tx_type_nn_8x16_ver_layer0_bias[16] = { + -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f, + -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f, + 0.57598f, 0.99819f, 0.75175f, 0.17044f, +}; + +static float av1_tx_type_nn_8x16_ver_layer1_weights[64] = { + -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f, + 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f, + -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f, + 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f, + -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f, + -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f, + -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f, + 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f, + 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f, + 2.20547f, +}; + +static float av1_tx_type_nn_8x16_ver_layer1_bias[4] = { + -0.44080f, + -1.67455f, + -1.46332f, + -6.13206f, +}; + +static float av1_tx_type_nn_8x16_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x16_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x16_ver_layer0_weights, // weights + av1_tx_type_nn_8x16_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x16_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x16_ver_layer1_weights, + av1_tx_type_nn_8x16_ver_layer1_bias, + NONE, + av1_tx_type_nn_8x16_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x16_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 16x8 block. +static float av1_tx_type_nn_16x8_hor_layer0_weights[128] = { + 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f, + -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f, + -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f, + 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f, + 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f, + 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f, + 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f, + -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f, + -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f, + -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f, + 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f, + -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f, + -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f, + -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f, + 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f, + -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f, + -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f, + 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f, + -0.36570f, -0.50757f, +}; + +static float av1_tx_type_nn_16x8_hor_layer0_bias[16] = { + -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f, + 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f, + -0.12329f, 0.08986f, 1.08117f, -0.00220f, +}; + +static float av1_tx_type_nn_16x8_hor_layer1_weights[64] = { + 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f, + 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f, + -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f, + -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f, + -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f, + -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f, + 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f, + 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f, + 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f, + -0.23347f, +}; + +static float av1_tx_type_nn_16x8_hor_layer1_bias[4] = { + 3.57175f, + 2.42612f, + 3.31259f, + 2.08287f, +}; + +static float av1_tx_type_nn_16x8_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x8_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x8_hor_layer0_weights, // weights + av1_tx_type_nn_16x8_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x8_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x8_hor_layer1_weights, + av1_tx_type_nn_16x8_hor_layer1_bias, + NONE, + av1_tx_type_nn_16x8_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x8_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_16x8_ver_layer0_weights[128] = { + 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f, + 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f, + -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f, + 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f, + 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f, + -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f, + 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f, + -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f, + 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f, + 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f, + 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f, + -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f, + -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f, + -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f, + 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f, + 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f, + -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f, + -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f, + -0.81945f, -0.41647f, +}; + +static float av1_tx_type_nn_16x8_ver_layer0_bias[16] = { + 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f, + 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f, + -0.04510f, 0.48000f, -0.09354f, -0.42422f, +}; + +static float av1_tx_type_nn_16x8_ver_layer1_weights[64] = { + 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f, + -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f, + 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f, + -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f, + -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f, + 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f, + 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f, + -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f, + 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f, + -0.00873f, +}; + +static float av1_tx_type_nn_16x8_ver_layer1_bias[4] = { + 3.34981f, + 3.74710f, + 1.38339f, + 0.45176f, +}; + +static float av1_tx_type_nn_16x8_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x8_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x8_ver_layer0_weights, // weights + av1_tx_type_nn_16x8_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x8_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x8_ver_layer1_weights, + av1_tx_type_nn_16x8_ver_layer1_bias, + NONE, + av1_tx_type_nn_16x8_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x8_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 16x16 block. +static float av1_tx_type_nn_16x16_layer0_weights[128] = { + 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f, + 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f, + -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f, + -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f, + 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f, + 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f, + 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f, + 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f, + -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f, + 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f, + 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f, + 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f, + -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f, + 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f, + 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f, + -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f, + -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f, + 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f, + 0.50355f, 0.08592f, +}; + +static float av1_tx_type_nn_16x16_layer0_bias[16] = { + -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f, + -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f, + -0.14062f, -0.42120f, 0.94573f, -0.09287f, +}; + +static float av1_tx_type_nn_16x16_layer1_weights[64] = { + -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f, + 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f, + 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f, + 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f, + 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f, + 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f, + -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f, + 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f, + -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f, + 1.08829f, +}; + +static float av1_tx_type_nn_16x16_layer1_bias[4] = { + 0.81986f, + 1.26865f, + 0.11118f, + 2.48404f, +}; + +static float av1_tx_type_nn_16x16_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x16_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x16 = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x16_layer0_weights, // weights + av1_tx_type_nn_16x16_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x16_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x16_layer1_weights, + av1_tx_type_nn_16x16_layer1_bias, + NONE, + av1_tx_type_nn_16x16_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x16_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 4x16 block. +static float av1_tx_type_nn_4x16_hor_layer0_weights[32] = { + 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f, + 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f, + 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f, + 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f, + -1.74563f, -0.88830f, -1.77603f, 2.15935f, +}; + +static float av1_tx_type_nn_4x16_hor_layer0_bias[8] = { + -0.36435f, -2.22731f, -0.00837f, -1.34546f, + 0.62806f, -0.20675f, 4.91940f, -0.56079f, +}; + +static float av1_tx_type_nn_4x16_hor_layer1_weights[32] = { + -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f, + -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f, + 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f, + 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f, + 1.28413f, -0.30326f, 2.45329f, -0.83335f, +}; + +static float av1_tx_type_nn_4x16_hor_layer1_bias[4] = { + 2.33198f, + 3.36245f, + 1.62603f, + 2.91056f, +}; + +static float av1_tx_type_nn_4x16_hor_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x16_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x16_hor_layer0_weights, // weights + av1_tx_type_nn_4x16_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x16_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x16_hor_layer1_weights, + av1_tx_type_nn_4x16_hor_layer1_bias, + NONE, + av1_tx_type_nn_4x16_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x16_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_4x16_ver_layer0_weights[128] = { + 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f, + 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f, + -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f, + -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f, + -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f, + -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f, + 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f, + 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f, + 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f, + -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f, + -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f, + 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f, + 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f, + 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f, + 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f, + -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f, + 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f, + 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f, + -0.27975f, -0.01149f, +}; + +static float av1_tx_type_nn_4x16_ver_layer0_bias[16] = { + -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f, + -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f, + -0.32530f, 0.73483f, 0.08322f, -0.23890f, +}; + +static float av1_tx_type_nn_4x16_ver_layer1_weights[64] = { + 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f, + -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f, + 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f, + -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f, + 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f, + -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f, + 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f, + 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f, + -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f, + -0.56513f, +}; + +static float av1_tx_type_nn_4x16_ver_layer1_bias[4] = { + 4.60896f, + 4.53551f, + 4.53124f, + 4.27435f, +}; + +static float av1_tx_type_nn_4x16_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_4x16_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_4x16_ver_layer0_weights, // weights + av1_tx_type_nn_4x16_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x16_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x16_ver_layer1_weights, + av1_tx_type_nn_4x16_ver_layer1_bias, + NONE, + av1_tx_type_nn_4x16_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x16_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 16x4 block. +static float av1_tx_type_nn_16x4_hor_layer0_weights[128] = { + 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f, + 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f, + -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f, + -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f, + -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f, + -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f, + 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f, + 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f, + 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f, + -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f, + 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f, + -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f, + 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f, + -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f, + -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f, + -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f, + 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f, + 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f, + 0.19055f, -1.56413f, +}; + +static float av1_tx_type_nn_16x4_hor_layer0_bias[16] = { + -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f, + 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f, + 1.14048f, 0.33308f, -1.10886f, 0.41184f, +}; + +static float av1_tx_type_nn_16x4_hor_layer1_weights[64] = { + -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f, + 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f, + -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f, + -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f, + 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f, + -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f, + -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f, + 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f, + 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f, + -0.43819f, +}; + +static float av1_tx_type_nn_16x4_hor_layer1_bias[4] = { + 2.32575f, + 2.75703f, + 1.12304f, + 2.15567f, +}; + +static float av1_tx_type_nn_16x4_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x4_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x4_hor_layer0_weights, // weights + av1_tx_type_nn_16x4_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x4_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x4_hor_layer1_weights, + av1_tx_type_nn_16x4_hor_layer1_bias, + NONE, + av1_tx_type_nn_16x4_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x4_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_16x4_ver_layer0_weights[32] = { + 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f, + 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f, + -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f, + -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f, + -0.17967f, -0.96622f, 0.42635f, -1.04784f, +}; + +static float av1_tx_type_nn_16x4_ver_layer0_bias[8] = { + -0.52088f, 0.52844f, -1.03655f, -0.30974f, + 2.59952f, -1.93604f, 0.00000f, 2.51787f, +}; + +static float av1_tx_type_nn_16x4_ver_layer1_weights[32] = { + 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f, + 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f, + 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f, + -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f, + 1.26814f, -1.93873f, -0.00768f, 1.58309f, +}; + +static float av1_tx_type_nn_16x4_ver_layer1_bias[4] = { + 2.34713f, + 1.68667f, + 1.25488f, + 1.69812f, +}; + +static float av1_tx_type_nn_16x4_ver_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_16x4_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_16x4_ver_layer0_weights, // weights + av1_tx_type_nn_16x4_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x4_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x4_ver_layer1_weights, + av1_tx_type_nn_16x4_ver_layer1_bias, + NONE, + av1_tx_type_nn_16x4_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x4_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Map tx_size to its corresponding neural net model for tx type prediction. +static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_hor[] = { + &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform + &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform + &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform + &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform + &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform + &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; + +static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_ver[] = { + &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform + &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform + &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform + &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform + &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform + &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; +#else +/******************************CONFIG_NN***************************************/ +// Tx type model for 4x4 block. +static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = { + -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f, + 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f, + -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f, + 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f, + 1.35792f, 0.27733f, 0.88660f, -0.68304f, +}; + +static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = { + 1.38742f, 0.59540f, -1.37622f, 1.92114f, + 0.00000f, -0.38998f, -0.32726f, -0.15650f, +}; + +static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = { + 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f, + -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f, + -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f, + 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f, + -0.26782f, -0.65416f, -0.10648f, 0.05568f, +}; + +static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = { + 4.07177f, + 3.26961f, + 0.58083f, + 1.21199f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x4_hor_layer0, + av1_tx_type_nn_weights_4x4_hor_layer1 }, + { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = { + -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f, + 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f, + 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f, + 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f, + -0.06589f, -0.28142f, -0.33118f, 1.72227f, +}; + +static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = { + -0.33685f, 0.22025f, 0.28140f, 0.56138f, + 0.93489f, -1.77048f, 1.34989f, -0.93747f, +}; + +static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = { + -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f, + 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f, + -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f, + -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f, + -0.86315f, -0.53336f, 0.30320f, -1.32331f, +}; + +static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = { + -1.31519f, + -3.26321f, + 1.71794f, + -1.90778f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x4_ver_layer0, + av1_tx_type_nn_weights_4x4_ver_layer1 }, + { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 4x8 block. +static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = { + 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f, + 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f, + -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f, + -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f, + -1.35896f, -1.17121f, 1.68866f, 0.10357f, +}; + +static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = { + 2.93391f, 0.66831f, -0.21419f, 0.00000f, + -0.72878f, 0.15127f, -1.46755f, 0.16658f, +}; + +static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = { + -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f, + -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f, + 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f, + 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f, + -0.50191f, 0.18219f, 1.83664f, -0.75276f, +}; + +static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = { + -1.17455f, + -2.26089f, + -1.79863f, + -2.26333f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x8_hor_layer0, + av1_tx_type_nn_weights_4x8_hor_layer1 }, + { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = { + -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f, + -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f, + -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f, + 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f, + 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f, + 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f, + -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f, + -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f, + 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f, + -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f, + -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f, + -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f, + 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f, + 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f, + -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f, + -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f, + 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f, + -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f, + -0.21958f, 0.05970f, +}; + +static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = { + 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f, + 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f, + 0.08288f, 0.18195f, -0.79890f, 0.10047f, +}; + +static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = { + -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f, + -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f, + -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f, + -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f, + 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f, + 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f, + -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f, + -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f, + -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f, + -1.01848f, +}; + +static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = { + -1.45955f, + -2.08949f, + -1.24813f, + -1.55368f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x8_ver_layer0, + av1_tx_type_nn_weights_4x8_ver_layer1 }, + { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 8x4 block. +static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = { + -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f, + 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f, + -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f, + -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f, + -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f, + 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f, + 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f, + -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f, + -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f, + 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f, + 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f, + -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f, + -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f, + 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f, + 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f, + 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f, + -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f, + -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f, + -1.85523f, 0.92532f, +}; + +static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = { + 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f, + -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f, + -0.28958f, -0.32869f, -0.01704f, 0.68171f, +}; + +static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = { + -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f, + -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f, + 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f, + -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f, + 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f, + -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f, + -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f, + 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f, + 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f, + -1.10654f, +}; + +static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = { + -0.92861f, + -1.45151f, + -1.33588f, + -4.33853f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x4_hor_layer0, + av1_tx_type_nn_weights_8x4_hor_layer1 }, + { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = { + -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f, + -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f, + -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f, + -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f, + 1.66212f, 1.70826f, 1.55182f, 0.12230f, +}; + +static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = { + 0.10943f, 2.09789f, 2.16578f, 0.15766f, + -0.42461f, 0.00000f, 1.22090f, -1.28717f, +}; + +static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = { + 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f, + 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f, + 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f, + -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f, + -1.15005f, -0.39311f, 1.51236f, -1.68973f, +}; + +static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = { + 1.81013f, + 1.10517f, + 2.90059f, + 0.95391f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x4_ver_layer0, + av1_tx_type_nn_weights_8x4_ver_layer1 }, + { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 8x8 block. +static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = { + -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f, + -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f, + 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f, + 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f, + -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f, + -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f, + -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f, + 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f, + 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f, + -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f, + 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f, + -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f, + 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f, + 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f, + 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f, + 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f, + 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f, + 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f, + -0.99892f, 1.09823f, +}; + +static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = { + -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f, + -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f, + -0.26319f, 2.65579f, -1.30137f, -0.01487f, +}; + +static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = { + -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f, + -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f, + 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f, + 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f, + 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f, + -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f, + 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f, + 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f, + 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f, + 0.06161f, +}; + +static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = { + 1.70385f, + 1.82373f, + 1.78496f, + 1.80826f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x8_hor_layer0, + av1_tx_type_nn_weights_8x8_hor_layer1 }, + { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = { + -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f, + 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f, + -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f, + -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f, + 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f, + 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f, + 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f, + -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f, + -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f, + 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f, + 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f, + -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f, + 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f, + 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f, + -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f, + 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f, + -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f, + -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f, + -1.29848f, 0.39308f, +}; + +static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = { + -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f, + 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f, + 0.83015f, 0.06024f, 1.17180f, 0.65122f, +}; + +static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = { + -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f, + 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f, + 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f, + 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f, + 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f, + 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f, + 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f, + 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f, + -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f, + -0.41305f, +}; + +static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = { + 2.14067f, + 2.76699f, + 2.04233f, + 1.34803f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x8_ver_layer0, + av1_tx_type_nn_weights_8x8_ver_layer1 }, + { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 8x16 block. +static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = { + -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f, + 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f, + -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f, + 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f, + -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f, + 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f, + -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f, + 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f, + -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f, + -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f, + 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f, + 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f, + -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f, + 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f, + -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f, + 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f, + 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f, + -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f, + -0.28136f, 0.42556f, +}; + +static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = { + 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f, + -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f, + 1.81560f, -1.02643f, -0.81690f, 0.08302f, +}; + +static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = { + 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f, + -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f, + 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f, + -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f, + 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f, + 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f, + 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f, + 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f, + 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f, + -1.31243f, +}; + +static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = { + 0.83359f, + 1.06875f, + 1.77645f, + 1.49570f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x16_hor_layer0, + av1_tx_type_nn_weights_8x16_hor_layer1 }, + { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = { + 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f, + -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f, + -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f, + 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f, + -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f, + 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f, + 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f, + 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f, + -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f, + -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f, + 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f, + 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f, + -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f, + -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f, + -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f, + -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f, + -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f, + 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f, + -0.12236f, 0.16075f, +}; + +static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = { + -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f, + -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f, + 0.57598f, 0.99819f, 0.75175f, 0.17044f, +}; + +static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = { + -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f, + 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f, + -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f, + 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f, + -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f, + -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f, + -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f, + 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f, + 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f, + 2.20547f, +}; + +static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = { + -0.44080f, + -1.67455f, + -1.46332f, + -6.13206f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x16_ver_layer0, + av1_tx_type_nn_weights_8x16_ver_layer1 }, + { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 16x8 block. +static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = { + 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f, + -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f, + -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f, + 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f, + 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f, + 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f, + 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f, + -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f, + -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f, + -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f, + 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f, + -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f, + -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f, + -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f, + 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f, + -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f, + -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f, + 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f, + -0.36570f, -0.50757f, +}; + +static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = { + -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f, + 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f, + -0.12329f, 0.08986f, 1.08117f, -0.00220f, +}; + +static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = { + 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f, + 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f, + -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f, + -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f, + -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f, + -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f, + 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f, + 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f, + 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f, + -0.23347f, +}; + +static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = { + 3.57175f, + 2.42612f, + 3.31259f, + 2.08287f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x8_hor_layer0, + av1_tx_type_nn_weights_16x8_hor_layer1 }, + { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = { + 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f, + 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f, + -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f, + 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f, + 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f, + -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f, + 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f, + -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f, + 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f, + 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f, + 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f, + -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f, + -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f, + -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f, + 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f, + 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f, + -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f, + -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f, + -0.81945f, -0.41647f, +}; + +static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = { + 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f, + 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f, + -0.04510f, 0.48000f, -0.09354f, -0.42422f, +}; + +static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = { + 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f, + -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f, + 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f, + -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f, + -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f, + 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f, + 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f, + -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f, + 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f, + -0.00873f, +}; + +static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = { + 3.34981f, + 3.74710f, + 1.38339f, + 0.45176f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x8_ver_layer0, + av1_tx_type_nn_weights_16x8_ver_layer1 }, + { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 16x16 block. +static const float av1_tx_type_nn_weights_16x16_layer0[128] = { + 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f, + 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f, + -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f, + -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f, + 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f, + 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f, + 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f, + 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f, + -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f, + 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f, + 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f, + 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f, + -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f, + 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f, + 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f, + -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f, + -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f, + 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f, + 0.50355f, 0.08592f, +}; + +static const float av1_tx_type_nn_bias_16x16_layer0[16] = { + -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f, + -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f, + -0.14062f, -0.42120f, 0.94573f, -0.09287f, +}; + +static const float av1_tx_type_nn_weights_16x16_layer1[64] = { + -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f, + 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f, + 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f, + 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f, + 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f, + 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f, + -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f, + 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f, + -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f, + 1.08829f, +}; + +static const float av1_tx_type_nn_bias_16x16_layer1[4] = { + 0.81986f, + 1.26865f, + 0.11118f, + 2.48404f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x16 = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_16x16_layer0, + av1_tx_type_nn_weights_16x16_layer1, + }, + { + av1_tx_type_nn_bias_16x16_layer0, + av1_tx_type_nn_bias_16x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 4x16 block. +static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = { + 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f, + 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f, + 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f, + 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f, + -1.74563f, -0.88830f, -1.77603f, 2.15935f, +}; + +static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = { + -0.36435f, -2.22731f, -0.00837f, -1.34546f, + 0.62806f, -0.20675f, 4.91940f, -0.56079f, +}; + +static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = { + -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f, + -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f, + 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f, + 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f, + 1.28413f, -0.30326f, 2.45329f, -0.83335f, +}; + +static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = { + 2.33198f, + 3.36245f, + 1.62603f, + 2.91056f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x16_hor_layer0, + av1_tx_type_nn_weights_4x16_hor_layer1 }, + { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = { + 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f, + 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f, + -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f, + -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f, + -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f, + -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f, + 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f, + 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f, + 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f, + -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f, + -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f, + 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f, + 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f, + 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f, + 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f, + -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f, + 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f, + 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f, + -0.27975f, -0.01149f, +}; + +static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = { + -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f, + -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f, + -0.32530f, 0.73483f, 0.08322f, -0.23890f, +}; + +static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = { + 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f, + -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f, + 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f, + -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f, + 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f, + -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f, + 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f, + 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f, + -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f, + -0.56513f, +}; + +static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = { + 4.60896f, + 4.53551f, + 4.53124f, + 4.27435f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x16_ver_layer0, + av1_tx_type_nn_weights_4x16_ver_layer1 }, + { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 16x4 block. +static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = { + 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f, + 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f, + -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f, + -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f, + -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f, + -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f, + 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f, + 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f, + 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f, + -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f, + 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f, + -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f, + 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f, + -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f, + -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f, + -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f, + 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f, + 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f, + 0.19055f, -1.56413f, +}; + +static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = { + -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f, + 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f, + 1.14048f, 0.33308f, -1.10886f, 0.41184f, +}; + +static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = { + -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f, + 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f, + -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f, + -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f, + 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f, + -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f, + -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f, + 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f, + 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f, + -0.43819f, +}; + +static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = { + 2.32575f, + 2.75703f, + 1.12304f, + 2.15567f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x4_hor_layer0, + av1_tx_type_nn_weights_16x4_hor_layer1 }, + { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = { + 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f, + 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f, + -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f, + -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f, + -0.17967f, -0.96622f, 0.42635f, -1.04784f, +}; + +static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = { + -0.52088f, 0.52844f, -1.03655f, -0.30974f, + 2.59952f, -1.93604f, 0.00000f, 2.51787f, +}; + +static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = { + 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f, + 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f, + 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f, + -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f, + 1.26814f, -1.93873f, -0.00768f, 1.58309f, +}; + +static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = { + 2.34713f, + 1.68667f, + 1.25488f, + 1.69812f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x4_ver_layer0, + av1_tx_type_nn_weights_16x4_ver_layer1 }, + { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 } +}; +/******************************************************************************/ + +// Map tx_size to its corresponding neural net model for tx type prediction. +static const NN_CONFIG *const av1_tx_type_nnconfig_map_hor[] = { + &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform + &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform + &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform + &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform + &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform + &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; + +static const NN_CONFIG *const av1_tx_type_nnconfig_map_ver[] = { + &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform + &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform + &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform + &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform + &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform + &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; +#endif // CONFIG_NN_V2 + +// Tx split model for 4x8 block. +static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = { + 0.068650f, -0.732073f, -0.040361f, 0.322550f, -0.021123f, 0.212518f, + -0.350546f, 0.435987f, -0.111756f, -0.401568f, 0.069548f, -0.313000f, + 0.073918f, -0.373805f, -0.775810f, -0.124753f, 0.181094f, -0.602641f, + -0.026219f, -0.350112f, 0.020599f, -0.311752f, -0.476482f, -0.669465f, + -0.310921f, 0.348869f, -0.115984f, 0.154250f, 0.200485f, -0.016689f, + 0.020392f, 0.413810f, 0.634064f, -0.627530f, 0.399178f, -0.012284f, + 0.472030f, 0.091087f, -0.706100f, -0.447944f, -0.274226f, 0.445656f, + 0.309339f, 0.505522f, 0.038496f, -0.152809f, 0.408684f, -0.068151f, + 0.271612f, 0.353233f, -0.150365f, 0.075212f, -0.035096f, 0.346615f, + 0.124382f, 0.477072f, 0.216288f, 0.070548f, -0.106362f, 0.681613f, + -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f, + 0.063009f, -0.123053f, 0.104875f, -0.137581f, -0.282933f, -0.003624f, + -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f, + -0.670248f, -0.353762f, 0.181109f, 0.289715f, -0.071206f, 0.261141f, + 0.052796f, -0.114554f, -0.139214f, -0.261380f, 0.075984f, -0.647925f, + -0.099528f, -0.677814f, 0.015712f, -0.389385f, -0.095622f, -0.165117f, + -0.109454f, -0.175240f, -0.393914f, 0.212330f, 0.037822f, 0.248280f, + 0.180197f, 0.110493f, -0.525727f, -0.092329f, -0.524029f, -0.407364f, + -0.542373f, -0.435626f, -0.912194f, 0.062794f, 0.160433f, 0.741485f, + -0.103659f, -0.119327f, -0.055275f, 0.334358f, 0.014713f, 0.046327f, + 0.831114f, -0.576682f, 0.354369f, -0.082088f, 0.452331f, 0.039730f, + -0.792429f, -0.385862f, +}; + +static const float av1_tx_split_nn_bias_4x8_layer0[16] = { + 0.238621f, 2.186830f, 1.383035f, -0.867139f, 1.257119f, -0.351571f, + -0.240650f, -0.971692f, 2.744843f, 1.116991f, 0.139062f, -0.165332f, + 0.262171f, -1.598153f, -1.427340f, -1.602306f, +}; + +static const float av1_tx_split_nn_weights_4x8_layer1[16] = { + -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f, + -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f, 0.278987f, + 0.085082f, 0.614986f, 0.847904f, 0.637578f, +}; + +static const float av1_tx_split_nn_bias_4x8_layer1[1] = { + 0.20586078f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_4x8 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_4x8_layer0, + av1_tx_split_nn_weights_4x8_layer1, + }, + { + av1_tx_split_nn_bias_4x8_layer0, + av1_tx_split_nn_bias_4x8_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x8 block. +static const float av1_tx_split_nn_weights_8x8_layer0[144] = { + 0.177983f, -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f, + -0.098202f, -0.279510f, 0.001054f, -0.119319f, -1.835282f, -0.581507f, + -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f, + -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f, + 0.015331f, -0.341818f, 0.145549f, -0.348362f, 0.147647f, -0.323400f, + 0.047558f, -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f, + 0.447740f, 0.782381f, -0.179164f, -0.584675f, -0.052645f, 0.038656f, + -0.096783f, 0.038342f, -0.170762f, -0.405844f, -0.552665f, -0.509866f, + 0.757204f, -1.296465f, 0.631015f, 0.009265f, 0.646192f, 0.044523f, + 0.653161f, 0.033820f, 0.849639f, -0.068555f, -1.036085f, -0.511652f, + 0.104693f, -1.458690f, 0.286051f, -0.089800f, 0.381564f, -0.302640f, + 0.304465f, -0.268706f, 0.432603f, -0.117914f, -2.070031f, -0.565696f, + -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f, + -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f, + 0.101894f, -0.221847f, 0.018412f, -0.423887f, -0.266684f, -0.444930f, + -0.196237f, 0.106638f, -0.065834f, -0.538401f, -0.280772f, -0.620348f, + 1.089957f, -0.799928f, 0.504112f, -0.165763f, 0.578741f, -0.172653f, + 0.547316f, -0.143484f, 0.717220f, -0.297190f, -1.237854f, -0.074819f, + -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f, + -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f, + 0.231249f, -1.693073f, -0.035899f, 0.380845f, -0.058476f, 0.409405f, + -0.066679f, 0.406731f, -0.068501f, 0.396748f, 0.639462f, 0.150834f, + -0.418659f, -1.421931f, 0.101889f, 0.083573f, 0.129746f, 0.134460f, + 0.081185f, 0.127420f, 0.083664f, 0.051096f, 1.361688f, 0.386093f, +}; + +static const float av1_tx_split_nn_bias_8x8_layer0[12] = { + 4.280443f, 2.218902f, -0.256953f, 3.161431f, 2.082548f, 2.506052f, + 2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f, +}; + +static const float av1_tx_split_nn_weights_8x8_layer1[12] = { + 1.178833f, -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f, + -0.766968f, -0.356663f, 0.450146f, 0.509370f, -0.356604f, -0.443506f, +}; + +static const float av1_tx_split_nn_bias_8x8_layer1[1] = { + -0.156294f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x8 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 12, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x8_layer0, + av1_tx_split_nn_weights_8x8_layer1, + }, + { + av1_tx_split_nn_bias_8x8_layer0, + av1_tx_split_nn_bias_8x8_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x16 block. +static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = { + 0.374660f, 0.218905f, -0.139779f, 0.212141f, 0.056517f, 0.051114f, + 0.042860f, -0.273258f, -0.340809f, 0.138983f, -0.216996f, -0.241519f, + -0.123244f, 0.078577f, -0.472273f, -0.194201f, 0.125056f, 0.239761f, + -0.332782f, 0.174782f, -0.211400f, -0.129795f, 0.062195f, 0.113176f, + -0.008869f, 0.140764f, 0.059833f, 0.163826f, 0.359293f, -0.109797f, + -0.022091f, -0.059536f, -0.188226f, 0.179709f, 0.031386f, 0.164790f, + 0.214364f, 0.198555f, 0.152262f, -0.242980f, 0.319367f, -0.136902f, + 0.046524f, -0.043591f, 0.342178f, -0.011757f, -0.014286f, 0.072871f, + -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f, + -0.120865f, -0.160042f, 0.240028f, 0.112902f, -0.141587f, -0.703012f, + -0.136591f, 0.318993f, -0.154417f, -0.054668f, 0.192870f, 0.176166f, + -0.029965f, 0.266942f, -0.178384f, 0.038680f, 0.134403f, -0.002426f, + 0.534825f, -0.070923f, 0.413281f, 0.418148f, 0.093729f, 0.016454f, + 0.305358f, -0.040512f, 0.069904f, -0.227588f, -0.362220f, -0.031604f, + -0.394901f, 0.071506f, -0.342833f, -0.142550f, -0.164005f, 0.182600f, + 0.213062f, 0.076805f, 0.278758f, 0.125613f, -0.035552f, 0.040971f, + 0.182785f, -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f, + 0.114657f, 0.047121f, 0.195902f, 0.264759f, 0.017799f, 0.210230f, + 0.150749f, -0.142142f, 0.182494f, -0.142415f, -0.259782f, -0.114830f, + -0.198826f, 0.000061f, -0.375668f, -0.276656f, -0.373202f, 0.210298f, + 0.422680f, 0.066960f, 0.351106f, -0.209034f, 0.367195f, -0.110274f, + 0.115573f, -0.066642f, -0.389673f, -0.260447f, 0.056949f, -0.180425f, + 0.069922f, -0.153506f, -0.097053f, -0.111757f, 0.094069f, 0.144837f, + -0.052984f, -0.506681f, -0.034474f, 0.279057f, -0.105025f, 0.006656f, + -0.125017f, -0.114096f, 0.103153f, -0.117402f, -0.359472f, 0.072534f, + 0.110291f, 0.003088f, -0.456897f, 0.038331f, -0.322298f, 0.113942f, + -0.119916f, -0.194392f, 0.093167f, 0.193459f, 0.074671f, 0.033602f, + 0.004440f, -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f, + 0.319160f, -0.066218f, 0.291246f, 0.181292f, 0.089914f, 0.025273f, + 0.303128f, 0.019063f, 0.078545f, -0.396919f, 0.014065f, -0.122121f, + 0.037107f, -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f, + 0.102970f, -0.225040f, 0.061059f, -0.258188f, -0.469871f, -0.099607f, + -0.061524f, -0.213700f, 0.070237f, -0.289134f, -0.238225f, 0.256403f, + -0.119344f, 0.067782f, -0.398983f, -0.123975f, -0.200205f, -0.047038f, + 0.026569f, 0.031037f, 0.094302f, -0.101239f, 0.433307f, -0.303612f, + 0.088537f, -0.164436f, 0.202471f, -0.048592f, -0.251904f, 0.122577f, + -0.309874f, -0.263405f, -0.292503f, 0.216589f, 0.035378f, 0.136599f, + -0.145844f, -0.018211f, 0.174084f, -0.449941f, -0.001428f, 0.064134f, + 0.039652f, 0.111083f, -0.246076f, -0.204733f, 0.056559f, -0.000123f, + 0.104049f, 0.138512f, -0.128309f, 0.087855f, 0.232784f, 0.247138f, + 0.162766f, 0.154829f, 0.313605f, -0.164115f, -0.050844f, 0.156549f, + 0.185279f, -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f, + -0.203399f, -0.096831f, -0.127867f, 0.310674f, -0.008181f, 0.004078f, + -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f, + 0.114268f, -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f, + -0.352853f, -0.224001f, -0.156330f, 0.215436f, 0.171846f, 0.291849f, + 0.108832f, 0.046991f, -0.127801f, 0.032485f, 0.141493f, 0.123319f, + -0.057250f, 0.315346f, -0.061317f, -0.465086f, -0.130179f, -0.217841f, + -0.239089f, -0.073251f, -0.327718f, 0.054905f, -0.283169f, -0.028900f, + 0.071450f, 0.270072f, 0.248891f, 0.088052f, 0.253319f, 0.122808f, + 0.175490f, -0.147805f, 0.089169f, -0.045457f, -0.330788f, 0.099791f, + -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f, + 0.162554f, -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f, + -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f, + -0.182036f, 0.176772f, -0.070823f, 0.216054f, -0.211533f, -0.232992f, + 0.279346f, 0.117984f, 0.236674f, 0.126625f, -0.046220f, 0.044919f, + 0.278492f, 0.083944f, 0.180512f, 0.217994f, 0.401170f, -0.064417f, + 0.011636f, -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f, + -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f, -0.312849f, + -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f, + -0.354389f, 0.169464f, 0.094151f, -0.217122f, -0.456397f, 0.211478f, + 0.219232f, -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f, + -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f, -0.144181f, + 0.335028f, 0.176439f, 0.105980f, 0.169390f, 0.155615f, -0.040618f, + -0.176029f, 0.155569f, -0.184833f, -0.171099f, -0.178663f, -0.032051f, + -0.434334f, 0.092238f, -0.263103f, 0.061804f, -0.172957f, 0.005962f, + -0.100176f, 0.125898f, 0.048092f, -0.088141f, 0.247196f, -0.221601f, + -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f, + 0.403401f, -0.046200f, 0.322259f, 0.219678f, 0.109850f, 0.051837f, + 0.196861f, -0.019118f, 0.248818f, -0.137567f, 0.127862f, 0.052293f, + 0.298726f, 0.275788f, 0.015344f, 0.058714f, 0.283691f, -0.053794f, + -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f, + -0.252396f, -0.069017f, 0.034803f, -0.003388f, -0.262577f, 0.062115f, + -0.298393f, 0.215415f, -0.153615f, 0.289902f, 0.085886f, -0.504290f, + 0.077178f, 0.150861f, -0.228848f, -0.261020f, 0.198204f, 0.162113f, + 0.346418f, -0.286950f, 0.354756f, -0.226419f, 0.024720f, 0.208037f, + 0.107286f, -0.110849f, 0.104415f, -0.207725f, 0.063932f, -0.037748f, + -0.167037f, -0.068282f, 0.320815f, -0.051884f, 0.099989f, -0.078388f, + 0.127071f, 0.046675f, -0.336571f, -0.273080f, 0.264694f, -0.007352f, + -0.093828f, 0.094773f, -0.144434f, 0.091795f, -0.031615f, 0.056914f, + 0.064673f, -0.136669f, 0.344734f, 0.225926f, 0.283451f, -0.068354f, + 0.030572f, 0.180784f, -0.378047f, -0.092962f, -0.083291f, 0.038970f, + 0.052094f, -0.017932f, 0.216302f, -0.184396f, 0.079888f, 0.210406f, + -0.020627f, 0.244744f, 0.336972f, -0.182914f, -0.220976f, -0.304225f, + -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f, + -0.408768f, 0.184693f, +}; + +static const float av1_tx_split_nn_bias_8x16_layer0[64] = { + -0.274107f, 0.445751f, 0.234359f, 0.291593f, 0.163298f, 0.183707f, + -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f, -0.354974f, + 0.000000f, -0.254630f, 0.220149f, 0.371104f, 0.789759f, 0.270300f, + 0.195126f, -0.206958f, 0.917708f, -0.256232f, 1.131933f, 1.178944f, + 0.461270f, 0.246169f, -0.818614f, -0.111986f, 0.759355f, 0.154889f, + 0.470299f, -1.025250f, 0.678678f, 0.959346f, -0.164105f, 0.544079f, + -0.448733f, 0.649221f, -0.536672f, 0.962758f, -0.256427f, 0.808664f, + -0.118694f, 0.684873f, -0.015635f, -0.046469f, 0.075481f, 0.412647f, + 0.454456f, -0.107169f, 0.775235f, -0.261629f, -1.194849f, 0.010093f, + -0.231289f, 0.658286f, -0.769320f, 0.564545f, 0.482962f, -0.131378f, + -0.255844f, -0.078400f, 0.476752f, 0.643001f, +}; + +static const float av1_tx_split_nn_weights_8x16_layer1[64] = { + -0.145065f, -0.145101f, 0.174786f, 0.196692f, 0.102025f, -0.087735f, + 0.386353f, -0.660539f, -0.183940f, 0.490045f, -0.276404f, -0.145669f, + 0.209846f, -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f, + -0.108545f, -0.261181f, 1.435606f, -0.176621f, -1.158548f, 2.035680f, + 0.218069f, -0.138629f, 0.305958f, -0.277194f, -0.602468f, 0.203873f, + 0.120720f, 0.216095f, -0.434502f, -0.579746f, -0.239450f, 0.755529f, + 0.545643f, 0.232091f, 0.330169f, 0.988136f, -0.070465f, -0.345584f, + -0.162455f, -0.617064f, 0.123881f, -0.201098f, 0.222756f, 0.112932f, + 0.048647f, -0.147890f, 0.394584f, -0.262148f, 0.280564f, -0.195432f, + -0.047515f, 1.133410f, 0.255415f, -0.299032f, -0.397807f, -0.153246f, + -0.256734f, 0.177370f, 0.213522f, -0.530158f, +}; + +static const float av1_tx_split_nn_bias_8x16_layer1[1] = { + 0.14910713f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x16 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 64, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x16_layer0, + av1_tx_split_nn_weights_8x16_layer1, + }, + { + av1_tx_split_nn_bias_8x16_layer0, + av1_tx_split_nn_bias_8x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x16 block. +static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = { + -0.177215f, -0.297166f, 0.299924f, 0.207878f, 0.216871f, 0.173264f, + 0.295464f, 0.048395f, 0.154731f, 0.305880f, 0.056787f, -0.166617f, + 0.115653f, -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f, + -0.024940f, -0.007055f, 0.001392f, 0.021678f, -1.594600f, -0.099593f, + 0.332930f, 0.103574f, 0.158249f, 0.182601f, 0.332665f, 0.226207f, + -0.139566f, 0.185531f, 0.099074f, -0.185654f, -0.203121f, -0.285678f, + -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f, + -0.066150f, -0.099058f, -0.458879f, 0.127544f, 0.338314f, -0.161350f, + 0.030091f, -0.075528f, 0.004320f, 0.353690f, -0.013480f, -0.420402f, + -0.004659f, -0.329401f, -0.001745f, 0.227384f, -0.055183f, 0.121405f, + 0.160340f, 0.143603f, -0.221813f, 0.079107f, -0.657639f, -0.084348f, + -0.303414f, 0.046774f, -0.367679f, 0.060005f, 0.168645f, 0.084421f, + -0.133625f, 0.301375f, 0.079412f, -0.419303f, 0.017235f, 0.068637f, + 0.018384f, -0.428325f, -0.019753f, 0.149444f, -0.474836f, -0.287162f, + 0.198083f, 0.028292f, -0.299092f, -0.005849f, -0.256245f, 0.233277f, + -0.217561f, -0.264003f, 0.269411f, 0.207032f, -0.339411f, -0.198431f, + -0.028521f, 0.158076f, 0.177116f, 0.345702f, -0.145132f, 0.064623f, + -0.090867f, 0.288816f, -0.263198f, -0.071028f, -0.044546f, 0.380017f, + -0.014100f, -0.271192f, -0.318559f, 0.129015f, -0.050314f, -0.093355f, + -0.578498f, 0.099090f, -0.133080f, -0.029975f, -0.059828f, -0.157765f, + -0.321153f, -0.343671f, -0.242959f, 0.128304f, 0.017170f, 0.072787f, + -0.475838f, -0.003806f, -0.068615f, 0.150556f, -0.159903f, -0.416513f, + 0.218794f, -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f, + -0.077329f, -0.089747f, -0.096526f, 0.537952f, 0.134725f, -0.006469f, + -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f, -0.021712f, + -0.513992f, 0.259135f, -0.319808f, 0.077811f, 0.104613f, 0.370571f, + 0.185244f, 0.065530f, -0.091098f, -0.573741f, 0.111934f, 0.437417f, + -0.123691f, 0.220641f, -0.024783f, -0.149460f, -0.354185f, -0.134127f, + 0.038015f, -0.380596f, 0.250980f, 0.142208f, 0.135170f, -0.131129f, + -0.357556f, -0.530945f, 0.159672f, -0.147025f, -0.377829f, -0.504508f, + -0.492870f, 0.020753f, 0.142818f, 0.025172f, 0.086140f, 0.091283f, + 0.087491f, -0.186415f, 0.177785f, -0.195121f, -1.191148f, -0.477102f, + 0.023371f, 0.227004f, -0.023502f, -0.242913f, -0.074398f, -0.153480f, + 0.162900f, 0.415509f, -0.162565f, -0.131709f, -0.258852f, -0.252027f, + -0.080845f, -0.330274f, 0.021874f, 0.232398f, 0.069277f, 0.220567f, + -0.024237f, -0.366771f, 0.081673f, -0.429906f, -0.302170f, 0.061045f, + 0.352777f, -0.230376f, 0.408153f, 0.064758f, 0.142051f, 0.007219f, + 0.622878f, 0.212577f, 0.036489f, 0.081150f, -0.284767f, 0.107763f, + -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f, + -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f, 0.057565f, + 0.414265f, -0.159155f, 0.221456f, 0.146314f, 0.265776f, -0.006516f, + 0.473978f, -0.186431f, 0.288672f, -0.060437f, 0.083380f, -0.205641f, + 0.360016f, 0.222041f, 0.420011f, 0.024579f, 0.377546f, 0.250380f, + -0.069900f, 0.296743f, 0.073532f, -0.243225f, -0.374987f, -0.387288f, + -0.237255f, -0.287013f, 0.417831f, -0.252988f, -0.257652f, -0.066775f, + -0.253926f, 0.057841f, 0.346133f, -0.157797f, -0.406028f, -0.286893f, + 0.274507f, -0.452561f, 0.143381f, -0.097755f, 0.021242f, 0.034561f, + 0.044115f, 0.004065f, 0.066729f, 0.043558f, 0.102991f, -0.477574f, +}; + +static const float av1_tx_split_nn_bias_16x16_layer0[24] = { + -0.479033f, 1.467402f, -0.366291f, 0.372511f, 0.715322f, -0.605500f, + 0.176848f, 0.032318f, 0.237429f, -0.046047f, 0.452082f, 0.451805f, + -0.822845f, 0.636762f, -0.057350f, 1.163978f, 0.728287f, 0.603654f, + -0.245519f, -0.893569f, -1.428185f, 0.808870f, -0.076159f, 1.231976f, +}; + +static const float av1_tx_split_nn_weights_16x16_layer1[24] = { + -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f, -0.170504f, + -0.538432f, 0.033893f, 0.149842f, 0.404140f, -0.377812f, 0.338838f, + -0.176091f, 0.249844f, -0.362533f, 1.412460f, 0.196862f, 0.278194f, + -0.140444f, 0.297746f, 0.172533f, 0.116470f, -0.151656f, -0.603250f, +}; + +static const float av1_tx_split_nn_bias_16x16_layer1[1] = { + 0.184803f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x16 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 24, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x16_layer0, + av1_tx_split_nn_weights_16x16_layer1, + }, + { + av1_tx_split_nn_bias_16x16_layer0, + av1_tx_split_nn_bias_16x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 32x32 block. +static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = { + -0.439303f, 0.004813f, -0.365052f, -0.116868f, -0.356716f, -0.196537f, + -0.196770f, -0.076096f, 0.357004f, -0.044909f, -0.112910f, -0.129081f, + 0.156725f, -0.386346f, 0.038971f, 0.160696f, 0.204923f, -0.384333f, + -0.319546f, 0.028179f, -0.250524f, -0.289669f, -0.284138f, -0.258963f, + -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f, 0.141414f, + 0.303016f, 0.098066f, 0.482455f, 0.036069f, -0.166279f, 0.210119f, + -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f, + -0.306403f, 0.026318f, -0.277296f, 0.092684f, -0.033584f, -0.018371f, + -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f, + 0.361851f, -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f, + -0.097051f, 0.259172f, 0.016432f, 0.259358f, 0.145059f, 0.037196f, + 0.091581f, -0.219644f, 0.140384f, -0.446837f, -0.234531f, 0.149508f, + -0.083429f, 0.186189f, -0.099890f, -0.111277f, 0.495214f, 0.085053f, + -0.266613f, -0.051366f, 0.148593f, 0.111875f, 0.077787f, -0.371653f, + -0.146157f, -0.229235f, 0.076203f, 0.488975f, 0.096771f, -0.009483f, + 0.192985f, 0.246273f, -0.192671f, -0.557890f, -0.292650f, -0.088907f, + -0.106892f, -0.329659f, 0.012105f, -0.359326f, 0.170723f, -0.004357f, + 0.171593f, -0.478768f, -0.236016f, -0.035077f, 0.133731f, 0.137962f, + -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f, + -0.649359f, 0.127605f, 0.097930f, 0.182775f, -0.313324f, 0.053349f, + 0.204203f, -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f, + -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f, + 0.632147f, 0.221825f, 0.268394f, -0.096357f, 0.442545f, -0.007117f, + -0.036125f, 0.000525f, 0.088092f, -0.203653f, 0.086925f, 0.439141f, + 0.329889f, -0.370050f, -0.194306f, -0.207430f, 0.132779f, -0.217614f, + -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f, + -0.007300f, 0.062257f, -0.347865f, -0.296767f, -0.359123f, 0.230459f, + -0.189117f, -0.087622f, -0.561091f, 0.184182f, -0.044980f, 0.012643f, + 0.241672f, 0.050272f, -0.204851f, -0.159285f, -0.064081f, -0.118666f, + -0.269471f, 0.231668f, 0.135749f, -0.131162f, 0.062760f, 0.100949f, + 0.074967f, -0.056918f, 0.251707f, 0.034098f, 0.341290f, -0.105027f, + 0.313246f, -0.092679f, -0.014632f, -0.390967f, 0.136881f, -0.241554f, + 0.097674f, 0.110832f, -0.390245f, 0.017654f, -0.506222f, 0.065252f, + 0.244834f, -0.171352f, -0.331702f, 0.111043f, 0.125217f, -0.058116f, + -0.382595f, -0.052545f, 0.114261f, -0.493617f, 0.243984f, -0.171053f, + 0.165009f, -0.063020f, 0.096502f, 0.341339f, -0.013443f, 0.056372f, + 0.339284f, 0.398376f, 0.389409f, 0.257252f, 0.517368f, 0.078856f, + 0.087716f, -0.171092f, 0.227461f, 0.125307f, -0.054423f, -0.143161f, + 0.224041f, -0.086477f, -0.092548f, 0.072392f, -0.061608f, 0.258347f, + 0.147033f, -0.478244f, -0.204869f, 0.038552f, -0.144563f, 0.224087f, + -0.296705f, 0.153889f, -0.064624f, 0.085265f, -0.103826f, 0.127971f, + 0.019965f, 0.111937f, -0.074187f, -0.029518f, -0.127305f, -0.012210f, + 0.042714f, 0.070052f, -0.202360f, 0.348144f, -0.132097f, -0.209585f, + -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f, -0.013468f, + -0.406090f, -0.144936f, 0.208620f, 0.343445f, -0.059639f, 0.114857f, + -0.069431f, -0.218725f, 0.190575f, -0.368101f, 0.030030f, 0.062815f, + -0.239369f, -0.537852f, 0.022487f, 0.023038f, 0.190788f, 0.040123f, + -0.004304f, 0.060749f, -0.108929f, 0.136796f, -0.542875f, -0.227074f, + -0.182244f, 0.082559f, 0.019149f, 0.178854f, 0.120284f, 0.009070f, + 0.068268f, -0.544822f, 0.120536f, 0.354028f, -0.119890f, -0.122055f, + -0.405335f, 0.122341f, -0.304412f, 0.062405f, -0.302568f, -0.276505f, + -0.120915f, -0.221841f, 0.282007f, -0.253971f, 0.059517f, -0.144976f, + 0.149391f, -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f, + 0.017485f, 0.021038f, -0.023728f, -0.192181f, -0.103996f, 0.092873f, + -0.114365f, -0.397732f, -0.065421f, 0.053084f, 0.035201f, 0.053019f, + -0.105377f, -0.039500f, 0.131904f, -0.123911f, -0.390328f, -0.125198f, + -0.000126f, 0.014864f, -0.220187f, 0.084056f, -0.492155f, -0.164979f, + 0.133592f, 0.121519f, -0.240813f, 0.186680f, 0.118673f, 0.235006f, + -0.239894f, -0.185759f, -0.336992f, 0.209620f, -0.298845f, 0.127803f, + -0.083992f, 0.194340f, -0.245378f, 0.212308f, 0.142512f, -0.163324f, + 0.383495f, 0.291065f, 0.286620f, -0.239957f, 0.225127f, -0.174424f, + 0.297231f, -0.045434f, 0.156444f, -0.184273f, -0.204567f, 0.202551f, + 0.370019f, -0.073910f, 0.344897f, 0.063100f, 0.338547f, -0.099145f, + 0.391863f, -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f, +}; + +static const float av1_tx_split_nn_bias_32x32_layer0[32] = { + 0.143343f, -0.021982f, -0.314939f, 0.170867f, -0.081248f, 0.125758f, + -0.355762f, 0.279798f, 1.027712f, -0.434660f, 1.072005f, 0.668893f, + -0.031216f, -0.528650f, 0.328349f, 0.543645f, -0.188810f, 0.221110f, + -1.638637f, 0.058045f, -1.731105f, -0.444284f, 0.513693f, 0.890025f, + 0.160288f, 0.393312f, 0.332856f, -0.080767f, 0.299822f, 0.235876f, + 0.254942f, -0.017796f, +}; + +static const float av1_tx_split_nn_weights_32x32_layer1[32] = { + -0.090326f, -0.267553f, -0.026071f, 0.100912f, 0.279137f, 0.079064f, + -0.074885f, 0.053804f, 0.736810f, -0.031693f, -0.970514f, 0.174069f, + 0.095940f, -0.065047f, 0.052911f, 0.176728f, -0.058274f, 0.148364f, + -0.162210f, 0.093875f, -0.367663f, 0.020876f, 0.137280f, -1.099116f, + 0.146854f, 0.075590f, 0.228534f, 0.141993f, 0.072143f, 0.101421f, + -0.068547f, -0.154148f, +}; + +static const float av1_tx_split_nn_bias_32x32_layer1[1] = { + 0.316622f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_32x32 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_32x32_layer0, + av1_tx_split_nn_weights_32x32_layer1, + }, + { + av1_tx_split_nn_bias_32x32_layer0, + av1_tx_split_nn_bias_32x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 64x64 block. +static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = { + -0.006828f, 0.149944f, -0.017614f, -0.044599f, -0.024517f, 0.507698f, + 0.001039f, 0.037164f, 0.015091f, -0.306620f, -0.162047f, -0.369440f, + 0.396310f, 0.087121f, 0.208609f, -0.083068f, 0.493774f, 0.217682f, + 0.377393f, 0.172879f, 0.397422f, 0.078919f, 0.741350f, 0.064169f, + -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f, + -0.436596f, -0.007551f, -0.396721f, 0.153570f, -0.190838f, -0.071869f, + 0.048799f, -0.301301f, -0.005015f, 0.500480f, -0.030622f, -0.559095f, + -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f, -0.411323f, + -0.005366f, -0.069496f, 0.019990f, 0.327931f, -0.002516f, 0.393190f, + 0.001759f, 0.035093f, -0.030302f, -0.528984f, 0.174781f, 0.241462f, + -0.415427f, -0.164502f, 0.143065f, -0.122595f, 0.082049f, -0.143346f, + 0.055642f, -0.124701f, 0.004050f, -0.216235f, -2.681730f, 0.101658f, + 0.381239f, 0.465936f, 0.331154f, 0.301708f, -0.360171f, 0.054886f, + -0.118658f, 0.287921f, 0.277859f, 0.203784f, 0.247809f, 0.656924f, + -0.354628f, 0.315081f, 0.105108f, -0.510179f, 0.059267f, 0.061386f, + 0.076423f, 0.347119f, 0.100134f, 0.028402f, -0.118621f, -0.238689f, + 0.080141f, -0.138863f, 0.009009f, -0.100526f, -0.138875f, 0.066992f, + 0.005949f, 0.564336f, 0.046994f, 0.004655f, 0.366047f, 0.014695f, + -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f, -0.020925f, + -0.227236f, -0.068141f, 0.282009f, 0.040192f, -0.267100f, 0.229228f, + 0.133861f, 0.338706f, -0.030178f, -0.040919f, -0.026343f, -0.330338f, + -0.066931f, -0.110580f, -0.072056f, 0.599457f, -0.020738f, 0.169200f, + 0.836240f, -0.157548f, 0.386273f, 0.002404f, 0.329410f, -0.007020f, + 0.351705f, -0.041259f, 0.388861f, 0.003899f, 0.582627f, 0.023572f, + 0.409912f, -0.158472f, 0.536383f, 0.525093f, 0.604247f, 0.439159f, + 0.692832f, 0.046272f, 0.590367f, -0.082166f, 0.262357f, 0.478671f, + 0.031935f, 0.042675f, 0.120002f, 0.398616f, -0.078967f, 0.227986f, + -0.044679f, 0.151061f, -0.085564f, 0.220205f, -0.265606f, -0.203623f, + 0.204719f, -0.125922f, 0.038544f, -0.269379f, 0.025866f, 0.109967f, + 0.019064f, -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f, + 0.278496f, 0.018620f, 0.209971f, 0.296250f, 0.142850f, 0.288689f, + 0.137084f, 0.130517f, 0.128171f, -0.155396f, -0.008449f, -0.099845f, + 0.173455f, -0.059909f, -0.147318f, 0.102851f, -0.251389f, -0.001448f, + 0.103907f, 0.297273f, -0.027846f, 0.028260f, -0.382601f, 0.346695f, + -0.601641f, 0.162366f, -0.477495f, -0.042731f, -0.387871f, -0.051791f, + -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f, 0.003008f, + 0.099917f, -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f, + -0.050644f, 0.020041f, -0.132912f, -0.061578f, -3.083691f, -0.014961f, + -0.129115f, -0.710559f, 0.157213f, -0.844037f, -0.121991f, -0.943386f, + -0.231269f, -0.003462f, 0.331478f, -0.132703f, -1.285993f, -0.120957f, + -0.373755f, -0.322609f, 0.309059f, -0.131523f, -0.118334f, -0.063805f, + -0.104251f, 0.012166f, -0.094699f, -0.283753f, 0.128168f, -0.526929f, + -0.050331f, 0.186153f, 0.005913f, -0.221236f, 0.036363f, 0.160909f, + -0.001342f, -0.382749f, 0.037820f, 0.281689f, -0.024275f, 0.028854f, + 0.318291f, 0.318526f, 0.035778f, 0.034031f, 0.189663f, -0.293367f, + 0.082022f, 0.127923f, 0.078866f, -0.081361f, -0.268117f, 0.246675f, + 0.248605f, -0.215479f, -0.073084f, 0.496140f, -0.067327f, 0.396237f, + -0.120739f, 0.033752f, -0.044120f, -0.218941f, -0.028078f, 0.195132f, + -0.040400f, 0.281604f, -0.100471f, 0.415207f, -0.258503f, -0.429749f, + 0.150569f, -0.010859f, 0.136448f, 0.026589f, 0.148466f, 0.110764f, + 0.380967f, 0.009177f, 0.103075f, 0.116417f, 0.226273f, -0.327746f, + 0.169346f, 0.284553f, -0.094986f, 0.312745f, -0.147840f, 0.025062f, + -0.494482f, 0.112388f, -0.213962f, 0.107050f, -0.433371f, -0.096276f, + -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f, 0.042846f, + -0.237479f, 0.104746f, 0.158677f, 0.358937f, 0.099921f, 0.277109f, + 0.012410f, -0.062897f, 0.116130f, 0.255309f, 0.341628f, 0.145002f, + -0.429344f, -0.016433f, -0.068985f, 0.285194f, -0.286719f, -0.018298f, + -0.179369f, -0.194655f, -0.165380f, 0.026071f, -0.428268f, -0.379929f, + -0.727543f, 0.179610f, -0.963979f, -0.042026f, -0.616202f, 0.133401f, + -0.784966f, 0.061205f, -0.713357f, 0.129795f, 0.120512f, -0.339545f, + 0.353557f, 0.114906f, -0.329813f, -0.209987f, 0.085410f, 0.214313f, + -0.122082f, 0.335770f, -0.020937f, 0.202456f, 0.289023f, -0.421186f, + 0.337905f, 0.407663f, 0.132771f, 0.071734f, 0.213914f, 0.128595f, + 0.302659f, -0.209501f, 0.217756f, 0.253079f, -0.089505f, -0.205614f, +}; + +static const float av1_tx_split_nn_bias_64x64_layer0[32] = { + 0.296914f, -1.826816f, 0.346130f, 0.969520f, -0.528154f, 1.175862f, + -0.075985f, -0.097323f, -0.233059f, 0.004846f, 0.401279f, -2.272435f, + 0.086257f, 0.414162f, -0.194786f, -0.233887f, -0.113215f, -2.453546f, + 0.861214f, 0.298361f, 0.267397f, -0.158557f, -0.119911f, -0.098134f, + -0.339263f, 0.385871f, -0.678123f, 0.263218f, 0.251611f, -1.155773f, + -0.365437f, 0.229255f, +}; + +static const float av1_tx_split_nn_weights_64x64_layer1[32] = { + 0.502104f, -0.708023f, 0.419648f, 1.583418f, 0.419355f, -1.462981f, + -0.439623f, 0.405691f, 0.823257f, 0.061654f, 0.750875f, 0.775031f, + -0.387909f, 0.447385f, 0.284690f, 0.353262f, -0.224347f, 0.832864f, + -1.708491f, -1.042447f, -0.272829f, 0.540640f, 0.310509f, 0.723745f, + 0.245592f, -0.218417f, -0.597987f, -0.362301f, 0.702217f, -0.692614f, + 0.207812f, 0.513560f, +}; + +static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f }; + +static const NN_CONFIG av1_tx_split_nnconfig_64x64 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_64x64_layer0, + av1_tx_split_nn_weights_64x64_layer1, + }, + { + av1_tx_split_nn_bias_64x64_layer0, + av1_tx_split_nn_bias_64x64_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 4x16 block. +static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = { + -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f, + -2.128968f, -0.655518f, 0.432180f, 0.879752f, -0.222211f, 0.061615f, + -0.230969f, 0.569496f, 1.424188f, 0.598063f, -0.436005f, -0.737606f, + -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f, + -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f, -0.331752f, + -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f, + -0.636060f, 0.183271f, -0.610212f, 0.345895f, -1.100906f, -1.605713f, + 0.111888f, -0.140937f, 0.063013f, -0.013315f, -0.273472f, -0.255870f, + 1.200328f, 0.274002f, 1.005776f, 0.322392f, 1.222373f, 0.158227f, + 0.408810f, 0.145022f, 0.139842f, -1.249412f, 0.286672f, -0.635699f, + 0.312562f, -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f, + -0.132199f, -0.863055f, 0.217579f, -1.161425f, -0.302087f, -1.357271f, + -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f, + -2.057684f, -0.228755f, 0.606278f, 0.101198f, -0.314847f, -1.303255f, + -0.294964f, 1.301923f, 0.041712f, 0.077593f, -1.152746f, 0.495315f, + -0.751566f, 0.230249f, -0.840661f, 0.100731f, 1.346269f, 0.649898f, + -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f, + -0.354072f, 0.068292f, -0.234168f, 0.277503f, 0.179134f, 0.907420f, + 0.354626f, -0.627210f, 0.905779f, 0.512612f, 0.161190f, -0.843177f, + 0.014953f, -0.354983f, 0.011116f, -0.429598f, -1.017138f, -0.211432f, + 0.941840f, -0.281747f, 0.957776f, -0.541914f, 1.041880f, -0.433580f, + -1.416451f, -0.166467f, +}; + +static const float av1_tx_split_nn_bias_4x16_layer0[16] = { + 3.086118f, -3.235095f, 4.830956f, -0.165706f, 0.955031f, 4.055783f, + -0.311489f, 4.660205f, -0.576277f, -0.248111f, -0.790519f, -1.686412f, + -1.191704f, -3.800073f, 4.121552f, -1.399397f, +}; + +static const float av1_tx_split_nn_weights_4x16_layer1[16] = { + -0.758677f, 0.388776f, 0.439906f, 0.011390f, -0.084319f, -0.667969f, + -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f, -0.549682f, + 0.462109f, 0.343315f, 1.092593f, 0.483152f, +}; + +static const float av1_tx_split_nn_bias_4x16_layer1[1] = { + 0.8205083f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_4x16 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_4x16_layer0, + av1_tx_split_nn_weights_4x16_layer1, + }, + { + av1_tx_split_nn_bias_4x16_layer0, + av1_tx_split_nn_bias_4x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x32 block. +static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = { + 0.180713f, 0.033211f, 0.607561f, 0.138642f, 0.637204f, -0.000940f, + 0.012630f, 0.358109f, 0.022238f, 0.190418f, 0.079088f, 0.065925f, + 0.038242f, 0.162380f, -0.122728f, 0.379382f, -0.303283f, -0.327550f, + 0.029120f, -0.284553f, 0.269588f, -0.309805f, -0.241036f, -0.161103f, + -0.304887f, 0.239843f, -0.149146f, 0.311234f, -0.073640f, -0.132718f, + 0.178901f, 0.474712f, 0.020280f, 0.063685f, -0.609170f, -0.013658f, + -0.338074f, 0.250429f, 0.082978f, -0.186315f, -0.788959f, 0.039859f, + -0.426461f, -0.001524f, -0.447211f, 0.378102f, 0.315617f, 0.017428f, + 0.745494f, -0.219024f, 0.512836f, 0.200522f, 0.680449f, 0.313686f, + -0.412569f, -0.132927f, 0.631120f, 0.042735f, 0.336153f, 0.044772f, + 0.432606f, 0.175681f, -0.634411f, -0.073509f, -0.040643f, -0.559260f, + -0.104034f, -0.570495f, -0.247365f, 0.063256f, -0.582021f, -0.492585f, + -0.194955f, -0.207934f, -0.506627f, 0.021743f, -0.416518f, 0.320876f, + 0.115889f, 0.149399f, -0.229376f, 0.095505f, 0.115191f, -0.471921f, + 0.113068f, 0.343684f, -0.036831f, 0.021240f, 0.295112f, 0.031166f, + 0.448201f, -0.132241f, 0.164032f, 0.355572f, 0.072154f, 0.017335f, + -0.046113f, 0.178719f, -0.026881f, -0.242590f, 0.055073f, -0.012958f, + 0.077904f, 0.351356f, 0.107655f, 0.260568f, -0.080052f, -0.197553f, + 0.085763f, 0.263416f, -0.327741f, 0.158855f, 0.056899f, -0.162121f, + 0.339518f, -0.571204f, 0.264966f, -0.252214f, -0.202560f, -0.134213f, + -0.330188f, 0.009470f, -0.468376f, -0.065240f, -0.307957f, 0.116479f, + -0.222238f, -0.458716f, 0.186493f, -0.391415f, 0.118649f, -0.104653f, + -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f, + -0.598358f, 0.164947f, -0.119694f, -0.058520f, 0.203829f, -0.267404f, + -0.048202f, -0.600006f, 0.181594f, -0.731805f, 0.146417f, -0.687148f, + -1.210525f, -0.450101f, -0.620635f, 0.208825f, -0.611357f, 0.112202f, + -0.309468f, -0.323545f, 0.357770f, 0.308061f, 0.553199f, 0.049012f, + 0.530093f, -0.208597f, 0.607882f, -0.058120f, -0.527634f, 0.018136f, + 0.060753f, 0.118894f, 0.175649f, 0.014731f, 0.428318f, -0.106465f, + -0.119077f, 0.080179f, 0.524997f, 0.368286f, 0.528286f, 0.213659f, + 0.639286f, 0.195079f, -0.049815f, -0.092008f, -0.302958f, 0.298149f, + -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f, 0.325622f, + -0.115293f, 0.155188f, 0.047225f, 0.231050f, -0.167447f, 0.349754f, + 0.295544f, -0.319466f, 0.095144f, 0.174612f, -0.194652f, 0.305915f, + -0.239008f, -0.037453f, 0.280696f, 0.125850f, 0.749196f, -0.101919f, + 0.791808f, -0.236811f, 0.064157f, 0.032865f, -0.225911f, 0.350384f, + 0.723183f, -0.103992f, 0.483085f, -0.123992f, 0.602138f, 0.023895f, + -0.692601f, -0.118387f, 0.162527f, 0.145178f, -0.184702f, -0.017753f, + -0.159436f, 0.124105f, -0.131067f, 0.310275f, 0.151499f, 0.138924f, + 0.537459f, 0.263212f, 0.615896f, 0.281255f, 0.021293f, -0.473459f, + 0.210145f, -0.056682f, 0.063658f, 0.377254f, -0.314410f, -0.183487f, + 0.300384f, 0.328471f, 0.164694f, -0.159272f, -0.160942f, -0.502861f, + -0.129147f, 0.045916f, -0.606865f, -0.101378f, +}; + +static const float av1_tx_split_nn_bias_16x32_layer0[32] = { + 0.051664f, -0.212487f, -0.077596f, -0.818467f, 0.638475f, -0.759937f, + 0.157198f, 0.989640f, 1.586035f, 0.431144f, 0.041605f, 0.543085f, + 0.498379f, 0.320504f, 0.134233f, 0.670979f, -0.105562f, -1.574879f, + 1.261812f, -0.287530f, -1.610592f, 0.730899f, -0.894240f, -0.657790f, + 0.270806f, -0.181708f, 0.298578f, 0.817240f, -0.221508f, -0.201771f, + -0.294389f, 1.456413f, +}; + +static const float av1_tx_split_nn_weights_16x32_layer1[32] = { + 1.208914f, 0.324728f, 0.383352f, -0.874321f, 0.172565f, -0.580927f, + -0.432927f, 0.433698f, -0.801935f, 0.672028f, 0.563493f, 0.260077f, + -0.200557f, -0.121638f, 0.530735f, -0.525196f, 0.281799f, 0.624204f, + -0.662775f, -0.230887f, 0.980989f, 0.223437f, -0.790591f, 0.600724f, + -0.273445f, 0.427635f, -0.501641f, -0.878390f, 0.234731f, -0.172550f, + 0.418904f, 1.792187f, +}; + +static const float av1_tx_split_nn_bias_16x32_layer1[1] = { + -0.29233751f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x32 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x32_layer0, + av1_tx_split_nn_weights_16x32_layer1, + }, + { + av1_tx_split_nn_bias_16x32_layer0, + av1_tx_split_nn_bias_16x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 32x64 block. +static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = { + 0.031614f, -0.110926f, 0.052418f, -0.702506f, 0.045708f, 0.238329f, + -0.021806f, -0.208128f, 0.509745f, -0.293891f, 0.277788f, 0.113937f, + 0.741576f, 0.062848f, 0.351878f, 0.212532f, 0.385842f, 0.081517f, + 0.398502f, -0.015156f, 0.242616f, 0.214619f, -0.182678f, -0.170546f, + 0.110605f, -0.236749f, -0.023831f, -0.285243f, 0.147156f, -0.257639f, + 0.341355f, -0.571641f, -0.721797f, 0.139588f, -0.518494f, -0.206526f, + -0.570560f, -0.184295f, 0.110271f, 0.210292f, -0.109132f, -0.001080f, + 0.129251f, -0.204230f, -0.396312f, -0.183024f, 0.421243f, -0.013154f, + 0.222627f, 0.169826f, 0.226037f, 0.218153f, -0.343528f, 0.274906f, + -0.156632f, 0.250261f, -0.484020f, 0.019909f, -0.349575f, -0.286643f, + -0.507396f, 0.202446f, -0.154110f, -0.292644f, 0.122666f, 0.306963f, + 0.424895f, 0.005579f, 0.494094f, -0.079551f, 0.473740f, 0.352414f, + -0.356917f, 0.264331f, -0.554487f, 0.119978f, 0.012291f, -0.141641f, + -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f, -0.118501f, + 0.305151f, -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f, + -0.177066f, -0.055114f, 0.229698f, -0.199523f, 0.054278f, 0.365020f, + -0.060586f, -0.300618f, 0.157563f, -0.064338f, -0.005711f, -0.176991f, + -0.424502f, -0.111914f, 0.092608f, 0.126621f, 0.078547f, 0.148008f, + 0.024221f, 0.124599f, 0.001343f, 0.059402f, 0.453753f, 0.047102f, + 0.242544f, 0.055735f, -0.067451f, -0.170061f, -0.170469f, -0.232173f, + 0.214908f, 0.248889f, 0.544348f, -0.084566f, 0.402478f, 0.298031f, + 0.099038f, -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f, + -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f, + 0.100219f, 0.293934f, 0.099271f, -0.036320f, 0.356626f, -0.261445f, + 0.879544f, 0.000878f, 0.532920f, -0.093918f, 0.508867f, -0.040215f, + -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f, 0.352989f, + -0.058831f, -0.164588f, 0.039890f, 0.122861f, 0.222508f, 0.061217f, + 0.466487f, 0.022666f, 0.423777f, -0.002200f, -0.656835f, -0.099760f, + -0.520606f, 0.303204f, -0.563620f, -0.160922f, -0.243203f, 0.313354f, + -0.336516f, -0.206764f, -0.236040f, 0.325899f, -0.418748f, 0.163205f, + -0.476242f, -0.121928f, 0.139178f, -0.157193f, -0.531766f, -0.180202f, + -0.485254f, 0.187703f, -0.440072f, 0.137854f, 0.029139f, 0.109530f, + -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f, + -0.304542f, 0.005123f, 0.413995f, 0.314639f, 0.342648f, -0.293264f, + 0.358135f, -0.180425f, -0.369530f, -0.048413f, 0.498366f, 0.121875f, + 0.270948f, -0.187966f, 0.342503f, 0.174420f, -0.352105f, 0.088080f, + 0.008277f, 0.020275f, -0.002381f, 0.504389f, -0.018832f, -0.366047f, + -0.090947f, -0.168150f, 0.016184f, -0.328914f, 0.089579f, -0.017349f, + 0.005844f, -0.005010f, -1.857514f, -0.282426f, 0.010177f, -0.214727f, + -0.182529f, 0.156943f, -0.162032f, -0.472654f, 0.069432f, 0.016901f, + -0.767905f, 0.137129f, -0.411463f, 0.049056f, -0.431657f, -0.037641f, + 0.785500f, 0.046225f, 0.195831f, 0.245204f, 0.368614f, 0.212261f, + 0.440626f, -0.158048f, -0.461031f, -0.146280f, +}; + +static const float av1_tx_split_nn_bias_32x64_layer0[32] = { + 0.490777f, -1.894238f, 0.621333f, -0.076756f, 0.286298f, 0.286375f, + -0.126431f, -0.350034f, -1.017572f, 0.620125f, 0.408128f, 0.238756f, + -0.060728f, 0.210912f, 0.043124f, 0.445649f, 0.907025f, 0.360272f, + 1.083101f, -0.068952f, 1.062348f, 0.396354f, 0.280075f, 0.501732f, + 0.328422f, 0.066241f, 0.474697f, 0.126313f, 0.741206f, 0.314796f, + 0.552712f, 0.299410f, +}; + +static const float av1_tx_split_nn_weights_32x64_layer1[32] = { + 1.033823f, 0.603439f, 0.304591f, -0.279940f, -0.780909f, -0.132801f, + 0.154059f, 0.662014f, -0.718368f, 0.198733f, 0.039766f, -0.208516f, + -0.104909f, -0.394209f, 0.081617f, 0.365041f, -0.874960f, -0.063315f, + -1.189897f, 0.337225f, 0.410893f, 0.307519f, 0.221323f, 0.233895f, + 0.469536f, 0.438557f, 0.280144f, 0.422423f, -1.394513f, 0.781900f, + 0.352981f, 0.111265f, +}; + +static const float av1_tx_split_nn_bias_32x64_layer1[1] = { + -0.18160765f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_32x64 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_32x64_layer0, + av1_tx_split_nn_weights_32x64_layer1, + }, + { + av1_tx_split_nn_bias_32x64_layer0, + av1_tx_split_nn_bias_32x64_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x32 block. +static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = { + -0.687846f, 0.121404f, -0.372905f, 0.126770f, -0.103298f, -0.101650f, + -0.148490f, -0.271740f, 0.682915f, -0.079765f, 0.634347f, -0.151503f, + 0.287692f, -0.079072f, -0.236948f, 0.065064f, 0.713383f, 0.397123f, + 0.553621f, 0.368529f, 0.767663f, -0.046601f, -0.392402f, -0.294822f, + -0.292325f, -0.010573f, -0.837945f, 0.050113f, -0.811360f, 0.199162f, + 0.150832f, 0.011602f, 0.369694f, -0.225876f, 0.234113f, -0.269808f, + 0.303805f, -0.190281f, -0.451136f, 0.209755f, -0.308894f, 0.326956f, + 0.313591f, 0.089923f, -0.095754f, 0.390981f, 0.467366f, 0.169670f, + 0.853322f, 0.054055f, 0.830319f, -0.121918f, 0.262019f, -0.093526f, + 0.385558f, 0.419174f, 0.040198f, -0.347030f, -0.450492f, -0.106764f, + 0.487502f, -0.204188f, 0.430374f, -0.116388f, 0.236407f, -0.157376f, + 0.732294f, -0.651387f, 0.347446f, 0.342575f, 0.048406f, 0.187657f, + 0.434899f, -0.447782f, 0.032728f, -0.071168f, -0.255327f, 0.104174f, + 0.095689f, -0.431743f, 0.725694f, 0.031797f, 0.523171f, 0.061801f, + 0.469804f, -0.071068f, -0.059024f, -0.211937f, 0.392134f, -0.321490f, + 0.366060f, -0.427798f, 0.166771f, 0.299652f, 0.044660f, 0.205142f, + 0.039133f, -0.051835f, -0.465475f, 0.216976f, -0.341156f, 0.095358f, + 0.230807f, 0.201674f, 0.279266f, -0.713534f, -0.091690f, -0.569708f, + -0.119001f, 0.252160f, -1.544578f, -0.284477f, 0.555348f, 0.226471f, + 0.347690f, 0.034365f, 0.770835f, -0.241859f, -0.130241f, 0.292936f, + 0.396622f, -0.417916f, 0.492224f, 0.125517f, 0.344824f, 0.232172f, + -0.432106f, -0.278745f, 0.035069f, -0.307247f, -0.120760f, 0.170950f, + 0.433601f, 0.044286f, 0.141463f, -0.041382f, 0.529346f, 0.010868f, + -0.323674f, 0.185205f, 0.623459f, 0.232842f, -0.406693f, -0.142944f, + 0.222988f, 0.343634f, 0.065401f, 0.002621f, 0.805335f, -0.426926f, + 0.279181f, 0.131364f, 0.192339f, -0.402391f, 0.544120f, -0.060618f, + 0.467780f, 0.165224f, -0.373131f, 0.002427f, 0.688064f, 0.322317f, + 0.259713f, 0.130583f, 0.185032f, -0.189111f, -0.067821f, 0.010875f, + 0.644724f, -0.179291f, 0.463222f, 0.155230f, 0.721384f, -0.046019f, + 0.438501f, 0.440027f, -0.462090f, -0.002039f, -0.468026f, -0.008890f, + -0.328530f, 0.370102f, 0.482531f, 0.043471f, -0.469732f, -0.532663f, + 0.122081f, -0.379659f, 0.037219f, -0.519913f, -0.128975f, -0.404365f, +}; + +static const float av1_tx_split_nn_bias_8x32_layer0[24] = { + -1.198965f, 0.395204f, -0.408627f, -0.021654f, -0.658355f, 0.154525f, + -0.288354f, 1.207574f, 0.411608f, 0.964678f, -1.176893f, 1.059006f, + -0.472969f, 2.087975f, 1.065536f, 0.595569f, 0.197907f, -0.349938f, + 1.013651f, -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f, +}; + +static const float av1_tx_split_nn_weights_8x32_layer1[24] = { + 0.815787f, -0.393465f, -0.483427f, -0.565592f, 0.493494f, 0.430229f, + -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f, 0.649146f, + -0.487383f, 1.844503f, 0.480324f, -0.982705f, -0.501446f, -0.220584f, + 0.334299f, 0.802238f, 0.805838f, -0.487848f, 0.300772f, -1.232857f, +}; + +static const float av1_tx_split_nn_bias_8x32_layer1[1] = { + 0.13435879f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x32 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 24, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x32_layer0, + av1_tx_split_nn_weights_8x32_layer1, + }, + { + av1_tx_split_nn_bias_8x32_layer0, + av1_tx_split_nn_bias_8x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x32 block. +static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = { + -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f, + -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f, + -0.454709f, -0.059461f, 0.210313f, -0.155683f, 0.192968f, -0.127804f, + 0.471996f, 0.253377f, 0.472625f, 0.485322f, 0.150560f, 0.164868f, + -0.475587f, 0.447559f, -0.455759f, -0.306665f, -0.194866f, -0.283716f, + -0.243897f, 0.293020f, -0.308298f, -0.191904f, -0.468568f, 0.014053f, + -0.618848f, 0.096273f, -0.444586f, 0.347750f, -0.280643f, -0.062872f, + 0.118661f, 0.540099f, 0.104141f, -0.279300f, -0.098721f, -0.173427f, + -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f, + -0.523103f, 0.093620f, -0.930396f, -0.431997f, -1.163297f, 0.190384f, + -0.422581f, -0.005354f, 0.450552f, 0.369210f, 0.562484f, 0.679922f, + 0.282099f, -0.039075f, 0.404196f, 0.006371f, 0.069679f, -0.196160f, + -0.213675f, 0.275187f, -0.104235f, -0.193090f, 0.003116f, -0.252454f, + -0.094591f, 0.210439f, -0.137070f, 0.145043f, 0.024558f, 0.121718f, + 0.010138f, 0.301651f, -0.377990f, 0.444414f, 0.001845f, -0.095334f, + 0.550259f, 0.087603f, 0.792492f, -0.044584f, 0.641706f, -0.328458f, + -0.447791f, 0.135376f, 0.356385f, 0.135748f, 0.310370f, 0.293757f, + -0.062000f, -0.056368f, 0.343930f, 0.312039f, 0.370763f, 0.452381f, + -0.023630f, -0.185909f, 0.422277f, -0.006306f, 0.045166f, 0.423359f, + -0.157735f, -0.084901f, 0.219527f, -0.209510f, 0.575057f, 0.249276f, + 0.069267f, 0.233898f, -0.229392f, 0.117197f, -0.038551f, 0.293976f, + 0.101996f, 0.120878f, +}; + +static const float av1_tx_split_nn_bias_16x64_layer0[16] = { + 1.036995f, 0.160249f, 0.100264f, 0.694881f, 0.694677f, 0.128379f, + -0.843405f, -0.405515f, 0.104139f, 0.182980f, -0.025472f, 0.901067f, + -0.299866f, -0.103079f, -0.190352f, -0.048121f, +}; + +static const float av1_tx_split_nn_weights_16x64_layer1[16] = { + -1.778868f, 0.174690f, 0.211991f, 0.712138f, 0.589352f, 0.466652f, + 1.029146f, -0.490044f, 0.483015f, 0.600215f, -0.577776f, -0.755546f, + 0.348337f, -0.205082f, 0.347129f, -0.322277f, +}; + +static const float av1_tx_split_nn_bias_16x64_layer1[1] = { + 0.04230947f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x64 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x64_layer0, + av1_tx_split_nn_weights_16x64_layer1, + }, + { + av1_tx_split_nn_bias_16x64_layer0, + av1_tx_split_nn_bias_16x64_layer1, + }, +}; +/******************************************************************************/ + +// Map block size to its corresponding neural net model for tx split prediction. +static const NN_CONFIG *const av1_tx_split_nnconfig_map[TX_SIZES_ALL] = { + NULL, // TX_4X4, + &av1_tx_split_nnconfig_8x8, // TX_8X8, + &av1_tx_split_nnconfig_16x16, // TX_16X16, + &av1_tx_split_nnconfig_32x32, // TX_32X32, + &av1_tx_split_nnconfig_64x64, // TX_64X64, + &av1_tx_split_nnconfig_4x8, // TX_4X8, + &av1_tx_split_nnconfig_4x8, // TX_8X4, + &av1_tx_split_nnconfig_8x16, // TX_8X16, + &av1_tx_split_nnconfig_8x16, // TX_16X8, + &av1_tx_split_nnconfig_16x32, // TX_16X32, + &av1_tx_split_nnconfig_16x32, // TX_32X16, + &av1_tx_split_nnconfig_32x64, // TX_32X64, + &av1_tx_split_nnconfig_32x64, // TX_64X32, + &av1_tx_split_nnconfig_4x16, // TX_4X16, + &av1_tx_split_nnconfig_4x16, // TX_16X4, + &av1_tx_split_nnconfig_8x32, // TX_8X32, + &av1_tx_split_nnconfig_8x32, // TX_32X8, + &av1_tx_split_nnconfig_16x64, // TX_16X64, + &av1_tx_split_nnconfig_16x64, // TX_64X16, +}; + +#if !CONFIG_REALTIME_ONLY +#define NUM_INTRA_TX_SPLIT_FEATURES 14 +#define NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS 1 +#define NUM_INTRA_TX_SPLIT_HIDDEN_NODES 16 +// Model to prune intra transform depth for intra 8x8 block. +static const float av1_intra_tx_split_8x8_mean[NUM_INTRA_TX_SPLIT_FEATURES] = { + 0.110706f, 18.901518f, 0.250436f, 13.483487f, 0.118141f, + 14.318728f, 0.028409f, 14.257664f, 0.045839f, 15.143358f, + 9.702971f, 14.300809f, 6.018646f, 3.682534f, +}; + +static const float av1_intra_tx_split_8x8_std[NUM_INTRA_TX_SPLIT_FEATURES] = { + 13.750575f, 13.440116f, 14.334330f, 12.236641f, 18.415247f, + 12.733355f, 18.309339f, 12.858130f, 23.465142f, 13.447014f, + 8.625048f, 10.456774f, 1.185447f, 1.810423f, +}; + +static const float av1_intra_tx_split_nn_weights_8x8_layer0 + [NUM_INTRA_TX_SPLIT_FEATURES * NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = { + -0.156142f, -0.753623f, 0.026883f, 0.039188f, -0.035310f, 0.106140f, + 0.051622f, 0.077838f, 0.101632f, 0.107278f, 0.232200f, 0.269083f, + 0.048966f, -1.553293f, -0.113983f, -0.151248f, -0.067369f, 0.787292f, + 0.076651f, -0.802634f, 0.266414f, 1.107563f, -0.068848f, -0.956468f, + -0.074920f, -0.192258f, 0.006207f, 0.176196f, -0.493442f, 0.152290f, + -0.208874f, -0.014658f, 0.297385f, -0.351695f, 0.246295f, -0.178519f, + -0.204191f, 0.049663f, -0.330343f, -0.299754f, 0.246215f, -0.014558f, + -0.117611f, 0.206445f, 0.045840f, -0.047563f, -0.049679f, 0.406892f, + -0.052307f, -1.513404f, 0.166166f, 0.520760f, -0.143320f, -0.593928f, + -0.010533f, 0.250752f, 0.076738f, 0.537512f, -0.082619f, -1.534031f, + 0.047109f, 0.634247f, -0.089730f, 0.545534f, -0.022742f, -0.779047f, + -0.606358f, -0.199145f, -0.051269f, 0.248784f, 0.327545f, -0.851751f, + 0.071739f, 0.035975f, 0.387781f, -0.136427f, -0.284436f, 0.578449f, + -0.198276f, 0.579950f, 0.600111f, -0.370164f, -0.215297f, 0.517342f, + 0.200061f, -2.507660f, -0.030851f, 0.227315f, -0.078289f, 0.276052f, + -0.050281f, 0.251481f, -0.139318f, 0.281175f, 0.226524f, 0.058968f, + 0.197436f, 0.517294f, -0.105914f, -1.599567f, 0.064985f, 0.043209f, + -0.280038f, 0.126874f, 0.330387f, -0.014407f, 0.031241f, 0.237801f, + 0.948959f, -0.253791f, -0.022622f, -0.061430f, 0.265852f, 0.750823f, + 0.086606f, 0.853527f, -0.180971f, -1.255744f, -0.152979f, -1.022198f, + -0.044708f, 0.506424f, -0.501968f, -0.416863f, -0.012688f, 0.193523f, + -0.093698f, 0.430875f, 0.007379f, 0.019278f, 0.080890f, 0.462755f, + -0.054326f, -0.157611f, -0.004851f, -1.275676f, -0.060528f, -0.508170f, + 0.195429f, -0.023534f, 0.355211f, 0.983561f, -0.122036f, -0.911948f, + -0.172280f, -1.135245f, -0.043211f, 0.576456f, -0.075247f, 0.429734f, + -0.246309f, -0.355575f, -0.048809f, 0.217113f, 0.078385f, 0.720341f, + 0.007070f, 0.144617f, -0.167642f, 0.303056f, -0.031425f, 0.123448f, + -0.320530f, 0.164070f, -0.497849f, -0.233918f, -0.032123f, 0.084983f, + 0.312216f, 0.062609f, -0.389815f, 0.237593f, 0.000157f, -0.642068f, + 0.167898f, 0.495234f, -0.083493f, -0.555971f, 0.124437f, 0.381125f, + -0.459219f, 0.047924f, -0.138222f, -2.232816f, 0.127585f, -0.102420f, + 0.131598f, 0.036837f, -0.163055f, -0.067429f, -0.078521f, -0.055666f, + 1.387057f, 0.400154f, -0.003355f, -0.073627f, -0.305098f, -0.413383f, + -0.008266f, -0.038329f, 0.209808f, 0.375777f, 0.037274f, -0.050226f, + -0.100576f, 0.237441f, 0.237854f, 0.828296f, 0.001149f, -0.093964f, + 0.214051f, -0.031486f, -0.561307f, 0.014540f, 0.169357f, 0.323202f, + -0.395334f, -0.038941f, 0.476800f, -0.213122f, -0.287521f, -0.420717f, + -0.054142f, -0.102266f, + }; + +static const float + av1_intra_tx_split_nn_bias_8x8_layer0[NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = { + -1.150850f, -0.236404f, 0.184554f, -0.904162f, -0.949979f, 0.427016f, + -0.546867f, -0.611094f, -0.676570f, -0.208959f, -0.286384f, 0.562238f, + 0.434197f, -0.746518f, 0.123085f, -0.549836f, + }; + +static const float av1_intra_tx_split_nn_weights_8x8_layer1 + [NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = { + 0.749814f, 0.598172f, 0.375611f, 0.751612f, 0.947538f, -0.282228f, + -1.457522f, -1.092290f, 0.738657f, 0.575779f, 0.514823f, -0.560616f, + -0.491619f, -1.482014f, 0.524625f, -0.533590f, + }; + +static const float av1_intra_tx_split_nn_bias_8x8_layer1[1] = { + -0.488888f, +}; + +static const NN_CONFIG av1_intra_tx_split_nnconfig_8x8 = { + NUM_INTRA_TX_SPLIT_FEATURES, // num_inputs + 1, // num_outputs + NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS, // num_hidden_layers + { + NUM_INTRA_TX_SPLIT_HIDDEN_NODES, + }, // num_hidden_nodes + { + av1_intra_tx_split_nn_weights_8x8_layer0, + av1_intra_tx_split_nn_weights_8x8_layer1, + }, + { + av1_intra_tx_split_nn_bias_8x8_layer0, + av1_intra_tx_split_nn_bias_8x8_layer1, + }, +}; + +static const float av1_intra_tx_prune_nn_thresh_8x8[2] = { -0.405465f, + 0.405465f }; +#endif // !CONFIG_REALTIME_ONLY + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/tx_search.c b/third_party/aom/av1/encoder/tx_search.c new file mode 100644 index 0000000000..7292c01191 --- /dev/null +++ b/third_party/aom/av1/encoder/tx_search.c @@ -0,0 +1,3830 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/cfl.h" +#include "av1/common/reconintra.h" +#include "av1/encoder/block.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/common/idct.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/random.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/sorting_network.h" +#include "av1/encoder/tx_prune_model_weights.h" +#include "av1/encoder/tx_search.h" +#include "av1/encoder/txb_rdopt.h" + +#define PROB_THRESH_OFFSET_TX_TYPE 100 + +struct rdcost_block_args { + const AV1_COMP *cpi; + MACROBLOCK *x; + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE]; + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]; + RD_STATS rd_stats; + int64_t current_rd; + int64_t best_rd; + int exit_early; + int incomplete_exit; + FAST_TX_SEARCH_MODE ftxs_mode; + int skip_trellis; +}; + +typedef struct { + int64_t rd; + int txb_entropy_ctx; + TX_TYPE tx_type; +} TxCandidateInfo; + +// origin_threshold * 128 / 100 +static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = { + { + 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68, + }, + { + 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68, + }, + { + 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74, + }, +}; + +// lookup table for predict_skip_txfm +// int max_tx_size = max_txsize_rect_lookup[bsize]; +// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16) +// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16); +static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = { + TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8, + TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, + TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4, + TX_8X8, TX_8X8, TX_16X16, TX_16X16, +}; + +// look-up table for sqrt of number of pixels in a transform block +// rounded up to the nearest integer. +static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4, 8, 16, 32, 32, 6, 6, + 12, 12, 23, 23, 32, 32, 8, + 8, 16, 16, 23, 23 }; + +static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) { + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + const int16_t *diff = x->plane[0].src_diff; + const uint32_t hash = + av1_get_crc32c_value(&x->txfm_search_info.mb_rd_record->crc_calculator, + (uint8_t *)diff, 2 * rows * cols); + return (hash << 5) + bsize; +} + +static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record, + const int64_t ref_best_rd, + const uint32_t hash) { + int32_t match_index = -1; + if (ref_best_rd != INT64_MAX) { + for (int i = 0; i < mb_rd_record->num; ++i) { + const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN; + // If there is a match in the mb_rd_record, fetch the RD decision and + // terminate early. + if (mb_rd_record->mb_rd_info[index].hash_value == hash) { + match_index = index; + break; + } + } + } + return match_index; +} + +static AOM_INLINE void fetch_mb_rd_info(int n4, + const MB_RD_INFO *const mb_rd_info, + RD_STATS *const rd_stats, + MACROBLOCK *const x) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + mbmi->tx_size = mb_rd_info->tx_size; + memcpy(x->txfm_search_info.blk_skip, mb_rd_info->blk_skip, + sizeof(mb_rd_info->blk_skip[0]) * n4); + av1_copy(mbmi->inter_tx_size, mb_rd_info->inter_tx_size); + av1_copy_array(xd->tx_type_map, mb_rd_info->tx_type_map, n4); + *rd_stats = mb_rd_info->rd_stats; +} + +int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, + int blk_col, const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize, + unsigned int *block_mse_q8) { + int visible_rows, visible_cols; + const MACROBLOCKD *xd = &x->e_mbd; + get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, + NULL, &visible_cols, &visible_rows); + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *diff = x->plane[plane].src_diff; + + diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2); + uint64_t sse = + aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows); + if (block_mse_q8 != NULL) { + if (visible_cols > 0 && visible_rows > 0) + *block_mse_q8 = + (unsigned int)((256 * sse) / (visible_cols * visible_rows)); + else + *block_mse_q8 = UINT_MAX; + } + return sse; +} + +// Computes the residual block's SSE and mean on all visible 4x4s in the +// transform block +static INLINE int64_t pixel_diff_stats( + MACROBLOCK *x, int plane, int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize, + unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) { + int visible_rows, visible_cols; + const MACROBLOCKD *xd = &x->e_mbd; + get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, + NULL, &visible_cols, &visible_rows); + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *diff = x->plane[plane].src_diff; + + diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2); + uint64_t sse = 0; + int sum = 0; + sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum); + if (visible_cols > 0 && visible_rows > 0) { + double norm_factor = 1.0 / (visible_cols * visible_rows); + int sign_sum = sum > 0 ? 1 : -1; + // Conversion to transform domain + *per_px_mean = (int64_t)(norm_factor * abs(sum)) << 7; + *per_px_mean = sign_sum * (*per_px_mean); + *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse)); + *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum)); + } else { + *block_mse_q8 = UINT_MAX; + } + return sse; +} + +// Uses simple features on top of DCT coefficients to quickly predict +// whether optimal RD decision is to skip encoding the residual. +// The sse value is stored in dist. +static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, + int reduced_tx_set) { + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const MACROBLOCKD *xd = &x->e_mbd; + const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); + + *dist = av1_pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL); + + const int64_t mse = *dist / bw / bh; + // Normalized quantizer takes the transform upscaling factor (8 for tx size + // smaller than 32) into account. + const int16_t normalized_dc_q = dc_q >> 3; + const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8; + // For faster early skip decision, use dist to compare against threshold so + // that quality risk is less for the skip=1 decision. Otherwise, use mse + // since the fwd_txfm coeff checks will take care of quality + // TODO(any): Use dist to return 0 when skip_txfm_level is 1 + int64_t pred_err = (txfm_params->skip_txfm_level >= 2) ? *dist : mse; + // Predict not to skip when error is larger than threshold. + if (pred_err > mse_thresh) return 0; + // Return as skip otherwise for aggressive early skip + else if (txfm_params->skip_txfm_level >= 2) + return 1; + + const int max_tx_size = max_predict_sf_tx_size[bsize]; + const int tx_h = tx_size_high[max_tx_size]; + const int tx_w = tx_size_wide[max_tx_size]; + DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]); + TxfmParam param; + param.tx_type = DCT_DCT; + param.tx_size = max_tx_size; + param.bd = xd->bd; + param.is_hbd = is_cur_buf_hbd(xd); + param.lossless = 0; + param.tx_set_type = av1_get_ext_tx_set_type( + param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); + const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2); + const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize]; + const int16_t *src_diff = x->plane[0].src_diff; + const int n_coeff = tx_w * tx_h; + const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); + const uint32_t dc_thresh = max_qcoef_thresh * dc_q; + const uint32_t ac_thresh = max_qcoef_thresh * ac_q; + for (int row = 0; row < bh; row += tx_h) { + for (int col = 0; col < bw; col += tx_w) { + av1_fwd_txfm(src_diff + col, coefs, bw, ¶m); + // Operating on TX domain, not pixels; we want the QTX quantizers + const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7); + if (dc_coef >= dc_thresh) return 0; + for (int i = 1; i < n_coeff; ++i) { + const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7); + if (ac_coef >= ac_thresh) return 0; + } + } + src_diff += tx_h * bw; + } + return 1; +} + +// Used to set proper context for early termination with skip = 1. +static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats, + BLOCK_SIZE bsize, int64_t dist) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int n4 = bsize_to_num_blk(bsize); + const TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4); + memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size)); + mbmi->tx_size = tx_size; + for (int i = 0; i < n4; ++i) + set_blk_skip(x->txfm_search_info.blk_skip, 0, i, 1); + rd_stats->skip_txfm = 1; + if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2); + rd_stats->dist = rd_stats->sse = (dist << 4); + // Though decision is to make the block as skip based on luma stats, + // it is possible that block becomes non skip after chroma rd. In addition + // intermediate non skip costs calculated by caller function will be + // incorrect, if rate is set as zero (i.e., if zero_blk_rate is not + // accounted). Hence intermediate rate is populated to code the luma tx blks + // as skip, the caller function based on final rd decision (i.e., skip vs + // non-skip) sets the final rate accordingly. Here the rate populated + // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx + // size possible) in the current block. Eg: For 128*128 block, rate would be + // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx + // block as 'all zeros' + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl); + ENTROPY_CONTEXT *ta = ctxa; + ENTROPY_CONTEXT *tl = ctxl; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx); + const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->rate = zero_blk_rate * + (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) * + (block_size_high[bsize] >> tx_size_high_log2[tx_size]); +} + +static AOM_INLINE void save_mb_rd_info(int n4, uint32_t hash, + const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + MB_RD_RECORD *mb_rd_record) { + int index; + if (mb_rd_record->num < RD_RECORD_BUFFER_LEN) { + index = + (mb_rd_record->index_start + mb_rd_record->num) % RD_RECORD_BUFFER_LEN; + ++mb_rd_record->num; + } else { + index = mb_rd_record->index_start; + mb_rd_record->index_start = + (mb_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN; + } + MB_RD_INFO *const mb_rd_info = &mb_rd_record->mb_rd_info[index]; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + mb_rd_info->hash_value = hash; + mb_rd_info->tx_size = mbmi->tx_size; + memcpy(mb_rd_info->blk_skip, x->txfm_search_info.blk_skip, + sizeof(mb_rd_info->blk_skip[0]) * n4); + av1_copy(mb_rd_info->inter_tx_size, mbmi->inter_tx_size); + av1_copy_array(mb_rd_info->tx_type_map, xd->tx_type_map, n4); + mb_rd_info->rd_stats = *rd_stats; +} + +static int get_search_init_depth(int mi_width, int mi_height, int is_inter, + const SPEED_FEATURES *sf, + int tx_size_search_method) { + if (tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH; + + if (sf->tx_sf.tx_size_search_lgr_block) { + if (mi_width > mi_size_wide[BLOCK_64X64] || + mi_height > mi_size_high[BLOCK_64X64]) + return MAX_VARTX_DEPTH; + } + + if (is_inter) { + return (mi_height != mi_width) + ? sf->tx_sf.inter_tx_size_search_init_depth_rect + : sf->tx_sf.inter_tx_size_search_init_depth_sqr; + } else { + return (mi_height != mi_width) + ? sf->tx_sf.intra_tx_size_search_init_depth_rect + : sf->tx_sf.intra_tx_size_search_init_depth_sqr; + } +} + +static AOM_INLINE void select_tx_block( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd, + int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode); + +// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values +// 0: Do not collect any RD stats +// 1: Collect RD stats for transform units +// 2: Collect RD stats for partition units +#if CONFIG_COLLECT_RD_STATS + +static AOM_INLINE void get_energy_distribution_fine( + const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, int need_4th, double *hordist, + double *verdist) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) { + // Special cases: calculate 'esq' values manually, as we don't have 'vf' + // functions for the 16 (very small) sub-blocks of this block. + const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3; + const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3; + assert(bw <= 32); + assert(bh <= 32); + assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15); + if (cpi->common.seq_params->use_highbitdepth) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (int i = 0; i < bh; ++i) + for (int j = 0; j < bw; ++j) { + const int index = (j >> w_shift) + ((i >> h_shift) << 2); + esq[index] += + (src16[j + i * src_stride] - dst16[j + i * dst_stride]) * + (src16[j + i * src_stride] - dst16[j + i * dst_stride]); + } + } else { + for (int i = 0; i < bh; ++i) + for (int j = 0; j < bw; ++j) { + const int index = (j >> w_shift) + ((i >> h_shift) << 2); + esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) * + (src[j + i * src_stride] - dst[j + i * dst_stride]); + } + } + } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks. + const int f_index = + (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16; + assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL); + const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index; + assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]); + assert(block_size_high[bsize] == 4 * block_size_high[subsize]); + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[1]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[2]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[3]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[5]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[6]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[7]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[9]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[10]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[11]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[13]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[14]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[15]); + } + + double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] + + esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] + + esq[12] + esq[13] + esq[14] + esq[15]; + if (total > 0) { + const double e_recip = 1.0 / total; + hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip; + hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip; + hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip; + if (need_4th) { + hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip; + } + verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip; + verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip; + verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip; + if (need_4th) { + verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip; + } + } else { + hordist[0] = verdist[0] = 0.25; + hordist[1] = verdist[1] = 0.25; + hordist[2] = verdist[2] = 0.25; + if (need_4th) { + hordist[3] = verdist[3] = 0.25; + } + } +} + +static double get_sse_norm(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int err = diff[j * stride + i]; + sum += err * err; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static double get_sad_norm(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + sum += abs(diff[j * stride + i]); + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static AOM_INLINE void get_2x2_normalized_sses_and_sads( + const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src, + int src_stride, const uint8_t *const dst, int dst_stride, + const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr, + double *const sad_norm_arr) { + const BLOCK_SIZE tx_bsize_half = + get_partition_subsize(tx_bsize, PARTITION_SPLIT); + if (tx_bsize_half == BLOCK_INVALID) { // manually calculate stats + const int half_width = block_size_wide[tx_bsize] / 2; + const int half_height = block_size_high[tx_bsize] / 2; + for (int row = 0; row < 2; ++row) { + for (int col = 0; col < 2; ++col) { + const int16_t *const this_src_diff = + src_diff + row * half_height * diff_stride + col * half_width; + if (sse_norm_arr) { + sse_norm_arr[row * 2 + col] = + get_sse_norm(this_src_diff, diff_stride, half_width, half_height); + } + if (sad_norm_arr) { + sad_norm_arr[row * 2 + col] = + get_sad_norm(this_src_diff, diff_stride, half_width, half_height); + } + } + } + } else { // use function pointers to calculate stats + const int half_width = block_size_wide[tx_bsize_half]; + const int half_height = block_size_high[tx_bsize_half]; + const int num_samples_half = half_width * half_height; + for (int row = 0; row < 2; ++row) { + for (int col = 0; col < 2; ++col) { + const uint8_t *const this_src = + src + row * half_height * src_stride + col * half_width; + const uint8_t *const this_dst = + dst + row * half_height * dst_stride + col * half_width; + + if (sse_norm_arr) { + unsigned int this_sse; + cpi->ppi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst, + dst_stride, &this_sse); + sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half; + } + + if (sad_norm_arr) { + const unsigned int this_sad = cpi->ppi->fn_ptr[tx_bsize_half].sdf( + this_src, src_stride, this_dst, dst_stride); + sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half; + } + } + } + } +} + +#if CONFIG_COLLECT_RD_STATS == 1 +static double get_mean(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + sum += diff[j * stride + i]; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} +static AOM_INLINE void PrintTransformUnitStats( + const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats, + int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + TX_TYPE tx_type, int64_t rd) { + if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; + + // Generate small sample to restrict output size. + static unsigned int seed = 21743; + if (lcg_rand16(&seed) % 256 > 0) return; + + const char output_file[] = "tu_stats.txt"; + FILE *fout = fopen(output_file, "a"); + if (!fout) return; + + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const MACROBLOCKD *const xd = &x->e_mbd; + const int plane = 0; + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int txw = tx_size_wide[tx_size]; + const int txh = tx_size_high[tx_size]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int q_step = p->dequant_QTX[1] >> dequant_shift; + const int num_samples = txw * txh; + + const double rate_norm = (double)rd_stats->rate / num_samples; + const double dist_norm = (double)rd_stats->dist / num_samples; + + fprintf(fout, "%g %g", rate_norm, dist_norm); + + const int src_stride = p->src.stride; + const uint8_t *const src = + &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; + const int dst_stride = pd->dst.stride; + const uint8_t *const dst = + &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + unsigned int sse; + cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + const double sse_norm = (double)sse / num_samples; + + const unsigned int sad = + cpi->ppi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride); + const double sad_norm = (double)sad / num_samples; + + fprintf(fout, " %g %g", sse_norm, sad_norm); + + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *const src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2]; + + double sse_norm_arr[4], sad_norm_arr[4]; + get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst, + dst_stride, src_diff, diff_stride, + sse_norm_arr, sad_norm_arr); + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sse_norm_arr[i]); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sad_norm_arr[i]); + } + + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + + fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size], + tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col); + + int model_rate; + int64_t model_dist; + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples, + &model_rate, &model_dist); + const double model_rate_norm = (double)model_rate / num_samples; + const double model_dist_norm = (double)model_dist / num_samples; + fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); + + const double mean = get_mean(src_diff, diff_stride, txw, txh); + float hor_corr, vert_corr; + av1_get_horver_correlation_full(src_diff, diff_stride, txw, txh, &hor_corr, + &vert_corr); + fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); + + double hdist[4] = { 0 }, vdist[4] = { 0 }; + get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride, + 1, hdist, vdist); + fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], + hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); + + fprintf(fout, " %d %" PRId64, x->rdmult, rd); + + fprintf(fout, "\n"); + fclose(fout); +} +#endif // CONFIG_COLLECT_RD_STATS == 1 + +#if CONFIG_COLLECT_RD_STATS >= 2 +static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int64_t total_sse = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + unsigned int sse; + + if (plane) continue; + + cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); + total_sse += sse; + } + total_sse <<= 4; + return total_sse; +} + +static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int *est_residue_cost, + int64_t *est_dist) { + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (md->ready) { + if (sse < md->dist_mean) { + *est_residue_cost = 0; + *est_dist = sse; + } else { + *est_dist = (int64_t)round(md->dist_mean); + const double est_ld = md->a * sse + md->b; + // Clamp estimated rate cost by INT_MAX / 2. + // TODO(angiebird@google.com): find better solution than clamping. + if (fabs(est_ld) < 1e-2) { + *est_residue_cost = INT_MAX / 2; + } else { + double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld); + if (est_residue_cost_dbl < 0) { + *est_residue_cost = 0; + } else { + *est_residue_cost = + (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2); + } + } + if (*est_residue_cost <= 0) { + *est_residue_cost = 0; + *est_dist = sse; + } + } + return 1; + } + return 0; +} + +static double get_highbd_diff_mean(const uint8_t *src8, int src_stride, + const uint8_t *dst8, int dst_stride, int w, + int h) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int diff = src[j * src_stride + i] - dst[j * dst_stride + i]; + sum += diff; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static double get_diff_mean(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int diff = src[j * src_stride + i] - dst[j * dst_stride + i]; + sum += diff; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi, + const TileDataEnc *tile_data, + MACROBLOCK *x, + const RD_STATS *const rd_stats, + BLOCK_SIZE plane_bsize) { + if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; + + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && + (tile_data == NULL || + !tile_data->inter_mode_rd_models[plane_bsize].ready)) + return; + (void)tile_data; + // Generate small sample to restrict output size. + static unsigned int seed = 95014; + + if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) != + 1) + return; + + const char output_file[] = "pu_stats.txt"; + FILE *fout = fopen(output_file, "a"); + if (!fout) return; + + MACROBLOCKD *const xd = &x->e_mbd; + const int plane = 0; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const int diff_stride = block_size_wide[plane_bsize]; + int bw, bh; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, + &bh); + const int num_samples = bw * bh; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int q_step = p->dequant_QTX[1] >> dequant_shift; + const int shift = (xd->bd - 8); + + const double rate_norm = (double)rd_stats->rate / num_samples; + const double dist_norm = (double)rd_stats->dist / num_samples; + const double rdcost_norm = + (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples; + + fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm); + + const int src_stride = p->src.stride; + const uint8_t *const src = p->src.buf; + const int dst_stride = pd->dst.stride; + const uint8_t *const dst = pd->dst.buf; + const int16_t *const src_diff = p->src_diff; + + int64_t sse = calculate_sse(xd, p, pd, bw, bh); + const double sse_norm = (double)sse / num_samples; + + const unsigned int sad = + cpi->ppi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride); + const double sad_norm = + (double)sad / (1 << num_pels_log2_lookup[plane_bsize]); + + fprintf(fout, " %g %g", sse_norm, sad_norm); + + double sse_norm_arr[4], sad_norm_arr[4]; + get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, + dst_stride, src_diff, diff_stride, + sse_norm_arr, sad_norm_arr); + if (shift) { + for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift)); + for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sse_norm_arr[i]); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sad_norm_arr[i]); + } + + fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh); + + int model_rate; + int64_t model_dist; + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples, + &model_rate, &model_dist); + const double model_rdcost_norm = + (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples; + const double model_rate_norm = (double)model_rate / num_samples; + const double model_dist_norm = (double)model_dist / num_samples; + fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm, + model_rdcost_norm); + + double mean; + if (is_cur_buf_hbd(xd)) { + mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + bw, bh); + } + mean /= (1 << shift); + float hor_corr, vert_corr; + av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr, + &vert_corr); + fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); + + double hdist[4] = { 0 }, vdist[4] = { 0 }; + get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst, + dst_stride, 1, hdist, vdist); + fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], + hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); + + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + assert(tile_data->inter_mode_rd_models[plane_bsize].ready); + const int64_t overall_sse = get_sse(cpi, x); + int est_residue_cost = 0; + int64_t est_dist = 0; + get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost, + &est_dist); + const double est_residue_cost_norm = (double)est_residue_cost / num_samples; + const double est_dist_norm = (double)est_dist / num_samples; + const double est_rdcost_norm = + (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples; + fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm, + est_rdcost_norm); + } + + fprintf(fout, "\n"); + fclose(fout); +} +#endif // CONFIG_COLLECT_RD_STATS >= 2 +#endif // CONFIG_COLLECT_RD_STATS + +static AOM_INLINE void inverse_transform_block_facade(MACROBLOCK *const x, + int plane, int block, + int blk_row, int blk_col, + int eob, + int reduced_tx_set) { + if (!eob) return; + struct macroblock_plane *const p = &x->plane[plane]; + MACROBLOCKD *const xd = &x->e_mbd; + tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, + tx_size, reduced_tx_set); + + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + dst_stride, eob, reduced_tx_set); +} + +static INLINE void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, int skip_trellis, + TX_TYPE best_tx_type, int do_quant, + int *rate_cost, uint16_t best_eob) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + if (!is_inter && best_eob && + (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] || + blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) { + // if the quantized coefficients are stored in the dqcoeff buffer, we don't + // need to do transform and quantization again. + if (do_quant) { + TxfmParam txfm_param_intra; + QUANT_PARAM quant_param_intra; + av1_setup_xform(cm, x, tx_size, best_tx_type, &txfm_param_intra); + av1_setup_quant(tx_size, !skip_trellis, + skip_trellis + ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B + : AV1_XFORM_QUANT_FP) + : AV1_XFORM_QUANT_FP, + cpi->oxcf.q_cfg.quant_b_adapt, &quant_param_intra); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type, + &quant_param_intra); + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, + &txfm_param_intra, &quant_param_intra); + if (quant_param_intra.use_optimize_b) { + av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, + rate_cost); + } + } + + inverse_transform_block_facade(x, plane, block, blk_row, blk_col, + x->plane[plane].eobs[block], + cm->features.reduced_tx_set_used); + + // This may happen because of hash collision. The eob stored in the hash + // table is non-zero, but the real eob is zero. We need to make sure tx_type + // is DCT_DCT in this case. + if (plane == 0 && x->plane[plane].eobs[block] == 0 && + best_tx_type != DCT_DCT) { + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + } +} + +static unsigned pixel_dist_visible_only( + const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src, + const int src_stride, const uint8_t *dst, const int dst_stride, + const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows, + int visible_cols) { + unsigned sse; + + if (txb_rows == visible_rows && txb_cols == visible_cols) { + cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + return sse; + } + +#if CONFIG_AV1_HIGHBITDEPTH + const MACROBLOCKD *xd = &x->e_mbd; + if (is_cur_buf_hbd(xd)) { + uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride, + visible_cols, visible_rows); + return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2); + } +#else + (void)x; +#endif + sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols, + visible_rows); + return sse; +} + +// Compute the pixel domain distortion from src and dst on all visible 4x4s in +// the +// transform block. +static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, + int plane, const uint8_t *src, const int src_stride, + const uint8_t *dst, const int dst_stride, + int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize) { + int txb_rows, txb_cols, visible_rows, visible_cols; + const MACROBLOCKD *xd = &x->e_mbd; + + get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, + &txb_cols, &txb_rows, &visible_cols, &visible_rows); + assert(visible_rows > 0); + assert(visible_cols > 0); + + unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst, + dst_stride, tx_bsize, txb_rows, + txb_cols, visible_rows, visible_cols); + + return sse; +} + +static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, + int plane, BLOCK_SIZE plane_bsize, + int block, int blk_row, int blk_col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const uint16_t eob = p->eobs[block]; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const int bsw = block_size_wide[tx_bsize]; + const int bsh = block_size_high[tx_bsize]; + const int src_stride = x->plane[plane].src.stride; + const int dst_stride = xd->plane[plane].dst.stride; + // Scale the transform block index to pixel unit. + const int src_idx = (blk_row * src_stride + blk_col) << MI_SIZE_LOG2; + const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2; + const uint8_t *src = &x->plane[plane].src.buf[src_idx]; + const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx]; + const tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + + assert(cpi != NULL); + assert(tx_size_wide_log2[0] == tx_size_high_log2[0]); + + uint8_t *recon; + DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]); + +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + recon = CONVERT_TO_BYTEPTR(recon16); + aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, + CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, bsh); + } else { + recon = (uint8_t *)recon16; + aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh); + } +#else + recon = (uint8_t *)recon16; + aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh); +#endif + + const PLANE_TYPE plane_type = get_plane_type(plane); + TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cpi->common.features.reduced_tx_set_used); + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon, + MAX_TX_SIZE, eob, + cpi->common.features.reduced_tx_set_used); + + return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE, + blk_row, blk_col, plane_bsize, tx_bsize); +} + +// pruning thresholds for prune_txk_type and prune_txk_type_separ +static const int prune_factors[5] = { 200, 200, 120, 80, 40 }; // scale 1000 +static const int mul_factors[5] = { 80, 80, 70, 50, 30 }; // scale 100 + +// R-D costs are sorted in ascending order. +static INLINE void sort_rd(int64_t rds[], int txk[], int len) { + int i, j, k; + + for (i = 1; i <= len - 1; ++i) { + for (j = 0; j < i; ++j) { + if (rds[j] > rds[i]) { + int64_t temprd; + int tempi; + + temprd = rds[i]; + tempi = txk[i]; + + for (k = i; k > j; k--) { + rds[k] = rds[k - 1]; + txk[k] = txk[k - 1]; + } + + rds[j] = temprd; + txk[j] = tempi; + break; + } + } + } +} + +static INLINE int64_t av1_block_error_qm(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, + const qm_val_t *qmatrix, + const int16_t *scan, int64_t *ssz) { + int i; + int64_t error = 0, sqcoeff = 0; + + for (i = 0; i < block_size; i++) { + int64_t weight = qmatrix[scan[i]]; + int64_t dd = coeff[i] - dqcoeff[i]; + dd *= weight; + int64_t cc = coeff[i]; + cc *= weight; + // The ranges of coeff and dqcoeff are + // bd8 : 18 bits (including sign) + // bd10: 20 bits (including sign) + // bd12: 22 bits (including sign) + // As AOM_QM_BITS is 5, the intermediate quantities in the calculation + // below should fit in 54 bits, thus no overflow should happen. + error += (dd * dd + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS); + sqcoeff += (cc * cc + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS); + } + + *ssz = sqcoeff; + return error; +} + +static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, + const qm_val_t *qmatrix, + const int16_t *scan, int64_t *out_dist, + int64_t *out_sse) { + const struct macroblock_plane *const p = &x->plane[plane]; + // Transform domain distortion computation is more efficient as it does + // not involve an inverse transform, but it is less accurate. + const int buffer_length = av1_get_max_eob(tx_size); + int64_t this_sse; + // TX-domain results need to shift down to Q2/D10 to match pixel + // domain distortion values which are in Q2^2 + int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff = p->coeff + block_offset; + tran_low_t *const dqcoeff = p->dqcoeff + block_offset; +#if CONFIG_AV1_HIGHBITDEPTH + MACROBLOCKD *const xd = &x->e_mbd; + if (is_cur_buf_hbd(xd)) { + // TODO(veluca): handle use_qm_dist_metric for HBD too. + *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, + xd->bd); + } else { +#endif + if (qmatrix == NULL || !x->txfm_search_params.use_qm_dist_metric) { + *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse); + } else { + *out_dist = av1_block_error_qm(coeff, dqcoeff, buffer_length, qmatrix, + scan, &this_sse); + } +#if CONFIG_AV1_HIGHBITDEPTH + } +#endif + + *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift); + *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift); +} + +uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, int *txk_map, + int16_t allowed_tx_mask, int prune_factor, + const TXB_CTX *const txb_ctx, + int reduced_tx_set_used, int64_t ref_best_rd, + int num_sel) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + + int idx; + + int64_t rds_v[4]; + int64_t rds_h[4]; + int idx_v[4] = { 0, 1, 2, 3 }; + int idx_h[4] = { 0, 1, 2, 3 }; + int skip_v[4] = { 0 }; + int skip_h[4] = { 0 }; + const int idx_map[16] = { + DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, + ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, + FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, + H_DCT, H_ADST, H_FLIPADST, IDTX + }; + + const int sel_pattern_v[16] = { + 0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 3, 1, 3, 2, 3, 3 + }; + const int sel_pattern_h[16] = { + 0, 1, 0, 1, 2, 0, 2, 1, 2, 3, 0, 3, 1, 3, 2, 3 + }; + + QUANT_PARAM quant_param; + TxfmParam txfm_param; + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt, + &quant_param); + int tx_type; + // to ensure we can try ones even outside of ext_tx_set of current block + // this function should only be called for size < 16 + assert(txsize_sqr_up_map[tx_size] <= TX_16X16); + txfm_param.tx_set_type = EXT_TX_SET_ALL16; + + int rate_cost = 0; + int64_t dist = 0, sse = 0; + // evaluate horizontal with vertical DCT + for (idx = 0; idx < 4; ++idx) { + tx_type = idx_map[idx]; + txfm_param.tx_type = tx_type; + + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, + scan_order->scan, &dist, &sse); + + rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used, 0); + + rds_h[idx] = RDCOST(x->rdmult, rate_cost, dist); + + if ((rds_h[idx] - (rds_h[idx] >> 2)) > ref_best_rd) { + skip_h[idx] = 1; + } + } + sort_rd(rds_h, idx_h, 4); + for (idx = 1; idx < 4; idx++) { + if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1; + } + + if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF; + + // evaluate vertical with the best horizontal chosen + rds_v[0] = rds_h[0]; + int start_v = 1, end_v = 4; + const int *idx_map_v = idx_map + idx_h[0]; + + for (idx = start_v; idx < end_v; ++idx) { + tx_type = idx_map_v[idx_v[idx] * 4]; + txfm_param.tx_type = tx_type; + + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, + scan_order->scan, &dist, &sse); + + rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used, 0); + + rds_v[idx] = RDCOST(x->rdmult, rate_cost, dist); + + if ((rds_v[idx] - (rds_v[idx] >> 2)) > ref_best_rd) { + skip_v[idx] = 1; + } + } + sort_rd(rds_v, idx_v, 4); + for (idx = 1; idx < 4; idx++) { + if (rds_v[idx] > rds_v[0] * 1.2) skip_v[idx_v[idx]] = 1; + } + + // combine rd_h and rd_v to prune tx candidates + int i_v, i_h; + int64_t rds[16]; + int num_cand = 0, last = TX_TYPES - 1; + + for (int i = 0; i < 16; i++) { + i_v = sel_pattern_v[i]; + i_h = sel_pattern_h[i]; + tx_type = idx_map[idx_v[i_v] * 4 + idx_h[i_h]]; + if (!(allowed_tx_mask & (1 << tx_type)) || skip_h[idx_h[i_h]] || + skip_v[idx_v[i_v]]) { + txk_map[last] = tx_type; + last--; + } else { + txk_map[num_cand] = tx_type; + rds[num_cand] = rds_v[i_v] + rds_h[i_h]; + if (rds[num_cand] == 0) rds[num_cand] = 1; + num_cand++; + } + } + sort_rd(rds, txk_map, num_cand); + + uint16_t prune = (uint16_t)(~(1 << txk_map[0])); + num_sel = AOMMIN(num_sel, num_cand); + + for (int i = 1; i < num_sel; i++) { + int64_t factor = 1800 * (rds[i] - rds[0]) / (rds[0]); + if (factor < (int64_t)prune_factor) + prune &= ~(1 << txk_map[i]); + else + break; + } + return prune; +} + +uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, int *txk_map, + uint16_t allowed_tx_mask, int prune_factor, + const TXB_CTX *const txb_ctx, int reduced_tx_set_used) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + int tx_type; + + int64_t rds[TX_TYPES]; + + int num_cand = 0; + int last = TX_TYPES - 1; + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt, + &quant_param); + + for (int idx = 0; idx < TX_TYPES; idx++) { + tx_type = idx; + int rate_cost = 0; + int64_t dist = 0, sse = 0; + if (!(allowed_tx_mask & (1 << tx_type))) { + txk_map[last] = tx_type; + last--; + continue; + } + txfm_param.tx_type = tx_type; + + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + + // do txfm and quantization + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + // estimate rate cost + rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used, 0); + // tx domain dist + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, + scan_order->scan, &dist, &sse); + + txk_map[num_cand] = tx_type; + rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist); + if (rds[num_cand] == 0) rds[num_cand] = 1; + num_cand++; + } + + if (num_cand == 0) return (uint16_t)0xFFFF; + + sort_rd(rds, txk_map, num_cand); + uint16_t prune = (uint16_t)(~(1 << txk_map[0])); + + // 0 < prune_factor <= 1000 controls aggressiveness + int64_t factor = 0; + for (int idx = 1; idx < num_cand; idx++) { + factor = 1000 * (rds[idx] - rds[0]) / rds[0]; + if (factor < (int64_t)prune_factor) + prune &= ~(1 << txk_map[idx]); + else + break; + } + return prune; +} + +// These thresholds were calibrated to provide a certain number of TX types +// pruned by the model on average, i.e. selecting a threshold with index i +// will lead to pruning i+1 TX types on average +static const float *prune_2D_adaptive_thresholds[] = { + // TX_4X4 + (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f, + 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f, + 0.09778f, 0.11780f }, + // TX_8X8 + (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f, + 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f, + 0.10803f, 0.14124f }, + // TX_16X16 + (float[]){ 0.01404f, 0.02000f, 0.04211f, 0.05164f, 0.05798f, 0.06335f, + 0.06897f, 0.07629f, 0.08875f, 0.11169f }, + // TX_32X32 + NULL, + // TX_64X64 + NULL, + // TX_4X8 + (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f, + 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f, + 0.10168f, 0.12585f }, + // TX_8X4 + (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f, + 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f, + 0.10583f, 0.13123f }, + // TX_8X16 + (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f, + 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f, + 0.10730f, 0.14221f }, + // TX_16X8 + (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f, + 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f, + 0.10339f, 0.13464f }, + // TX_16X32 + NULL, + // TX_32X16 + NULL, + // TX_32X64 + NULL, + // TX_64X32 + NULL, + // TX_4X16 + (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f, + 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f, + 0.10242f, 0.12878f }, + // TX_16X4 + (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f, + 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f, + 0.10217f, 0.12610f }, + // TX_8X32 + NULL, + // TX_32X8 + NULL, + // TX_16X64 + NULL, + // TX_64X16 + NULL, +}; + +static INLINE float get_adaptive_thresholds( + TX_SIZE tx_size, TxSetType tx_set_type, + TX_TYPE_PRUNE_MODE prune_2d_txfm_mode) { + const int prune_aggr_table[5][2] = { + { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 }, { 12, 9 } + }; + int pruning_aggressiveness = 0; + if (tx_set_type == EXT_TX_SET_ALL16) + pruning_aggressiveness = + prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][0]; + else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) + pruning_aggressiveness = + prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][1]; + + return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness]; +} + +static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff, + int stride, int bw, int bh, + float *hordist, + float *verdist) { + // First compute downscaled block energy values (esq); downscale factors + // are defined by w_shift and h_shift. + unsigned int esq[256]; + const int w_shift = bw <= 8 ? 0 : 1; + const int h_shift = bh <= 8 ? 0 : 1; + const int esq_w = bw >> w_shift; + const int esq_h = bh >> h_shift; + const int esq_sz = esq_w * esq_h; + int i, j; + memset(esq, 0, esq_sz * sizeof(esq[0])); + if (w_shift) { + for (i = 0; i < bh; i++) { + unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; + const int16_t *cur_diff_row = diff + i * stride; + for (j = 0; j < bw; j += 2) { + cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] + + cur_diff_row[j + 1] * cur_diff_row[j + 1]); + } + } + } else { + for (i = 0; i < bh; i++) { + unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; + const int16_t *cur_diff_row = diff + i * stride; + for (j = 0; j < bw; j++) { + cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j]; + } + } + } + + uint64_t total = 0; + for (i = 0; i < esq_sz; i++) total += esq[i]; + + // Output hordist and verdist arrays are normalized 1D projections of esq + if (total == 0) { + float hor_val = 1.0f / esq_w; + for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val; + float ver_val = 1.0f / esq_h; + for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val; + return; + } + + const float e_recip = 1.0f / (float)total; + memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0])); + memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0])); + const unsigned int *cur_esq_row; + for (i = 0; i < esq_h - 1; i++) { + cur_esq_row = esq + i * esq_w; + for (j = 0; j < esq_w - 1; j++) { + hordist[j] += (float)cur_esq_row[j]; + verdist[i] += (float)cur_esq_row[j]; + } + verdist[i] += (float)cur_esq_row[j]; + } + cur_esq_row = esq + i * esq_w; + for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j]; + + for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip; + for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip; +} + +static AOM_INLINE bool check_bit_mask(uint16_t mask, int val) { + return mask & (1 << val); +} + +static AOM_INLINE void set_bit_mask(uint16_t *mask, int val) { + *mask |= (1 << val); +} + +static AOM_INLINE void unset_bit_mask(uint16_t *mask, int val) { + *mask &= ~(1 << val); +} + +static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, + int blk_row, int blk_col, TxSetType tx_set_type, + TX_TYPE_PRUNE_MODE prune_2d_txfm_mode, int *txk_map, + uint16_t *allowed_tx_mask) { + // This table is used because the search order is different from the enum + // order. + static const int tx_type_table_2D[16] = { + DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, + ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, + FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, + H_DCT, H_ADST, H_FLIPADST, IDTX + }; + if (tx_set_type != EXT_TX_SET_ALL16 && + tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT) + return; +#if CONFIG_NN_V2 + NN_CONFIG_V2 *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; + NN_CONFIG_V2 *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; +#else + const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; + const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; +#endif + if (!nn_config_hor || !nn_config_ver) return; // Model not established yet. + + float hfeatures[16], vfeatures[16]; + float hscores[4], vscores[4]; + float scores_2D_raw[16]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + const int hfeatures_num = bw <= 8 ? bw : bw / 2; + const int vfeatures_num = bh <= 8 ? bh : bh / 2; + assert(hfeatures_num <= 16); + assert(vfeatures_num <= 16); + + const struct macroblock_plane *const p = &x->plane[0]; + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col; + get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures, + vfeatures); + + av1_get_horver_correlation_full(diff, diff_stride, bw, bh, + &hfeatures[hfeatures_num - 1], + &vfeatures[vfeatures_num - 1]); + +#if CONFIG_NN_V2 + av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores); + av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores); +#else + av1_nn_predict(hfeatures, nn_config_hor, 1, hscores); + av1_nn_predict(vfeatures, nn_config_ver, 1, vscores); +#endif + + for (int i = 0; i < 4; i++) { + float *cur_scores_2D = scores_2D_raw + i * 4; + cur_scores_2D[0] = vscores[i] * hscores[0]; + cur_scores_2D[1] = vscores[i] * hscores[1]; + cur_scores_2D[2] = vscores[i] * hscores[2]; + cur_scores_2D[3] = vscores[i] * hscores[3]; + } + + assert(TX_TYPES == 16); + // This version of the function only works when there are at most 16 classes. + // So we will need to change the optimization or use av1_nn_softmax instead if + // this ever gets changed. + av1_nn_fast_softmax_16(scores_2D_raw, scores_2D_raw); + + const float score_thresh = + get_adaptive_thresholds(tx_size, tx_set_type, prune_2d_txfm_mode); + + // Always keep the TX type with the highest score, prune all others with + // score below score_thresh. + int max_score_i = 0; + float max_score = 0.0f; + uint16_t allow_bitmask = 0; + float sum_score = 0.0; + // Calculate sum of allowed tx type score and Populate allow bit mask based + // on score_thresh and allowed_tx_mask + int allow_count = 0; + int tx_type_allowed[16] = { TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, + TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, + TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, + TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, + TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, + TX_TYPE_INVALID }; + float scores_2D[16] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }; + for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) { + const int allow_tx_type = + check_bit_mask(*allowed_tx_mask, tx_type_table_2D[tx_idx]); + if (!allow_tx_type) { + continue; + } + if (scores_2D_raw[tx_idx] > max_score) { + max_score = scores_2D_raw[tx_idx]; + max_score_i = tx_idx; + } + if (scores_2D_raw[tx_idx] >= score_thresh) { + // Set allow mask based on score_thresh + set_bit_mask(&allow_bitmask, tx_type_table_2D[tx_idx]); + + // Accumulate score of allowed tx type + sum_score += scores_2D_raw[tx_idx]; + + scores_2D[allow_count] = scores_2D_raw[tx_idx]; + tx_type_allowed[allow_count] = tx_type_table_2D[tx_idx]; + allow_count += 1; + } + } + if (!check_bit_mask(allow_bitmask, tx_type_table_2D[max_score_i])) { + // If even the tx_type with max score is pruned, this means that no other + // tx_type is feasible. When this happens, we force enable max_score_i and + // end the search. + set_bit_mask(&allow_bitmask, tx_type_table_2D[max_score_i]); + memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D)); + *allowed_tx_mask = allow_bitmask; + return; + } + + // Sort tx type probability of all types + if (allow_count <= 8) { + av1_sort_fi32_8(scores_2D, tx_type_allowed); + } else { + av1_sort_fi32_16(scores_2D, tx_type_allowed); + } + + // Enable more pruning based on tx type probability and number of allowed tx + // types + if (prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) { + float temp_score = 0.0; + float score_ratio = 0.0; + int tx_idx, tx_count = 0; + const float inv_sum_score = 100 / sum_score; + // Get allowed tx types based on sorted probability score and tx count + for (tx_idx = 0; tx_idx < allow_count; tx_idx++) { + // Skip the tx type which has more than 30% of cumulative + // probability and allowed tx type count is more than 2 + if (score_ratio > 30.0 && tx_count >= 2) break; + + assert(check_bit_mask(allow_bitmask, tx_type_allowed[tx_idx])); + // Calculate cumulative probability + temp_score += scores_2D[tx_idx]; + + // Calculate percentage of cumulative probability of allowed tx type + score_ratio = temp_score * inv_sum_score; + tx_count++; + } + // Set remaining tx types as pruned + for (; tx_idx < allow_count; tx_idx++) + unset_bit_mask(&allow_bitmask, tx_type_allowed[tx_idx]); + } + + memcpy(txk_map, tx_type_allowed, sizeof(tx_type_table_2D)); + *allowed_tx_mask = allow_bitmask; +} + +static float get_dev(float mean, double x2_sum, int num) { + const float e_x2 = (float)(x2_sum / num); + const float diff = e_x2 - mean * mean; + const float dev = (diff > 0) ? sqrtf(diff) : 0; + return dev; +} + +// Writes the features required by the ML model to predict tx split based on +// mean and standard deviation values of the block and sub-blocks. +// Returns the number of elements written to the output array which is at most +// 12 currently. Hence 'features' buffer should be able to accommodate at least +// 12 elements. +static AOM_INLINE int get_mean_dev_features(const int16_t *data, int stride, + int bw, int bh, float *features) { + const int16_t *const data_ptr = &data[0]; + const int subh = (bh >= bw) ? (bh >> 1) : bh; + const int subw = (bw >= bh) ? (bw >> 1) : bw; + const int num = bw * bh; + const int sub_num = subw * subh; + int feature_idx = 2; + int total_x_sum = 0; + int64_t total_x2_sum = 0; + int num_sub_blks = 0; + double mean2_sum = 0.0f; + float dev_sum = 0.0f; + + for (int row = 0; row < bh; row += subh) { + for (int col = 0; col < bw; col += subw) { + int x_sum; + int64_t x2_sum; + // TODO(any): Write a SIMD version. Clear registers. + aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh, + &x_sum, &x2_sum); + total_x_sum += x_sum; + total_x2_sum += x2_sum; + + const float mean = (float)x_sum / sub_num; + const float dev = get_dev(mean, (double)x2_sum, sub_num); + features[feature_idx++] = mean; + features[feature_idx++] = dev; + mean2_sum += (double)(mean * mean); + dev_sum += dev; + num_sub_blks++; + } + } + + const float lvl0_mean = (float)total_x_sum / num; + features[0] = lvl0_mean; + features[1] = get_dev(lvl0_mean, (double)total_x2_sum, num); + + // Deviation of means. + features[feature_idx++] = get_dev(lvl0_mean, mean2_sum, num_sub_blks); + // Mean of deviations. + features[feature_idx++] = dev_sum / num_sub_blks; + + return feature_idx; +} + +static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row, + int blk_col, TX_SIZE tx_size) { + const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size]; + if (!nn_config) return -1; + + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = + x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + float features[64] = { 0.0f }; + get_mean_dev_features(diff, diff_stride, bw, bh, features); + + float score = 0.0f; + av1_nn_predict(features, nn_config, 1, &score); + + int int_score = (int)(score * 10000); + return clamp(int_score, -80000, 80000); +} + +static INLINE uint16_t +get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, + int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode, + int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const int is_inter = is_inter_block(mbmi); + const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY; + // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed < + // TX_TYPES, only that specific tx type is allowed. + TX_TYPE txk_allowed = TX_TYPES; + + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int use_actual_frame_probs = 1; + const int *tx_type_probs; +#if CONFIG_FPMT_TEST + use_actual_frame_probs = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; + if (!use_actual_frame_probs) { + tx_type_probs = + (int *)cpi->ppi->temp_frame_probs.tx_type_probs[update_type][tx_size]; + } +#endif + if (use_actual_frame_probs) { + tx_type_probs = cpi->ppi->frame_probs.tx_type_probs[update_type][tx_size]; + } + + if ((!is_inter && txfm_params->use_default_intra_tx_type) || + (is_inter && txfm_params->default_inter_tx_type_prob_thresh == 0)) { + txk_allowed = + get_default_tx_type(0, xd, tx_size, cpi->use_screen_content_tools); + } else if (is_inter && + txfm_params->default_inter_tx_type_prob_thresh != INT_MAX) { + if (tx_type_probs[DEFAULT_INTER_TX_TYPE] > + txfm_params->default_inter_tx_type_prob_thresh) { + txk_allowed = DEFAULT_INTER_TX_TYPE; + } else { + int force_tx_type = 0; + int max_prob = 0; + const int tx_type_prob_threshold = + txfm_params->default_inter_tx_type_prob_thresh + + PROB_THRESH_OFFSET_TX_TYPE; + for (int i = 1; i < TX_TYPES; i++) { // find maximum probability. + if (tx_type_probs[i] > max_prob) { + max_prob = tx_type_probs[i]; + force_tx_type = i; + } + } + if (max_prob > tx_type_prob_threshold) // force tx type with max prob. + txk_allowed = force_tx_type; + else if (x->rd_model == LOW_TXFM_RD) { + if (plane == 0) txk_allowed = DCT_DCT; + } + } + } else if (x->rd_model == LOW_TXFM_RD) { + if (plane == 0) txk_allowed = DCT_DCT; + } + + const TxSetType tx_set_type = av1_get_ext_tx_set_type( + tx_size, is_inter, cm->features.reduced_tx_set_used); + + TX_TYPE uv_tx_type = DCT_DCT; + if (plane) { + // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y + uv_tx_type = txk_allowed = + av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + } + PREDICTION_MODE intra_dir = + mbmi->filter_intra_mode_info.use_filter_intra + ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode] + : mbmi->mode; + uint16_t ext_tx_used_flag = + cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset != 0 && + tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT + ? av1_reduced_intra_tx_used_flag[intra_dir] + : av1_ext_tx_used_flag[tx_set_type]; + + if (cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset == 2) + ext_tx_used_flag &= av1_derived_intra_tx_used_flag[intra_dir]; + + if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 || + ext_tx_used_flag == 0x0001 || + (is_inter && cpi->oxcf.txfm_cfg.use_inter_dct_only) || + (!is_inter && cpi->oxcf.txfm_cfg.use_intra_dct_only)) { + txk_allowed = DCT_DCT; + } + + if (cpi->oxcf.txfm_cfg.enable_flip_idtx == 0) + ext_tx_used_flag &= DCT_ADST_TX_MASK; + + uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip. + if (txk_allowed < TX_TYPES) { + allowed_tx_mask = 1 << txk_allowed; + allowed_tx_mask &= ext_tx_used_flag; + } else if (fast_tx_search) { + allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT + allowed_tx_mask &= ext_tx_used_flag; + } else { + assert(plane == 0); + allowed_tx_mask = ext_tx_used_flag; + int num_allowed = 0; + int i; + + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + static const int thresh_arr[2][7] = { { 10, 15, 15, 10, 15, 15, 15 }, + { 10, 17, 17, 10, 17, 17, 17 } }; + const int thresh = + thresh_arr[cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats - 1] + [update_type]; + uint16_t prune = 0; + int max_prob = -1; + int max_idx = 0; + for (i = 0; i < TX_TYPES; i++) { + if (tx_type_probs[i] > max_prob && (allowed_tx_mask & (1 << i))) { + max_prob = tx_type_probs[i]; + max_idx = i; + } + if (tx_type_probs[i] < thresh) prune |= (1 << i); + } + if ((prune >> max_idx) & 0x01) prune &= ~(1 << max_idx); + allowed_tx_mask &= (~prune); + } + for (i = 0; i < TX_TYPES; i++) { + if (allowed_tx_mask & (1 << i)) num_allowed++; + } + assert(num_allowed > 0); + + if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) { + int pf = prune_factors[txfm_params->prune_2d_txfm_mode]; + int mf = mul_factors[txfm_params->prune_2d_txfm_mode]; + if (num_allowed <= 7) { + const uint16_t prune = + prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col, + plane_bsize, txk_map, allowed_tx_mask, pf, txb_ctx, + cm->features.reduced_tx_set_used); + allowed_tx_mask &= (~prune); + } else { + const int num_sel = (num_allowed * mf + 50) / 100; + const uint16_t prune = prune_txk_type_separ( + cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize, + txk_map, allowed_tx_mask, pf, txb_ctx, + cm->features.reduced_tx_set_used, ref_best_rd, num_sel); + + allowed_tx_mask &= (~prune); + } + } else { + assert(num_allowed > 0); + int allowed_tx_count = + (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) ? 1 : 5; + // !fast_tx_search && txk_end != txk_start && plane == 0 + if (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_1 && is_inter && + num_allowed > allowed_tx_count) { + prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type, + txfm_params->prune_2d_txfm_mode, txk_map, &allowed_tx_mask); + } + } + } + + // Need to have at least one transform type allowed. + if (allowed_tx_mask == 0) { + txk_allowed = (plane ? uv_tx_type : DCT_DCT); + allowed_tx_mask = (1 << txk_allowed); + } + + assert(IMPLIES(txk_allowed < TX_TYPES, allowed_tx_mask == 1 << txk_allowed)); + *allowed_txk_types = txk_allowed; + return allowed_tx_mask; +} + +#if CONFIG_RD_DEBUG +static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane, + int txb_coeff_cost) { + rd_stats->txb_coeff_cost[plane] += txb_coeff_cost; +} +#endif + +static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, + int reduced_tx_set_used) { +#if TXCOEFF_COST_TIMER + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif + const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used); +#if TXCOEFF_COST_TIMER + AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common; + aom_usec_timer_mark(&timer); + const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); + tmp_cm->txcoeff_cost_timer += elapsed_time; + ++tmp_cm->txcoeff_cost_count; +#endif + return cost; +} + +static int skip_trellis_opt_based_on_satd(MACROBLOCK *x, + QUANT_PARAM *quant_param, int plane, + int block, TX_SIZE tx_size, + int quant_b_adapt, int qstep, + unsigned int coeff_opt_satd_threshold, + int skip_trellis, int dc_only_blk) { + if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX)) + return skip_trellis; + + const struct macroblock_plane *const p = &x->plane[plane]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff_ptr = p->coeff + block_offset; + const int n_coeffs = av1_get_max_eob(tx_size); + const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)); + int satd = (dc_only_blk) ? abs(coeff_ptr[0]) : aom_satd(coeff_ptr, n_coeffs); + satd = RIGHT_SIGNED_SHIFT(satd, shift); + satd >>= (x->e_mbd.bd - 8); + + const int skip_block_trellis = + ((uint64_t)satd > + (uint64_t)coeff_opt_satd_threshold * qstep * sqrt_tx_pixels_2d[tx_size]); + + av1_setup_quant( + tx_size, !skip_block_trellis, + skip_block_trellis + ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP) + : AV1_XFORM_QUANT_FP, + quant_b_adapt, quant_param); + + return skip_block_trellis; +} + +// Predict DC only blocks if the residual variance is below a qstep based +// threshold.For such blocks, transform type search is bypassed. +static INLINE void predict_dc_only_block( + MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int block, int blk_row, int blk_col, RD_STATS *best_rd_stats, + int64_t *block_sse, unsigned int *block_mse_q8, int64_t *per_px_mean, + int *dc_only_blk) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift; + uint64_t block_var = UINT64_MAX; + const int dc_qstep = x->plane[plane].dequant_QTX[0] >> 3; + *block_sse = pixel_diff_stats(x, plane, blk_row, blk_col, plane_bsize, + txsize_to_bsize[tx_size], block_mse_q8, + per_px_mean, &block_var); + assert((*block_mse_q8) != UINT_MAX); + uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep); + if (is_cur_buf_hbd(xd)) + block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2); + + if (block_var >= var_threshold) return; + const unsigned int predict_dc_level = x->txfm_search_params.predict_dc_level; + assert(predict_dc_level != 0); + + // Prediction of skip block if residual mean and variance are less + // than qstep based threshold + if ((llabs(*per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) { + // If the normalized mean of residual block is less than the dc qstep and + // the normalized block variance is less than ac qstep, then the block is + // assumed to be a skip block and its rdcost is updated accordingly. + best_rd_stats->skip_txfm = 1; + + x->plane[plane].eobs[block] = 0; + + if (is_cur_buf_hbd(xd)) + *block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2); + + best_rd_stats->dist = (*block_sse) << 4; + best_rd_stats->sse = best_rd_stats->dist; + + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + av1_get_entropy_contexts(plane_bsize, &xd->plane[plane], ctxa, ctxl); + ENTROPY_CONTEXT *ta = ctxa; + ENTROPY_CONTEXT *tl = ctxl; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx_tmp; + const PLANE_TYPE plane_type = get_plane_type(plane); + get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx_tmp); + const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][plane_type] + .txb_skip_cost[txb_ctx_tmp.txb_skip_ctx][1]; + best_rd_stats->rate = zero_blk_rate; + + best_rd_stats->rdcost = + RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->sse); + + x->plane[plane].txb_entropy_ctx[block] = 0; + } else if (predict_dc_level > 1) { + // Predict DC only blocks based on residual variance. + // For chroma plane, this prediction is disabled for intra blocks. + if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) *dc_only_blk = 1; + } +} + +// Search for the best transform type for a given transform block. +// This function can be used for both inter and intra, both luma and chroma. +static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis, + int64_t ref_best_rd, RD_STATS *best_rd_stats) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + int64_t best_rd = INT64_MAX; + uint16_t best_eob = 0; + TX_TYPE best_tx_type = DCT_DCT; + int rate_cost = 0; + struct macroblock_plane *const p = &x->plane[plane]; + tran_low_t *orig_dqcoeff = p->dqcoeff; + tran_low_t *best_dqcoeff = x->dqcoeff_buf; + const int tx_type_map_idx = + plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col; + av1_invalid_rd_stats(best_rd_stats); + + skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id], + DRY_RUN_NORMAL); + + uint8_t best_txb_ctx = 0; + // txk_allowed = TX_TYPES: >1 tx types are allowed + // txk_allowed < TX_TYPES: only that specific tx type is allowed. + TX_TYPE txk_allowed = TX_TYPES; + int txk_map[TX_TYPES] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + }; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift; + + const uint8_t txw = tx_size_wide[tx_size]; + const uint8_t txh = tx_size_high[tx_size]; + int64_t block_sse; + unsigned int block_mse_q8; + int dc_only_blk = 0; + const bool predict_dc_block = + txfm_params->predict_dc_level >= 1 && txw != 64 && txh != 64; + int64_t per_px_mean = INT64_MAX; + if (predict_dc_block) { + predict_dc_only_block(x, plane, plane_bsize, tx_size, block, blk_row, + blk_col, best_rd_stats, &block_sse, &block_mse_q8, + &per_px_mean, &dc_only_blk); + if (best_rd_stats->skip_txfm == 1) { + const TX_TYPE tx_type = DCT_DCT; + if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type; + return; + } + } else { + block_sse = av1_pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, + txsize_to_bsize[tx_size], &block_mse_q8); + assert(block_mse_q8 != UINT_MAX); + } + + // Bit mask to indicate which transform types are allowed in the RD search. + uint16_t tx_mask; + + // Use DCT_DCT transform for DC only block. + if (dc_only_blk || cpi->sf.rt_sf.dct_only_palette_nonrd == 1) + tx_mask = 1 << DCT_DCT; + else + tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize, + tx_size, txb_ctx, ftxs_mode, ref_best_rd, + &txk_allowed, txk_map); + const uint16_t allowed_tx_mask = tx_mask; + + if (is_cur_buf_hbd(xd)) { + block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); + block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2); + } + block_sse *= 16; + // Use mse / qstep^2 based threshold logic to take decision of R-D + // optimization of coeffs. For smaller residuals, coeff optimization + // would be helpful. For larger residuals, R-D optimization may not be + // effective. + // TODO(any): Experiment with variance and mean based thresholds + const int perform_block_coeff_opt = + ((uint64_t)block_mse_q8 <= + (uint64_t)txfm_params->coeff_opt_thresholds[0] * qstep * qstep); + skip_trellis |= !perform_block_coeff_opt; + + // Flag to indicate if distortion should be calculated in transform domain or + // not during iterating through transform type candidates. + // Transform domain distortion is accurate for higher residuals. + // TODO(any): Experiment with variance and mean based thresholds + int use_transform_domain_distortion = + (txfm_params->use_transform_domain_distortion > 0) && + (block_mse_q8 >= txfm_params->tx_domain_dist_threshold) && + // Any 64-pt transforms only preserves half the coefficients. + // Therefore transform domain distortion is not valid for these + // transform sizes. + (txsize_sqr_up_map[tx_size] != TX_64X64) && + // Use pixel domain distortion for DC only blocks + !dc_only_blk; + // Flag to indicate if an extra calculation of distortion in the pixel domain + // should be performed at the end, after the best transform type has been + // decided. + int calc_pixel_domain_distortion_final = + txfm_params->use_transform_domain_distortion == 1 && + use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD; + if (calc_pixel_domain_distortion_final && + (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001)) + calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0; + + const uint16_t *eobs_ptr = x->plane[plane].eobs; + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + int skip_trellis_based_on_satd[TX_TYPES] = { 0 }; + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, !skip_trellis, + skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B + : AV1_XFORM_QUANT_FP) + : AV1_XFORM_QUANT_FP, + cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); + + // Iterate through all transform type candidates. + for (int idx = 0; idx < TX_TYPES; ++idx) { + const TX_TYPE tx_type = (TX_TYPE)txk_map[idx]; + if (tx_type == TX_TYPE_INVALID || !check_bit_mask(allowed_tx_mask, tx_type)) + continue; + txfm_param.tx_type = tx_type; + if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) { + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + } + if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type; + RD_STATS this_rd_stats; + av1_invalid_rd_stats(&this_rd_stats); + + if (!dc_only_blk) + av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param); + else + av1_xform_dc_only(x, plane, block, &txfm_param, per_px_mean); + + skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd( + x, &quant_param, plane, block, tx_size, cpi->oxcf.q_cfg.quant_b_adapt, + qstep, txfm_params->coeff_opt_thresholds[1], skip_trellis, dc_only_blk); + + av1_quant(x, plane, block, &txfm_param, &quant_param); + + // Calculate rate cost of quantized coefficients. + if (quant_param.use_optimize_b) { + // TODO(aomedia:3209): update Trellis quantization to take into account + // quantization matrices. + av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, + &rate_cost); + } else { + rate_cost = cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx, + cm->features.reduced_tx_set_used); + } + + // If rd cost based on coeff rate alone is already more than best_rd, + // terminate early. + if (RDCOST(x->rdmult, rate_cost, 0) > best_rd) continue; + + // Calculate distortion. + if (eobs_ptr[block] == 0) { + // When eob is 0, pixel domain distortion is more efficient and accurate. + this_rd_stats.dist = this_rd_stats.sse = block_sse; + } else if (dc_only_blk) { + this_rd_stats.sse = block_sse; + this_rd_stats.dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + } else if (use_transform_domain_distortion) { + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, + scan_order->scan, &this_rd_stats.dist, + &this_rd_stats.sse); + } else { + int64_t sse_diff = INT64_MAX; + // high_energy threshold assumes that every pixel within a txfm block + // has a residue energy of at least 25% of the maximum, i.e. 128 * 128 + // for 8 bit. + const int64_t high_energy_thresh = + ((int64_t)128 * 128 * tx_size_2d[tx_size]); + const int is_high_energy = (block_sse >= high_energy_thresh); + if (tx_size == TX_64X64 || is_high_energy) { + // Because 3 out 4 quadrants of transform coefficients are forced to + // zero, the inverse transform has a tendency to overflow. sse_diff + // is effectively the energy of those 3 quadrants, here we use it + // to decide if we should do pixel domain distortion. If the energy + // is mostly in first quadrant, then it is unlikely that we have + // overflow issue in inverse transform. + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, + scan_order->scan, &this_rd_stats.dist, + &this_rd_stats.sse); + sse_diff = block_sse - this_rd_stats.sse; + } + if (tx_size != TX_64X64 || !is_high_energy || + (sse_diff * 2) < this_rd_stats.sse) { + const int64_t tx_domain_dist = this_rd_stats.dist; + this_rd_stats.dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + // For high energy blocks, occasionally, the pixel domain distortion + // can be artificially low due to clamping at reconstruction stage + // even when inverse transform output is hugely different from the + // actual residue. + if (is_high_energy && this_rd_stats.dist < tx_domain_dist) + this_rd_stats.dist = tx_domain_dist; + } else { + assert(sse_diff < INT64_MAX); + this_rd_stats.dist += sse_diff; + } + this_rd_stats.sse = block_sse; + } + + this_rd_stats.rate = rate_cost; + + const int64_t rd = + RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + + if (rd < best_rd) { + best_rd = rd; + *best_rd_stats = this_rd_stats; + best_tx_type = tx_type; + best_txb_ctx = x->plane[plane].txb_entropy_ctx[block]; + best_eob = x->plane[plane].eobs[block]; + // Swap dqcoeff buffers + tran_low_t *const tmp_dqcoeff = best_dqcoeff; + best_dqcoeff = p->dqcoeff; + p->dqcoeff = tmp_dqcoeff; + } + +#if CONFIG_COLLECT_RD_STATS == 1 + if (plane == 0) { + PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col, + plane_bsize, tx_size, tx_type, rd); + } +#endif // CONFIG_COLLECT_RD_STATS == 1 + +#if COLLECT_TX_SIZE_DATA + // Generate small sample to restrict output size. + static unsigned int seed = 21743; + if (lcg_rand16(&seed) % 200 == 0) { + FILE *fp = NULL; + + if (within_border) { + fp = fopen(av1_tx_size_data_output_file, "a"); + } + + if (fp) { + // Transform info and RD + const int txb_w = tx_size_wide[tx_size]; + const int txb_h = tx_size_high[tx_size]; + + // Residue signal. + const int diff_stride = block_size_wide[plane_bsize]; + struct macroblock_plane *const p = &x->plane[plane]; + const int16_t *src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) * 4]; + + for (int r = 0; r < txb_h; ++r) { + for (int c = 0; c < txb_w; ++c) { + fprintf(fp, "%d,", src_diff[c]); + } + src_diff += diff_stride; + } + + fprintf(fp, "%d,%d,%d,%" PRId64, txb_w, txb_h, tx_type, rd); + fprintf(fp, "\n"); + fclose(fp); + } + } +#endif // COLLECT_TX_SIZE_DATA + + // If the current best RD cost is much worse than the reference RD cost, + // terminate early. + if (cpi->sf.tx_sf.adaptive_txb_search_level) { + if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) > + ref_best_rd) { + break; + } + } + + // Terminate transform type search if the block has been quantized to + // all zero. + if (cpi->sf.tx_sf.tx_type_search.skip_tx_search && !best_eob) break; + } + + assert(best_rd != INT64_MAX); + + best_rd_stats->skip_txfm = best_eob == 0; + if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type); + x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx; + x->plane[plane].eobs[block] = best_eob; + skip_trellis = skip_trellis_based_on_satd[best_tx_type]; + + // Point dqcoeff to the quantized coefficients corresponding to the best + // transform type, then we can skip transform and quantization, e.g. in the + // final pixel domain distortion calculation and recon_intra(). + p->dqcoeff = best_dqcoeff; + + if (calc_pixel_domain_distortion_final && best_eob) { + best_rd_stats->dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + best_rd_stats->sse = block_sse; + } + + // Intra mode needs decoded pixels such that the next transform block + // can use them for prediction. + recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob); + p->dqcoeff = orig_dqcoeff; +} + +// Pick transform type for a luma transform block of tx_size. Note this function +// is used only for inter-predicted blocks. +static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x, + TX_SIZE tx_size, int blk_row, int blk_col, + int block, int plane_bsize, TXB_CTX *txb_ctx, + RD_STATS *rd_stats, + FAST_TX_SEARCH_MODE ftxs_mode, + int64_t ref_rdcost) { + assert(is_inter_block(x->e_mbd.mi[0])); + RD_STATS this_rd_stats; + const int skip_trellis = 0; + search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size, + txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats); + + av1_merge_rd_stats(rd_stats, &this_rd_stats); +} + +static AOM_INLINE void try_tx_block_no_split( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, + const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl, + int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode, TxCandidateInfo *no_split) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblock_plane *const p = &x->plane[0]; + const int bw = mi_size_wide[plane_bsize]; + const ENTROPY_CONTEXT *const pta = ta + blk_col; + const ENTROPY_CONTEXT *const ptl = tl + blk_row; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx); + const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->zero_rate = zero_blk_rate; + const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); + mbmi->inter_tx_size[index] = tx_size; + tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx, + rd_stats, ftxs_mode, ref_best_rd); + assert(rd_stats->rate < INT_MAX); + + const int pick_skip_txfm = + !xd->lossless[mbmi->segment_id] && + (rd_stats->skip_txfm == 1 || + RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse)); + if (pick_skip_txfm) { +#if CONFIG_RD_DEBUG + update_txb_coeff_cost(rd_stats, 0, zero_blk_rate - rd_stats->rate); +#endif // CONFIG_RD_DEBUG + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + p->eobs[block] = 0; + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + rd_stats->skip_txfm = pick_skip_txfm; + set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col, + pick_skip_txfm); + + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->mode_costs.txfm_partition_cost[txfm_partition_ctx][0]; + + no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + no_split->txb_entropy_ctx = p->txb_entropy_ctx[block]; + no_split->tx_type = + xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; +} + +static AOM_INLINE void try_tx_block_split( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode, RD_STATS *split_rd_stats) { + assert(tx_size < TX_SIZES_ALL); + MACROBLOCKD *const xd = &x->e_mbd; + const int max_blocks_high = max_block_high(xd, plane_bsize, 0); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); + const int txb_width = tx_size_wide_unit[tx_size]; + const int txb_height = tx_size_high_unit[tx_size]; + // Transform size after splitting current block. + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int sub_txb_width = tx_size_wide_unit[sub_txs]; + const int sub_txb_height = tx_size_high_unit[sub_txs]; + const int sub_step = sub_txb_width * sub_txb_height; + const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width); + assert(nblks > 0); + av1_init_rd_stats(split_rd_stats); + split_rd_stats->rate = + x->mode_costs.txfm_partition_cost[txfm_partition_ctx][1]; + + for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) { + const int offsetr = blk_row + r; + if (offsetr >= max_blocks_high) break; + for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) { + assert(blk_idx < 4); + const int offsetc = blk_col + c; + if (offsetc >= max_blocks_wide) continue; + + RD_STATS this_rd_stats; + int this_cost_valid = 1; + select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, + plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats, + no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost, + &this_cost_valid, ftxs_mode); + if (!this_cost_valid) { + split_rd_stats->rdcost = INT64_MAX; + return; + } + av1_merge_rd_stats(split_rd_stats, &this_rd_stats); + split_rd_stats->rdcost = + RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); + if (split_rd_stats->rdcost > ref_best_rd) { + split_rd_stats->rdcost = INT64_MAX; + return; + } + block += sub_step; + } + } +} + +static float get_var(float mean, double x2_sum, int num) { + const float e_x2 = (float)(x2_sum / num); + const float diff = e_x2 - mean * mean; + return diff; +} + +static AOM_INLINE void get_blk_var_dev(const int16_t *data, int stride, int bw, + int bh, float *dev_of_mean, + float *var_of_vars) { + const int16_t *const data_ptr = &data[0]; + const int subh = (bh >= bw) ? (bh >> 1) : bh; + const int subw = (bw >= bh) ? (bw >> 1) : bw; + const int num = bw * bh; + const int sub_num = subw * subh; + int total_x_sum = 0; + int64_t total_x2_sum = 0; + int blk_idx = 0; + float var_sum = 0.0f; + float mean_sum = 0.0f; + double var2_sum = 0.0f; + double mean2_sum = 0.0f; + + for (int row = 0; row < bh; row += subh) { + for (int col = 0; col < bw; col += subw) { + int x_sum; + int64_t x2_sum; + aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh, + &x_sum, &x2_sum); + total_x_sum += x_sum; + total_x2_sum += x2_sum; + + const float mean = (float)x_sum / sub_num; + const float var = get_var(mean, (double)x2_sum, sub_num); + mean_sum += mean; + mean2_sum += (double)(mean * mean); + var_sum += var; + var2_sum += var * var; + blk_idx++; + } + } + + const float lvl0_mean = (float)total_x_sum / num; + const float block_var = get_var(lvl0_mean, (double)total_x2_sum, num); + mean_sum += lvl0_mean; + mean2_sum += (double)(lvl0_mean * lvl0_mean); + var_sum += block_var; + var2_sum += block_var * block_var; + const float av_mean = mean_sum / 5; + + if (blk_idx > 1) { + // Deviation of means. + *dev_of_mean = get_dev(av_mean, mean2_sum, (blk_idx + 1)); + // Variance of variances. + const float mean_var = var_sum / (blk_idx + 1); + *var_of_vars = get_var(mean_var, var2_sum, (blk_idx + 1)); + } +} + +static void prune_tx_split_no_split(MACROBLOCK *x, BLOCK_SIZE bsize, + int blk_row, int blk_col, TX_SIZE tx_size, + int *try_no_split, int *try_split, + int pruning_level) { + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = + x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + float dev_of_means = 0.0f; + float var_of_vars = 0.0f; + + // This function calculates the deviation of means, and the variance of pixel + // variances of the block as well as it's sub-blocks. + get_blk_var_dev(diff, diff_stride, bw, bh, &dev_of_means, &var_of_vars); + const int dc_q = x->plane[0].dequant_QTX[0] >> 3; + const int ac_q = x->plane[0].dequant_QTX[1] >> 3; + const int no_split_thresh_scales[4] = { 0, 24, 8, 8 }; + const int no_split_thresh_scale = no_split_thresh_scales[pruning_level]; + const int split_thresh_scales[4] = { 0, 24, 10, 8 }; + const int split_thresh_scale = split_thresh_scales[pruning_level]; + + if ((dev_of_means <= dc_q) && + (split_thresh_scale * var_of_vars <= ac_q * ac_q)) { + *try_split = 0; + } + if ((dev_of_means > no_split_thresh_scale * dc_q) && + (var_of_vars > no_split_thresh_scale * ac_q * ac_q)) { + *try_no_split = 0; + } +} + +// Search for the best transform partition(recursive)/type for a given +// inter-predicted luma block. The obtained transform selection will be saved +// in xd->mi[0], the corresponding RD stats will be saved in rd_stats. +static AOM_INLINE void select_tx_block( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd, + int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) { + assert(tx_size < TX_SIZES_ALL); + av1_init_rd_stats(rd_stats); + if (ref_best_rd < 0) { + *is_cost_valid = 0; + return; + } + + MACROBLOCKD *const xd = &x->e_mbd; + assert(blk_row < max_block_high(xd, plane_bsize, 0) && + blk_col < max_block_wide(xd, plane_bsize, 0)); + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, + mbmi->bsize, tx_size); + struct macroblock_plane *const p = &x->plane[0]; + + int try_no_split = (cpi->oxcf.txfm_cfg.enable_tx64 || + txsize_sqr_up_map[tx_size] != TX_64X64) && + (cpi->oxcf.txfm_cfg.enable_rect_tx || + tx_size_wide[tx_size] == tx_size_high[tx_size]); + int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH; + TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES }; + + // Prune tx_split and no-split based on sub-block properties. + if (tx_size != TX_4X4 && try_split == 1 && try_no_split == 1 && + cpi->sf.tx_sf.prune_tx_size_level > 0) { + prune_tx_split_no_split(x, plane_bsize, blk_row, blk_col, tx_size, + &try_no_split, &try_split, + cpi->sf.tx_sf.prune_tx_size_level); + } + + if (cpi->sf.rt_sf.skip_tx_no_split_var_based_partition) { + if (x->try_merge_partition && try_split && p->eobs[block]) try_no_split = 0; + } + + // Try using current block as a single transform block without split. + if (try_no_split) { + try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth, + plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd, + ftxs_mode, &no_split); + + // Speed features for early termination. + const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level; + if (search_level) { + if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) { + *is_cost_valid = 0; + return; + } + if (no_split.rd - (no_split.rd >> (2 + search_level)) > prev_level_rd) { + try_split = 0; + } + } + if (cpi->sf.tx_sf.txb_split_cap) { + if (p->eobs[block] == 0) try_split = 0; + } + } + + // ML based speed feature to skip searching for split transform blocks. + if (x->e_mbd.bd == 8 && try_split && + !(ref_best_rd == INT64_MAX && no_split.rd == INT64_MAX)) { + const int threshold = cpi->sf.tx_sf.tx_type_search.ml_tx_split_thresh; + if (threshold >= 0) { + const int split_score = + ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size); + if (split_score < -threshold) try_split = 0; + } + } + + RD_STATS split_rd_stats; + split_rd_stats.rdcost = INT64_MAX; + // Try splitting current block into smaller transform blocks. + if (try_split) { + try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth, + plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd, + AOMMIN(no_split.rd, ref_best_rd), ftxs_mode, + &split_rd_stats); + } + + if (no_split.rd < split_rd_stats.rdcost) { + ENTROPY_CONTEXT *pta = ta + blk_col; + ENTROPY_CONTEXT *ptl = tl + blk_row; + p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx; + av1_set_txb_context(x, 0, block, tx_size, pta, ptl); + txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, + tx_size); + for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) { + for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) { + const int index = + av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx); + mbmi->inter_tx_size[index] = tx_size; + } + } + mbmi->tx_size = tx_size; + update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type); + const int bw = mi_size_wide[plane_bsize]; + set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col, + rd_stats->skip_txfm); + } else { + *rd_stats = split_rd_stats; + if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0; + } +} + +static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi, + MACROBLOCK *x, RD_STATS *rd_stats, + int64_t ref_best_rd, + BLOCK_SIZE bs) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + mbmi->tx_size = tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type); + + // If tx64 is not enabled, we need to go down to the next available size + if (!cpi->oxcf.txfm_cfg.enable_tx64 && cpi->oxcf.txfm_cfg.enable_rect_tx) { + static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = { + TX_4X4, // 4x4 transform + TX_8X8, // 8x8 transform + TX_16X16, // 16x16 transform + TX_32X32, // 32x32 transform + TX_32X32, // 64x64 transform + TX_4X8, // 4x8 transform + TX_8X4, // 8x4 transform + TX_8X16, // 8x16 transform + TX_16X8, // 16x8 transform + TX_16X32, // 16x32 transform + TX_32X16, // 32x16 transform + TX_32X32, // 32x64 transform + TX_32X32, // 64x32 transform + TX_4X16, // 4x16 transform + TX_16X4, // 16x4 transform + TX_8X32, // 8x32 transform + TX_32X8, // 32x8 transform + TX_16X32, // 16x64 transform + TX_32X16, // 64x16 transform + }; + mbmi->tx_size = tx_size_max_32[mbmi->tx_size]; + } else if (cpi->oxcf.txfm_cfg.enable_tx64 && + !cpi->oxcf.txfm_cfg.enable_rect_tx) { + static const TX_SIZE tx_size_max_square[TX_SIZES_ALL] = { + TX_4X4, // 4x4 transform + TX_8X8, // 8x8 transform + TX_16X16, // 16x16 transform + TX_32X32, // 32x32 transform + TX_64X64, // 64x64 transform + TX_4X4, // 4x8 transform + TX_4X4, // 8x4 transform + TX_8X8, // 8x16 transform + TX_8X8, // 16x8 transform + TX_16X16, // 16x32 transform + TX_16X16, // 32x16 transform + TX_32X32, // 32x64 transform + TX_32X32, // 64x32 transform + TX_4X4, // 4x16 transform + TX_4X4, // 16x4 transform + TX_8X8, // 8x32 transform + TX_8X8, // 32x8 transform + TX_16X16, // 16x64 transform + TX_16X16, // 64x16 transform + }; + mbmi->tx_size = tx_size_max_square[mbmi->tx_size]; + } else if (!cpi->oxcf.txfm_cfg.enable_tx64 && + !cpi->oxcf.txfm_cfg.enable_rect_tx) { + static const TX_SIZE tx_size_max_32_square[TX_SIZES_ALL] = { + TX_4X4, // 4x4 transform + TX_8X8, // 8x8 transform + TX_16X16, // 16x16 transform + TX_32X32, // 32x32 transform + TX_32X32, // 64x64 transform + TX_4X4, // 4x8 transform + TX_4X4, // 8x4 transform + TX_8X8, // 8x16 transform + TX_8X8, // 16x8 transform + TX_16X16, // 16x32 transform + TX_16X16, // 32x16 transform + TX_32X32, // 32x64 transform + TX_32X32, // 64x32 transform + TX_4X4, // 4x16 transform + TX_4X4, // 16x4 transform + TX_8X8, // 8x32 transform + TX_8X8, // 32x8 transform + TX_16X16, // 16x64 transform + TX_16X16, // 64x16 transform + }; + + mbmi->tx_size = tx_size_max_32_square[mbmi->tx_size]; + } + + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0]; + const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1]; + // Skip RDcost is used only for Inter blocks + const int64_t skip_txfm_rd = + is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX; + const int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_rate, 0); + const int skip_trellis = 0; + av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, + AOMMIN(no_skip_txfm_rd, skip_txfm_rd), AOM_PLANE_Y, bs, + mbmi->tx_size, FTXS_NONE, skip_trellis); +} + +static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi, + MACROBLOCK *x, + RD_STATS *rd_stats, + int64_t ref_best_rd, + BLOCK_SIZE bs) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + mbmi->tx_size = TX_4X4; + // TODO(any) : Pass this_rd based on skip/non-skip cost + const int skip_trellis = 0; + av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size, + FTXS_NONE, skip_trellis); +} + +#if !CONFIG_REALTIME_ONLY +static void ml_predict_intra_tx_depth_prune(MACROBLOCK *x, int blk_row, + int blk_col, BLOCK_SIZE bsize, + TX_SIZE tx_size) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + + // Disable the pruning logic using NN model for the following cases: + // 1) Lossless coding as only 4x4 transform is evaluated in this case + // 2) When transform and current block sizes do not match as the features are + // obtained over the current block + // 3) When operating bit-depth is not 8-bit as the input features are not + // scaled according to bit-depth. + if (xd->lossless[mbmi->segment_id] || txsize_to_bsize[tx_size] != bsize || + xd->bd != 8) + return; + + // Currently NN model based pruning is supported only when largest transform + // size is 8x8 + if (tx_size != TX_8X8) return; + + // Neural network model is a sequential neural net and was trained using SGD + // optimizer. The model can be further improved in terms of speed/quality by + // considering the following experiments: + // 1) Generate ML model by training with balanced data for different learning + // rates and optimizers. + // 2) Experiment with ML model by adding features related to the statistics of + // top and left pixels to capture the accuracy of reconstructed neighbouring + // pixels for 4x4 blocks numbered 1, 2, 3 in 8x8 block, source variance of 4x4 + // sub-blocks, etc. + // 3) Generate ML models for transform blocks other than 8x8. + const NN_CONFIG *const nn_config = &av1_intra_tx_split_nnconfig_8x8; + const float *const intra_tx_prune_thresh = av1_intra_tx_prune_nn_thresh_8x8; + + float features[NUM_INTRA_TX_SPLIT_FEATURES] = { 0.0f }; + const int diff_stride = block_size_wide[bsize]; + + const int16_t *diff = x->plane[0].src_diff + MI_SIZE * blk_row * diff_stride + + MI_SIZE * blk_col; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + int feature_idx = get_mean_dev_features(diff, diff_stride, bw, bh, features); + + features[feature_idx++] = log1pf((float)x->source_variance); + + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + const float log_dc_q_square = log1pf((float)(dc_q * dc_q) / 256.0f); + features[feature_idx++] = log_dc_q_square; + assert(feature_idx == NUM_INTRA_TX_SPLIT_FEATURES); + for (int i = 0; i < NUM_INTRA_TX_SPLIT_FEATURES; i++) { + features[i] = (features[i] - av1_intra_tx_split_8x8_mean[i]) / + av1_intra_tx_split_8x8_std[i]; + } + + float score; + av1_nn_predict(features, nn_config, 1, &score); + + TxfmSearchParams *const txfm_params = &x->txfm_search_params; + if (score <= intra_tx_prune_thresh[0]) + txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_SPLIT; + else if (score > intra_tx_prune_thresh[1]) + txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_LARGEST; +} +#endif // !CONFIG_REALTIME_ONLY + +// Search for the best uniform transform size and type for current coding block. +static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, + MACROBLOCK *x, + RD_STATS *rd_stats, + int64_t ref_best_rd, + BLOCK_SIZE bs) { + av1_invalid_rd_stats(rd_stats); + + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + TxfmSearchParams *const txfm_params = &x->txfm_search_params; + const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs]; + const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT; + int start_tx; + // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls + // how many times of splitting is allowed during the RD search. + int init_depth; + + if (tx_select) { + start_tx = max_rect_tx_size; + init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs], + is_inter_block(mbmi), &cpi->sf, + txfm_params->tx_size_search_method); + if (init_depth == MAX_TX_DEPTH && !cpi->oxcf.txfm_cfg.enable_tx64 && + txsize_sqr_up_map[start_tx] == TX_64X64) { + start_tx = sub_tx_size_map[start_tx]; + } + } else { + const TX_SIZE chosen_tx_size = + tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type); + start_tx = chosen_tx_size; + init_depth = MAX_TX_DEPTH; + } + + const int skip_trellis = 0; + uint8_t best_txk_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + TX_SIZE best_tx_size = max_rect_tx_size; + int64_t best_rd = INT64_MAX; + const int num_blks = bsize_to_num_blk(bs); + x->rd_model = FULL_TXFM_RD; + int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX }; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH; + depth++, tx_size = sub_tx_size_map[tx_size]) { + if ((!cpi->oxcf.txfm_cfg.enable_tx64 && + txsize_sqr_up_map[tx_size] == TX_64X64) || + (!cpi->oxcf.txfm_cfg.enable_rect_tx && + tx_size_wide[tx_size] != tx_size_high[tx_size])) { + continue; + } + +#if !CONFIG_REALTIME_ONLY + if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_SPLIT) break; + + // Set the flag to enable the evaluation of NN classifier to prune transform + // depths. As the features are based on intra residual information of + // largest transform, the evaluation of NN model is enabled only for this + // case. + txfm_params->enable_nn_prune_intra_tx_depths = + (cpi->sf.tx_sf.prune_intra_tx_depths_using_nn && tx_size == start_tx); +#endif + + RD_STATS this_rd_stats; + // When the speed feature use_rd_based_breakout_for_intra_tx_search is + // enabled, use the known minimum best_rd for early termination. + const int64_t rd_thresh = + cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search + ? AOMMIN(ref_best_rd, best_rd) + : ref_best_rd; + rd[depth] = av1_uniform_txfm_yrd(cpi, x, &this_rd_stats, rd_thresh, bs, + tx_size, FTXS_NONE, skip_trellis); + if (rd[depth] < best_rd) { + av1_copy_array(best_blk_skip, txfm_info->blk_skip, num_blks); + av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks); + best_tx_size = tx_size; + best_rd = rd[depth]; + *rd_stats = this_rd_stats; + } + if (tx_size == TX_4X4) break; + // If we are searching three depths, prune the smallest size depending + // on rd results for the first two depths for low contrast blocks. + if (depth > init_depth && depth != MAX_TX_DEPTH && + x->source_variance < 256) { + if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break; + } + } + + if (rd_stats->rate != INT_MAX) { + mbmi->tx_size = best_tx_size; + av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks); + av1_copy_array(txfm_info->blk_skip, best_blk_skip, num_blks); + } + +#if !CONFIG_REALTIME_ONLY + // Reset the flags to avoid any unintentional evaluation of NN model and + // consumption of prune depths. + txfm_params->enable_nn_prune_intra_tx_depths = false; + txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_NONE; +#endif +} + +// Search for the best transform type for the given transform block in the +// given plane/channel, and calculate the corresponding RD cost. +static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct rdcost_block_args *args = arg; + if (args->exit_early) { + args->incomplete_exit = 1; + return; + } + + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + const int is_inter = is_inter_block(xd->mi[0]); + const AV1_COMP *cpi = args->cpi; + ENTROPY_CONTEXT *a = args->t_above + blk_col; + ENTROPY_CONTEXT *l = args->t_left + blk_row; + const AV1_COMMON *cm = &cpi->common; + RD_STATS this_rd_stats; + av1_init_rd_stats(&this_rd_stats); + + if (!is_inter) { + av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); +#if !CONFIG_REALTIME_ONLY + const TxfmSearchParams *const txfm_params = &x->txfm_search_params; + if (txfm_params->enable_nn_prune_intra_tx_depths) { + ml_predict_intra_tx_depth_prune(x, blk_row, blk_col, plane_bsize, + tx_size); + if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_LARGEST) { + av1_invalid_rd_stats(&args->rd_stats); + args->exit_early = 1; + return; + } + } +#endif + } + + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + &txb_ctx, args->ftxs_mode, args->skip_trellis, + args->best_rd - args->current_rd, &this_rd_stats); + + if (plane == AOM_PLANE_Y && xd->cfl.store_y) { + assert(!is_inter || plane_bsize < BLOCK_8X8); + cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); + } + +#if CONFIG_RD_DEBUG + update_txb_coeff_cost(&this_rd_stats, plane, this_rd_stats.rate); +#endif // CONFIG_RD_DEBUG + av1_set_txb_context(x, plane, block, tx_size, a, l); + + const int blk_idx = + blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col; + + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + if (plane == 0) + set_blk_skip(txfm_info->blk_skip, plane, blk_idx, + x->plane[plane].eobs[block] == 0); + else + set_blk_skip(txfm_info->blk_skip, plane, blk_idx, 0); + + int64_t rd; + if (is_inter) { + const int64_t no_skip_txfm_rd = + RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + const int64_t skip_txfm_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse); + rd = AOMMIN(no_skip_txfm_rd, skip_txfm_rd); + this_rd_stats.skip_txfm &= !x->plane[plane].eobs[block]; + } else { + // Signal non-skip_txfm for Intra blocks + rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + this_rd_stats.skip_txfm = 0; + } + + av1_merge_rd_stats(&args->rd_stats, &this_rd_stats); + + args->current_rd += rd; + if (args->current_rd > args->best_rd) args->exit_early = 1; +} + +int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const ModeCosts *mode_costs = &x->mode_costs; + const int is_inter = is_inter_block(mbmi); + const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT && + block_signals_txsize(mbmi->bsize); + int tx_size_rate = 0; + if (tx_select) { + const int ctx = txfm_partition_context( + xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size); + tx_size_rate = mode_costs->txfm_partition_cost[ctx][0]; + } + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0]; + const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1]; + const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, 0); + const int64_t no_this_rd = + RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0); + mbmi->tx_size = tx_size; + + const uint8_t txw_unit = tx_size_wide_unit[tx_size]; + const uint8_t txh_unit = tx_size_high_unit[tx_size]; + const int step = txw_unit * txh_unit; + const int max_blocks_wide = max_block_wide(xd, bs, 0); + const int max_blocks_high = max_block_high(xd, bs, 0); + + struct rdcost_block_args args; + av1_zero(args); + args.x = x; + args.cpi = cpi; + args.best_rd = ref_best_rd; + args.current_rd = AOMMIN(no_this_rd, skip_txfm_rd); + av1_init_rd_stats(&args.rd_stats); + av1_get_entropy_contexts(bs, &xd->plane[0], args.t_above, args.t_left); + int i = 0; + for (int blk_row = 0; blk_row < max_blocks_high && !args.incomplete_exit; + blk_row += txh_unit) { + for (int blk_col = 0; blk_col < max_blocks_wide; blk_col += txw_unit) { + RD_STATS this_rd_stats; + av1_init_rd_stats(&this_rd_stats); + + if (args.exit_early) { + args.incomplete_exit = 1; + break; + } + + ENTROPY_CONTEXT *a = args.t_above + blk_col; + ENTROPY_CONTEXT *l = args.t_left + blk_row; + TXB_CTX txb_ctx; + get_txb_ctx(bs, tx_size, 0, a, l, &txb_ctx); + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + av1_setup_xform(&cpi->common, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, 0, &quant_param); + + av1_xform(x, 0, i, blk_row, blk_col, bs, &txfm_param); + av1_quant(x, 0, i, &txfm_param, &quant_param); + + this_rd_stats.rate = + cost_coeffs(x, 0, i, tx_size, txfm_param.tx_type, &txb_ctx, 0); + + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, 0, i, tx_size, quant_param.qmatrix, + scan_order->scan, &this_rd_stats.dist, + &this_rd_stats.sse); + + const int64_t no_skip_txfm_rd = + RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse); + + this_rd_stats.skip_txfm &= !x->plane[0].eobs[i]; + + av1_merge_rd_stats(&args.rd_stats, &this_rd_stats); + args.current_rd += AOMMIN(no_skip_txfm_rd, skip_rd); + + if (args.current_rd > ref_best_rd) { + args.exit_early = 1; + break; + } + + av1_set_txb_context(x, 0, i, tx_size, a, l); + i += step; + } + } + + if (args.incomplete_exit) av1_invalid_rd_stats(&args.rd_stats); + + *rd_stats = args.rd_stats; + if (rd_stats->rate == INT_MAX) return INT64_MAX; + + int64_t rd; + // rdstats->rate should include all the rate except skip/non-skip cost as the + // same is accounted in the caller functions after rd evaluation of all + // planes. However the decisions should be done after considering the + // skip/non-skip header cost + if (rd_stats->skip_txfm && is_inter) { + rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); + } else { + // Intra blocks are always signalled as non-skip + rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate, + rd_stats->dist); + rd_stats->rate += tx_size_rate; + } + // Check if forcing the block to skip transform leads to smaller RD cost. + if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) { + int64_t temp_skip_txfm_rd = + RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); + if (temp_skip_txfm_rd <= rd) { + rd = temp_skip_txfm_rd; + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip_txfm = 1; + } + } + + return rd; +} + +int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs, TX_SIZE tx_size, + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) { + assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs))); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const ModeCosts *mode_costs = &x->mode_costs; + const int is_inter = is_inter_block(mbmi); + const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT && + block_signals_txsize(mbmi->bsize); + int tx_size_rate = 0; + if (tx_select) { + const int ctx = txfm_partition_context( + xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size); + tx_size_rate = is_inter ? mode_costs->txfm_partition_cost[ctx][0] + : tx_size_cost(x, bs, tx_size); + } + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0]; + const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1]; + const int64_t skip_txfm_rd = + is_inter ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX; + const int64_t no_this_rd = + RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0); + + mbmi->tx_size = tx_size; + av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, + AOMMIN(no_this_rd, skip_txfm_rd), AOM_PLANE_Y, bs, + tx_size, ftxs_mode, skip_trellis); + if (rd_stats->rate == INT_MAX) return INT64_MAX; + + int64_t rd; + // rdstats->rate should include all the rate except skip/non-skip cost as the + // same is accounted in the caller functions after rd evaluation of all + // planes. However the decisions should be done after considering the + // skip/non-skip header cost + if (rd_stats->skip_txfm && is_inter) { + rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); + } else { + // Intra blocks are always signalled as non-skip + rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate, + rd_stats->dist); + rd_stats->rate += tx_size_rate; + } + // Check if forcing the block to skip transform leads to smaller RD cost. + if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) { + int64_t temp_skip_txfm_rd = + RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); + if (temp_skip_txfm_rd <= rd) { + rd = temp_skip_txfm_rd; + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip_txfm = 1; + } + } + + return rd; +} + +// Search for the best transform type for a luma inter-predicted block, given +// the transform block partitions. +// This function is used only when some speed features are enabled. +static AOM_INLINE void tx_block_yrd( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int depth, + ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx, + TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int64_t ref_best_rd, + RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode) { + assert(tx_size < TX_SIZES_ALL); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(is_inter_block(mbmi)); + const int max_blocks_high = max_block_high(xd, plane_bsize, 0); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index( + plane_bsize, blk_row, blk_col)]; + const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, + mbmi->bsize, tx_size); + + av1_init_rd_stats(rd_stats); + if (tx_size == plane_tx_size) { + ENTROPY_CONTEXT *ta = above_ctx + blk_col; + ENTROPY_CONTEXT *tl = left_ctx + blk_row; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx); + + const int zero_blk_rate = + x->coeff_costs.coeff_costs[txs_ctx][get_plane_type(0)] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->zero_rate = zero_blk_rate; + tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx, + rd_stats, ftxs_mode, ref_best_rd); + const int mi_width = mi_size_wide[plane_bsize]; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || + rd_stats->skip_txfm == 1) { + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + rd_stats->skip_txfm = 1; + set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 1); + x->plane[0].eobs[block] = 0; + x->plane[0].txb_entropy_ctx[block] = 0; + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } else { + rd_stats->skip_txfm = 0; + set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 0); + } + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][0]; + av1_set_txb_context(x, 0, block, tx_size, ta, tl); + txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, + tx_size); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int txb_width = tx_size_wide_unit[sub_txs]; + const int txb_height = tx_size_high_unit[sub_txs]; + const int step = txb_height * txb_width; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + RD_STATS pn_rd_stats; + int64_t this_rd = 0; + assert(txb_width > 0 && txb_height > 0); + + for (int row = 0; row < row_end; row += txb_height) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += txb_width) { + const int offsetc = blk_col + col; + + av1_init_rd_stats(&pn_rd_stats); + tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize, + depth + 1, above_ctx, left_ctx, tx_above, tx_left, + ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist); + block += step; + } + } + + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][1]; + } +} + +// search for tx type with tx sizes already decided for a inter-predicted luma +// partition block. It's used only when some speed features are enabled. +// Return value 0: early termination triggered, no valid rd cost available; +// 1: rd cost values are valid. +static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) { + if (ref_best_rd < 0) { + av1_invalid_rd_stats(rd_stats); + return 0; + } + + av1_init_rd_stats(rd_stats); + + MACROBLOCKD *const xd = &x->e_mbd; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const struct macroblockd_plane *const pd = &xd->plane[0]; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int step = bw * bh; + const int init_depth = get_search_init_depth( + mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method); + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); + memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); + memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + + int64_t this_rd = 0; + for (int idy = 0, block = 0; idy < mi_height; idy += bh) { + for (int idx = 0; idx < mi_width; idx += bw) { + RD_STATS pn_rd_stats; + av1_init_rd_stats(&pn_rd_stats); + tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, bsize, init_depth, + ctxa, ctxl, tx_above, tx_left, ref_best_rd - this_rd, + &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return 0; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd += + AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist), + RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse)); + block += step; + } + } + + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0]; + const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1]; + const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); + this_rd = + RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate, rd_stats->dist); + if (skip_txfm_rd < this_rd) { + this_rd = skip_txfm_rd; + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip_txfm = 1; + } + + const int is_cost_valid = this_rd > ref_best_rd; + if (!is_cost_valid) { + // reset cost value + av1_invalid_rd_stats(rd_stats); + } + return is_cost_valid; +} + +// Search for the best transform size and type for current inter-predicted +// luma block with recursive transform block partitioning. The obtained +// transform selection will be saved in xd->mi[0], the corresponding RD stats +// will be saved in rd_stats. The returned value is the corresponding RD cost. +static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + assert(is_inter_block(xd->mi[0])); + assert(bsize < BLOCK_SIZES_ALL); + const int fast_tx_search = txfm_params->tx_size_search_method > USE_FULL_RD; + int64_t rd_thresh = ref_best_rd; + if (rd_thresh == 0) { + av1_invalid_rd_stats(rd_stats); + return INT64_MAX; + } + if (fast_tx_search && rd_thresh < INT64_MAX) { + if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3); + } + assert(rd_thresh > 0); + const FAST_TX_SEARCH_MODE ftxs_mode = + fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE; + const struct macroblockd_plane *const pd = &xd->plane[0]; + assert(bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); + memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); + memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + const int init_depth = get_search_init_depth( + mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method); + const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int step = bw * bh; + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int no_skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][0]; + const int skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][1]; + int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, 0); + int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_cost, 0); + int block = 0; + + av1_init_rd_stats(rd_stats); + for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) { + for (int idx = 0; idx < max_block_wide(xd, bsize, 0); idx += bw) { + const int64_t best_rd_sofar = + (rd_thresh == INT64_MAX) + ? INT64_MAX + : (rd_thresh - (AOMMIN(skip_txfm_rd, no_skip_txfm_rd))); + int is_cost_valid = 1; + RD_STATS pn_rd_stats; + // Search for the best transform block size and type for the sub-block. + select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize, + ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX, + best_rd_sofar, &is_cost_valid, ftxs_mode); + if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return INT64_MAX; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse); + no_skip_txfm_rd = + RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist); + block += step; + } + } + + if (rd_stats->rate == INT_MAX) return INT64_MAX; + + rd_stats->skip_txfm = (skip_txfm_rd <= no_skip_txfm_rd); + + // If fast_tx_search is true, only DCT and 1D DCT were tested in + // select_inter_block_yrd() above. Do a better search for tx type with + // tx sizes already decided. + if (fast_tx_search && cpi->sf.tx_sf.refine_fast_tx_search_results) { + if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE)) + return INT64_MAX; + } + + int64_t final_rd; + if (rd_stats->skip_txfm) { + final_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse); + } else { + final_rd = + RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist); + if (!xd->lossless[xd->mi[0]->segment_id]) { + final_rd = + AOMMIN(final_rd, RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse)); + } + } + + return final_rd; +} + +// Return 1 to terminate transform search early. The decision is made based on +// the comparison with the reference RD cost and the model-estimated RD cost. +static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi, + MACROBLOCK *x, + BLOCK_SIZE bsize, + int64_t ref_best_rd) { + const int level = cpi->sf.tx_sf.model_based_prune_tx_search_level; + assert(level >= 0 && level <= 2); + int model_rate; + int64_t model_dist; + uint8_t model_skip; + MACROBLOCKD *const xd = &x->e_mbd; + model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE]( + cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, &model_skip, NULL, + NULL, NULL, NULL); + if (model_skip) return 0; + const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist); + // TODO(debargha, urvang): Improve the model and make the check below + // tighter. + static const int prune_factor_by8[] = { 3, 5 }; + const int factor = prune_factor_by8[level - 1]; + return ((model_rd * factor) >> 3) > ref_best_rd; +} + +void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + assert(is_inter_block(xd->mi[0])); + + av1_invalid_rd_stats(rd_stats); + + // If modeled RD cost is a lot worse than the best so far, terminate early. + if (cpi->sf.tx_sf.model_based_prune_tx_search_level && + ref_best_rd != INT64_MAX) { + if (model_based_tx_search_prune(cpi, x, bsize, ref_best_rd)) return; + } + + // Hashing based speed feature. If the hash of the prediction residue block is + // found in the hash table, use previous search results and terminate early. + uint32_t hash = 0; + MB_RD_RECORD *mb_rd_record = NULL; + const int mi_row = x->e_mbd.mi_row; + const int mi_col = x->e_mbd.mi_col; + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end); + const int is_mb_rd_hash_enabled = + (within_border && cpi->sf.rd_sf.use_mb_rd_hash); + const int n4 = bsize_to_num_blk(bsize); + if (is_mb_rd_hash_enabled) { + hash = get_block_residue_hash(x, bsize); + mb_rd_record = x->txfm_search_info.mb_rd_record; + const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); + if (match_index != -1) { + MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index]; + fetch_mb_rd_info(n4, mb_rd_info, rd_stats, x); + return; + } + } + + // If we predict that skip is the optimal RD decision - set the respective + // context and terminate early. + int64_t dist; + if (txfm_params->skip_txfm_level && + predict_skip_txfm(x, bsize, &dist, + cpi->common.features.reduced_tx_set_used)) { + set_skip_txfm(x, rd_stats, bsize, dist); + // Save the RD search results into mb_rd_record. + if (is_mb_rd_hash_enabled) + save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record); + return; + } +#if CONFIG_SPEED_STATS + ++x->txfm_search_info.tx_search_count; +#endif // CONFIG_SPEED_STATS + + const int64_t rd = + select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd); + + if (rd == INT64_MAX) { + // We should always find at least one candidate unless ref_best_rd is less + // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type + // might have failed to find something better) + assert(ref_best_rd != INT64_MAX); + av1_invalid_rd_stats(rd_stats); + return; + } + + // Save the RD search results into mb_rd_record. + if (is_mb_rd_hash_enabled) { + assert(mb_rd_record != NULL); + save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record); + } +} + +void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bs, + int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const TxfmSearchParams *tx_params = &x->txfm_search_params; + assert(bs == mbmi->bsize); + const int is_inter = is_inter_block(mbmi); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + av1_init_rd_stats(rd_stats); + + // Hashing based speed feature for inter blocks. If the hash of the residue + // block is found in the table, use previously saved search results and + // terminate early. + uint32_t hash = 0; + MB_RD_RECORD *mb_rd_record = NULL; + const int num_blks = bsize_to_num_blk(bs); + if (is_inter && cpi->sf.rd_sf.use_mb_rd_hash) { + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end); + if (within_border) { + hash = get_block_residue_hash(x, bs); + mb_rd_record = x->txfm_search_info.mb_rd_record; + const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); + if (match_index != -1) { + MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index]; + fetch_mb_rd_info(num_blks, mb_rd_info, rd_stats, x); + return; + } + } + } + + // If we predict that skip is the optimal RD decision - set the respective + // context and terminate early. + int64_t dist; + if (tx_params->skip_txfm_level && is_inter && + !xd->lossless[mbmi->segment_id] && + predict_skip_txfm(x, bs, &dist, + cpi->common.features.reduced_tx_set_used)) { + // Populate rdstats as per skip decision + set_skip_txfm(x, rd_stats, bs, dist); + // Save the RD search results into mb_rd_record. + if (mb_rd_record) { + save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record); + } + return; + } + + if (xd->lossless[mbmi->segment_id]) { + // Lossless mode can only pick the smallest (4x4) transform size. + choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); + } else if (tx_params->tx_size_search_method == USE_LARGESTALL) { + choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); + } else { + choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs); + } + + // Save the RD search results into mb_rd_record for possible reuse in future. + if (mb_rd_record) { + save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record); + } +} + +int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, + BLOCK_SIZE bsize, int64_t ref_best_rd) { + av1_init_rd_stats(rd_stats); + if (ref_best_rd < 0) return 0; + if (!x->e_mbd.is_chroma_ref) return 1; + + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U]; + const int is_inter = is_inter_block(mbmi); + int64_t this_rd = 0, skip_txfm_rd = 0; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + + if (is_inter) { + for (int plane = 1; plane < MAX_MB_PLANE; ++plane) + av1_subtract_plane(x, plane_bsize, plane); + } + + const int skip_trellis = 0; + const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + int is_cost_valid = 1; + for (int plane = 1; plane < MAX_MB_PLANE; ++plane) { + RD_STATS this_rd_stats; + int64_t chroma_ref_best_rd = ref_best_rd; + // For inter blocks, refined ref_best_rd is used for early exit + // For intra blocks, even though current rd crosses ref_best_rd, early + // exit is not recommended as current rd is used for gating subsequent + // modes as well (say, for angular modes) + // TODO(any): Extend the early exit mechanism for intra modes as well + if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter && + chroma_ref_best_rd != INT64_MAX) + chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_txfm_rd); + av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane, + plane_bsize, uv_tx_size, FTXS_NONE, skip_trellis); + if (this_rd_stats.rate == INT_MAX) { + is_cost_valid = 0; + break; + } + av1_merge_rd_stats(rd_stats, &this_rd_stats); + this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + skip_txfm_rd = RDCOST(x->rdmult, 0, rd_stats->sse); + if (AOMMIN(this_rd, skip_txfm_rd) > ref_best_rd) { + is_cost_valid = 0; + break; + } + } + + if (!is_cost_valid) { + // reset cost value + av1_invalid_rd_stats(rd_stats); + } + + return is_cost_valid; +} + +void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, + RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t current_rd, int plane, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode, + int skip_trellis) { + assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size)); + + if (!cpi->oxcf.txfm_cfg.enable_tx64 && + txsize_sqr_up_map[tx_size] == TX_64X64) { + av1_invalid_rd_stats(rd_stats); + return; + } + + if (current_rd > ref_best_rd) { + av1_invalid_rd_stats(rd_stats); + return; + } + + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + struct rdcost_block_args args; + av1_zero(args); + args.x = x; + args.cpi = cpi; + args.best_rd = ref_best_rd; + args.current_rd = current_rd; + args.ftxs_mode = ftxs_mode; + args.skip_trellis = skip_trellis; + av1_init_rd_stats(&args.rd_stats); + + av1_get_entropy_contexts(plane_bsize, pd, args.t_above, args.t_left); + av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, block_rd_txfm, + &args); + + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early; + + if (invalid_rd) { + av1_invalid_rd_stats(rd_stats); + } else { + *rd_stats = args.rd_stats; + } +} + +int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + TxfmSearchParams *txfm_params = &x->txfm_search_params; + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int skip_txfm_cost[2] = { x->mode_costs.skip_txfm_cost[skip_ctx][0], + x->mode_costs.skip_txfm_cost[skip_ctx][1] }; + const int64_t min_header_rate = + mode_rate + AOMMIN(skip_txfm_cost[0], skip_txfm_cost[1]); + // Account for minimum skip and non_skip rd. + // Eventually either one of them will be added to mode_rate + const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0); + if (min_header_rd_possible > ref_best_rd) { + av1_invalid_rd_stats(rd_stats_y); + return 0; + } + + const AV1_COMMON *cm = &cpi->common; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0); + const int64_t rd_thresh = + ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd; + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_y); + rd_stats->rate = mode_rate; + + // cost and distortion + av1_subtract_plane(x, bsize, 0); + if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && + !xd->lossless[mbmi->segment_id]) { + av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); +#if CONFIG_COLLECT_RD_STATS == 2 + PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 2 + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + for (int i = 0; i < xd->height * xd->width; ++i) + set_blk_skip(x->txfm_search_info.blk_skip, 0, i, rd_stats_y->skip_txfm); + } + + if (rd_stats_y->rate == INT_MAX) return 0; + + av1_merge_rd_stats(rd_stats, rd_stats_y); + + const int64_t non_skip_txfm_rdcosty = + RDCOST(x->rdmult, rd_stats->rate + skip_txfm_cost[0], rd_stats->dist); + const int64_t skip_txfm_rdcosty = + RDCOST(x->rdmult, mode_rate + skip_txfm_cost[1], rd_stats->sse); + const int64_t min_rdcosty = AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty); + if (min_rdcosty > ref_best_rd) return 0; + + av1_init_rd_stats(rd_stats_uv); + const int num_planes = av1_num_planes(cm); + if (num_planes > 1) { + int64_t ref_best_chroma_rd = ref_best_rd; + // Calculate best rd cost possible for chroma + if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && + (ref_best_chroma_rd != INT64_MAX)) { + ref_best_chroma_rd = (ref_best_chroma_rd - + AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty)); + } + const int is_cost_valid_uv = + av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd); + if (!is_cost_valid_uv) return 0; + av1_merge_rd_stats(rd_stats, rd_stats_uv); + } + + int choose_skip_txfm = rd_stats->skip_txfm; + if (!choose_skip_txfm && !xd->lossless[mbmi->segment_id]) { + const int64_t rdcost_no_skip_txfm = RDCOST( + x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_txfm_cost[0], + rd_stats->dist); + const int64_t rdcost_skip_txfm = + RDCOST(x->rdmult, skip_txfm_cost[1], rd_stats->sse); + if (rdcost_no_skip_txfm >= rdcost_skip_txfm) choose_skip_txfm = 1; + } + if (choose_skip_txfm) { + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->rate = mode_rate + skip_txfm_cost[1]; + rd_stats->dist = rd_stats->sse; + rd_stats_y->dist = rd_stats_y->sse; + rd_stats_uv->dist = rd_stats_uv->sse; + mbmi->skip_txfm = 1; + if (rd_stats->skip_txfm) { + const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (tmprd > ref_best_rd) return 0; + } + } else { + rd_stats->rate += skip_txfm_cost[0]; + mbmi->skip_txfm = 0; + } + + return 1; +} diff --git a/third_party/aom/av1/encoder/tx_search.h b/third_party/aom/av1/encoder/tx_search.h new file mode 100644 index 0000000000..ed95c1cd98 --- /dev/null +++ b/third_party/aom/av1/encoder/tx_search.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ +#define AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ + +#include "av1/common/pred_common.h" +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Set this macro as 1 to collect data about tx size selection. +#define COLLECT_TX_SIZE_DATA 0 + +#if COLLECT_TX_SIZE_DATA +static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; +#endif + +enum { + FTXS_NONE = 0, + FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0, + FTXS_DISABLE_TRELLIS_OPT = 1 << 1, + FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 +} UENUM1BYTE(FAST_TX_SEARCH_MODE); + +static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize, + TX_SIZE tx_size) { + assert(bsize == x->e_mbd.mi[0]->bsize); + if (x->txfm_search_params.tx_mode_search_type != TX_MODE_SELECT || + !block_signals_txsize(bsize)) + return 0; + + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int depth = tx_size_to_depth(tx_size, bsize); + const MACROBLOCKD *const xd = &x->e_mbd; + const int tx_size_ctx = get_tx_size_context(xd); + return x->mode_costs.tx_size_cost[tx_size_cat][tx_size_ctx][depth]; +} + +/*!\brief Compute the pixel domain distortion. + * + * \ingroup transform_search + * Compute the pixel domain distortion from diff on all visible 4x4s in the + * transform block. + * + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] plane Plane index + * \param[in] blk_row Block row index + * \param[in] blk_col Block col index + * \param[in] plane_bsize Current plane block size + * \param[in] tx_bsize Transform size + * \param[in] block_mse_q8 Block mse + * \return An int64_t value that is the block sse. + */ +int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, + int blk_col, const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize, + unsigned int *block_mse_q8); + +int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs, TX_SIZE tx_size); + +/*!\brief Transform type search for luma macroblock with fixed transform size. + * + * \ingroup transform_search + * Search for the best transform type and return the transform coefficients RD + * cost of current luma macroblock with the given uniform transform size. + * + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] cpi Top-level encoder structure + * \param[in] rd_stats Pointer to struct to keep track of the RD stats + * \param[in] ref_best_rd Best RD cost seen for this block so far + * \param[in] bs Size of the current macroblock + * \param[in] tx_size The given transform size + * \param[in] ftxs_mode Transform search mode specifying desired speed + and quality tradeoff + * \param[in] skip_trellis Binary flag indicating if trellis optimization + should be skipped + * \return An int64_t value that is the best RD cost found. + */ +int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs, TX_SIZE tx_size, + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis); + +/*!\brief Recursive transform size and type search. + * + * \ingroup transform_search + * Search for best transform size and type for luma inter blocks. The transform + * block partitioning can be recursive resulting in non-uniform transform sizes. + * The best transform size and type, if found, will be saved in the MB_MODE_INFO + * structure, and the corresponding RD stats will be saved in rd_stats. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] rd_stats Pointer to struct to keep track of the RD stats + * \param[in] bsize Current macroblock size + * \param[in] ref_best_rd Best RD cost seen for this block so far + * \remark Nothing is returned. The selected transform size and type will + be saved in the MB_MODE_INFO structure + */ +void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd); + +/*!\brief Uniform transform size and type search. + * + * \ingroup transform_search + * Search for the best transform size and type for current macroblock block, + * with the assumption that all the transform blocks have a uniform size + * (VP9 style). The selected transform size and type will be saved in the + * MB_MODE_INFO structure; the corresponding RD stats will be saved in rd_stats. + * This function may be used for both intra and inter predicted blocks. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] rd_stats Pointer to struct to keep track of the RD stats + * \param[in] bs Current macroblock size + * \param[in] ref_best_rd Best RD cost seen for this block so far + * \remark Nothing is returned. The selected transform size and type will + be saved in the MB_MODE_INFO structure + */ +void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bs, + int64_t ref_best_rd); + +/*!\brief Chroma block transform search. + * + * \ingroup transform_search + * Calculate the transform coefficient RD cost for the given chroma macroblock + * If the current mode is intra, then this function will compute the predictor. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] rd_stats Pointer to struct to keep track of the RD stats + * \param[in] bsize Current macroblock size + * \param[in] ref_best_rd Best RD cost seen for this block so far + * \return An integer value is returned. 0: early termination triggered, + no valid rd cost available; 1: rd cost values are valid. + */ +int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, + BLOCK_SIZE bsize, int64_t ref_best_rd); + +/*!\brief Transform type search with fixed transform size. + * + * \ingroup transform_search + * Search for the best transform type and calculate the transform coefficients + * RD cost of the current transform block with the specified (uniform) transform + * size and plane. The RD results will be saved in rd_stats. + * + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] cpi Top-level encoder structure + * \param[in] rd_stats Pointer to struct to keep track of the RD stats + * \param[in] ref_best_rd Best RD cost seen for this block so far + * \param[in] current_rd Current RD cost for this block so far + * \param[in] plane Plane index + * \param[in] plane_bsize Size of the current macroblock considering + sup-sampling + * \param[in] tx_size The given transform size + * \param[in] ftxs_mode Transform search mode specifying desired speed + and quality tradeoff + * \param[in] skip_trellis Binary flag indicating if trellis optimization + should be skipped + * + * \remark Nothing is returned. The RD results will be saved in rd_stats. + */ +void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, + RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t current_rd, int plane, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode, + int skip_trellis); + +/*!\brief Recursive transform size and type search. + * + * \ingroup transform_search + * This function combines y and uv planes' transform search processes together + * for inter-predicted blocks (including IntraBC), when the prediction is + * already generated. It first does subtraction to obtain the prediction error. + * Then it calls + * av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and + * av1_txfm_uvrd sequentially and handles possible early terminations. + * The RD metrics are calculated and stored in rd_stats/_y/_uv. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] bsize Current macroblock size + * \param[in] rd_stats Pointer to struct to keep track of the overal RD + stats + * \param[in] rd_stats_y Pointer to struct to keep track of the RD + stats for the luma plane + * \param[in] rd_stats_uv Pointer to struct to keep track of the RD + stats for the chroma planes + * \param[in] mode_rate Rate cost to encode the prediction mode info. of + the current macroblock + * \param[in] ref_best_rd Best RD cost seen for this block so far + * + * \return An integer value is returned indicating if a valid transform + candidate is found (1) or not (0). + */ +int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ diff --git a/third_party/aom/av1/encoder/txb_rdopt.c b/third_party/aom/av1/encoder/txb_rdopt.c new file mode 100644 index 0000000000..e551e8aa12 --- /dev/null +++ b/third_party/aom/av1/encoder/txb_rdopt.c @@ -0,0 +1,659 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/txb_rdopt.h" +#include "av1/encoder/txb_rdopt_utils.h" + +#include "av1/common/idct.h" + +static INLINE void update_coeff_general( + int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size, + TX_CLASS tx_class, int bhl, int width, int64_t rdmult, int shift, + int dc_sign_ctx, const int16_t *dequant, const int16_t *scan, + const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, + const qm_val_t *iqmatrix, const qm_val_t *qmatrix) { + const int dqv = get_dqv(dequant, scan[si], iqmatrix); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int is_last = si == (eob - 1); + const int coeff_ctx = get_lower_levels_ctx_general( + is_last, si, bhl, width, levels, ci, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + const int sign = (qc < 0) ? 1 : 0; + const tran_low_t abs_qc = abs(qc); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci); + const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci); + const int rate = + get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bhl, tx_class, levels); + const int64_t rd = RDCOST(rdmult, rate, dist); + + tran_low_t qc_low, dqc_low; + tran_low_t abs_qc_low; + int64_t dist_low, rd_low; + int rate_low; + if (abs_qc == 1) { + abs_qc_low = qc_low = dqc_low = 0; + dist_low = dist0; + rate_low = txb_costs->base_cost[coeff_ctx][0]; + } else { + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + abs_qc_low = abs_qc - 1; + dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci); + rate_low = + get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bhl, tx_class, levels); + } + + rd_low = RDCOST(rdmult, rate_low, dist_low); + if (rd_low < rd) { + qcoeff[ci] = qc_low; + dqcoeff[ci] = dqc_low; + levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX); + *accu_rate += rate_low; + *accu_dist += dist_low - dist0; + } else { + *accu_rate += rate; + *accu_dist += dist - dist0; + } + } +} + +static AOM_FORCE_INLINE void update_coeff_simple( + int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class, + int bhl, int64_t rdmult, int shift, const int16_t *dequant, + const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs, + const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, + uint8_t *levels, const qm_val_t *iqmatrix, const qm_val_t *qmatrix) { + const int dqv = get_dqv(dequant, scan[si], iqmatrix); + (void)eob; + // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) + // and not the last (scan_idx != eob - 1) + assert(si != eob - 1); + assert(si > 0); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int coeff_ctx = + get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + const tran_low_t abs_qc = abs(qc); + const tran_low_t abs_tqc = abs(tcoeff[ci]); + const tran_low_t abs_dqc = abs(dqcoeff[ci]); + int rate_low = 0; + const int rate = get_two_coeff_cost_simple( + ci, abs_qc, coeff_ctx, txb_costs, bhl, tx_class, levels, &rate_low); + if (abs_dqc < abs_tqc) { + *accu_rate += rate; + return; + } + + const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci); + const int64_t rd = RDCOST(rdmult, rate, dist); + + const tran_low_t abs_qc_low = abs_qc - 1; + const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift; + const int64_t dist_low = + get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci); + const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low); + + if (rd_low < rd) { + const int sign = (qc < 0) ? 1 : 0; + qcoeff[ci] = (-sign ^ abs_qc_low) + sign; + dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign; + levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX); + *accu_rate += rate_low; + } else { + *accu_rate += rate; + } + } +} + +static AOM_FORCE_INLINE void update_coeff_eob( + int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci, + int si, TX_SIZE tx_size, TX_CLASS tx_class, int bhl, int width, + int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant, + const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs, + const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness, + const qm_val_t *iqmatrix, const qm_val_t *qmatrix) { + const int dqv = get_dqv(dequant, scan[si], iqmatrix); + assert(si != *eob - 1); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int coeff_ctx = + get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + int lower_level = 0; + const tran_low_t abs_qc = abs(qc); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int sign = (qc < 0) ? 1 : 0; + const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci); + int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0; + int rate = + get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx, + txb_costs, bhl, tx_class, levels); + int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist); + + tran_low_t qc_low, dqc_low; + tran_low_t abs_qc_low; + int64_t dist_low, rd_low; + int rate_low; + + if (abs_qc == 1) { + abs_qc_low = 0; + dqc_low = qc_low = 0; + dist_low = 0; + rate_low = txb_costs->base_cost[coeff_ctx][0]; + rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist); + } else { + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + abs_qc_low = abs_qc - 1; + dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0; + rate_low = + get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bhl, tx_class, levels); + rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low); + } + + int lower_level_new_eob = 0; + const int new_eob = si + 1; + const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bhl, width, si); + const int new_eob_cost = + get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class); + int rate_coeff_eob = + new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob, + dc_sign_ctx, txb_costs, bhl, + tx_class); + int64_t dist_new_eob = dist; + int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob); + + if (abs_qc_low > 0) { + const int rate_coeff_eob_low = + new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign, + coeff_ctx_new_eob, dc_sign_ctx, + txb_costs, bhl, tx_class); + const int64_t dist_new_eob_low = dist_low; + const int64_t rd_new_eob_low = + RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low); + if (rd_new_eob_low < rd_new_eob) { + lower_level_new_eob = 1; + rd_new_eob = rd_new_eob_low; + rate_coeff_eob = rate_coeff_eob_low; + dist_new_eob = dist_new_eob_low; + } + } + + if (sharpness == 0 || abs_qc > 1) { + if (rd_low < rd) { + lower_level = 1; + rd = rd_low; + rate = rate_low; + dist = dist_low; + } + } + + if (sharpness == 0 && rd_new_eob < rd) { + for (int ni = 0; ni < *nz_num; ++ni) { + int last_ci = nz_ci[ni]; + levels[get_padded_idx(last_ci, bhl)] = 0; + qcoeff[last_ci] = 0; + dqcoeff[last_ci] = 0; + } + *eob = new_eob; + *nz_num = 0; + *accu_rate = rate_coeff_eob; + *accu_dist = dist_new_eob; + lower_level = lower_level_new_eob; + } else { + *accu_rate += rate; + *accu_dist += dist; + } + + if (lower_level) { + qcoeff[ci] = qc_low; + dqcoeff[ci] = dqc_low; + levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX); + } + if (qcoeff[ci]) { + nz_ci[*nz_num] = ci; + ++*nz_num; + } + } +} + +static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob, + int nz_num, int *nz_ci, int64_t rdmult, + int skip_cost, int non_skip_cost, + tran_low_t *qcoeff, tran_low_t *dqcoeff) { + const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist); + const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0); + if (rd_new_eob < rd) { + for (int i = 0; i < nz_num; ++i) { + const int ci = nz_ci[i]; + qcoeff[ci] = 0; + dqcoeff[ci] = 0; + // no need to set up levels because this is the last step + // levels[get_padded_idx(ci, bhl)] = 0; + } + *accu_rate = 0; + *eob = 0; + } +} + +// TODO(angiebird): use this function whenever it's possible +static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd, + int plane, TX_SIZE tx_size, TX_TYPE tx_type, + int reduced_tx_set_used) { + if (plane > 0) return 0; + + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 && + !xd->lossless[xd->mi[0]->segment_id]) { + const int ext_tx_set = + get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used); + if (is_inter) { + if (ext_tx_set > 0) + return x->mode_costs + .inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type]; + } else { + if (ext_tx_set > 0) { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode]; + else + intra_dir = mbmi->mode; + return x->mode_costs.intra_tx_type_costs[ext_tx_set][square_tx_size] + [intra_dir][tx_type]; + } + } + } + return 0; +} + +int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost, + int sharpness) { + MACROBLOCKD *xd = &x->e_mbd; + const struct macroblock_plane *p = &x->plane[plane]; + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + const int shift = av1_get_tx_scale(tx_size); + int eob = p->eobs[block]; + const int16_t *dequant = p->dequant_QTX; + const qm_val_t *iqmatrix = + av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type); + const qm_val_t *qmatrix = + cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR + ? av1_get_qmatrix(&cpi->common.quant_params, xd, plane, tx_size, + tx_type) + : NULL; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + block_offset; + tran_low_t *dqcoeff = p->dqcoeff + block_offset; + const tran_low_t *tcoeff = p->coeff + block_offset; + const CoeffCosts *coeff_costs = &x->coeff_costs; + + // This function is not called if eob = 0. + assert(eob > 0); + + const AV1_COMMON *cm = &cpi->common; + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + assert(height == (1 << bhl)); + const int is_inter = is_inter_block(mbmi); + const LV_MAP_COEFF_COST *txb_costs = + &coeff_costs->coeff_costs[txs_ctx][plane_type]; + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *txb_eob_costs = + &coeff_costs->eob_costs[eob_multi_size][plane_type]; + + const int rshift = 2; + + const int64_t rdmult = + (((int64_t)x->rdmult * + (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) + + 2) >> + rshift; + + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + + if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels); + + // TODO(angirbird): check iqmatrix + + const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0]; + const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class); + int accu_rate = eob_cost; + int64_t accu_dist = 0; + int si = eob - 1; + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const tran_low_t abs_qc = abs(qc); + const int sign = qc < 0; + const int max_nz_num = 2; + int nz_num = 1; + int nz_ci[3] = { ci, 0, 0 }; + if (abs_qc >= 2) { + update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class, + bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx, + dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff, + levels, iqmatrix, qmatrix); + --si; + } else { + assert(abs_qc == 1); + const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, si); + accu_rate += + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx, + txb_costs, bhl, tx_class); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci); + const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci); + accu_dist += dist - dist0; + --si; + } + +#define UPDATE_COEFF_EOB_CASE(tx_class_literal) \ + case tx_class_literal: \ + for (; si >= 0 && nz_num <= max_nz_num; --si) { \ + update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \ + tx_size, tx_class_literal, bhl, width, \ + txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \ + txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff, \ + levels, sharpness, iqmatrix, qmatrix); \ + } \ + break + switch (tx_class) { + UPDATE_COEFF_EOB_CASE(TX_CLASS_2D); + UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ); + UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT); +#undef UPDATE_COEFF_EOB_CASE + default: assert(false); + } + + if (si == -1 && nz_num <= max_nz_num && sharpness == 0) { + update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost, + non_skip_cost, qcoeff, dqcoeff); + } + +#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal) \ + case tx_class_literal: \ + for (; si >= 1; --si) { \ + update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bhl, \ + rdmult, shift, dequant, scan, txb_costs, tcoeff, \ + qcoeff, dqcoeff, levels, iqmatrix, qmatrix); \ + } \ + break + switch (tx_class) { + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D); + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ); + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT); +#undef UPDATE_COEFF_SIMPLE_CASE + default: assert(false); + } + + // DC position + if (si == 0) { + // no need to update accu_dist because it's not used after this point + int64_t dummy_dist = 0; + update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class, + bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx, + dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff, + levels, iqmatrix, qmatrix); + } + + const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type, + cm->features.reduced_tx_set_used); + if (eob == 0) + accu_rate += skip_cost; + else + accu_rate += non_skip_cost + tx_type_cost; + + p->eobs[block] = eob; + p->txb_entropy_ctx[block] = + av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]); + + *rate_cost = accu_rate; + return eob; +} + +static AOM_FORCE_INLINE int warehouse_efficients_txb( + const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, + const struct macroblock_plane *p, const int eob, + const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs, + const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class, + int reduced_tx_set_used) { + const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block); + const int txb_skip_ctx = txb_ctx->txb_skip_ctx; + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *const eob_costs = + &x->coeff_costs.eob_costs[eob_multi_size][plane_type]; + int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; + + av1_txb_init_levels(qcoeff, width, height, levels); + + cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used); + + cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class); + + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); + + const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] = + coeff_costs->lps_cost; + int c = eob - 1; + { + const int pos = scan[c]; + const tran_low_t v = qcoeff[pos]; + const int sign = AOMSIGN(v); + const int level = (v ^ sign) - sign; + const int coeff_ctx = coeff_contexts[pos]; + cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1]; + + if (v) { + // sign bit cost + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx_eob(pos, bhl, tx_class); + cost += get_br_cost(level, lps_cost[ctx]); + } + if (c) { + cost += av1_cost_literal(1); + } else { + const int sign01 = (sign ^ sign) - sign; + const int dc_sign_ctx = txb_ctx->dc_sign_ctx; + cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; + return cost; + } + } + } + const int(*base_cost)[8] = coeff_costs->base_cost; + for (c = eob - 2; c >= 1; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = qcoeff[pos]; + const int level = abs(v); + cost += base_cost[coeff_ctx][AOMMIN(level, 3)]; + if (v) { + // sign bit cost + cost += av1_cost_literal(1); + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx(levels, pos, bhl, tx_class); + cost += get_br_cost(level, lps_cost[ctx]); + } + } + } + // c == 0 after previous loop + { + const int pos = scan[c]; + const tran_low_t v = qcoeff[pos]; + const int coeff_ctx = coeff_contexts[pos]; + const int sign = AOMSIGN(v); + const int level = (v ^ sign) - sign; + cost += base_cost[coeff_ctx][AOMMIN(level, 3)]; + + if (v) { + // sign bit cost + const int sign01 = (sign ^ sign) - sign; + const int dc_sign_ctx = txb_ctx->dc_sign_ctx; + cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx(levels, pos, bhl, tx_class); + cost += get_br_cost(level, lps_cost[ctx]); + } + } + } + return cost; +} + +int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type) { + assert(plane == 0); + + int cost = 0; + const struct macroblock_plane *p = &x->plane[plane]; + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block); + + int eob = p->eobs[block]; + + // coeffs + int c = eob - 1; + // eob + { + const int pos = scan[c]; + const tran_low_t v = abs(qcoeff[pos]) - 1; + cost += (v << (AV1_PROB_COST_SHIFT + 2)); + } + // other coeffs + for (c = eob - 2; c >= 0; c--) { + const int pos = scan[c]; + const tran_low_t v = abs(qcoeff[pos]); + const int idx = AOMMIN(v, 14); + + cost += costLUT[idx]; + } + + // const_term does not contain DC, and log(e) does not contain eob, so both + // (eob-1) + cost += (const_term + loge_par) * (eob - 1); + + return cost; +} + +static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian( + const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob, + const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs, + const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class, + int reduced_tx_set_used) { + const int txb_skip_ctx = txb_ctx->txb_skip_ctx; + + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *const eob_costs = + &x->coeff_costs.eob_costs[eob_multi_size][plane_type]; + int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; + + cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used); + + cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class); + + cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type); + return cost; +} + +int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int reduced_tx_set_used) { + const struct macroblock_plane *p = &x->plane[plane]; + const int eob = p->eobs[block]; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs = + &x->coeff_costs.coeff_costs[txs_ctx][plane_type]; + if (eob == 0) { + return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + } + + const MACROBLOCKD *const xd = &x->e_mbd; + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + + return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob, + plane_type, coeff_costs, xd, tx_type, + tx_class, reduced_tx_set_used); +} + +int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, + const int reduced_tx_set_used, + const int adjust_eob) { + const struct macroblock_plane *p = &x->plane[plane]; + int eob = p->eobs[block]; + + if (adjust_eob) { + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block); + tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan, + tcoeff, qcoeff, dqcoeff); + p->eobs[block] = eob; + } + + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs = + &x->coeff_costs.coeff_costs[txs_ctx][plane_type]; + if (eob == 0) { + return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + } + + const MACROBLOCKD *const xd = &x->e_mbd; + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + + return warehouse_efficients_txb_laplacian( + x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd, + tx_type, tx_class, reduced_tx_set_used); +} diff --git a/third_party/aom/av1/encoder/txb_rdopt.h b/third_party/aom/av1/encoder/txb_rdopt.h new file mode 100644 index 0000000000..70b322a2e1 --- /dev/null +++ b/third_party/aom/av1/encoder/txb_rdopt.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TXB_RDOPT_H_ +#define AOM_AV1_ENCODER_TXB_RDOPT_H_ + +#include "av1/common/blockd.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Adjust the magnitude of quantized coefficients to achieve better + * rate-distortion (RD) trade-off. + * + * \ingroup coefficient_coding + * + * This function goes through each coefficient and greedily choose to lower + * the coefficient magnitude by 1 or not based on the RD score. + * + * The coefficients are processing in reversed scan order. + * + * Note that, the end of block position (eob) may change if the original last + * coefficient is lowered to zero. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] plane The index of the current plane + * \param[in] block The index of the current transform block in the + * \param[in] tx_size The transform size + * \param[in] tx_type The transform type + * \param[in] txb_ctx Context info for entropy coding transform block + * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). + * \param[out] rate_cost The entropy cost of coding the transform block + * after adjustment of coefficients. + * \param[in] sharpness When sharpness > 0, the function will be less + * aggressive towards lowering the magnitude of coefficients. + * In this way, the transform block will contain more high-frequency + * coefficients and therefore will preserve the sharpness of the reconstructed + * block. + */ +int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost, + int sharpness); + +/*!\brief Compute the entropy cost of coding coefficients in a transform block. + * + * \ingroup coefficient_coding + * + * \param[in] x Pointer to structure holding the data for + the current encoding macroblock. + * \param[in] plane The index of the current plane. + * \param[in] block The index of the current transform block + in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block. + * \param[in] tx_size The transform size. + * \param[in] tx_type The transform type. + * \param[in] txb_ctx Context info for entropy coding transform + block + * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). + * \param[in] reduced_tx_set_used Whether the transform type is chosen from + * a reduced set. + */ +int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int reduced_tx_set_used); + +/*!\brief Estimate the entropy cost of coding a transform block using Laplacian + * distribution. + * + * \ingroup coefficient_coding + * + * This function compute the entropy costs of the end of block position (eob) + * and the transform type (tx_type) precisely. + * + * Then using \ref av1_cost_coeffs_txb_estimate to estimate the entropy costs + * of coefficients in the transform block. + * + * In the end, the function returns the sum of entropy costs of end of block + * position (eob), transform type (tx_type) and coefficients. + * + * Compared to \ref av1_cost_coeffs_txb, this function is much faster but less + * accurate. + * + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] plane The index of the current plane + * \param[in] block The index of the current transform block in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block + * \param[in] tx_size The transform size + * \param[in] tx_type The transform type + * \param[in] txb_ctx Context info for entropy coding transform block + * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). + * \param[in] reduced_tx_set_used Whether the transform type is chosen from + * a reduced set. + * \param[in] adjust_eob Whether to adjust the end of block position + (eob) + * or not. + * \return int Estimated entropy cost of coding the transform + block. + */ +int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, + const int reduced_tx_set_used, + const int adjust_eob); + +/*!\brief Estimate the entropy cost of transform coefficients using Laplacian + * distribution. + * + * \ingroup coefficient_coding + * + * This function assumes each transform coefficient is of its own Laplacian + * distribution and the coefficient is the only observation of the Laplacian + * distribution. + * + * Based on that, each coefficient's coding cost can be estimated by computing + * the entropy of the corresponding Laplacian distribution. + * + * This function then return the sum of the estimated entropy cost for all + * coefficients in the transform block. + * + * Note that the entropy cost of end of block (eob) and transform type (tx_type) + * are not included. + * + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] plane The index of the current plane + * \param[in] block The index of the current transform block in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block + * \param[in] tx_size The transform size + * \param[in] tx_type The transform type + * \return int Estimated entropy cost of coefficients in the + * transform block. + */ +int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_TXB_RDOPT_H_ diff --git a/third_party/aom/av1/encoder/txb_rdopt_utils.h b/third_party/aom/av1/encoder/txb_rdopt_utils.h new file mode 100644 index 0000000000..b9f08aacf0 --- /dev/null +++ b/third_party/aom/av1/encoder/txb_rdopt_utils.h @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_ +#define AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_ + +#include "av1/encoder/encodetxb.h" + +static const int golomb_bits_cost[32] = { + 0, 512, 512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5, + 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, + 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, + 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9 +}; + +static const int golomb_cost_diff[32] = { + 0, 512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0, + 512 * 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +// Look up table of individual cost of coefficient by its quantization level. +// determined based on Laplacian distribution conditioned on estimated context +static const int costLUT[15] = { -1143, 53, 545, 825, 1031, + 1209, 1393, 1577, 1763, 1947, + 2132, 2317, 2501, 2686, 2871 }; + +static const int const_term = (1 << AV1_PROB_COST_SHIFT); + +static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000; + +static INLINE int get_dqv(const int16_t *dequant, int coeff_idx, + const qm_val_t *iqmatrix) { + int dqv = dequant[!!coeff_idx]; + if (iqmatrix != NULL) + dqv = + ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + return dqv; +} + +static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff, + int shift, const qm_val_t *qmatrix, + int coeff_idx) { + int64_t diff = (tcoeff - dqcoeff) * (1 << shift); + if (qmatrix == NULL) { + return diff * diff; + } + // When AOM_DIST_METRIC_QM_PSNR is enabled, this mirrors the rate-distortion + // computation done in av1_block_error_qm, improving visual quality. + // The maximum value of `shift` is 2, `tcoeff` and `dqcoeff` are at most 22 + // bits, and AOM_QM_BITS is 5, so `diff` should fit in 29-bits. The + // multiplication `diff * diff` then does not risk overflowing. + diff *= qmatrix[coeff_idx]; + const int64_t error = + (diff * diff + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS); + return error; +} + +static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs, + const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) { + int eob_extra; + const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra); + int eob_cost = 0; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1]; + + if (av1_eob_offset_bits[eob_pt] > 0) { + const int eob_ctx = eob_pt - 3; + const int eob_shift = av1_eob_offset_bits[eob_pt] - 1; + const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit]; + const int offset_bits = av1_eob_offset_bits[eob_pt]; + if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1); + } + return eob_cost; +} + +static INLINE int get_golomb_cost(int abs_qc) { + if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { + const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS; + const int length = get_msb(r) + 1; + return av1_cost_literal(2 * length - 1); + } + return 0; +} + +static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) { + const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + return coeff_lps[base_range] + get_golomb_cost(level); +} + +static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps, + int *diff) { + const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + int golomb_bits = 0; + if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) + *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1]; + + if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) { + int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS; + if (r < 32) { + golomb_bits = golomb_bits_cost[r]; + *diff += golomb_cost_diff[r]; + } else { + golomb_bits = get_golomb_cost(level); + *diff += (r & (r - 1)) == 0 ? 1024 : 0; + } + } + + return coeff_lps[base_range] + golomb_bits; +} + +static AOM_FORCE_INLINE int get_two_coeff_cost_simple( + int ci, tran_low_t abs_qc, int coeff_ctx, + const LV_MAP_COEFF_COST *txb_costs, int bhl, TX_CLASS tx_class, + const uint8_t *levels, int *cost_low) { + // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) + // and not the last (scan_idx != eob - 1) + assert(ci > 0); + int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; + int diff = 0; + if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4]; + if (abs_qc) { + cost += av1_cost_literal(1); + if (abs_qc > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx(levels, ci, bhl, tx_class); + int brcost_diff = 0; + cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx], + &brcost_diff); + diff += brcost_diff; + } + } + *cost_low = cost - diff; + + return cost; +} + +static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign, + int coeff_ctx, int dc_sign_ctx, + const LV_MAP_COEFF_COST *txb_costs, + int bhl, TX_CLASS tx_class) { + int cost = 0; + cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; + if (abs_qc != 0) { + if (ci == 0) { + cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign]; + } else { + cost += av1_cost_literal(1); + } + if (abs_qc > NUM_BASE_LEVELS) { + int br_ctx; + br_ctx = get_br_ctx_eob(ci, bhl, tx_class); + cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]); + } + } + return cost; +} + +static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc, + int sign, int coeff_ctx, + int dc_sign_ctx, + const LV_MAP_COEFF_COST *txb_costs, + int bhl, TX_CLASS tx_class, + const uint8_t *levels) { + int cost = 0; + if (is_last) { + cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; + } else { + cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; + } + if (abs_qc != 0) { + if (ci == 0) { + cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign]; + } else { + cost += av1_cost_literal(1); + } + if (abs_qc > NUM_BASE_LEVELS) { + int br_ctx; + if (is_last) + br_ctx = get_br_ctx_eob(ci, bhl, tx_class); + else + br_ctx = get_br_ctx(levels, ci, bhl, tx_class); + cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]); + } + } + return cost; +} + +static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv, + int shift, tran_low_t *qc_low, + tran_low_t *dqc_low) { + tran_low_t abs_qc_low = abs_qc - 1; + *qc_low = (-sign ^ abs_qc_low) + sign; + assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low); + tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift; + *dqc_low = (-sign ^ abs_dqc_low) + sign; + assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low); +} + +static INLINE void update_coeff_eob_fast(int *eob, int shift, + const int16_t *dequant_ptr, + const int16_t *scan, + const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr) { + // TODO(sarahparker) make this work for aomqm + int eob_out = *eob; + int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7), + dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) }; + + for (int i = *eob - 1; i >= 0; i--) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) { + eob_out--; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } else { + break; + } + } + + *eob = eob_out; +} +#endif // AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_ diff --git a/third_party/aom/av1/encoder/var_based_part.c b/third_party/aom/av1/encoder/var_based_part.c new file mode 100644 index 0000000000..f664795153 --- /dev/null +++ b/third_party/aom/av1/encoder/var_based_part.c @@ -0,0 +1,1914 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_timer.h" + +#include "av1/common/reconinter.h" +#include "av1/common/blockd.h" + +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/var_based_part.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/rdopt_utils.h" + +// Possible values for the force_split variable while evaluating variance based +// partitioning. +enum { + // Evaluate all partition types + PART_EVAL_ALL = 0, + // Force PARTITION_SPLIT + PART_EVAL_ONLY_SPLIT = 1, + // Force PARTITION_NONE + PART_EVAL_ONLY_NONE = 2 +} UENUM1BYTE(PART_EVAL_STATUS); + +typedef struct { + VPVariance *part_variances; + VPartVar *split[4]; +} variance_node; + +static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize, + variance_node *node) { + node->part_variances = NULL; + switch (bsize) { + case BLOCK_128X128: { + VP128x128 *vt = (VP128x128 *)data; + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx].part_variances.none; + break; + } + case BLOCK_64X64: { + VP64x64 *vt = (VP64x64 *)data; + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx].part_variances.none; + break; + } + case BLOCK_32X32: { + VP32x32 *vt = (VP32x32 *)data; + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx].part_variances.none; + break; + } + case BLOCK_16X16: { + VP16x16 *vt = (VP16x16 *)data; + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx].part_variances.none; + break; + } + case BLOCK_8X8: { + VP8x8 *vt = (VP8x8 *)data; + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx].part_variances.none; + break; + } + default: { + VP4x4 *vt = (VP4x4 *)data; + assert(bsize == BLOCK_4X4); + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx]; + break; + } + } +} + +// Set variance values given sum square error, sum error, count. +static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c, + VPartVar *v) { + v->sum_square_error = s2; + v->sum_error = s; + v->log2_count = c; +} + +static AOM_INLINE void get_variance(VPartVar *v) { + v->variance = + (int)(256 * (v->sum_square_error - + (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> + v->log2_count)) >> + v->log2_count); +} + +static AOM_INLINE void sum_2_variances(const VPartVar *a, const VPartVar *b, + VPartVar *r) { + assert(a->log2_count == b->log2_count); + fill_variance(a->sum_square_error + b->sum_square_error, + a->sum_error + b->sum_error, a->log2_count + 1, r); +} + +static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) { + variance_node node; + memset(&node, 0, sizeof(node)); + tree_to_node(data, bsize, &node); + sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]); + sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]); + sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]); + sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]); + sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1], + &node.part_variances->none); +} + +static AOM_INLINE void set_block_size(AV1_COMP *const cpi, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + if (cpi->common.mi_params.mi_cols > mi_col && + cpi->common.mi_params.mi_rows > mi_row) { + CommonModeInfoParams *mi_params = &cpi->common.mi_params; + const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); + const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col); + MB_MODE_INFO *mi = mi_params->mi_grid_base[mi_grid_idx] = + &mi_params->mi_alloc[mi_alloc_idx]; + mi->bsize = bsize; + } +} + +static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, void *data, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int64_t threshold, BLOCK_SIZE bsize_min, + PART_EVAL_STATUS force_split) { + AV1_COMMON *const cm = &cpi->common; + variance_node vt; + const int block_width = mi_size_wide[bsize]; + const int block_height = mi_size_high[bsize]; + int bs_width_check = block_width; + int bs_height_check = block_height; + int bs_width_vert_check = block_width >> 1; + int bs_height_horiz_check = block_height >> 1; + // On the right and bottom boundary we only need to check + // if half the bsize fits, because boundary is extended + // up to 64. So do this check only for sb_size = 64X64. + if (cm->seq_params->sb_size == BLOCK_64X64) { + if (tile->mi_col_end == cm->mi_params.mi_cols) { + bs_width_check = (block_width >> 1) + 1; + bs_width_vert_check = (block_width >> 2) + 1; + } + if (tile->mi_row_end == cm->mi_params.mi_rows) { + bs_height_check = (block_height >> 1) + 1; + bs_height_horiz_check = (block_height >> 2) + 1; + } + } + + assert(block_height == block_width); + tree_to_node(data, bsize, &vt); + + if (mi_col + bs_width_check <= tile->mi_col_end && + mi_row + bs_height_check <= tile->mi_row_end && + force_split == PART_EVAL_ONLY_NONE) { + set_block_size(cpi, mi_row, mi_col, bsize); + return 1; + } + if (force_split == PART_EVAL_ONLY_SPLIT) return 0; + + // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if + // variance is below threshold, otherwise split will be selected. + // No check for vert/horiz split as too few samples for variance. + if (bsize == bsize_min) { + // Variance already computed to set the force_split. + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); + if (mi_col + bs_width_check <= tile->mi_col_end && + mi_row + bs_height_check <= tile->mi_row_end && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, mi_row, mi_col, bsize); + return 1; + } + return 0; + } else if (bsize > bsize_min) { + // Variance already computed to set the force_split. + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); + // For key frame: take split for bsize above 32X32 or very high variance. + if (frame_is_intra_only(cm) && + (bsize > BLOCK_32X32 || + vt.part_variances->none.variance > (threshold << 4))) { + return 0; + } + // If variance is low, take the bsize (no split). + if (mi_col + bs_width_check <= tile->mi_col_end && + mi_row + bs_height_check <= tile->mi_row_end && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, mi_row, mi_col, bsize); + return 1; + } + // Check vertical split. + if (mi_row + bs_height_check <= tile->mi_row_end && + mi_col + bs_width_vert_check <= tile->mi_col_end) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT); + BLOCK_SIZE plane_bsize = + get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); + get_variance(&vt.part_variances->vert[0]); + get_variance(&vt.part_variances->vert[1]); + if (vt.part_variances->vert[0].variance < threshold && + vt.part_variances->vert[1].variance < threshold && + plane_bsize < BLOCK_INVALID) { + set_block_size(cpi, mi_row, mi_col, subsize); + set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize); + return 1; + } + } + // Check horizontal split. + if (mi_col + bs_width_check <= tile->mi_col_end && + mi_row + bs_height_horiz_check <= tile->mi_row_end) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); + BLOCK_SIZE plane_bsize = + get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); + get_variance(&vt.part_variances->horz[0]); + get_variance(&vt.part_variances->horz[1]); + if (vt.part_variances->horz[0].variance < threshold && + vt.part_variances->horz[1].variance < threshold && + plane_bsize < BLOCK_INVALID) { + set_block_size(cpi, mi_row, mi_col, subsize); + set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize); + return 1; + } + } + return 0; + } + return 0; +} + +static AOM_INLINE int all_blks_inside(int x16_idx, int y16_idx, int pixels_wide, + int pixels_high) { + int all_inside = 1; + for (int idx = 0; idx < 4; idx++) { + all_inside &= ((x16_idx + GET_BLK_IDX_X(idx, 3)) < pixels_wide); + all_inside &= ((y16_idx + GET_BLK_IDX_Y(idx, 3)) < pixels_high); + } + return all_inside; +} + +#if CONFIG_AV1_HIGHBITDEPTH +// TODO(yunqingwang): Perform average of four 8x8 blocks similar to lowbd +static AOM_INLINE void fill_variance_8x8avg_highbd( + const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, + int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide, + int pixels_high) { + for (int idx = 0; idx < 4; idx++) { + const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3); + const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3); + unsigned int sse = 0; + int sum = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + int src_avg = aom_highbd_avg_8x8(src_buf + y8_idx * src_stride + x8_idx, + src_stride); + int dst_avg = aom_highbd_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx, + dst_stride); + + sum = src_avg - dst_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none); + } +} +#endif + +static AOM_INLINE void fill_variance_8x8avg_lowbd( + const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, + int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide, + int pixels_high) { + unsigned int sse[4] = { 0 }; + int sum[4] = { 0 }; + + if (all_blks_inside(x16_idx, y16_idx, pixels_wide, pixels_high)) { + int src_avg[4]; + int dst_avg[4]; + aom_avg_8x8_quad(src_buf, src_stride, x16_idx, y16_idx, src_avg); + aom_avg_8x8_quad(dst_buf, dst_stride, x16_idx, y16_idx, dst_avg); + for (int idx = 0; idx < 4; idx++) { + sum[idx] = src_avg[idx] - dst_avg[idx]; + sse[idx] = sum[idx] * sum[idx]; + } + } else { + for (int idx = 0; idx < 4; idx++) { + const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3); + const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3); + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + int src_avg = + aom_avg_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride); + int dst_avg = + aom_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx, dst_stride); + sum[idx] = src_avg - dst_avg; + sse[idx] = sum[idx] * sum[idx]; + } + } + } + + for (int idx = 0; idx < 4; idx++) { + fill_variance(sse[idx], sum[idx], 0, &vst->split[idx].part_variances.none); + } +} + +// Obtain parameters required to calculate variance (such as sum, sse, etc,.) +// at 8x8 sub-block level for a given 16x16 block. +// The function can be called only when is_key_frame is false since sum is +// computed between source and reference frames. +static AOM_INLINE void fill_variance_8x8avg( + const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, + int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int highbd_flag, + int pixels_wide, int pixels_high) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd_flag) { + fill_variance_8x8avg_highbd(src_buf, src_stride, dst_buf, dst_stride, + x16_idx, y16_idx, vst, pixels_wide, + pixels_high); + return; + } +#else + (void)highbd_flag; +#endif // CONFIG_AV1_HIGHBITDEPTH + fill_variance_8x8avg_lowbd(src_buf, src_stride, dst_buf, dst_stride, x16_idx, + y16_idx, vst, pixels_wide, pixels_high); +} + +static int compute_minmax_8x8(const uint8_t *src_buf, int src_stride, + const uint8_t *dst_buf, int dst_stride, + int x16_idx, int y16_idx, +#if CONFIG_AV1_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, int pixels_high) { + int minmax_max = 0; + int minmax_min = 255; + // Loop over the 4 8x8 subblocks. + for (int idx = 0; idx < 4; idx++) { + const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3); + const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3); + int min = 0; + int max = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_minmax_8x8( + src_buf + y8_idx * src_stride + x8_idx, src_stride, + dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, &max); + } else { + aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride, + dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, + &max); + } +#else + aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride, + dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, + &max); +#endif + if ((max - min) > minmax_max) minmax_max = (max - min); + if ((max - min) < minmax_min) minmax_min = (max - min); + } + } + return (minmax_max - minmax_min); +} + +// Function to compute average and variance of 4x4 sub-block. +// The function can be called only when is_key_frame is true since sum is +// computed using source frame only. +static AOM_INLINE void fill_variance_4x4avg(const uint8_t *src_buf, + int src_stride, int x8_idx, + int y8_idx, VP8x8 *vst, +#if CONFIG_AV1_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, int pixels_high, + int border_offset_4x4) { + for (int idx = 0; idx < 4; idx++) { + const int x4_idx = x8_idx + GET_BLK_IDX_X(idx, 2); + const int y4_idx = y8_idx + GET_BLK_IDX_Y(idx, 2); + unsigned int sse = 0; + int sum = 0; + if (x4_idx < pixels_wide - border_offset_4x4 && + y4_idx < pixels_high - border_offset_4x4) { + int src_avg; + int dst_avg = 128; +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + src_avg = aom_highbd_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, + src_stride); + } else { + src_avg = + aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride); + } +#else + src_avg = aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride); +#endif + + sum = src_avg - dst_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none); + } +} + +// TODO(kyslov) Bring back threshold adjustment based on content state +static int64_t scale_part_thresh_content(int64_t threshold_base, int speed, + int width, int height, + int non_reference_frame) { + (void)width; + (void)height; + int64_t threshold = threshold_base; + if (non_reference_frame) threshold = (3 * threshold) >> 1; + if (speed >= 8) { + return (5 * threshold) >> 2; + } + return threshold; +} + +// Tune thresholds less or more aggressively to prefer larger partitions +static AOM_INLINE void tune_thresh_based_on_qindex( + AV1_COMP *cpi, int64_t thresholds[], uint64_t block_sad, int current_qindex, + int num_pixels, bool is_segment_id_boosted, int source_sad_nonrd, + int lighting_change) { + double weight; + if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 3) { + const int win = 20; + if (current_qindex < QINDEX_LARGE_BLOCK_THR - win) + weight = 1.0; + else if (current_qindex > QINDEX_LARGE_BLOCK_THR + win) + weight = 0.0; + else + weight = + 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + win) / (2 * win); + if (num_pixels > RESOLUTION_480P) { + for (int i = 0; i < 4; i++) { + thresholds[i] <<= 1; + } + } + if (num_pixels <= RESOLUTION_288P) { + thresholds[3] = INT64_MAX; + if (is_segment_id_boosted == false) { + thresholds[1] <<= 2; + thresholds[2] <<= (source_sad_nonrd <= kLowSad) ? 5 : 4; + } else { + thresholds[1] <<= 1; + thresholds[2] <<= 3; + } + // Allow for split to 8x8 for superblocks where part of it has + // moving boundary. So allow for sb with source_sad above threshold, + // and avoid very large source_sad or high source content, to avoid + // too many 8x8 within superblock. + uint64_t avg_source_sad_thresh = 25000; + uint64_t block_sad_low = 25000; + uint64_t block_sad_high = 50000; + if (cpi->svc.temporal_layer_id == 0 && + cpi->svc.number_temporal_layers > 1) { + // Increase the sad thresholds for base TL0, as reference/LAST is + // 2/4 frames behind (for 2/3 #TL). + avg_source_sad_thresh = 40000; + block_sad_high = 70000; + } + if (is_segment_id_boosted == false && + cpi->rc.avg_source_sad < avg_source_sad_thresh && + block_sad > block_sad_low && block_sad < block_sad_high && + !lighting_change) { + thresholds[2] = (3 * thresholds[2]) >> 2; + thresholds[3] = thresholds[2] << 3; + } + // Condition the increase of partition thresholds on the segment + // and the content. Avoid the increase for superblocks which have + // high source sad, unless the whole frame has very high motion + // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks + // have high source sad). + } else if (num_pixels > RESOLUTION_480P && is_segment_id_boosted == false && + (source_sad_nonrd != kHighSad || + cpi->rc.avg_source_sad > 50000)) { + thresholds[0] = (3 * thresholds[0]) >> 1; + thresholds[3] = INT64_MAX; + if (current_qindex > QINDEX_LARGE_BLOCK_THR) { + thresholds[1] = + (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]); + thresholds[2] = + (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]); + } + } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && + is_segment_id_boosted == false && + (source_sad_nonrd != kHighSad || + cpi->rc.avg_source_sad > 50000)) { + thresholds[1] = + (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]); + thresholds[2] = + (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]); + thresholds[3] = INT64_MAX; + } + } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 2) { + thresholds[1] <<= (source_sad_nonrd <= kLowSad) ? 2 : 0; + thresholds[2] = + (source_sad_nonrd <= kLowSad) ? (3 * thresholds[2]) : thresholds[2]; + } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 1) { + const int fac = (source_sad_nonrd <= kLowSad) ? 2 : 1; + if (current_qindex < QINDEX_LARGE_BLOCK_THR - 45) + weight = 1.0; + else if (current_qindex > QINDEX_LARGE_BLOCK_THR + 45) + weight = 0.0; + else + weight = 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + 45) / (2 * 45); + thresholds[1] = + (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]); + thresholds[2] = + (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]); + thresholds[3] = + (int)((1 - weight) * (thresholds[3] << fac) + weight * thresholds[3]); + } + if (cpi->sf.part_sf.disable_8x8_part_based_on_qidx && (current_qindex < 128)) + thresholds[3] = INT64_MAX; +} + +static void set_vbp_thresholds_key_frame(AV1_COMP *cpi, int64_t thresholds[], + int64_t threshold_base, + int threshold_left_shift, + int num_pixels) { + if (cpi->sf.rt_sf.force_large_partition_blocks_intra) { + const int shift_steps = + threshold_left_shift - (cpi->oxcf.mode == ALLINTRA ? 7 : 8); + assert(shift_steps >= 0); + threshold_base <<= shift_steps; + } + thresholds[0] = threshold_base; + thresholds[1] = threshold_base; + if (num_pixels < RESOLUTION_720P) { + thresholds[2] = threshold_base / 3; + thresholds[3] = threshold_base >> 1; + } else { + int shift_val = 2; + if (cpi->sf.rt_sf.force_large_partition_blocks_intra) { + shift_val = 0; + } + + thresholds[2] = threshold_base >> shift_val; + thresholds[3] = threshold_base >> shift_val; + } + thresholds[4] = threshold_base << 2; +} + +static AOM_INLINE void tune_thresh_based_on_resolution( + AV1_COMP *cpi, int64_t thresholds[], int64_t threshold_base, + int current_qindex, int source_sad_rd, int num_pixels) { + if (num_pixels >= RESOLUTION_720P) thresholds[3] = thresholds[3] << 1; + if (num_pixels <= RESOLUTION_288P) { + const int qindex_thr[5][2] = { + { 200, 220 }, { 140, 170 }, { 120, 150 }, { 200, 210 }, { 170, 220 }, + }; + int th_idx = 0; + if (cpi->sf.rt_sf.var_part_based_on_qidx >= 1) + th_idx = + (source_sad_rd <= kLowSad) ? cpi->sf.rt_sf.var_part_based_on_qidx : 0; + if (cpi->sf.rt_sf.var_part_based_on_qidx >= 3) + th_idx = cpi->sf.rt_sf.var_part_based_on_qidx; + const int qindex_low_thr = qindex_thr[th_idx][0]; + const int qindex_high_thr = qindex_thr[th_idx][1]; + if (current_qindex >= qindex_high_thr) { + threshold_base = (5 * threshold_base) >> 1; + thresholds[1] = threshold_base >> 3; + thresholds[2] = threshold_base << 2; + thresholds[3] = threshold_base << 5; + } else if (current_qindex < qindex_low_thr) { + thresholds[1] = threshold_base >> 3; + thresholds[2] = threshold_base >> 1; + thresholds[3] = threshold_base << 3; + } else { + int64_t qi_diff_low = current_qindex - qindex_low_thr; + int64_t qi_diff_high = qindex_high_thr - current_qindex; + int64_t threshold_diff = qindex_high_thr - qindex_low_thr; + int64_t threshold_base_high = (5 * threshold_base) >> 1; + + threshold_diff = threshold_diff > 0 ? threshold_diff : 1; + threshold_base = + (qi_diff_low * threshold_base_high + qi_diff_high * threshold_base) / + threshold_diff; + thresholds[1] = threshold_base >> 3; + thresholds[2] = ((qi_diff_low * threshold_base) + + qi_diff_high * (threshold_base >> 1)) / + threshold_diff; + thresholds[3] = ((qi_diff_low * (threshold_base << 5)) + + qi_diff_high * (threshold_base << 3)) / + threshold_diff; + } + } else if (num_pixels < RESOLUTION_720P) { + thresholds[2] = (5 * threshold_base) >> 2; + } else if (num_pixels < RESOLUTION_1080P) { + thresholds[2] = threshold_base << 1; + } else { + // num_pixels >= RESOLUTION_1080P + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + if (num_pixels < RESOLUTION_1440P) { + thresholds[2] = (5 * threshold_base) >> 1; + } else { + thresholds[2] = (7 * threshold_base) >> 1; + } + } else { + if (cpi->oxcf.speed > 7) { + thresholds[2] = 6 * threshold_base; + } else { + thresholds[2] = 3 * threshold_base; + } + } + } +} + +// Increase partition thresholds for noisy content. Apply it only for +// superblocks where sumdiff is low, as we assume the sumdiff of superblock +// whose only change is due to noise will be low (i.e, noise will average +// out over large block). +static AOM_INLINE int64_t tune_thresh_noisy_content(AV1_COMP *cpi, + int64_t threshold_base, + int content_lowsumdiff, + int num_pixels) { + AV1_COMMON *const cm = &cpi->common; + int64_t updated_thresh_base = threshold_base; + if (cpi->noise_estimate.enabled && content_lowsumdiff && + num_pixels > RESOLUTION_480P && cm->current_frame.frame_number > 60) { + NOISE_LEVEL noise_level = + av1_noise_estimate_extract_level(&cpi->noise_estimate); + if (noise_level == kHigh) + updated_thresh_base = (5 * updated_thresh_base) >> 1; + else if (noise_level == kMedium && + !cpi->sf.rt_sf.prefer_large_partition_blocks) + updated_thresh_base = (5 * updated_thresh_base) >> 2; + } + // TODO(kyslov) Enable var based partition adjusment on temporal denoising +#if 0 // CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow) + updated_thresh_base = + av1_scale_part_thresh(updated_thresh_base, cpi->denoiser.denoising_level, + content_state, cpi->svc.temporal_layer_id); + else + threshold_base = + scale_part_thresh_content(updated_thresh_base, cpi->oxcf.speed, cm->width, + cm->height, cpi->ppi->rtc_ref.non_reference_frame); +#else + // Increase base variance threshold based on content_state/sum_diff level. + updated_thresh_base = scale_part_thresh_content( + updated_thresh_base, cpi->oxcf.speed, cm->width, cm->height, + cpi->ppi->rtc_ref.non_reference_frame); +#endif + return updated_thresh_base; +} + +static AOM_INLINE void set_vbp_thresholds( + AV1_COMP *cpi, int64_t thresholds[], uint64_t blk_sad, int qindex, + int content_lowsumdiff, int source_sad_nonrd, int source_sad_rd, + bool is_segment_id_boosted, int lighting_change) { + AV1_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + const int threshold_multiplier = is_key_frame ? 120 : 1; + const int ac_q = av1_ac_quant_QTX(qindex, 0, cm->seq_params->bit_depth); + int64_t threshold_base = (int64_t)(threshold_multiplier * ac_q); + const int current_qindex = cm->quant_params.base_qindex; + const int threshold_left_shift = cpi->sf.rt_sf.var_part_split_threshold_shift; + const int num_pixels = cm->width * cm->height; + + if (is_key_frame) { + set_vbp_thresholds_key_frame(cpi, thresholds, threshold_base, + threshold_left_shift, num_pixels); + return; + } + + threshold_base = tune_thresh_noisy_content(cpi, threshold_base, + content_lowsumdiff, num_pixels); + thresholds[0] = threshold_base >> 1; + thresholds[1] = threshold_base; + thresholds[3] = threshold_base << threshold_left_shift; + + tune_thresh_based_on_resolution(cpi, thresholds, threshold_base, + current_qindex, source_sad_rd, num_pixels); + + tune_thresh_based_on_qindex(cpi, thresholds, blk_sad, current_qindex, + num_pixels, is_segment_id_boosted, + source_sad_nonrd, lighting_change); +} + +// Set temporal variance low flag for superblock 64x64. +// Only first 25 in the array are used in this case. +static AOM_INLINE void set_low_temp_var_flag_64x64( + CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info, + MACROBLOCKD *xd, VP64x64 *vt, const int64_t thresholds[], int mi_col, + int mi_row) { + if (xd->mi[0]->bsize == BLOCK_64X64) { + if ((vt->part_variances).none.variance < (thresholds[0] >> 1)) + part_info->variance_low[0] = 1; + } else if (xd->mi[0]->bsize == BLOCK_64X32) { + for (int part_idx = 0; part_idx < 2; part_idx++) { + if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2)) + part_info->variance_low[part_idx + 1] = 1; + } + } else if (xd->mi[0]->bsize == BLOCK_32X64) { + for (int part_idx = 0; part_idx < 2; part_idx++) { + if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2)) + part_info->variance_low[part_idx + 3] = 1; + } + } else { + static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } }; + for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { + const int idx_str = mi_params->mi_stride * (mi_row + idx[lvl1_idx][0]) + + mi_col + idx[lvl1_idx][1]; + MB_MODE_INFO **this_mi = mi_params->mi_grid_base + idx_str; + + if (mi_params->mi_cols <= mi_col + idx[lvl1_idx][1] || + mi_params->mi_rows <= mi_row + idx[lvl1_idx][0]) + continue; + + if (*this_mi == NULL) continue; + + if ((*this_mi)->bsize == BLOCK_32X32) { + int64_t threshold_32x32 = (5 * thresholds[1]) >> 3; + if (vt->split[lvl1_idx].part_variances.none.variance < threshold_32x32) + part_info->variance_low[lvl1_idx + 5] = 1; + } else { + // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block + // inside. + if ((*this_mi)->bsize == BLOCK_16X16 || + (*this_mi)->bsize == BLOCK_32X16 || + (*this_mi)->bsize == BLOCK_16X32) { + for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { + if (vt->split[lvl1_idx] + .split[lvl2_idx] + .part_variances.none.variance < (thresholds[2] >> 8)) + part_info->variance_low[(lvl1_idx << 2) + lvl2_idx + 9] = 1; + } + } + } + } + } +} + +static AOM_INLINE void set_low_temp_var_flag_128x128( + CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info, + MACROBLOCKD *xd, VP128x128 *vt, const int64_t thresholds[], int mi_col, + int mi_row) { + if (xd->mi[0]->bsize == BLOCK_128X128) { + if (vt->part_variances.none.variance < (thresholds[0] >> 1)) + part_info->variance_low[0] = 1; + } else if (xd->mi[0]->bsize == BLOCK_128X64) { + for (int part_idx = 0; part_idx < 2; part_idx++) { + if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2)) + part_info->variance_low[part_idx + 1] = 1; + } + } else if (xd->mi[0]->bsize == BLOCK_64X128) { + for (int part_idx = 0; part_idx < 2; part_idx++) { + if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2)) + part_info->variance_low[part_idx + 3] = 1; + } + } else { + static const int idx64[4][2] = { + { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 } + }; + static const int idx32[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } }; + for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { + const int idx_str = mi_params->mi_stride * (mi_row + idx64[lvl1_idx][0]) + + mi_col + idx64[lvl1_idx][1]; + MB_MODE_INFO **mi_64 = mi_params->mi_grid_base + idx_str; + if (*mi_64 == NULL) continue; + if (mi_params->mi_cols <= mi_col + idx64[lvl1_idx][1] || + mi_params->mi_rows <= mi_row + idx64[lvl1_idx][0]) + continue; + const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3; + if ((*mi_64)->bsize == BLOCK_64X64) { + if (vt->split[lvl1_idx].part_variances.none.variance < threshold_64x64) + part_info->variance_low[5 + lvl1_idx] = 1; + } else if ((*mi_64)->bsize == BLOCK_64X32) { + for (int part_idx = 0; part_idx < 2; part_idx++) + if (vt->split[lvl1_idx].part_variances.horz[part_idx].variance < + (threshold_64x64 >> 1)) + part_info->variance_low[9 + (lvl1_idx << 1) + part_idx] = 1; + } else if ((*mi_64)->bsize == BLOCK_32X64) { + for (int part_idx = 0; part_idx < 2; part_idx++) + if (vt->split[lvl1_idx].part_variances.vert[part_idx].variance < + (threshold_64x64 >> 1)) + part_info->variance_low[17 + (lvl1_idx << 1) + part_idx] = 1; + } else { + for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { + const int idx_str1 = + mi_params->mi_stride * idx32[lvl2_idx][0] + idx32[lvl2_idx][1]; + MB_MODE_INFO **mi_32 = mi_params->mi_grid_base + idx_str + idx_str1; + if (*mi_32 == NULL) continue; + + if (mi_params->mi_cols <= + mi_col + idx64[lvl1_idx][1] + idx32[lvl2_idx][1] || + mi_params->mi_rows <= + mi_row + idx64[lvl1_idx][0] + idx32[lvl2_idx][0]) + continue; + const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3; + if ((*mi_32)->bsize == BLOCK_32X32) { + if (vt->split[lvl1_idx] + .split[lvl2_idx] + .part_variances.none.variance < threshold_32x32) + part_info->variance_low[25 + (lvl1_idx << 2) + lvl2_idx] = 1; + } else { + // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block + // inside. + if ((*mi_32)->bsize == BLOCK_16X16 || + (*mi_32)->bsize == BLOCK_32X16 || + (*mi_32)->bsize == BLOCK_16X32) { + for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) { + VPartVar *none_var = &vt->split[lvl1_idx] + .split[lvl2_idx] + .split[lvl3_idx] + .part_variances.none; + if (none_var->variance < (thresholds[3] >> 8)) + part_info->variance_low[41 + (lvl1_idx << 4) + + (lvl2_idx << 2) + lvl3_idx] = 1; + } + } + } + } + } + } + } +} + +static AOM_INLINE void set_low_temp_var_flag( + AV1_COMP *cpi, PartitionSearchInfo *part_info, MACROBLOCKD *xd, + VP128x128 *vt, int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition, + int mi_col, int mi_row, const bool is_small_sb) { + AV1_COMMON *const cm = &cpi->common; + // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected. + // If the temporal variance is small set the flag + // variance_low for the block. The variance threshold can be adjusted, the + // higher the more aggressive. + if (ref_frame_partition == LAST_FRAME) { + if (is_small_sb) + set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd, + &(vt->split[0]), thresholds, mi_col, mi_row); + else + set_low_temp_var_flag_128x128(&cm->mi_params, part_info, xd, vt, + thresholds, mi_col, mi_row); + } +} + +static const int pos_shift_16x16[4][4] = { + { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 } +}; + +int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + // Relative indices of MB inside the superblock. + const int mi_x = mi_row & 0xF; + const int mi_y = mi_col & 0xF; + // Relative indices of 16x16 block inside the superblock. + const int i = mi_x >> 2; + const int j = mi_y >> 2; + int force_skip_low_temp_var = 0; + // Set force_skip_low_temp_var based on the block size and block offset. + switch (bsize) { + case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break; + case BLOCK_64X32: + if (!mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[1]; + } else if (!mi_y && mi_x) { + force_skip_low_temp_var = variance_low[2]; + } + break; + case BLOCK_32X64: + if (!mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[3]; + } else if (mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[4]; + } + break; + case BLOCK_32X32: + if (!mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[5]; + } else if (mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[6]; + } else if (!mi_y && mi_x) { + force_skip_low_temp_var = variance_low[7]; + } else if (mi_y && mi_x) { + force_skip_low_temp_var = variance_low[8]; + } + break; + case BLOCK_32X16: + case BLOCK_16X32: + case BLOCK_16X16: + force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]]; + break; + default: break; + } + + return force_skip_low_temp_var; +} + +int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + int force_skip_low_temp_var = 0; + int x, y; + x = (mi_col & 0x1F) >> 4; + // y = (mi_row & 0x1F) >> 4; + // const int idx64 = (y << 1) + x; + y = (mi_row & 0x17) >> 3; + const int idx64 = y + x; + + x = (mi_col & 0xF) >> 3; + // y = (mi_row & 0xF) >> 3; + // const int idx32 = (y << 1) + x; + y = (mi_row & 0xB) >> 2; + const int idx32 = y + x; + + x = (mi_col & 0x7) >> 2; + // y = (mi_row & 0x7) >> 2; + // const int idx16 = (y << 1) + x; + y = (mi_row & 0x5) >> 1; + const int idx16 = y + x; + // Set force_skip_low_temp_var based on the block size and block offset. + switch (bsize) { + case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break; + case BLOCK_128X64: + assert((mi_col & 0x1F) == 0); + force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)]; + break; + case BLOCK_64X128: + assert((mi_row & 0x1F) == 0); + force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)]; + break; + case BLOCK_64X64: + // Location of this 64x64 block inside the 128x128 superblock + force_skip_low_temp_var = variance_low[5 + idx64]; + break; + case BLOCK_64X32: + x = (mi_col & 0x1F) >> 4; + y = (mi_row & 0x1F) >> 3; + /* + .---------------.---------------. + | x=0,y=0,idx=0 | x=0,y=0,idx=2 | + :---------------+---------------: + | x=0,y=1,idx=1 | x=1,y=1,idx=3 | + :---------------+---------------: + | x=0,y=2,idx=4 | x=1,y=2,idx=6 | + :---------------+---------------: + | x=0,y=3,idx=5 | x=1,y=3,idx=7 | + '---------------'---------------' + */ + const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2); + force_skip_low_temp_var = variance_low[9 + idx64x32]; + break; + case BLOCK_32X64: + x = (mi_col & 0x1F) >> 3; + y = (mi_row & 0x1F) >> 4; + const int idx32x64 = (y << 2) + x; + force_skip_low_temp_var = variance_low[17 + idx32x64]; + break; + case BLOCK_32X32: + force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32]; + break; + case BLOCK_32X16: + case BLOCK_16X32: + case BLOCK_16X16: + force_skip_low_temp_var = + variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16]; + break; + default: break; + } + return force_skip_low_temp_var; +} + +void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int qindex, + int content_lowsumdiff) { + SPEED_FEATURES *const sf = &cpi->sf; + if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) { + return; + } else { + set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, 0, qindex, + content_lowsumdiff, 0, 0, 0, 0); + // The threshold below is not changed locally. + cpi->vbp_info.threshold_minmax = 15 + (qindex >> 3); + } +} + +static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int y_sad, + unsigned int y_sad_g, + unsigned int y_sad_alt, bool is_key_frame, + bool zero_motion, unsigned int *uv_sad) { + MACROBLOCKD *xd = &x->e_mbd; + const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; + int shift_upper_limit = 1; + int shift_lower_limit = 3; + int fac_uv = 6; + if (is_key_frame || cpi->oxcf.tool_cfg.enable_monochrome) return; + + // Use lower threshold (more conservative in setting color flag) for + // higher resolutions non-screen, which tend to have more camera noise. + // Since this may be used to skip compound mode in nonrd pickmode, which + // is generally more effective for higher resolutions, better to be more + // conservative. + if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { + if (cpi->common.width * cpi->common.height >= RESOLUTION_1080P) + fac_uv = 3; + else + fac_uv = 5; + } + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + cpi->rc.high_source_sad) { + shift_lower_limit = 7; + } else if (source_sad_nonrd >= kMedSad && x->source_variance > 500 && + cpi->common.width * cpi->common.height >= 640 * 360) { + shift_upper_limit = 2; + shift_lower_limit = source_sad_nonrd > kMedSad ? 5 : 4; + } + + MB_MODE_INFO *mi = xd->mi[0]; + const AV1_COMMON *const cm = &cpi->common; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + const YV12_BUFFER_CONFIG *yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME); + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, LAST_FRAME); + struct buf_2d dst; + unsigned int uv_sad_g = 0; + unsigned int uv_sad_alt = 0; + + for (int plane = AOM_PLANE_U; plane < MAX_MB_PLANE; ++plane) { + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + + if (bs != BLOCK_INVALID) { + // For last: + if (zero_motion) { + if (mi->ref_frame[0] == LAST_FRAME) { + uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride); + } else { + uint8_t *src = (plane == 1) ? yv12->u_buffer : yv12->v_buffer; + setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12->uv_crop_width, + yv12->uv_crop_height, yv12->uv_stride, xd->mi_row, + xd->mi_col, sf, xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y); + + uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, dst.buf, dst.stride); + } + } else { + uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); + } + + // For golden: + if (y_sad_g != UINT_MAX) { + uint8_t *src = (plane == 1) ? yv12_g->u_buffer : yv12_g->v_buffer; + setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_g->uv_crop_width, + yv12_g->uv_crop_height, yv12_g->uv_stride, xd->mi_row, + xd->mi_col, sf, xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y); + uv_sad_g = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, dst.buf, + dst.stride); + } + + // For altref: + if (y_sad_alt != UINT_MAX) { + uint8_t *src = (plane == 1) ? yv12_alt->u_buffer : yv12_alt->v_buffer; + setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_alt->uv_crop_width, + yv12_alt->uv_crop_height, yv12_alt->uv_stride, + xd->mi_row, xd->mi_col, sf, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y); + uv_sad_alt = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, + dst.buf, dst.stride); + } + } + + if (uv_sad[plane - 1] > (y_sad >> shift_upper_limit)) + x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 1; + else if (uv_sad[plane - 1] < (y_sad >> shift_lower_limit)) + x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 0; + // Borderline case: to be refined at coding block level in nonrd_pickmode, + // for coding block size < sb_size. + else + x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 2; + + x->color_sensitivity_sb_g[COLOR_SENS_IDX(plane)] = + uv_sad_g > y_sad_g / fac_uv; + x->color_sensitivity_sb_alt[COLOR_SENS_IDX(plane)] = + uv_sad_alt > y_sad_alt / fac_uv; + } +} + +static void fill_variance_tree_leaves( + AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, PART_EVAL_STATUS *force_split, + int avg_16x16[][4], int maxvar_16x16[][4], int minvar_16x16[][4], + int64_t *thresholds, const uint8_t *src_buf, int src_stride, + const uint8_t *dst_buf, int dst_stride, bool is_key_frame, + const bool is_small_sb) { + MACROBLOCKD *xd = &x->e_mbd; + const int num_64x64_blocks = is_small_sb ? 1 : 4; + // TODO(kyslov) Bring back compute_minmax_variance with content type detection + const int compute_minmax_variance = 0; + const int segment_id = xd->mi[0]->segment_id; + int pixels_wide = 128, pixels_high = 128; + int border_offset_4x4 = 0; + int temporal_denoising = cpi->sf.rt_sf.use_rtc_tf; + // dst_buf pointer is not used for is_key_frame, so it should be NULL. + assert(IMPLIES(is_key_frame, dst_buf == NULL)); + if (is_small_sb) { + pixels_wide = 64; + pixels_high = 64; + } + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); + if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); +#if CONFIG_AV1_TEMPORAL_DENOISING + temporal_denoising |= cpi->oxcf.noise_sensitivity; +#endif + // For temporal filtering or temporal denoiser enabled: since the source + // is modified we need to avoid 4x4 avg along superblock boundary, since + // simd code will load 8 pixels for 4x4 avg and so can access source + // data outside superblock (while its being modified by temporal filter). + // Temporal filtering is never done on key frames. + if (!is_key_frame && temporal_denoising) border_offset_4x4 = 4; + for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; blk64_idx++) { + const int x64_idx = GET_BLK_IDX_X(blk64_idx, 6); + const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 6); + const int blk64_scale_idx = blk64_idx << 2; + force_split[blk64_idx + 1] = PART_EVAL_ALL; + + for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { + const int x32_idx = x64_idx + GET_BLK_IDX_X(lvl1_idx, 5); + const int y32_idx = y64_idx + GET_BLK_IDX_Y(lvl1_idx, 5); + const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; + force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ALL; + avg_16x16[blk64_idx][lvl1_idx] = 0; + maxvar_16x16[blk64_idx][lvl1_idx] = 0; + minvar_16x16[blk64_idx][lvl1_idx] = INT_MAX; + for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { + const int x16_idx = x32_idx + GET_BLK_IDX_X(lvl2_idx, 4); + const int y16_idx = y32_idx + GET_BLK_IDX_Y(lvl2_idx, 4); + const int split_index = 21 + lvl1_scale_idx + lvl2_idx; + VP16x16 *vst = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; + force_split[split_index] = PART_EVAL_ALL; + if (is_key_frame) { + // Go down to 4x4 down-sampling for variance. + for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) { + const int x8_idx = x16_idx + GET_BLK_IDX_X(lvl3_idx, 3); + const int y8_idx = y16_idx + GET_BLK_IDX_Y(lvl3_idx, 3); + VP8x8 *vst2 = &vst->split[lvl3_idx]; + fill_variance_4x4avg(src_buf, src_stride, x8_idx, y8_idx, vst2, +#if CONFIG_AV1_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high, border_offset_4x4); + } + } else { + fill_variance_8x8avg(src_buf, src_stride, dst_buf, dst_stride, + x16_idx, y16_idx, vst, is_cur_buf_hbd(xd), + pixels_wide, pixels_high); + + fill_variance_tree(vst, BLOCK_16X16); + VPartVar *none_var = &vt->split[blk64_idx] + .split[lvl1_idx] + .split[lvl2_idx] + .part_variances.none; + get_variance(none_var); + const int val_none_var = none_var->variance; + avg_16x16[blk64_idx][lvl1_idx] += val_none_var; + minvar_16x16[blk64_idx][lvl1_idx] = + AOMMIN(minvar_16x16[blk64_idx][lvl1_idx], val_none_var); + maxvar_16x16[blk64_idx][lvl1_idx] = + AOMMAX(maxvar_16x16[blk64_idx][lvl1_idx], val_none_var); + if (val_none_var > thresholds[3]) { + // 16X16 variance is above threshold for split, so force split to + // 8x8 for this 16x16 block (this also forces splits for upper + // levels). + force_split[split_index] = PART_EVAL_ONLY_SPLIT; + force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; + force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } else if (!cyclic_refresh_segment_id_boosted(segment_id) && + compute_minmax_variance && val_none_var > thresholds[2]) { + // We have some nominal amount of 16x16 variance (based on average), + // compute the minmax over the 8x8 sub-blocks, and if above + // threshold, force split to 8x8 block for this 16x16 block. + int minmax = compute_minmax_8x8(src_buf, src_stride, dst_buf, + dst_stride, x16_idx, y16_idx, +#if CONFIG_AV1_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high); + const int thresh_minmax = (int)cpi->vbp_info.threshold_minmax; + if (minmax > thresh_minmax) { + force_split[split_index] = PART_EVAL_ONLY_SPLIT; + force_split[5 + blk64_scale_idx + lvl1_idx] = + PART_EVAL_ONLY_SPLIT; + force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } + } + } + } + } + } +} + +static AOM_INLINE void set_ref_frame_for_partition( + AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + MV_REFERENCE_FRAME *ref_frame_partition, MB_MODE_INFO *mi, + unsigned int *y_sad, unsigned int *y_sad_g, unsigned int *y_sad_alt, + const YV12_BUFFER_CONFIG *yv12_g, const YV12_BUFFER_CONFIG *yv12_alt, + int mi_row, int mi_col, int num_planes) { + AV1_COMMON *const cm = &cpi->common; + const bool is_set_golden_ref_frame = + *y_sad_g < 0.9 * *y_sad && *y_sad_g < *y_sad_alt; + const bool is_set_altref_ref_frame = + *y_sad_alt < 0.9 * *y_sad && *y_sad_alt < *y_sad_g; + + if (is_set_golden_ref_frame) { + av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes); + mi->ref_frame[0] = GOLDEN_FRAME; + mi->mv[0].as_int = 0; + *y_sad = *y_sad_g; + *ref_frame_partition = GOLDEN_FRAME; + x->nonrd_prune_ref_frame_search = 0; + x->sb_me_partition = 0; + } else if (is_set_altref_ref_frame) { + av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col, + get_ref_scale_factors(cm, ALTREF_FRAME), num_planes); + mi->ref_frame[0] = ALTREF_FRAME; + mi->mv[0].as_int = 0; + *y_sad = *y_sad_alt; + *ref_frame_partition = ALTREF_FRAME; + x->nonrd_prune_ref_frame_search = 0; + x->sb_me_partition = 0; + } else { + *ref_frame_partition = LAST_FRAME; + x->nonrd_prune_ref_frame_search = + cpi->sf.rt_sf.nonrd_prune_ref_frame_search; + } +} + +static AOM_FORCE_INLINE int mv_distance(const FULLPEL_MV *mv0, + const FULLPEL_MV *mv1) { + return abs(mv0->row - mv1->row) + abs(mv0->col - mv1->col); +} + +static AOM_INLINE void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x, + unsigned int *y_sad, + bool is_small_sb, + int est_motion) { + const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; + // TODO(yunqingwang@google.com): test if this condition works with other + // speeds. + if (est_motion > 2 && source_sad_nonrd > kMedSad) return; + + MACROBLOCKD *xd = &x->e_mbd; + BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; + MB_MODE_INFO *mi = xd->mi[0]; + + unsigned int above_y_sad = UINT_MAX; + unsigned int left_y_sad = UINT_MAX; + FULLPEL_MV above_mv = kZeroFullMv; + FULLPEL_MV left_mv = kZeroFullMv; + SubpelMvLimits subpel_mv_limits; + const MV dummy_mv = { 0, 0 }; + av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, &dummy_mv); + + // Current best MV + FULLPEL_MV best_mv = get_fullmv_from_mv(&mi->mv[0].as_mv); + const int multi = (est_motion > 2 && source_sad_nonrd > kLowSad) ? 7 : 8; + + if (xd->up_available) { + const MB_MODE_INFO *above_mbmi = xd->above_mbmi; + if (above_mbmi->mode >= INTRA_MODE_END && + above_mbmi->ref_frame[0] == LAST_FRAME) { + MV temp = above_mbmi->mv[0].as_mv; + clamp_mv(&temp, &subpel_mv_limits); + above_mv = get_fullmv_from_mv(&temp); + + if (mv_distance(&best_mv, &above_mv) > 0) { + uint8_t const *ref_buf = + get_buf_from_fullmv(&xd->plane[0].pre[0], &above_mv); + above_y_sad = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, ref_buf, + xd->plane[0].pre[0].stride); + } + } + } + if (xd->left_available) { + const MB_MODE_INFO *left_mbmi = xd->left_mbmi; + if (left_mbmi->mode >= INTRA_MODE_END && + left_mbmi->ref_frame[0] == LAST_FRAME) { + MV temp = left_mbmi->mv[0].as_mv; + clamp_mv(&temp, &subpel_mv_limits); + left_mv = get_fullmv_from_mv(&temp); + + if (mv_distance(&best_mv, &left_mv) > 0 && + mv_distance(&above_mv, &left_mv) > 0) { + uint8_t const *ref_buf = + get_buf_from_fullmv(&xd->plane[0].pre[0], &left_mv); + left_y_sad = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, ref_buf, + xd->plane[0].pre[0].stride); + } + } + } + + if (above_y_sad < ((multi * *y_sad) >> 3) && above_y_sad < left_y_sad) { + *y_sad = above_y_sad; + mi->mv[0].as_mv = get_mv_from_fullmv(&above_mv); + clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits); + } + if (left_y_sad < ((multi * *y_sad) >> 3) && left_y_sad < above_y_sad) { + *y_sad = left_y_sad; + mi->mv[0].as_mv = get_mv_from_fullmv(&left_mv); + clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits); + } +} + +static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, + unsigned int *y_sad_g, unsigned int *y_sad_alt, + unsigned int *y_sad_last, + MV_REFERENCE_FRAME *ref_frame_partition, + struct scale_factors *sf_no_scale, int mi_row, + int mi_col, bool is_small_sb, bool scaled_ref_last) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const int num_planes = av1_num_planes(cm); + bool scaled_ref_golden = false; + bool scaled_ref_alt = false; + BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; + MB_MODE_INFO *mi = xd->mi[0]; + const YV12_BUFFER_CONFIG *yv12 = + scaled_ref_last ? av1_get_scaled_ref_frame(cpi, LAST_FRAME) + : get_ref_frame_yv12_buf(cm, LAST_FRAME); + assert(yv12 != NULL); + const YV12_BUFFER_CONFIG *yv12_g = NULL; + const YV12_BUFFER_CONFIG *yv12_alt = NULL; + // Check if LAST is a reference. For spatial layers always use it as + // reference scaling. + int use_last_ref = (cpi->ref_frame_flags & AOM_LAST_FLAG) || + cpi->svc.number_spatial_layers > 1; + int use_golden_ref = cpi->ref_frame_flags & AOM_GOLD_FLAG; + int use_alt_ref = cpi->ppi->rtc_ref.set_ref_frame_config || + cpi->sf.rt_sf.use_nonrd_altref_frame || + (cpi->sf.rt_sf.use_comp_ref_nonrd && + cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 1); + + // For 1 spatial layer: GOLDEN is another temporal reference. + // Check if it should be used as reference for partitioning. + if (cpi->svc.number_spatial_layers == 1 && use_golden_ref && + (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) { + yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + if (yv12_g && (yv12_g->y_crop_height != cm->height || + yv12_g->y_crop_width != cm->width)) { + yv12_g = av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + scaled_ref_golden = true; + } + if (yv12_g && yv12_g != yv12) { + av1_setup_pre_planes( + xd, 0, yv12_g, mi_row, mi_col, + scaled_ref_golden ? NULL : get_ref_scale_factors(cm, GOLDEN_FRAME), + num_planes); + *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, + xd->plane[AOM_PLANE_Y].pre[0].buf, + xd->plane[AOM_PLANE_Y].pre[0].stride); + } + } + + // For 1 spatial layer: ALTREF is another temporal reference. + // Check if it should be used as reference for partitioning. + if (cpi->svc.number_spatial_layers == 1 && use_alt_ref && + (cpi->ref_frame_flags & AOM_ALT_FLAG) && + (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) { + yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME); + if (yv12_alt && (yv12_alt->y_crop_height != cm->height || + yv12_alt->y_crop_width != cm->width)) { + yv12_alt = av1_get_scaled_ref_frame(cpi, ALTREF_FRAME); + scaled_ref_alt = true; + } + if (yv12_alt && yv12_alt != yv12) { + av1_setup_pre_planes( + xd, 0, yv12_alt, mi_row, mi_col, + scaled_ref_alt ? NULL : get_ref_scale_factors(cm, ALTREF_FRAME), + num_planes); + *y_sad_alt = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, + xd->plane[AOM_PLANE_Y].pre[0].buf, + xd->plane[AOM_PLANE_Y].pre[0].stride); + } + } + + if (use_last_ref) { + const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; + av1_setup_pre_planes( + xd, 0, yv12, mi_row, mi_col, + scaled_ref_last ? NULL : get_ref_scale_factors(cm, LAST_FRAME), + num_planes); + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NONE_FRAME; + mi->bsize = cm->seq_params->sb_size; + mi->mv[0].as_int = 0; + mi->interp_filters = av1_broadcast_interp_filter(BILINEAR); + + int est_motion = cpi->sf.rt_sf.estimate_motion_for_var_based_partition; + // TODO(b/290596301): Look into adjusting this condition. + // There is regression on color content when + // estimate_motion_for_var_based_partition = 3 and high motion, + // so for now force it to 2 based on superblock sad. + if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2; + + if (est_motion == 1 || est_motion == 2) { + if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) { + // For screen only do int_pro_motion for spatial variance above + // threshold and motion level above LowSad. + if (x->source_variance > 100 && source_sad_nonrd > kLowSad) { + int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; + int me_search_size_col = + is_screen ? 96 : block_size_wide[cm->seq_params->sb_size] >> 1; + // For screen use larger search size row motion to capture + // vertical scroll, which can be larger motion. + int me_search_size_row = + is_screen ? 192 : block_size_high[cm->seq_params->sb_size] >> 1; + unsigned int y_sad_zero; + *y_sad = av1_int_pro_motion_estimation( + cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv, + &y_sad_zero, me_search_size_col, me_search_size_row); + // The logic below selects whether the motion estimated in the + // int_pro_motion() will be used in nonrd_pickmode. Only do this + // for screen for now. + if (is_screen) { + unsigned int thresh_sad = + (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000; + if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) { + x->sb_me_partition = 1; + x->sb_me_mv.as_int = mi->mv[0].as_int; + } else { + x->sb_me_partition = 0; + // Fall back to using zero motion. + *y_sad = y_sad_zero; + mi->mv[0].as_int = 0; + } + } + } + } + } + + if (*y_sad == UINT_MAX) { + *y_sad = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, + xd->plane[AOM_PLANE_Y].pre[0].buf, + xd->plane[AOM_PLANE_Y].pre[0].stride); + } + + // Evaluate if neighbours' MVs give better predictions. Zero MV is tested + // already, so only non-zero MVs are tested here. Here the neighbour blocks + // are the first block above or left to this superblock. + if (est_motion >= 2 && (xd->up_available || xd->left_available)) + evaluate_neighbour_mvs(cpi, x, y_sad, is_small_sb, est_motion); + + *y_sad_last = *y_sad; + } + + // Pick the ref frame for partitioning, use golden or altref frame only if + // its lower sad, bias to LAST with factor 0.9. + set_ref_frame_for_partition(cpi, x, xd, ref_frame_partition, mi, y_sad, + y_sad_g, y_sad_alt, yv12_g, yv12_alt, mi_row, + mi_col, num_planes); + + // Only calculate the predictor for non-zero MV. + if (mi->mv[0].as_int != 0) { + if (!scaled_ref_last) { + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + } else { + xd->block_ref_scale_factors[0] = sf_no_scale; + xd->block_ref_scale_factors[1] = sf_no_scale; + } + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, + cm->seq_params->sb_size, AOM_PLANE_Y, + num_planes - 1); + } +} + +// Decides whether to split or merge a 16x16 partition block in variance based +// partitioning based on the 8x8 sub-block variances. +static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var( + VP16x16 *var_16x16_info, int64_t threshold16) { + int max_8x8_var = 0, min_8x8_var = INT_MAX; + for (int split_idx = 0; split_idx < 4; split_idx++) { + get_variance(&var_16x16_info->split[split_idx].part_variances.none); + int this_8x8_var = + var_16x16_info->split[split_idx].part_variances.none.variance; + max_8x8_var = AOMMAX(this_8x8_var, max_8x8_var); + min_8x8_var = AOMMIN(this_8x8_var, min_8x8_var); + } + // If the difference between maximum and minimum sub-block variances is high, + // then only evaluate PARTITION_SPLIT for the 16x16 block. Otherwise, evaluate + // only PARTITION_NONE. The shift factor for threshold16 has been derived + // empirically. + return ((max_8x8_var - min_8x8_var) > (threshold16 << 2)) + ? PART_EVAL_ONLY_SPLIT + : PART_EVAL_ONLY_NONE; +} + +static AOM_INLINE bool is_set_force_zeromv_skip_based_on_src_sad( + int set_zeromv_skip_based_on_source_sad, SOURCE_SAD source_sad_nonrd) { + if (set_zeromv_skip_based_on_source_sad == 0) return false; + + if (set_zeromv_skip_based_on_source_sad >= 3) + return source_sad_nonrd <= kLowSad; + else if (set_zeromv_skip_based_on_source_sad >= 2) + return source_sad_nonrd <= kVeryLowSad; + else if (set_zeromv_skip_based_on_source_sad >= 1) + return source_sad_nonrd == kZeroSad; + + return false; +} + +static AOM_INLINE bool set_force_zeromv_skip_for_sb( + AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP128x128 *vt, + unsigned int *uv_sad, int mi_row, int mi_col, unsigned int y_sad, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + if (!is_set_force_zeromv_skip_based_on_src_sad( + cpi->sf.rt_sf.set_zeromv_skip_based_on_source_sad, + x->content_state_sb.source_sad_nonrd)) + return false; + int shift = cpi->sf.rt_sf.increase_source_sad_thresh ? 1 : 0; + const int block_width = mi_size_wide[cm->seq_params->sb_size]; + const int block_height = mi_size_high[cm->seq_params->sb_size]; + const unsigned int thresh_exit_part_y = + cpi->zeromv_skip_thresh_exit_part[bsize] << shift; + unsigned int thresh_exit_part_uv = + CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y) << shift; + // Be more aggressive in UV threshold if source_sad >= VeryLowSad + // to suppreess visual artifact caused by the speed feature: + // set_zeromv_skip_based_on_source_sad = 2. For now only for + // part_early_exit_zeromv = 1. + if (x->content_state_sb.source_sad_nonrd >= kVeryLowSad && + cpi->sf.rt_sf.part_early_exit_zeromv == 1) + thresh_exit_part_uv = thresh_exit_part_uv >> 3; + if (mi_col + block_width <= tile->mi_col_end && + mi_row + block_height <= tile->mi_row_end && y_sad < thresh_exit_part_y && + uv_sad[0] < thresh_exit_part_uv && uv_sad[1] < thresh_exit_part_uv) { + set_block_size(cpi, mi_row, mi_col, bsize); + x->force_zeromv_skip_for_sb = 1; + aom_free(vt); + // Partition shape is set here at SB level. + // Exit needs to happen from av1_choose_var_based_partitioning(). + return true; + } else if (x->content_state_sb.source_sad_nonrd == kZeroSad && + cpi->sf.rt_sf.part_early_exit_zeromv >= 2) + x->force_zeromv_skip_for_sb = 2; + return false; +} + +int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + ThreadData *td, MACROBLOCK *x, int mi_row, + int mi_col) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, choose_var_based_partitioning_time); +#endif + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds; + PART_EVAL_STATUS force_split[85]; + int avg_64x64; + int max_var_32x32[4]; + int min_var_32x32[4]; + int var_32x32; + int var_64x64; + int min_var_64x64 = INT_MAX; + int max_var_64x64 = 0; + int avg_16x16[4][4]; + int maxvar_16x16[4][4]; + int minvar_16x16[4][4]; + const uint8_t *src_buf; + const uint8_t *dst_buf; + int dst_stride; + unsigned int uv_sad[MAX_MB_PLANE - 1]; + NOISE_LEVEL noise_level = kLow; + bool is_zero_motion = true; + bool scaled_ref_last = false; + struct scale_factors sf_no_scale; + av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height, + cm->width, cm->height); + + bool is_key_frame = + (frame_is_intra_only(cm) || + (cpi->ppi->use_svc && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); + + assert(cm->seq_params->sb_size == BLOCK_64X64 || + cm->seq_params->sb_size == BLOCK_128X128); + const bool is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); + const int num_64x64_blocks = is_small_sb ? 1 : 4; + + unsigned int y_sad = UINT_MAX; + unsigned int y_sad_g = UINT_MAX; + unsigned int y_sad_alt = UINT_MAX; + unsigned int y_sad_last = UINT_MAX; + BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; + + // Ref frame used in partitioning. + MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME; + + int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1], + vbp_thresholds[2], vbp_thresholds[3], + vbp_thresholds[4] }; + + const int segment_id = xd->mi[0]->segment_id; + uint64_t blk_sad = 0; + if (cpi->src_sad_blk_64x64 != NULL && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; + const int sb_cols = + (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; + const int sbi_col = mi_col / sb_size_by_mb; + const int sbi_row = mi_row / sb_size_by_mb; + blk_sad = cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols]; + } + + const bool is_segment_id_boosted = + cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + cyclic_refresh_segment_id_boosted(segment_id); + const int qindex = + is_segment_id_boosted + ? av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) + : cm->quant_params.base_qindex; + set_vbp_thresholds( + cpi, thresholds, blk_sad, qindex, x->content_state_sb.low_sumdiff, + x->content_state_sb.source_sad_nonrd, x->content_state_sb.source_sad_rd, + is_segment_id_boosted, x->content_state_sb.lighting_change); + + src_buf = x->plane[AOM_PLANE_Y].src.buf; + int src_stride = x->plane[AOM_PLANE_Y].src.stride; + + // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks, + // 5-20 for the 16x16 blocks. + force_split[0] = PART_EVAL_ALL; + memset(x->part_search_info.variance_low, 0, + sizeof(x->part_search_info.variance_low)); + + // Check if LAST frame is NULL, and if so, treat this frame + // as a key frame, for the purpose of the superblock partitioning. + // LAST == NULL can happen in cases where enhancement spatial layers are + // enabled dyanmically and the only reference is the spatial(GOLDEN). + // If LAST frame has a different resolution: set the scaled_ref_last flag + // and check if ref_scaled is NULL. + if (!frame_is_intra_only(cm)) { + const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, LAST_FRAME); + if (ref == NULL) { + is_key_frame = true; + } else if (ref->y_crop_height != cm->height || + ref->y_crop_width != cm->width) { + scaled_ref_last = true; + const YV12_BUFFER_CONFIG *ref_scaled = + av1_get_scaled_ref_frame(cpi, LAST_FRAME); + if (ref_scaled == NULL) is_key_frame = true; + } + } + + x->source_variance = UINT_MAX; + // For nord_pickmode: compute source_variance, only for superblocks with + // some motion for now. This input can then be used to bias the partitioning + // or the chroma_check. + if (cpi->sf.rt_sf.use_nonrd_pick_mode && + x->content_state_sb.source_sad_nonrd > kLowSad) + x->source_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, cm->seq_params->sb_size, AOM_PLANE_Y); + + if (!is_key_frame) { + setup_planes(cpi, x, &y_sad, &y_sad_g, &y_sad_alt, &y_sad_last, + &ref_frame_partition, &sf_no_scale, mi_row, mi_col, + is_small_sb, scaled_ref_last); + + MB_MODE_INFO *mi = xd->mi[0]; + // Use reference SB directly for zero mv. + if (mi->mv[0].as_int != 0) { + dst_buf = xd->plane[AOM_PLANE_Y].dst.buf; + dst_stride = xd->plane[AOM_PLANE_Y].dst.stride; + is_zero_motion = false; + } else { + dst_buf = xd->plane[AOM_PLANE_Y].pre[0].buf; + dst_stride = xd->plane[AOM_PLANE_Y].pre[0].stride; + } + } else { + dst_buf = NULL; + dst_stride = 0; + } + + // check and set the color sensitivity of sb. + av1_zero(uv_sad); + chroma_check(cpi, x, bsize, y_sad_last, y_sad_g, y_sad_alt, is_key_frame, + is_zero_motion, uv_sad); + + x->force_zeromv_skip_for_sb = 0; + + VP128x128 *vt; + AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt))); + vt->split = td->vt64x64; + + // If the superblock is completely static (zero source sad) and + // the y_sad (relative to LAST ref) is very small, take the sb_size partition + // and exit, and force zeromv_last skip mode for nonrd_pickmode. + // Only do this on the base segment (so the QP-boosted segment, if applied, + // can still continue cleaning/ramping up the quality). + // Condition on color uv_sad is also added. + if (!is_key_frame && cpi->sf.rt_sf.part_early_exit_zeromv && + cpi->rc.frames_since_key > 30 && segment_id == CR_SEGMENT_ID_BASE && + ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0) { + // Exit here, if zero mv skip flag is set at SB level. + if (set_force_zeromv_skip_for_sb(cpi, x, tile, vt, uv_sad, mi_row, mi_col, + y_sad, bsize)) + return 0; + } + + if (cpi->noise_estimate.enabled) + noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate); + + // Fill in the entire tree of 8x8 (for inter frames) or 4x4 (for key frames) + // variances for splits. + fill_variance_tree_leaves(cpi, x, vt, force_split, avg_16x16, maxvar_16x16, + minvar_16x16, thresholds, src_buf, src_stride, + dst_buf, dst_stride, is_key_frame, is_small_sb); + + avg_64x64 = 0; + for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) { + max_var_32x32[blk64_idx] = 0; + min_var_32x32[blk64_idx] = INT_MAX; + const int blk64_scale_idx = blk64_idx << 2; + for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { + const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; + for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { + if (!is_key_frame) continue; + VP16x16 *vtemp = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; + for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) + fill_variance_tree(&vtemp->split[lvl3_idx], BLOCK_8X8); + fill_variance_tree(vtemp, BLOCK_16X16); + // If variance of this 16x16 block is above the threshold, force block + // to split. This also forces a split on the upper levels. + get_variance(&vtemp->part_variances.none); + if (vtemp->part_variances.none.variance > thresholds[3]) { + const int split_index = 21 + lvl1_scale_idx + lvl2_idx; + force_split[split_index] = + cpi->sf.rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var + ? get_part_eval_based_on_sub_blk_var(vtemp, thresholds[3]) + : PART_EVAL_ONLY_SPLIT; + force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; + force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } + } + fill_variance_tree(&vt->split[blk64_idx].split[lvl1_idx], BLOCK_32X32); + // If variance of this 32x32 block is above the threshold, or if its above + // (some threshold of) the average variance over the sub-16x16 blocks, + // then force this block to split. This also forces a split on the upper + // (64x64) level. + uint64_t frame_sad_thresh = 20000; + const int is_360p_or_smaller = cm->width * cm->height <= RESOLUTION_360P; + if (cpi->svc.number_temporal_layers > 2 && + cpi->svc.temporal_layer_id == 0) + frame_sad_thresh = frame_sad_thresh << 1; + if (force_split[5 + blk64_scale_idx + lvl1_idx] == PART_EVAL_ALL) { + get_variance(&vt->split[blk64_idx].split[lvl1_idx].part_variances.none); + var_32x32 = + vt->split[blk64_idx].split[lvl1_idx].part_variances.none.variance; + max_var_32x32[blk64_idx] = AOMMAX(var_32x32, max_var_32x32[blk64_idx]); + min_var_32x32[blk64_idx] = AOMMIN(var_32x32, min_var_32x32[blk64_idx]); + const int max_min_var_16X16_diff = (maxvar_16x16[blk64_idx][lvl1_idx] - + minvar_16x16[blk64_idx][lvl1_idx]); + + if (var_32x32 > thresholds[2] || + (!is_key_frame && var_32x32 > (thresholds[2] >> 1) && + var_32x32 > (avg_16x16[blk64_idx][lvl1_idx] >> 1))) { + force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; + force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } else if (!is_key_frame && is_360p_or_smaller && + ((max_min_var_16X16_diff > (thresholds[2] >> 1) && + maxvar_16x16[blk64_idx][lvl1_idx] > thresholds[2]) || + (cpi->sf.rt_sf.prefer_large_partition_blocks && + x->content_state_sb.source_sad_nonrd > kLowSad && + cpi->rc.frame_source_sad < frame_sad_thresh && + maxvar_16x16[blk64_idx][lvl1_idx] > (thresholds[2] >> 4) && + maxvar_16x16[blk64_idx][lvl1_idx] > + (minvar_16x16[blk64_idx][lvl1_idx] << 2)))) { + force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; + force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } + } + } + if (force_split[1 + blk64_idx] == PART_EVAL_ALL) { + fill_variance_tree(&vt->split[blk64_idx], BLOCK_64X64); + get_variance(&vt->split[blk64_idx].part_variances.none); + var_64x64 = vt->split[blk64_idx].part_variances.none.variance; + max_var_64x64 = AOMMAX(var_64x64, max_var_64x64); + min_var_64x64 = AOMMIN(var_64x64, min_var_64x64); + // If the difference of the max-min variances of sub-blocks or max + // variance of a sub-block is above some threshold of then force this + // block to split. Only checking this for noise level >= medium, if + // encoder is in SVC or if we already forced large blocks. + const int max_min_var_32x32_diff = + max_var_32x32[blk64_idx] - min_var_32x32[blk64_idx]; + const int check_max_var = max_var_32x32[blk64_idx] > thresholds[1] >> 1; + const bool check_noise_lvl = noise_level >= kMedium || + cpi->ppi->use_svc || + cpi->sf.rt_sf.prefer_large_partition_blocks; + const int64_t set_threshold = 3 * (thresholds[1] >> 3); + + if (!is_key_frame && max_min_var_32x32_diff > set_threshold && + check_max_var && check_noise_lvl) { + force_split[1 + blk64_idx] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } + avg_64x64 += var_64x64; + } + if (is_small_sb) force_split[0] = PART_EVAL_ONLY_SPLIT; + } + + if (force_split[0] == PART_EVAL_ALL) { + fill_variance_tree(vt, BLOCK_128X128); + get_variance(&vt->part_variances.none); + const int set_avg_64x64 = (9 * avg_64x64) >> 5; + if (!is_key_frame && vt->part_variances.none.variance > set_avg_64x64) + force_split[0] = PART_EVAL_ONLY_SPLIT; + + if (!is_key_frame && + (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) && + max_var_64x64 > thresholds[0] >> 1) + force_split[0] = PART_EVAL_ONLY_SPLIT; + } + + if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end || + !set_vt_partitioning(cpi, xd, tile, vt, BLOCK_128X128, mi_row, mi_col, + thresholds[0], BLOCK_16X16, force_split[0])) { + for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) { + const int x64_idx = GET_BLK_IDX_X(blk64_idx, 4); + const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 4); + const int blk64_scale_idx = blk64_idx << 2; + + // Now go through the entire structure, splitting every block size until + // we get to one that's got a variance lower than our threshold. + if (set_vt_partitioning(cpi, xd, tile, &vt->split[blk64_idx], BLOCK_64X64, + mi_row + y64_idx, mi_col + x64_idx, thresholds[1], + BLOCK_16X16, force_split[1 + blk64_idx])) + continue; + for (int lvl1_idx = 0; lvl1_idx < 4; ++lvl1_idx) { + const int x32_idx = GET_BLK_IDX_X(lvl1_idx, 3); + const int y32_idx = GET_BLK_IDX_Y(lvl1_idx, 3); + const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; + if (set_vt_partitioning( + cpi, xd, tile, &vt->split[blk64_idx].split[lvl1_idx], + BLOCK_32X32, (mi_row + y64_idx + y32_idx), + (mi_col + x64_idx + x32_idx), thresholds[2], BLOCK_16X16, + force_split[5 + blk64_scale_idx + lvl1_idx])) + continue; + for (int lvl2_idx = 0; lvl2_idx < 4; ++lvl2_idx) { + const int x16_idx = GET_BLK_IDX_X(lvl2_idx, 2); + const int y16_idx = GET_BLK_IDX_Y(lvl2_idx, 2); + const int split_index = 21 + lvl1_scale_idx + lvl2_idx; + VP16x16 *vtemp = + &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; + if (set_vt_partitioning(cpi, xd, tile, vtemp, BLOCK_16X16, + mi_row + y64_idx + y32_idx + y16_idx, + mi_col + x64_idx + x32_idx + x16_idx, + thresholds[3], BLOCK_8X8, + force_split[split_index])) + continue; + for (int lvl3_idx = 0; lvl3_idx < 4; ++lvl3_idx) { + const int x8_idx = GET_BLK_IDX_X(lvl3_idx, 1); + const int y8_idx = GET_BLK_IDX_Y(lvl3_idx, 1); + set_block_size(cpi, (mi_row + y64_idx + y32_idx + y16_idx + y8_idx), + (mi_col + x64_idx + x32_idx + x16_idx + x8_idx), + BLOCK_8X8); + } + } + } + } + } + + if (cpi->sf.rt_sf.short_circuit_low_temp_var) { + set_low_temp_var_flag(cpi, &x->part_search_info, xd, vt, thresholds, + ref_frame_partition, mi_col, mi_row, is_small_sb); + } + + aom_free(vt); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, choose_var_based_partitioning_time); +#endif + return 0; +} diff --git a/third_party/aom/av1/encoder/var_based_part.h b/third_party/aom/av1/encoder/var_based_part.h new file mode 100644 index 0000000000..f912458307 --- /dev/null +++ b/third_party/aom/av1/encoder/var_based_part.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_ +#define AOM_AV1_ENCODER_VAR_BASED_PART_H_ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/encoder/encoder.h" + +// Calculate block index x and y from split level and index +#define GET_BLK_IDX_X(idx, level) (((idx) & (0x01)) << (level)) +#define GET_BLK_IDX_Y(idx, level) (((idx) >> (0x01)) << (level)) + +#ifdef __cplusplus +extern "C" { +#endif + +#define QINDEX_LARGE_BLOCK_THR \ + 100 // Use increased thresholds for midres for speed 9 when qindex is above + // this threshold + +#define CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part) \ + ((3 * (thresh_exit_part)) >> 2) +/*!\brief Set the thresholds for variance based partition. + * + * Set the variance split thresholds for following the block sizes: + * 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32, + * 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is + * currently only used on key frame. The thresholds are based om Q, resolution, + * noise level, and content state. + * + * \ingroup variance_partition + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] q q index + * \param[in] content_lowsumdiff Low sumdiff flag for superblock + * + * \remark Returns the set of thresholds in \c cpi->vbp_info.thresholds. + */ +void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q, + int content_lowsumdiff); + +/*!\brief Variance based partition selection. + * + * Select the partitioning based on the variance of the residual signal, + * residual generated as the difference between the source and prediction. + * The prediction is the reconstructed LAST or reconstructed GOLDEN, whichever + * has lower y sad. For LAST, option exists (speed feature) to use motion + * compensation based on superblock motion via int_pro_motion_estimation. For + * key frames reference is fixed 128 level, so variance is the source variance. + * The variance is computed for downsampled inputs (8x8 or 4x4 downsampled), + * and selection is done top-down via as set of partition thresholds. defined + * for each block level, and set based on Q, resolution, noise level, and + * content state. + * + * \ingroup variance_partition + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] tile Pointer to TileInfo + * \param[in] td Pointer to ThreadData + * \param[in] x Pointer to MACROBLOCK + * \param[in] mi_row Row coordinate of the superblock in a step + size of MI_SIZE + * \param[in] mi_col Column coordinate of the super block in a step + size of MI_SIZE + * + * \return Returns the partition in \c xd->mi[0]->sb_type. Also sets the low + * temporal variance flag and the color sensitivity flag (both used in + * nonrd_pickmode). + */ +int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + ThreadData *td, MACROBLOCK *x, int mi_row, + int mi_col); + +// Read out the block's temporal variance for 64x64 SB case. +int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low, + int mi_row, int mi_col, + BLOCK_SIZE bsize); +// Read out the block's temporal variance for 128x128 SB case. +int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row, + int mi_col, BLOCK_SIZE bsize); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_VAR_BASED_PART_H_ diff --git a/third_party/aom/av1/encoder/wedge_utils.c b/third_party/aom/av1/encoder/wedge_utils.c new file mode 100644 index 0000000000..40670178d7 --- /dev/null +++ b/third_party/aom/av1/encoder/wedge_utils.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" + +#include "aom_ports/mem.h" + +#include "aom_dsp/aom_dsp_common.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * Computes SSE of a compound predictor constructed from 2 fundamental + * predictors p0 and p1 using blending with mask. + * + * r1: Residuals of p1. + * (source - p1) + * d: Difference of p1 and p0. + * (p1 - p0) + * m: The blending mask + * N: Number of pixels + * + * 'r1', 'd', and 'm' are contiguous. + * + * Computes: + * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to: + * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2), + * where r0 is (source - p0), and r1 is (source - p1), which is in turn + * is equivalent to: + * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2), + * which is the SSE of the residuals of the compound predictor scaled up by + * MAX_MASK_VALUE**2. + * + * Note that we clamp the partial term in the loop to 16 bits signed. This is + * to facilitate equivalent SIMD implementation. It should have no effect if + * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always + * holds for 8 bit input, and on real input, it should hold practically always, + * as residuals are expected to be small. + */ +uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + uint64_t csse = 0; + int i; + + for (i = 0; i < N; i++) { + int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i]; + t = clamp(t, INT16_MIN, INT16_MAX); + csse += t * t; + } + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * Choose the mask sign for a compound predictor. + * + * ds: Difference of the squares of the residuals. + * r0**2 - r1**2 + * m: The blending mask + * N: Number of pixels + * limit: Pre-computed threshold value. + * MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) + * + * 'ds' and 'm' are contiguous. + * + * Returns true if the negated mask has lower SSE compared to the positive + * mask. Computation is based on: + * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2) + * > + * Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2) + * + * which can be simplified to: + * + * Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) + * + * The right hand side does not depend on the mask, and needs to be passed as + * the 'limit' parameter. + * + * After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left + * hand side is simply a scalar product between an int16_t and uint8_t vector. + * + * Note that for efficiency, ds is stored on 16 bits. Real input residuals + * being small, this should not cause a noticeable issue. + */ +int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc = 0; + + do { + acc += *ds++ * *m++; + } while (--N); + + return acc > limit; +} + +/** + * Compute the element-wise difference of the squares of 2 arrays. + * + * d: Difference of the squares of the inputs: a**2 - b**2 + * a: First input array + * b: Second input array + * N: Number of elements + * + * 'd', 'a', and 'b' are contiguous. + * + * The result is saturated to signed 16 bits. + */ +void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + int i; + + for (i = 0; i < N; i++) + d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX); +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c new file mode 100644 index 0000000000..494b0fdf15 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c @@ -0,0 +1,1409 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/x86/av1_txfm1d_sse4.h" + +void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int stride) { + __m128i buf0[32]; + __m128i buf1[32]; + const int32_t *cospi; + + int startidx = 0 * stride; + int endidx = 31 * stride; + // stage 0 + // stage 1 + buf1[0] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[1] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[2] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[3] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[4] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[5] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[6] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[7] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[8] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[9] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[10] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[11] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[12] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[13] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[14] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[15] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm_add_epi32(buf1[0], buf1[15]); + buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]); + buf0[1] = _mm_add_epi32(buf1[1], buf1[14]); + buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]); + buf0[2] = _mm_add_epi32(buf1[2], buf1[13]); + buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]); + buf0[3] = _mm_add_epi32(buf1[3], buf1[12]); + buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]); + buf0[4] = _mm_add_epi32(buf1[4], buf1[11]); + buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]); + buf0[5] = _mm_add_epi32(buf1[5], buf1[10]); + buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]); + buf0[6] = _mm_add_epi32(buf1[6], buf1[9]); + buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]); + buf0[7] = _mm_add_epi32(buf1[7], buf1[8]); + buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + cospi = cospi_arr(cos_bit); + buf1[0] = _mm_add_epi32(buf0[0], buf0[7]); + buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]); + buf1[1] = _mm_add_epi32(buf0[1], buf0[6]); + buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]); + buf1[2] = _mm_add_epi32(buf0[2], buf0[5]); + buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]); + buf1[3] = _mm_add_epi32(buf0[3], buf0[4]); + buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + buf1[16] = _mm_add_epi32(buf0[16], buf0[23]); + buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]); + buf1[17] = _mm_add_epi32(buf0[17], buf0[22]); + buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]); + buf1[18] = _mm_add_epi32(buf0[18], buf0[21]); + buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]); + buf1[19] = _mm_add_epi32(buf0[19], buf0[20]); + buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]); + buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[24]); + buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]); + buf1[30] = _mm_add_epi32(buf0[30], buf0[25]); + buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]); + buf1[29] = _mm_add_epi32(buf0[29], buf0[26]); + buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]); + buf1[28] = _mm_add_epi32(buf0[28], buf0[27]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm_add_epi32(buf1[0], buf1[3]); + buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]); + buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); + buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); + buf0[4] = buf1[4]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[7] = buf1[7]; + buf0[8] = _mm_add_epi32(buf1[8], buf1[11]); + buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]); + buf0[9] = _mm_add_epi32(buf1[9], buf1[10]); + buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]); + buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]); + buf0[15] = _mm_add_epi32(buf1[15], buf1[12]); + buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]); + buf0[14] = _mm_add_epi32(buf1[14], buf1[13]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + cospi = cospi_arr(cos_bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], + cos_bit); + btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3], + cos_bit); + buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); + buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); + buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); + buf1[7] = _mm_add_epi32(buf0[7], buf0[6]); + buf1[8] = buf0[8]; + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], + buf1[14], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + buf1[16] = _mm_add_epi32(buf0[16], buf0[19]); + buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]); + buf1[17] = _mm_add_epi32(buf0[17], buf0[18]); + buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]); + buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]); + buf1[23] = _mm_add_epi32(buf0[23], buf0[20]); + buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]); + buf1[22] = _mm_add_epi32(buf0[22], buf0[21]); + buf1[24] = _mm_add_epi32(buf0[24], buf0[27]); + buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]); + buf1[25] = _mm_add_epi32(buf0[25], buf0[26]); + buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]); + buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[28]); + buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]); + buf1[30] = _mm_add_epi32(buf0[30], buf0[29]); + + // stage 6 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], + cos_bit); + btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[8] = _mm_add_epi32(buf1[8], buf1[9]); + buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]); + buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]); + buf0[11] = _mm_add_epi32(buf1[11], buf1[10]); + buf0[12] = _mm_add_epi32(buf1[12], buf1[13]); + buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]); + buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]); + buf0[15] = _mm_add_epi32(buf1[15], buf1[14]); + buf0[16] = buf1[16]; + btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + cospi = cospi_arr(cos_bit); + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15], + cos_bit); + btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], + buf1[14], cos_bit); + btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[16] = _mm_add_epi32(buf0[16], buf0[17]); + buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]); + buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]); + buf1[19] = _mm_add_epi32(buf0[19], buf0[18]); + buf1[20] = _mm_add_epi32(buf0[20], buf0[21]); + buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]); + buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]); + buf1[23] = _mm_add_epi32(buf0[23], buf0[22]); + buf1[24] = _mm_add_epi32(buf0[24], buf0[25]); + buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]); + buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]); + buf1[27] = _mm_add_epi32(buf0[27], buf0[26]); + buf1[28] = _mm_add_epi32(buf0[28], buf0[29]); + buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]); + buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[30]); + + // stage 8 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], + buf0[31], cos_bit); + btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + + startidx = 0 * stride; + endidx = 31 * stride; + // stage 9 + output[startidx] = buf0[0]; + output[endidx] = buf0[31]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[16]; + output[endidx] = buf0[15]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[8]; + output[endidx] = buf0[23]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[24]; + output[endidx] = buf0[7]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[4]; + output[endidx] = buf0[27]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[20]; + output[endidx] = buf0[11]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[12]; + output[endidx] = buf0[19]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[28]; + output[endidx] = buf0[3]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[2]; + output[endidx] = buf0[29]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[18]; + output[endidx] = buf0[13]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[10]; + output[endidx] = buf0[21]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[26]; + output[endidx] = buf0[5]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[6]; + output[endidx] = buf0[25]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[22]; + output[endidx] = buf0[9]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[14]; + output[endidx] = buf0[17]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[30]; + output[endidx] = buf0[1]; +} + +void av1_fadst4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 4; + const int num_per_128 = 4; + const int32_t *cospi; + __m128i buf0[4]; + __m128i buf1[4]; + int col_num = txfm_size / num_per_128; + int col; + (void)stage_range; + for (col = 0; col < col_num; col++) { + // stage 0; + int j; + for (j = 0; j < 4; ++j) { + buf0[j] = input[j * col_num + col]; + } + + // stage 1 + buf1[0] = buf0[3]; + buf1[1] = buf0[0]; + buf1[2] = buf0[1]; + buf1[3] = buf0[2]; + + // stage 2 + cospi = cospi_arr(cos_bit); + btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1], + cos_bit); + btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], + buf0[3], cos_bit); + + // stage 3 + buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); + buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); + buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); + buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], + buf0[3], cos_bit); + + // stage 5 + buf1[0] = buf0[0]; + buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]); + buf1[2] = buf0[3]; + buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]); + + for (j = 0; j < 4; ++j) { + output[j * col_num + col] = buf1[j]; + } + } +} + +void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit, + const int instride, const int outstride) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]); + __m128i cospi_p32 = _mm_set1_epi32(cospi[32]); + __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]); + __m128i cospi_p48 = _mm_set1_epi32(cospi[48]); + __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]); + __m128i cospi_p16 = _mm_set1_epi32(cospi[16]); + __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]); + __m128i cospi_p56 = _mm_set1_epi32(cospi[56]); + __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]); + __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]); + __m128i cospi_p24 = _mm_set1_epi32(cospi[24]); + __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]); + __m128i cospi_p08 = _mm_set1_epi32(cospi[8]); + __m128i cospi_p40 = _mm_set1_epi32(cospi[40]); + __m128i cospi_p60 = _mm_set1_epi32(cospi[60]); + __m128i cospi_p04 = _mm_set1_epi32(cospi[4]); + __m128i cospi_p28 = _mm_set1_epi32(cospi[28]); + __m128i cospi_p36 = _mm_set1_epi32(cospi[36]); + __m128i cospi_p44 = _mm_set1_epi32(cospi[44]); + __m128i cospi_p20 = _mm_set1_epi32(cospi[20]); + __m128i cospi_p12 = _mm_set1_epi32(cospi[12]); + __m128i cospi_p52 = _mm_set1_epi32(cospi[52]); + __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]); + __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]); + __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]); + __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]); + __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]); + __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]); + __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]); + __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]); + __m128i cospi_p62 = _mm_set1_epi32(cospi[62]); + __m128i cospi_p02 = _mm_set1_epi32(cospi[2]); + __m128i cospi_p30 = _mm_set1_epi32(cospi[30]); + __m128i cospi_p34 = _mm_set1_epi32(cospi[34]); + __m128i cospi_p46 = _mm_set1_epi32(cospi[46]); + __m128i cospi_p18 = _mm_set1_epi32(cospi[18]); + __m128i cospi_p14 = _mm_set1_epi32(cospi[14]); + __m128i cospi_p50 = _mm_set1_epi32(cospi[50]); + __m128i cospi_p54 = _mm_set1_epi32(cospi[54]); + __m128i cospi_p10 = _mm_set1_epi32(cospi[10]); + __m128i cospi_p22 = _mm_set1_epi32(cospi[22]); + __m128i cospi_p42 = _mm_set1_epi32(cospi[42]); + __m128i cospi_p38 = _mm_set1_epi32(cospi[38]); + __m128i cospi_p26 = _mm_set1_epi32(cospi[26]); + __m128i cospi_p06 = _mm_set1_epi32(cospi[6]); + __m128i cospi_p58 = _mm_set1_epi32(cospi[58]); + __m128i cospi_p63 = _mm_set1_epi32(cospi[63]); + __m128i cospi_p01 = _mm_set1_epi32(cospi[1]); + __m128i cospi_p31 = _mm_set1_epi32(cospi[31]); + __m128i cospi_p33 = _mm_set1_epi32(cospi[33]); + __m128i cospi_p47 = _mm_set1_epi32(cospi[47]); + __m128i cospi_p17 = _mm_set1_epi32(cospi[17]); + __m128i cospi_p15 = _mm_set1_epi32(cospi[15]); + __m128i cospi_p49 = _mm_set1_epi32(cospi[49]); + __m128i cospi_p55 = _mm_set1_epi32(cospi[55]); + __m128i cospi_p09 = _mm_set1_epi32(cospi[9]); + __m128i cospi_p23 = _mm_set1_epi32(cospi[23]); + __m128i cospi_p41 = _mm_set1_epi32(cospi[41]); + __m128i cospi_p39 = _mm_set1_epi32(cospi[39]); + __m128i cospi_p25 = _mm_set1_epi32(cospi[25]); + __m128i cospi_p07 = _mm_set1_epi32(cospi[7]); + __m128i cospi_p57 = _mm_set1_epi32(cospi[57]); + __m128i cospi_p59 = _mm_set1_epi32(cospi[59]); + __m128i cospi_p05 = _mm_set1_epi32(cospi[5]); + __m128i cospi_p27 = _mm_set1_epi32(cospi[27]); + __m128i cospi_p37 = _mm_set1_epi32(cospi[37]); + __m128i cospi_p43 = _mm_set1_epi32(cospi[43]); + __m128i cospi_p21 = _mm_set1_epi32(cospi[21]); + __m128i cospi_p11 = _mm_set1_epi32(cospi[11]); + __m128i cospi_p53 = _mm_set1_epi32(cospi[53]); + __m128i cospi_p51 = _mm_set1_epi32(cospi[51]); + __m128i cospi_p13 = _mm_set1_epi32(cospi[13]); + __m128i cospi_p19 = _mm_set1_epi32(cospi[19]); + __m128i cospi_p45 = _mm_set1_epi32(cospi[45]); + __m128i cospi_p35 = _mm_set1_epi32(cospi[35]); + __m128i cospi_p29 = _mm_set1_epi32(cospi[29]); + __m128i cospi_p03 = _mm_set1_epi32(cospi[3]); + __m128i cospi_p61 = _mm_set1_epi32(cospi[61]); + + int startidx = 0 * instride; + int endidx = 63 * instride; + // stage 1 + __m128i x1[64]; + x1[0] = _mm_add_epi32(input[startidx], input[endidx]); + x1[63] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[1] = _mm_add_epi32(input[startidx], input[endidx]); + x1[62] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[2] = _mm_add_epi32(input[startidx], input[endidx]); + x1[61] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[3] = _mm_add_epi32(input[startidx], input[endidx]); + x1[60] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[4] = _mm_add_epi32(input[startidx], input[endidx]); + x1[59] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[5] = _mm_add_epi32(input[startidx], input[endidx]); + x1[58] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[6] = _mm_add_epi32(input[startidx], input[endidx]); + x1[57] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[7] = _mm_add_epi32(input[startidx], input[endidx]); + x1[56] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[8] = _mm_add_epi32(input[startidx], input[endidx]); + x1[55] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[9] = _mm_add_epi32(input[startidx], input[endidx]); + x1[54] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[10] = _mm_add_epi32(input[startidx], input[endidx]); + x1[53] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[11] = _mm_add_epi32(input[startidx], input[endidx]); + x1[52] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[12] = _mm_add_epi32(input[startidx], input[endidx]); + x1[51] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[13] = _mm_add_epi32(input[startidx], input[endidx]); + x1[50] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[14] = _mm_add_epi32(input[startidx], input[endidx]); + x1[49] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[15] = _mm_add_epi32(input[startidx], input[endidx]); + x1[48] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[16] = _mm_add_epi32(input[startidx], input[endidx]); + x1[47] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[17] = _mm_add_epi32(input[startidx], input[endidx]); + x1[46] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[18] = _mm_add_epi32(input[startidx], input[endidx]); + x1[45] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[19] = _mm_add_epi32(input[startidx], input[endidx]); + x1[44] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[20] = _mm_add_epi32(input[startidx], input[endidx]); + x1[43] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[21] = _mm_add_epi32(input[startidx], input[endidx]); + x1[42] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[22] = _mm_add_epi32(input[startidx], input[endidx]); + x1[41] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[23] = _mm_add_epi32(input[startidx], input[endidx]); + x1[40] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[24] = _mm_add_epi32(input[startidx], input[endidx]); + x1[39] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[25] = _mm_add_epi32(input[startidx], input[endidx]); + x1[38] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[26] = _mm_add_epi32(input[startidx], input[endidx]); + x1[37] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[27] = _mm_add_epi32(input[startidx], input[endidx]); + x1[36] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[28] = _mm_add_epi32(input[startidx], input[endidx]); + x1[35] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[29] = _mm_add_epi32(input[startidx], input[endidx]); + x1[34] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[30] = _mm_add_epi32(input[startidx], input[endidx]); + x1[33] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[31] = _mm_add_epi32(input[startidx], input[endidx]); + x1[32] = _mm_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + __m128i x2[64]; + x2[0] = _mm_add_epi32(x1[0], x1[31]); + x2[31] = _mm_sub_epi32(x1[0], x1[31]); + x2[1] = _mm_add_epi32(x1[1], x1[30]); + x2[30] = _mm_sub_epi32(x1[1], x1[30]); + x2[2] = _mm_add_epi32(x1[2], x1[29]); + x2[29] = _mm_sub_epi32(x1[2], x1[29]); + x2[3] = _mm_add_epi32(x1[3], x1[28]); + x2[28] = _mm_sub_epi32(x1[3], x1[28]); + x2[4] = _mm_add_epi32(x1[4], x1[27]); + x2[27] = _mm_sub_epi32(x1[4], x1[27]); + x2[5] = _mm_add_epi32(x1[5], x1[26]); + x2[26] = _mm_sub_epi32(x1[5], x1[26]); + x2[6] = _mm_add_epi32(x1[6], x1[25]); + x2[25] = _mm_sub_epi32(x1[6], x1[25]); + x2[7] = _mm_add_epi32(x1[7], x1[24]); + x2[24] = _mm_sub_epi32(x1[7], x1[24]); + x2[8] = _mm_add_epi32(x1[8], x1[23]); + x2[23] = _mm_sub_epi32(x1[8], x1[23]); + x2[9] = _mm_add_epi32(x1[9], x1[22]); + x2[22] = _mm_sub_epi32(x1[9], x1[22]); + x2[10] = _mm_add_epi32(x1[10], x1[21]); + x2[21] = _mm_sub_epi32(x1[10], x1[21]); + x2[11] = _mm_add_epi32(x1[11], x1[20]); + x2[20] = _mm_sub_epi32(x1[11], x1[20]); + x2[12] = _mm_add_epi32(x1[12], x1[19]); + x2[19] = _mm_sub_epi32(x1[12], x1[19]); + x2[13] = _mm_add_epi32(x1[13], x1[18]); + x2[18] = _mm_sub_epi32(x1[13], x1[18]); + x2[14] = _mm_add_epi32(x1[14], x1[17]); + x2[17] = _mm_sub_epi32(x1[14], x1[17]); + x2[15] = _mm_add_epi32(x1[15], x1[16]); + x2[16] = _mm_sub_epi32(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48], + __rounding, cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + __m128i x3[64]; + x3[0] = _mm_add_epi32(x2[0], x2[15]); + x3[15] = _mm_sub_epi32(x2[0], x2[15]); + x3[1] = _mm_add_epi32(x2[1], x2[14]); + x3[14] = _mm_sub_epi32(x2[1], x2[14]); + x3[2] = _mm_add_epi32(x2[2], x2[13]); + x3[13] = _mm_sub_epi32(x2[2], x2[13]); + x3[3] = _mm_add_epi32(x2[3], x2[12]); + x3[12] = _mm_sub_epi32(x2[3], x2[12]); + x3[4] = _mm_add_epi32(x2[4], x2[11]); + x3[11] = _mm_sub_epi32(x2[4], x2[11]); + x3[5] = _mm_add_epi32(x2[5], x2[10]); + x3[10] = _mm_sub_epi32(x2[5], x2[10]); + x3[6] = _mm_add_epi32(x2[6], x2[9]); + x3[9] = _mm_sub_epi32(x2[6], x2[9]); + x3[7] = _mm_add_epi32(x2[7], x2[8]); + x3[8] = _mm_sub_epi32(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24], + __rounding, cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm_add_epi32(x2[32], x2[47]); + x3[47] = _mm_sub_epi32(x2[32], x2[47]); + x3[33] = _mm_add_epi32(x2[33], x2[46]); + x3[46] = _mm_sub_epi32(x2[33], x2[46]); + x3[34] = _mm_add_epi32(x2[34], x2[45]); + x3[45] = _mm_sub_epi32(x2[34], x2[45]); + x3[35] = _mm_add_epi32(x2[35], x2[44]); + x3[44] = _mm_sub_epi32(x2[35], x2[44]); + x3[36] = _mm_add_epi32(x2[36], x2[43]); + x3[43] = _mm_sub_epi32(x2[36], x2[43]); + x3[37] = _mm_add_epi32(x2[37], x2[42]); + x3[42] = _mm_sub_epi32(x2[37], x2[42]); + x3[38] = _mm_add_epi32(x2[38], x2[41]); + x3[41] = _mm_sub_epi32(x2[38], x2[41]); + x3[39] = _mm_add_epi32(x2[39], x2[40]); + x3[40] = _mm_sub_epi32(x2[39], x2[40]); + x3[48] = _mm_sub_epi32(x2[63], x2[48]); + x3[63] = _mm_add_epi32(x2[63], x2[48]); + x3[49] = _mm_sub_epi32(x2[62], x2[49]); + x3[62] = _mm_add_epi32(x2[62], x2[49]); + x3[50] = _mm_sub_epi32(x2[61], x2[50]); + x3[61] = _mm_add_epi32(x2[61], x2[50]); + x3[51] = _mm_sub_epi32(x2[60], x2[51]); + x3[60] = _mm_add_epi32(x2[60], x2[51]); + x3[52] = _mm_sub_epi32(x2[59], x2[52]); + x3[59] = _mm_add_epi32(x2[59], x2[52]); + x3[53] = _mm_sub_epi32(x2[58], x2[53]); + x3[58] = _mm_add_epi32(x2[58], x2[53]); + x3[54] = _mm_sub_epi32(x2[57], x2[54]); + x3[57] = _mm_add_epi32(x2[57], x2[54]); + x3[55] = _mm_sub_epi32(x2[56], x2[55]); + x3[56] = _mm_add_epi32(x2[56], x2[55]); + + // stage 4 + __m128i x4[64]; + x4[0] = _mm_add_epi32(x3[0], x3[7]); + x4[7] = _mm_sub_epi32(x3[0], x3[7]); + x4[1] = _mm_add_epi32(x3[1], x3[6]); + x4[6] = _mm_sub_epi32(x3[1], x3[6]); + x4[2] = _mm_add_epi32(x3[2], x3[5]); + x4[5] = _mm_sub_epi32(x3[2], x3[5]); + x4[3] = _mm_add_epi32(x3[3], x3[4]); + x4[4] = _mm_sub_epi32(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12], + __rounding, cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm_add_epi32(x3[16], x3[23]); + x4[23] = _mm_sub_epi32(x3[16], x3[23]); + x4[17] = _mm_add_epi32(x3[17], x3[22]); + x4[22] = _mm_sub_epi32(x3[17], x3[22]); + x4[18] = _mm_add_epi32(x3[18], x3[21]); + x4[21] = _mm_sub_epi32(x3[18], x3[21]); + x4[19] = _mm_add_epi32(x3[19], x3[20]); + x4[20] = _mm_sub_epi32(x3[19], x3[20]); + x4[24] = _mm_sub_epi32(x3[31], x3[24]); + x4[31] = _mm_add_epi32(x3[31], x3[24]); + x4[25] = _mm_sub_epi32(x3[30], x3[25]); + x4[30] = _mm_add_epi32(x3[30], x3[25]); + x4[26] = _mm_sub_epi32(x3[29], x3[26]); + x4[29] = _mm_add_epi32(x3[29], x3[26]); + x4[27] = _mm_sub_epi32(x3[28], x3[27]); + x4[28] = _mm_add_epi32(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52], + __rounding, cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + __m128i x5[64]; + x5[0] = _mm_add_epi32(x4[0], x4[3]); + x5[3] = _mm_sub_epi32(x4[0], x4[3]); + x5[1] = _mm_add_epi32(x4[1], x4[2]); + x5[2] = _mm_sub_epi32(x4[1], x4[2]); + x5[4] = x4[4]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6], + __rounding, cos_bit); + x5[7] = x4[7]; + x5[8] = _mm_add_epi32(x4[8], x4[11]); + x5[11] = _mm_sub_epi32(x4[8], x4[11]); + x5[9] = _mm_add_epi32(x4[9], x4[10]); + x5[10] = _mm_sub_epi32(x4[9], x4[10]); + x5[12] = _mm_sub_epi32(x4[15], x4[12]); + x5[15] = _mm_add_epi32(x4[15], x4[12]); + x5[13] = _mm_sub_epi32(x4[14], x4[13]); + x5[14] = _mm_add_epi32(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26], + __rounding, cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm_add_epi32(x4[32], x4[39]); + x5[39] = _mm_sub_epi32(x4[32], x4[39]); + x5[33] = _mm_add_epi32(x4[33], x4[38]); + x5[38] = _mm_sub_epi32(x4[33], x4[38]); + x5[34] = _mm_add_epi32(x4[34], x4[37]); + x5[37] = _mm_sub_epi32(x4[34], x4[37]); + x5[35] = _mm_add_epi32(x4[35], x4[36]); + x5[36] = _mm_sub_epi32(x4[35], x4[36]); + x5[40] = _mm_sub_epi32(x4[47], x4[40]); + x5[47] = _mm_add_epi32(x4[47], x4[40]); + x5[41] = _mm_sub_epi32(x4[46], x4[41]); + x5[46] = _mm_add_epi32(x4[46], x4[41]); + x5[42] = _mm_sub_epi32(x4[45], x4[42]); + x5[45] = _mm_add_epi32(x4[45], x4[42]); + x5[43] = _mm_sub_epi32(x4[44], x4[43]); + x5[44] = _mm_add_epi32(x4[44], x4[43]); + x5[48] = _mm_add_epi32(x4[48], x4[55]); + x5[55] = _mm_sub_epi32(x4[48], x4[55]); + x5[49] = _mm_add_epi32(x4[49], x4[54]); + x5[54] = _mm_sub_epi32(x4[49], x4[54]); + x5[50] = _mm_add_epi32(x4[50], x4[53]); + x5[53] = _mm_sub_epi32(x4[50], x4[53]); + x5[51] = _mm_add_epi32(x4[51], x4[52]); + x5[52] = _mm_sub_epi32(x4[51], x4[52]); + x5[56] = _mm_sub_epi32(x4[63], x4[56]); + x5[63] = _mm_add_epi32(x4[63], x4[56]); + x5[57] = _mm_sub_epi32(x4[62], x4[57]); + x5[62] = _mm_add_epi32(x4[62], x4[57]); + x5[58] = _mm_sub_epi32(x4[61], x4[58]); + x5[61] = _mm_add_epi32(x4[61], x4[58]); + x5[59] = _mm_sub_epi32(x4[60], x4[59]); + x5[60] = _mm_add_epi32(x4[60], x4[59]); + + // stage 6 + __m128i x6[64]; + btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3], + __rounding, cos_bit); + x6[4] = _mm_add_epi32(x5[4], x5[5]); + x6[5] = _mm_sub_epi32(x5[4], x5[5]); + x6[6] = _mm_sub_epi32(x5[7], x5[6]); + x6[7] = _mm_add_epi32(x5[7], x5[6]); + x6[8] = x5[8]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13], + __rounding, cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm_add_epi32(x5[16], x5[19]); + x6[19] = _mm_sub_epi32(x5[16], x5[19]); + x6[17] = _mm_add_epi32(x5[17], x5[18]); + x6[18] = _mm_sub_epi32(x5[17], x5[18]); + x6[20] = _mm_sub_epi32(x5[23], x5[20]); + x6[23] = _mm_add_epi32(x5[23], x5[20]); + x6[21] = _mm_sub_epi32(x5[22], x5[21]); + x6[22] = _mm_add_epi32(x5[22], x5[21]); + x6[24] = _mm_add_epi32(x5[24], x5[27]); + x6[27] = _mm_sub_epi32(x5[24], x5[27]); + x6[25] = _mm_add_epi32(x5[25], x5[26]); + x6[26] = _mm_sub_epi32(x5[25], x5[26]); + x6[28] = _mm_sub_epi32(x5[31], x5[28]); + x6[31] = _mm_add_epi32(x5[31], x5[28]); + x6[29] = _mm_sub_epi32(x5[30], x5[29]); + x6[30] = _mm_add_epi32(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58], + __rounding, cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50], + __rounding, cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + __m128i x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6], + __rounding, cos_bit); + x7[8] = _mm_add_epi32(x6[8], x6[9]); + x7[9] = _mm_sub_epi32(x6[8], x6[9]); + x7[10] = _mm_sub_epi32(x6[11], x6[10]); + x7[11] = _mm_add_epi32(x6[11], x6[10]); + x7[12] = _mm_add_epi32(x6[12], x6[13]); + x7[13] = _mm_sub_epi32(x6[12], x6[13]); + x7[14] = _mm_sub_epi32(x6[15], x6[14]); + x7[15] = _mm_add_epi32(x6[15], x6[14]); + x7[16] = x6[16]; + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29], + __rounding, cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25], + __rounding, cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm_add_epi32(x6[32], x6[35]); + x7[35] = _mm_sub_epi32(x6[32], x6[35]); + x7[33] = _mm_add_epi32(x6[33], x6[34]); + x7[34] = _mm_sub_epi32(x6[33], x6[34]); + x7[36] = _mm_sub_epi32(x6[39], x6[36]); + x7[39] = _mm_add_epi32(x6[39], x6[36]); + x7[37] = _mm_sub_epi32(x6[38], x6[37]); + x7[38] = _mm_add_epi32(x6[38], x6[37]); + x7[40] = _mm_add_epi32(x6[40], x6[43]); + x7[43] = _mm_sub_epi32(x6[40], x6[43]); + x7[41] = _mm_add_epi32(x6[41], x6[42]); + x7[42] = _mm_sub_epi32(x6[41], x6[42]); + x7[44] = _mm_sub_epi32(x6[47], x6[44]); + x7[47] = _mm_add_epi32(x6[47], x6[44]); + x7[45] = _mm_sub_epi32(x6[46], x6[45]); + x7[46] = _mm_add_epi32(x6[46], x6[45]); + x7[48] = _mm_add_epi32(x6[48], x6[51]); + x7[51] = _mm_sub_epi32(x6[48], x6[51]); + x7[49] = _mm_add_epi32(x6[49], x6[50]); + x7[50] = _mm_sub_epi32(x6[49], x6[50]); + x7[52] = _mm_sub_epi32(x6[55], x6[52]); + x7[55] = _mm_add_epi32(x6[55], x6[52]); + x7[53] = _mm_sub_epi32(x6[54], x6[53]); + x7[54] = _mm_add_epi32(x6[54], x6[53]); + x7[56] = _mm_add_epi32(x6[56], x6[59]); + x7[59] = _mm_sub_epi32(x6[56], x6[59]); + x7[57] = _mm_add_epi32(x6[57], x6[58]); + x7[58] = _mm_sub_epi32(x6[57], x6[58]); + x7[60] = _mm_sub_epi32(x6[63], x6[60]); + x7[63] = _mm_add_epi32(x6[63], x6[60]); + x7[61] = _mm_sub_epi32(x6[62], x6[61]); + x7[62] = _mm_add_epi32(x6[62], x6[61]); + + // stage 8 + __m128i x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12], + __rounding, cos_bit); + x8[16] = _mm_add_epi32(x7[16], x7[17]); + x8[17] = _mm_sub_epi32(x7[16], x7[17]); + x8[18] = _mm_sub_epi32(x7[19], x7[18]); + x8[19] = _mm_add_epi32(x7[19], x7[18]); + x8[20] = _mm_add_epi32(x7[20], x7[21]); + x8[21] = _mm_sub_epi32(x7[20], x7[21]); + x8[22] = _mm_sub_epi32(x7[23], x7[22]); + x8[23] = _mm_add_epi32(x7[23], x7[22]); + x8[24] = _mm_add_epi32(x7[24], x7[25]); + x8[25] = _mm_sub_epi32(x7[24], x7[25]); + x8[26] = _mm_sub_epi32(x7[27], x7[26]); + x8[27] = _mm_add_epi32(x7[27], x7[26]); + x8[28] = _mm_add_epi32(x7[28], x7[29]); + x8[29] = _mm_sub_epi32(x7[28], x7[29]); + x8[30] = _mm_sub_epi32(x7[31], x7[30]); + x8[31] = _mm_add_epi32(x7[31], x7[30]); + x8[32] = x7[32]; + btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], + __rounding, cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], + __rounding, cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], + __rounding, cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], + __rounding, cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + __m128i x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24], + __rounding, cos_bit); + x9[32] = _mm_add_epi32(x8[32], x8[33]); + x9[33] = _mm_sub_epi32(x8[32], x8[33]); + x9[34] = _mm_sub_epi32(x8[35], x8[34]); + x9[35] = _mm_add_epi32(x8[35], x8[34]); + x9[36] = _mm_add_epi32(x8[36], x8[37]); + x9[37] = _mm_sub_epi32(x8[36], x8[37]); + x9[38] = _mm_sub_epi32(x8[39], x8[38]); + x9[39] = _mm_add_epi32(x8[39], x8[38]); + x9[40] = _mm_add_epi32(x8[40], x8[41]); + x9[41] = _mm_sub_epi32(x8[40], x8[41]); + x9[42] = _mm_sub_epi32(x8[43], x8[42]); + x9[43] = _mm_add_epi32(x8[43], x8[42]); + x9[44] = _mm_add_epi32(x8[44], x8[45]); + x9[45] = _mm_sub_epi32(x8[44], x8[45]); + x9[46] = _mm_sub_epi32(x8[47], x8[46]); + x9[47] = _mm_add_epi32(x8[47], x8[46]); + x9[48] = _mm_add_epi32(x8[48], x8[49]); + x9[49] = _mm_sub_epi32(x8[48], x8[49]); + x9[50] = _mm_sub_epi32(x8[51], x8[50]); + x9[51] = _mm_add_epi32(x8[51], x8[50]); + x9[52] = _mm_add_epi32(x8[52], x8[53]); + x9[53] = _mm_sub_epi32(x8[52], x8[53]); + x9[54] = _mm_sub_epi32(x8[55], x8[54]); + x9[55] = _mm_add_epi32(x8[55], x8[54]); + x9[56] = _mm_add_epi32(x8[56], x8[57]); + x9[57] = _mm_sub_epi32(x8[56], x8[57]); + x9[58] = _mm_sub_epi32(x8[59], x8[58]); + x9[59] = _mm_add_epi32(x8[59], x8[58]); + x9[60] = _mm_add_epi32(x8[60], x8[61]); + x9[61] = _mm_sub_epi32(x8[60], x8[61]); + x9[62] = _mm_sub_epi32(x8[63], x8[62]); + x9[63] = _mm_add_epi32(x8[63], x8[62]); + + // stage 10 + __m128i x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32], + x10[63], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33], + x10[62], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34], + x10[61], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35], + x10[60], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36], + x10[59], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37], + x10[58], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38], + x10[57], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39], + x10[56], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40], + x10[55], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41], + x10[54], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42], + x10[53], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43], + x10[52], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44], + x10[51], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45], + x10[50], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46], + x10[49], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47], + x10[48], __rounding, cos_bit); + + startidx = 0 * outstride; + endidx = 63 * outstride; + // stage 11 + output[startidx] = x10[0]; + output[endidx] = x10[63]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[32]; + output[endidx] = x10[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[16]; + output[endidx] = x10[47]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[48]; + output[endidx] = x10[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[8]; + output[endidx] = x10[55]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[40]; + output[endidx] = x10[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[24]; + output[endidx] = x10[39]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[56]; + output[endidx] = x10[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[4]; + output[endidx] = x10[59]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[36]; + output[endidx] = x10[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[20]; + output[endidx] = x10[43]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[52]; + output[endidx] = x10[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[12]; + output[endidx] = x10[51]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[44]; + output[endidx] = x10[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[28]; + output[endidx] = x10[35]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[60]; + output[endidx] = x10[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[2]; + output[endidx] = x10[61]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[34]; + output[endidx] = x10[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[18]; + output[endidx] = x10[45]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[50]; + output[endidx] = x10[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[10]; + output[endidx] = x10[53]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[42]; + output[endidx] = x10[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[26]; + output[endidx] = x10[37]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[58]; + output[endidx] = x10[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[6]; + output[endidx] = x10[57]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[38]; + output[endidx] = x10[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[22]; + output[endidx] = x10[41]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[54]; + output[endidx] = x10[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[14]; + output[endidx] = x10[49]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[46]; + output[endidx] = x10[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[30]; + output[endidx] = x10[33]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[62]; + output[endidx] = x10[1]; +} + +void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int col_num) { + (void)cos_bit; + for (int i = 0; i < 32; i++) { + output[i * col_num] = _mm_slli_epi32(input[i * col_num], 2); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c new file mode 100644 index 0000000000..b143df3523 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c @@ -0,0 +1,3010 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/x86/av1_fwd_txfm_avx2.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + + // stage 1 + __m256i x1[16]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + + // stage 4 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 6 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + + // stage 7 + output[0] = x1[0]; + output[1] = x1[8]; + output[2] = x1[4]; + output[3] = x1[12]; + output[4] = x1[2]; + output[5] = x1[10]; + output[6] = x1[6]; + output[7] = x1[14]; + output[8] = x1[1]; + output[9] = x1[9]; + output[10] = x1[5]; + output[11] = x1[13]; + output[12] = x1[3]; + output[13] = x1[11]; + output[14] = x1[7]; + output[15] = x1[15]; +} + +static INLINE void fdct16x32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); + __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); + __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); + __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); + __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); + __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); + __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); + __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); + __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); + __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); + __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); + __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); + __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); + __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); + __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); + __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); + + // stage 1 + __m256i x1[32]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]); + btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]); + btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]); + btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]); + btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]); + btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]); + btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]); + btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]); + btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[15]); + btf_16_adds_subs_avx2(&x1[1], &x1[14]); + btf_16_adds_subs_avx2(&x1[2], &x1[13]); + btf_16_adds_subs_avx2(&x1[3], &x1[12]); + btf_16_adds_subs_avx2(&x1[4], &x1[11]); + btf_16_adds_subs_avx2(&x1[5], &x1[10]); + btf_16_adds_subs_avx2(&x1[6], &x1[9]); + btf_16_adds_subs_avx2(&x1[7], &x1[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[23]); + btf_16_adds_subs_avx2(&x1[17], &x1[22]); + btf_16_adds_subs_avx2(&x1[18], &x1[21]); + btf_16_adds_subs_avx2(&x1[19], &x1[20]); + btf_16_adds_subs_avx2(&x1[31], &x1[24]); + btf_16_adds_subs_avx2(&x1[30], &x1[25]); + btf_16_adds_subs_avx2(&x1[29], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[27]); + + // stage 4 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[19]); + btf_16_adds_subs_avx2(&x1[17], &x1[18]); + btf_16_adds_subs_avx2(&x1[23], &x1[20]); + btf_16_adds_subs_avx2(&x1[22], &x1[21]); + btf_16_adds_subs_avx2(&x1[24], &x1[27]); + btf_16_adds_subs_avx2(&x1[25], &x1[26]); + btf_16_adds_subs_avx2(&x1[31], &x1[28]); + btf_16_adds_subs_avx2(&x1[30], &x1[29]); + + // stage 6 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); + + // stage 7 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[17]); + btf_16_adds_subs_avx2(&x1[19], &x1[18]); + btf_16_adds_subs_avx2(&x1[20], &x1[21]); + btf_16_adds_subs_avx2(&x1[23], &x1[22]); + btf_16_adds_subs_avx2(&x1[24], &x1[25]); + btf_16_adds_subs_avx2(&x1[27], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[29]); + btf_16_adds_subs_avx2(&x1[31], &x1[30]); + + // stage 8 + btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); + + // stage 9 + output[0] = x1[0]; + output[1] = x1[16]; + output[2] = x1[8]; + output[3] = x1[24]; + output[4] = x1[4]; + output[5] = x1[20]; + output[6] = x1[12]; + output[7] = x1[28]; + output[8] = x1[2]; + output[9] = x1[18]; + output[10] = x1[10]; + output[11] = x1[26]; + output[12] = x1[6]; + output[13] = x1[22]; + output[14] = x1[14]; + output[15] = x1[30]; + output[16] = x1[1]; + output[17] = x1[17]; + output[18] = x1[9]; + output[19] = x1[25]; + output[20] = x1[5]; + output[21] = x1[21]; + output[22] = x1[13]; + output[23] = x1[29]; + output[24] = x1[3]; + output[25] = x1[19]; + output[26] = x1[11]; + output[27] = x1[27]; + output[28] = x1[7]; + output[29] = x1[23]; + output[30] = x1[15]; + output[31] = x1[31]; +} + +static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); + __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); + __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); + __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); + __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); + __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); + __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); + __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); + __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); + __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); + __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); + __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); + __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); + __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); + __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); + __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); + __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); + __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); + __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]); + __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]); + __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]); + __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]); + __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]); + __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]); + __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]); + __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]); + __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]); + __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]); + __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]); + __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]); + __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]); + __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]); + __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]); + __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]); + __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]); + __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]); + __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]); + __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]); + __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]); + __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]); + __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]); + __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]); + __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]); + __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]); + __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]); + __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]); + __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]); + __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]); + __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]); + __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]); + + // stage 1 + __m256i x1[64]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]); + btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]); + btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]); + btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]); + btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]); + btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]); + btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]); + btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]); + btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]); + btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]); + btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]); + btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]); + btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]); + btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]); + btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]); + btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]); + btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]); + btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]); + btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]); + btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]); + btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]); + btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]); + btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]); + btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]); + btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[31]); + btf_16_adds_subs_avx2(&x1[1], &x1[30]); + btf_16_adds_subs_avx2(&x1[2], &x1[29]); + btf_16_adds_subs_avx2(&x1[3], &x1[28]); + btf_16_adds_subs_avx2(&x1[4], &x1[27]); + btf_16_adds_subs_avx2(&x1[5], &x1[26]); + btf_16_adds_subs_avx2(&x1[6], &x1[25]); + btf_16_adds_subs_avx2(&x1[7], &x1[24]); + btf_16_adds_subs_avx2(&x1[8], &x1[23]); + btf_16_adds_subs_avx2(&x1[9], &x1[22]); + btf_16_adds_subs_avx2(&x1[10], &x1[21]); + btf_16_adds_subs_avx2(&x1[11], &x1[20]); + btf_16_adds_subs_avx2(&x1[12], &x1[19]); + btf_16_adds_subs_avx2(&x1[13], &x1[18]); + btf_16_adds_subs_avx2(&x1[14], &x1[17]); + btf_16_adds_subs_avx2(&x1[15], &x1[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[15]); + btf_16_adds_subs_avx2(&x1[1], &x1[14]); + btf_16_adds_subs_avx2(&x1[2], &x1[13]); + btf_16_adds_subs_avx2(&x1[3], &x1[12]); + btf_16_adds_subs_avx2(&x1[4], &x1[11]); + btf_16_adds_subs_avx2(&x1[5], &x1[10]); + btf_16_adds_subs_avx2(&x1[6], &x1[9]); + btf_16_adds_subs_avx2(&x1[7], &x1[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[47]); + btf_16_adds_subs_avx2(&x1[33], &x1[46]); + btf_16_adds_subs_avx2(&x1[34], &x1[45]); + btf_16_adds_subs_avx2(&x1[35], &x1[44]); + btf_16_adds_subs_avx2(&x1[36], &x1[43]); + btf_16_adds_subs_avx2(&x1[37], &x1[42]); + btf_16_adds_subs_avx2(&x1[38], &x1[41]); + btf_16_adds_subs_avx2(&x1[39], &x1[40]); + btf_16_adds_subs_avx2(&x1[63], &x1[48]); + btf_16_adds_subs_avx2(&x1[62], &x1[49]); + btf_16_adds_subs_avx2(&x1[61], &x1[50]); + btf_16_adds_subs_avx2(&x1[60], &x1[51]); + btf_16_adds_subs_avx2(&x1[59], &x1[52]); + btf_16_adds_subs_avx2(&x1[58], &x1[53]); + btf_16_adds_subs_avx2(&x1[57], &x1[54]); + btf_16_adds_subs_avx2(&x1[56], &x1[55]); + + // stage 4 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[23]); + btf_16_adds_subs_avx2(&x1[17], &x1[22]); + btf_16_adds_subs_avx2(&x1[18], &x1[21]); + btf_16_adds_subs_avx2(&x1[19], &x1[20]); + btf_16_adds_subs_avx2(&x1[31], &x1[24]); + btf_16_adds_subs_avx2(&x1[30], &x1[25]); + btf_16_adds_subs_avx2(&x1[29], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit); + + // stage 5 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[39]); + btf_16_adds_subs_avx2(&x1[33], &x1[38]); + btf_16_adds_subs_avx2(&x1[34], &x1[37]); + btf_16_adds_subs_avx2(&x1[35], &x1[36]); + btf_16_adds_subs_avx2(&x1[47], &x1[40]); + btf_16_adds_subs_avx2(&x1[46], &x1[41]); + btf_16_adds_subs_avx2(&x1[45], &x1[42]); + btf_16_adds_subs_avx2(&x1[44], &x1[43]); + btf_16_adds_subs_avx2(&x1[48], &x1[55]); + btf_16_adds_subs_avx2(&x1[49], &x1[54]); + btf_16_adds_subs_avx2(&x1[50], &x1[53]); + btf_16_adds_subs_avx2(&x1[51], &x1[52]); + btf_16_adds_subs_avx2(&x1[63], &x1[56]); + btf_16_adds_subs_avx2(&x1[62], &x1[57]); + btf_16_adds_subs_avx2(&x1[61], &x1[58]); + btf_16_adds_subs_avx2(&x1[60], &x1[59]); + + // stage 6 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[19]); + btf_16_adds_subs_avx2(&x1[17], &x1[18]); + btf_16_adds_subs_avx2(&x1[23], &x1[20]); + btf_16_adds_subs_avx2(&x1[22], &x1[21]); + btf_16_adds_subs_avx2(&x1[24], &x1[27]); + btf_16_adds_subs_avx2(&x1[25], &x1[26]); + btf_16_adds_subs_avx2(&x1[31], &x1[28]); + btf_16_adds_subs_avx2(&x1[30], &x1[29]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit); + + // stage 7 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[35]); + btf_16_adds_subs_avx2(&x1[33], &x1[34]); + btf_16_adds_subs_avx2(&x1[39], &x1[36]); + btf_16_adds_subs_avx2(&x1[38], &x1[37]); + btf_16_adds_subs_avx2(&x1[40], &x1[43]); + btf_16_adds_subs_avx2(&x1[41], &x1[42]); + btf_16_adds_subs_avx2(&x1[47], &x1[44]); + btf_16_adds_subs_avx2(&x1[46], &x1[45]); + btf_16_adds_subs_avx2(&x1[48], &x1[51]); + btf_16_adds_subs_avx2(&x1[49], &x1[50]); + btf_16_adds_subs_avx2(&x1[55], &x1[52]); + btf_16_adds_subs_avx2(&x1[54], &x1[53]); + btf_16_adds_subs_avx2(&x1[56], &x1[59]); + btf_16_adds_subs_avx2(&x1[57], &x1[58]); + btf_16_adds_subs_avx2(&x1[63], &x1[60]); + btf_16_adds_subs_avx2(&x1[62], &x1[61]); + + // stage 8 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[17]); + btf_16_adds_subs_avx2(&x1[19], &x1[18]); + btf_16_adds_subs_avx2(&x1[20], &x1[21]); + btf_16_adds_subs_avx2(&x1[23], &x1[22]); + btf_16_adds_subs_avx2(&x1[24], &x1[25]); + btf_16_adds_subs_avx2(&x1[27], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[29]); + btf_16_adds_subs_avx2(&x1[31], &x1[30]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit); + + // stage 9 + btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[33]); + btf_16_adds_subs_avx2(&x1[35], &x1[34]); + btf_16_adds_subs_avx2(&x1[36], &x1[37]); + btf_16_adds_subs_avx2(&x1[39], &x1[38]); + btf_16_adds_subs_avx2(&x1[40], &x1[41]); + btf_16_adds_subs_avx2(&x1[43], &x1[42]); + btf_16_adds_subs_avx2(&x1[44], &x1[45]); + btf_16_adds_subs_avx2(&x1[47], &x1[46]); + btf_16_adds_subs_avx2(&x1[48], &x1[49]); + btf_16_adds_subs_avx2(&x1[51], &x1[50]); + btf_16_adds_subs_avx2(&x1[52], &x1[53]); + btf_16_adds_subs_avx2(&x1[55], &x1[54]); + btf_16_adds_subs_avx2(&x1[56], &x1[57]); + btf_16_adds_subs_avx2(&x1[59], &x1[58]); + btf_16_adds_subs_avx2(&x1[60], &x1[61]); + btf_16_adds_subs_avx2(&x1[63], &x1[62]); + + // stage 10 + btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit); + btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit); + btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit); + btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit); + btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit); + btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit); + + // stage 11 + output[0] = x1[0]; + output[1] = x1[32]; + output[2] = x1[16]; + output[3] = x1[48]; + output[4] = x1[8]; + output[5] = x1[40]; + output[6] = x1[24]; + output[7] = x1[56]; + output[8] = x1[4]; + output[9] = x1[36]; + output[10] = x1[20]; + output[11] = x1[52]; + output[12] = x1[12]; + output[13] = x1[44]; + output[14] = x1[28]; + output[15] = x1[60]; + output[16] = x1[2]; + output[17] = x1[34]; + output[18] = x1[18]; + output[19] = x1[50]; + output[20] = x1[10]; + output[21] = x1[42]; + output[22] = x1[26]; + output[23] = x1[58]; + output[24] = x1[6]; + output[25] = x1[38]; + output[26] = x1[22]; + output[27] = x1[54]; + output[28] = x1[14]; + output[29] = x1[46]; + output[30] = x1[30]; + output[31] = x1[62]; + output[32] = x1[1]; + output[33] = x1[33]; + output[34] = x1[17]; + output[35] = x1[49]; + output[36] = x1[9]; + output[37] = x1[41]; + output[38] = x1[25]; + output[39] = x1[57]; + output[40] = x1[5]; + output[41] = x1[37]; + output[42] = x1[21]; + output[43] = x1[53]; + output[44] = x1[13]; + output[45] = x1[45]; + output[46] = x1[29]; + output[47] = x1[61]; + output[48] = x1[3]; + output[49] = x1[35]; + output[50] = x1[19]; + output[51] = x1[51]; + output[52] = x1[11]; + output[53] = x1[43]; + output[54] = x1[27]; + output[55] = x1[59]; + output[56] = x1[7]; + output[57] = x1[39]; + output[58] = x1[23]; + output[59] = x1[55]; + output[60] = x1[15]; + output[61] = x1[47]; + output[62] = x1[31]; + output[63] = x1[63]; +} + +static INLINE void fdct32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + __m256i x1[32]; + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + // stage 0 + // stage 1 + btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]); + btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]); + btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]); + btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]); + btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]); + btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]); + btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]); + btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]); + btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]); + btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]); + btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]); + btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]); + btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]); + btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]); + btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]); + btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]); + + // stage 2 + btf_32_add_sub_avx2(&x1[0], &x1[15]); + btf_32_add_sub_avx2(&x1[1], &x1[14]); + btf_32_add_sub_avx2(&x1[2], &x1[13]); + btf_32_add_sub_avx2(&x1[3], &x1[12]); + btf_32_add_sub_avx2(&x1[4], &x1[11]); + btf_32_add_sub_avx2(&x1[5], &x1[10]); + btf_32_add_sub_avx2(&x1[6], &x1[9]); + btf_32_add_sub_avx2(&x1[7], &x1[8]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_32_add_sub_avx2(&x1[0], &x1[7]); + btf_32_add_sub_avx2(&x1[1], &x1[6]); + btf_32_add_sub_avx2(&x1[2], &x1[5]); + btf_32_add_sub_avx2(&x1[3], &x1[4]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[23]); + btf_32_add_sub_avx2(&x1[17], &x1[22]); + btf_32_add_sub_avx2(&x1[18], &x1[21]); + btf_32_add_sub_avx2(&x1[19], &x1[20]); + btf_32_add_sub_avx2(&x1[31], &x1[24]); + btf_32_add_sub_avx2(&x1[30], &x1[25]); + btf_32_add_sub_avx2(&x1[29], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[27]); + + // stage 4 + btf_32_add_sub_avx2(&x1[0], &x1[3]); + btf_32_add_sub_avx2(&x1[1], &x1[2]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[11]); + btf_32_add_sub_avx2(&x1[9], &x1[10]); + btf_32_add_sub_avx2(&x1[15], &x1[12]); + btf_32_add_sub_avx2(&x1[14], &x1[13]); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit); + + // stage 5 + btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit); + btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit); + btf_32_add_sub_avx2(&x1[4], &x1[5]); + btf_32_add_sub_avx2(&x1[7], &x1[6]); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[19]); + btf_32_add_sub_avx2(&x1[17], &x1[18]); + btf_32_add_sub_avx2(&x1[23], &x1[20]); + btf_32_add_sub_avx2(&x1[22], &x1[21]); + btf_32_add_sub_avx2(&x1[24], &x1[27]); + btf_32_add_sub_avx2(&x1[25], &x1[26]); + btf_32_add_sub_avx2(&x1[31], &x1[28]); + btf_32_add_sub_avx2(&x1[30], &x1[29]); + + // stage 6 + btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit); + btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[9]); + btf_32_add_sub_avx2(&x1[11], &x1[10]); + btf_32_add_sub_avx2(&x1[12], &x1[13]); + btf_32_add_sub_avx2(&x1[15], &x1[14]); + btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit); + + // stage 7 + btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit); + btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[17]); + btf_32_add_sub_avx2(&x1[19], &x1[18]); + btf_32_add_sub_avx2(&x1[20], &x1[21]); + btf_32_add_sub_avx2(&x1[23], &x1[22]); + btf_32_add_sub_avx2(&x1[24], &x1[25]); + btf_32_add_sub_avx2(&x1[27], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[29]); + btf_32_add_sub_avx2(&x1[31], &x1[30]); + + // stage 8 + btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit); + btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit); + + // stage 9 + output[0] = x1[0]; + output[1] = x1[16]; + output[2] = x1[8]; + output[3] = x1[24]; + output[4] = x1[4]; + output[5] = x1[20]; + output[6] = x1[12]; + output[7] = x1[28]; + output[8] = x1[2]; + output[9] = x1[18]; + output[10] = x1[10]; + output[11] = x1[26]; + output[12] = x1[6]; + output[13] = x1[22]; + output[14] = x1[14]; + output[15] = x1[30]; + output[16] = x1[1]; + output[17] = x1[17]; + output[18] = x1[9]; + output[19] = x1[25]; + output[20] = x1[5]; + output[21] = x1[21]; + output[22] = x1[13]; + output[23] = x1[29]; + output[24] = x1[3]; + output[25] = x1[19]; + output[26] = x1[11]; + output[27] = x1[27]; + output[28] = x1[7]; + output[29] = x1[23]; + output[30] = x1[15]; + output[31] = x1[31]; +} + +static INLINE void fdct64_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); + __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); + __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); + __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); + __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); + __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); + __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); + __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); + __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); + __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); + __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); + __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); + __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); + __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); + __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); + __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); + __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); + __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); + __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); + __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); + __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); + __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); + __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); + __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); + __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); + __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); + __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); + __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); + __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); + __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); + __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); + __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); + __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); + __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); + __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); + __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); + __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); + __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); + __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); + __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); + __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); + __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); + __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); + __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); + __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); + __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); + __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); + __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); + __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); + __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); + __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); + __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); + __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); + __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); + __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); + __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); + __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); + __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); + __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); + __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); + __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); + __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); + __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); + __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); + __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); + __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); + __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); + __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); + __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); + __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); + __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); + __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); + __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); + __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); + __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); + __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); + __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); + __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); + + // stage 1 + __m256i x1[64]; + btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]); + btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]); + btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]); + btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]); + btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]); + btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]); + btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]); + btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]); + btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]); + btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]); + btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]); + btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]); + btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]); + btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]); + btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]); + btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]); + btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]); + btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]); + btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]); + btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]); + btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]); + btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]); + btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]); + btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]); + btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]); + btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]); + btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]); + btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]); + btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]); + btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]); + btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]); + btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]); + + // stage 2 + btf_32_add_sub_avx2(&x1[0], &x1[31]); + btf_32_add_sub_avx2(&x1[1], &x1[30]); + btf_32_add_sub_avx2(&x1[2], &x1[29]); + btf_32_add_sub_avx2(&x1[3], &x1[28]); + btf_32_add_sub_avx2(&x1[4], &x1[27]); + btf_32_add_sub_avx2(&x1[5], &x1[26]); + btf_32_add_sub_avx2(&x1[6], &x1[25]); + btf_32_add_sub_avx2(&x1[7], &x1[24]); + btf_32_add_sub_avx2(&x1[8], &x1[23]); + btf_32_add_sub_avx2(&x1[9], &x1[22]); + btf_32_add_sub_avx2(&x1[10], &x1[21]); + btf_32_add_sub_avx2(&x1[11], &x1[20]); + btf_32_add_sub_avx2(&x1[12], &x1[19]); + btf_32_add_sub_avx2(&x1[13], &x1[18]); + btf_32_add_sub_avx2(&x1[14], &x1[17]); + btf_32_add_sub_avx2(&x1[15], &x1[16]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit); + + // stage 3 + btf_32_add_sub_avx2(&x1[0], &x1[15]); + btf_32_add_sub_avx2(&x1[1], &x1[14]); + btf_32_add_sub_avx2(&x1[2], &x1[13]); + btf_32_add_sub_avx2(&x1[3], &x1[12]); + btf_32_add_sub_avx2(&x1[4], &x1[11]); + btf_32_add_sub_avx2(&x1[5], &x1[10]); + btf_32_add_sub_avx2(&x1[6], &x1[9]); + btf_32_add_sub_avx2(&x1[7], &x1[8]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[47]); + btf_32_add_sub_avx2(&x1[33], &x1[46]); + btf_32_add_sub_avx2(&x1[34], &x1[45]); + btf_32_add_sub_avx2(&x1[35], &x1[44]); + btf_32_add_sub_avx2(&x1[36], &x1[43]); + btf_32_add_sub_avx2(&x1[37], &x1[42]); + btf_32_add_sub_avx2(&x1[38], &x1[41]); + btf_32_add_sub_avx2(&x1[39], &x1[40]); + btf_32_add_sub_avx2(&x1[63], &x1[48]); + btf_32_add_sub_avx2(&x1[62], &x1[49]); + btf_32_add_sub_avx2(&x1[61], &x1[50]); + btf_32_add_sub_avx2(&x1[60], &x1[51]); + btf_32_add_sub_avx2(&x1[59], &x1[52]); + btf_32_add_sub_avx2(&x1[58], &x1[53]); + btf_32_add_sub_avx2(&x1[57], &x1[54]); + btf_32_add_sub_avx2(&x1[56], &x1[55]); + + // stage 4 + btf_32_add_sub_avx2(&x1[0], &x1[7]); + btf_32_add_sub_avx2(&x1[1], &x1[6]); + btf_32_add_sub_avx2(&x1[2], &x1[5]); + btf_32_add_sub_avx2(&x1[3], &x1[4]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[23]); + btf_32_add_sub_avx2(&x1[17], &x1[22]); + btf_32_add_sub_avx2(&x1[18], &x1[21]); + btf_32_add_sub_avx2(&x1[19], &x1[20]); + btf_32_add_sub_avx2(&x1[31], &x1[24]); + btf_32_add_sub_avx2(&x1[30], &x1[25]); + btf_32_add_sub_avx2(&x1[29], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[27]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit); + + // stage 5 + btf_32_add_sub_avx2(&x1[0], &x1[3]); + btf_32_add_sub_avx2(&x1[1], &x1[2]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[11]); + btf_32_add_sub_avx2(&x1[9], &x1[10]); + btf_32_add_sub_avx2(&x1[15], &x1[12]); + btf_32_add_sub_avx2(&x1[14], &x1[13]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[39]); + btf_32_add_sub_avx2(&x1[33], &x1[38]); + btf_32_add_sub_avx2(&x1[34], &x1[37]); + btf_32_add_sub_avx2(&x1[35], &x1[36]); + btf_32_add_sub_avx2(&x1[47], &x1[40]); + btf_32_add_sub_avx2(&x1[46], &x1[41]); + btf_32_add_sub_avx2(&x1[45], &x1[42]); + btf_32_add_sub_avx2(&x1[44], &x1[43]); + btf_32_add_sub_avx2(&x1[48], &x1[55]); + btf_32_add_sub_avx2(&x1[49], &x1[54]); + btf_32_add_sub_avx2(&x1[50], &x1[53]); + btf_32_add_sub_avx2(&x1[51], &x1[52]); + btf_32_add_sub_avx2(&x1[63], &x1[56]); + btf_32_add_sub_avx2(&x1[62], &x1[57]); + btf_32_add_sub_avx2(&x1[61], &x1[58]); + btf_32_add_sub_avx2(&x1[60], &x1[59]); + + // stage 6 + btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit); + btf_32_add_sub_avx2(&x1[4], &x1[5]); + btf_32_add_sub_avx2(&x1[7], &x1[6]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[19]); + btf_32_add_sub_avx2(&x1[17], &x1[18]); + btf_32_add_sub_avx2(&x1[23], &x1[20]); + btf_32_add_sub_avx2(&x1[22], &x1[21]); + btf_32_add_sub_avx2(&x1[24], &x1[27]); + btf_32_add_sub_avx2(&x1[25], &x1[26]); + btf_32_add_sub_avx2(&x1[31], &x1[28]); + btf_32_add_sub_avx2(&x1[30], &x1[29]); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit); + + // stage 7 + btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[9]); + btf_32_add_sub_avx2(&x1[11], &x1[10]); + btf_32_add_sub_avx2(&x1[12], &x1[13]); + btf_32_add_sub_avx2(&x1[15], &x1[14]); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[35]); + btf_32_add_sub_avx2(&x1[33], &x1[34]); + btf_32_add_sub_avx2(&x1[39], &x1[36]); + btf_32_add_sub_avx2(&x1[38], &x1[37]); + btf_32_add_sub_avx2(&x1[40], &x1[43]); + btf_32_add_sub_avx2(&x1[41], &x1[42]); + btf_32_add_sub_avx2(&x1[47], &x1[44]); + btf_32_add_sub_avx2(&x1[46], &x1[45]); + btf_32_add_sub_avx2(&x1[48], &x1[51]); + btf_32_add_sub_avx2(&x1[49], &x1[50]); + btf_32_add_sub_avx2(&x1[55], &x1[52]); + btf_32_add_sub_avx2(&x1[54], &x1[53]); + btf_32_add_sub_avx2(&x1[56], &x1[59]); + btf_32_add_sub_avx2(&x1[57], &x1[58]); + btf_32_add_sub_avx2(&x1[63], &x1[60]); + btf_32_add_sub_avx2(&x1[62], &x1[61]); + + // stage 8 + btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[17]); + btf_32_add_sub_avx2(&x1[19], &x1[18]); + btf_32_add_sub_avx2(&x1[20], &x1[21]); + btf_32_add_sub_avx2(&x1[23], &x1[22]); + btf_32_add_sub_avx2(&x1[24], &x1[25]); + btf_32_add_sub_avx2(&x1[27], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[29]); + btf_32_add_sub_avx2(&x1[31], &x1[30]); + btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit); + + // stage 9 + btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[33]); + btf_32_add_sub_avx2(&x1[35], &x1[34]); + btf_32_add_sub_avx2(&x1[36], &x1[37]); + btf_32_add_sub_avx2(&x1[39], &x1[38]); + btf_32_add_sub_avx2(&x1[40], &x1[41]); + btf_32_add_sub_avx2(&x1[43], &x1[42]); + btf_32_add_sub_avx2(&x1[44], &x1[45]); + btf_32_add_sub_avx2(&x1[47], &x1[46]); + btf_32_add_sub_avx2(&x1[48], &x1[49]); + btf_32_add_sub_avx2(&x1[51], &x1[50]); + btf_32_add_sub_avx2(&x1[52], &x1[53]); + btf_32_add_sub_avx2(&x1[55], &x1[54]); + btf_32_add_sub_avx2(&x1[56], &x1[57]); + btf_32_add_sub_avx2(&x1[59], &x1[58]); + btf_32_add_sub_avx2(&x1[60], &x1[61]); + btf_32_add_sub_avx2(&x1[63], &x1[62]); + + // stage 10 + btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit); + + // stage 11 + output[0] = x1[0]; + output[1] = x1[32]; + output[2] = x1[16]; + output[3] = x1[48]; + output[4] = x1[8]; + output[5] = x1[40]; + output[6] = x1[24]; + output[7] = x1[56]; + output[8] = x1[4]; + output[9] = x1[36]; + output[10] = x1[20]; + output[11] = x1[52]; + output[12] = x1[12]; + output[13] = x1[44]; + output[14] = x1[28]; + output[15] = x1[60]; + output[16] = x1[2]; + output[17] = x1[34]; + output[18] = x1[18]; + output[19] = x1[50]; + output[20] = x1[10]; + output[21] = x1[42]; + output[22] = x1[26]; + output[23] = x1[58]; + output[24] = x1[6]; + output[25] = x1[38]; + output[26] = x1[22]; + output[27] = x1[54]; + output[28] = x1[14]; + output[29] = x1[46]; + output[30] = x1[30]; + output[31] = x1[62]; + output[32] = x1[1]; + output[33] = x1[33]; + output[34] = x1[17]; + output[35] = x1[49]; + output[36] = x1[9]; + output[37] = x1[41]; + output[38] = x1[25]; + output[39] = x1[57]; + output[40] = x1[5]; + output[41] = x1[37]; + output[42] = x1[21]; + output[43] = x1[53]; + output[44] = x1[13]; + output[45] = x1[45]; + output[46] = x1[29]; + output[47] = x1[61]; + output[48] = x1[3]; + output[49] = x1[35]; + output[50] = x1[19]; + output[51] = x1[51]; + output[52] = x1[11]; + output[53] = x1[43]; + output[54] = x1[27]; + output[55] = x1[59]; + output[56] = x1[7]; + output[57] = x1[39]; + output[58] = x1[23]; + output[59] = x1[55]; + output[60] = x1[15]; + output[61] = x1[47]; + output[62] = x1[31]; + output[63] = x1[63]; +} + +static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); + __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[1] = _mm256_subs_epi16(__zero, input[15]); + x1[2] = _mm256_subs_epi16(__zero, input[7]); + x1[3] = input[8]; + x1[4] = _mm256_subs_epi16(__zero, input[3]); + x1[5] = input[12]; + x1[6] = input[4]; + x1[7] = _mm256_subs_epi16(__zero, input[11]); + x1[8] = _mm256_subs_epi16(__zero, input[1]); + x1[9] = input[14]; + x1[10] = input[6]; + x1[11] = _mm256_subs_epi16(__zero, input[9]); + x1[12] = input[2]; + x1[13] = _mm256_subs_epi16(__zero, input[13]); + x1[14] = _mm256_subs_epi16(__zero, input[5]); + x1[15] = input[10]; + + // stage 2 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[2]); + btf_16_adds_subs_avx2(&x1[1], &x1[3]); + btf_16_adds_subs_avx2(&x1[4], &x1[6]); + btf_16_adds_subs_avx2(&x1[5], &x1[7]); + btf_16_adds_subs_avx2(&x1[8], &x1[10]); + btf_16_adds_subs_avx2(&x1[9], &x1[11]); + btf_16_adds_subs_avx2(&x1[12], &x1[14]); + btf_16_adds_subs_avx2(&x1[13], &x1[15]); + + // stage 4 + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit); + + // stage 5 + btf_16_adds_subs_avx2(&x1[0], &x1[4]); + btf_16_adds_subs_avx2(&x1[1], &x1[5]); + btf_16_adds_subs_avx2(&x1[2], &x1[6]); + btf_16_adds_subs_avx2(&x1[3], &x1[7]); + btf_16_adds_subs_avx2(&x1[8], &x1[12]); + btf_16_adds_subs_avx2(&x1[9], &x1[13]); + btf_16_adds_subs_avx2(&x1[10], &x1[14]); + btf_16_adds_subs_avx2(&x1[11], &x1[15]); + + // stage 6 + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit); + + // stage 7 + btf_16_adds_subs_avx2(&x1[0], &x1[8]); + btf_16_adds_subs_avx2(&x1[1], &x1[9]); + btf_16_adds_subs_avx2(&x1[2], &x1[10]); + btf_16_adds_subs_avx2(&x1[3], &x1[11]); + btf_16_adds_subs_avx2(&x1[4], &x1[12]); + btf_16_adds_subs_avx2(&x1[5], &x1[13]); + btf_16_adds_subs_avx2(&x1[6], &x1[14]); + btf_16_adds_subs_avx2(&x1[7], &x1[15]); + + // stage 8 + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); + + // stage 9 + output[0] = x1[1]; + output[1] = x1[14]; + output[2] = x1[3]; + output[3] = x1[12]; + output[4] = x1[5]; + output[5] = x1[10]; + output[6] = x1[7]; + output[7] = x1[8]; + output[8] = x1[9]; + output[9] = x1[6]; + output[10] = x1[11]; + output[11] = x1[4]; + output[12] = x1[13]; + output[13] = x1[2]; + output[14] = x1[15]; + output[15] = x1[0]; +} + +static INLINE void fidentity16x16_new_avx2(const __m256i *input, + __m256i *output, int8_t cos_bit) { + (void)cos_bit; + const __m256i one = _mm256_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one); + const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one); + const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); + output[i] = _mm256_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity16x32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) { + output[i] = _mm256_slli_epi16(input[i], 2); + } +} + +static INLINE void store_output_32bit_w16(int32_t *const out, + const __m256i *const in1, + const __m256i *const in2, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out + stride * i), in1[i]); + _mm256_store_si256((__m256i *)(out + stride * i + 8), in2[i]); + } +} + +// Store 8 16 bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + int32_t *out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out), + _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i]))); + _mm256_store_si256( + (__m256i *)(out + 8), + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1))); + out += stride; + } +} + +static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a, + int32_t *const b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8); + const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one); + const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one); + const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); + _mm256_store_si256((__m256i *)b, b_lo); + _mm256_store_si256((__m256i *)(b + 8), b_hi); +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2( + const __m256i *const in, int32_t *const out, const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_avx2(in[i], out + i * stride); + } +} + +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, + int8_t cos_bit); + +static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = { + fdct16x32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity16x32_avx2, // IDTX + fdct16x32_avx2, // V_DCT + fidentity16x32_avx2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = { + fdct16x32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity16x32_avx2, // IDTX + fidentity16x32_avx2, // V_DCT + fdct16x32_avx2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = { + fdct16x16_new_avx2, // DCT_DCT + fadst16x16_new_avx2, // ADST_DCT + fdct16x16_new_avx2, // DCT_ADST + fadst16x16_new_avx2, // ADST_ADST + fadst16x16_new_avx2, // FLIPADST_DCT + fdct16x16_new_avx2, // DCT_FLIPADST + fadst16x16_new_avx2, // FLIPADST_FLIPADST + fadst16x16_new_avx2, // ADST_FLIPADST + fadst16x16_new_avx2, // FLIPADST_ADST + fidentity16x16_new_avx2, // IDTX + fdct16x16_new_avx2, // V_DCT + fidentity16x16_new_avx2, // H_DCT + fadst16x16_new_avx2, // V_ADST + fidentity16x16_new_avx2, // H_ADST + fadst16x16_new_avx2, // V_FLIPADST + fidentity16x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = { + fdct16x16_new_avx2, // DCT_DCT + fdct16x16_new_avx2, // ADST_DCT + fadst16x16_new_avx2, // DCT_ADST + fadst16x16_new_avx2, // ADST_ADST + fdct16x16_new_avx2, // FLIPADST_DCT + fadst16x16_new_avx2, // DCT_FLIPADST + fadst16x16_new_avx2, // FLIPADST_FLIPADST + fadst16x16_new_avx2, // ADST_FLIPADST + fadst16x16_new_avx2, // FLIPADST_ADST + fidentity16x16_new_avx2, // IDTX + fidentity16x16_new_avx2, // V_DCT + fdct16x16_new_avx2, // H_DCT + fidentity16x16_new_avx2, // V_ADST + fadst16x16_new_avx2, // H_ADST + fidentity16x16_new_avx2, // V_FLIPADST + fadst16x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fadst8x8_new_sse2, // ADST_DCT + fdct8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fadst8x8_new_sse2, // FLIPADST_DCT + fdct8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct8x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst8x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst8x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fdct8x8_new_sse2, // ADST_DCT + fadst8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fdct8x8_new_sse2, // FLIPADST_DCT + fadst8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct8x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst8x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst8x8_new_sse2 // H_FLIPADST +}; + +static INLINE void load_buffer_and_round_shift(const int16_t *in, int stride, + __m128i *out, int bit) { + out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride)); + out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride)); + out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride)); + out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride)); + out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride)); + out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride)); + out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride)); + out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride)); + out[0] = _mm_slli_epi16(out[0], bit); + out[1] = _mm_slli_epi16(out[1], bit); + out[2] = _mm_slli_epi16(out[2], bit); + out[3] = _mm_slli_epi16(out[3], bit); + out[4] = _mm_slli_epi16(out[4], bit); + out[5] = _mm_slli_epi16(out[5], bit); + out[6] = _mm_slli_epi16(out[6], bit); + out[7] = _mm_slli_epi16(out[7], bit); +} + +static INLINE void load_buffer_and_flip_round_shift(const int16_t *in, + int stride, __m128i *out, + int bit) { + out[7] = load_16bit_to_16bit(in + 0 * stride); + out[6] = load_16bit_to_16bit(in + 1 * stride); + out[5] = load_16bit_to_16bit(in + 2 * stride); + out[4] = load_16bit_to_16bit(in + 3 * stride); + out[3] = load_16bit_to_16bit(in + 4 * stride); + out[2] = load_16bit_to_16bit(in + 5 * stride); + out[1] = load_16bit_to_16bit(in + 6 * stride); + out[0] = load_16bit_to_16bit(in + 7 * stride); + out[7] = _mm_slli_epi16(out[7], bit); + out[6] = _mm_slli_epi16(out[6], bit); + out[5] = _mm_slli_epi16(out[5], bit); + out[4] = _mm_slli_epi16(out[4], bit); + out[3] = _mm_slli_epi16(out[3], bit); + out[2] = _mm_slli_epi16(out[2], bit); + out[1] = _mm_slli_epi16(out[1], bit); + out[0] = _mm_slli_epi16(out[0], bit); +} + +#define TRANSPOSE_8X8_AVX2() \ + { \ + /* aa0: 00 10 01 11 02 12 03 13 | 40 50 41 51 42 52 43 53*/ \ + /* aa1: 04 14 05 15 06 16 07 17 | 44 54 45 55 46 56 47 57*/ \ + /* aa2: 20 30 21 31 22 32 23 33 | 60 70 61 71 62 72 63 73*/ \ + /* aa3: 24 34 25 35 26 36 27 37 | 64 74 65 75 66 76 67 77*/ \ + const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1); \ + const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1); \ + const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3); \ + const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3); \ + /* Unpack 32 bit elements resulting in: */ \ + /* bb0: 00 10 20 30 01 11 21 31 | 40 50 60 70 41 51 61 71*/ \ + /* bb1: 02 12 22 32 03 13 23 33 | 42 52 62 72 43 53 63 73*/ \ + /* bb2: 04 14 24 34 05 15 25 35 | 44 54 64 74 45 55 65 75*/ \ + /* bb2: 06 16 26 36 07 17 27 37 | 46 56 66 76 47 57 67 77*/ \ + const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2); \ + const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2); \ + const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3); \ + const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3); \ + /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/ \ + /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/ \ + /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/ \ + /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/ \ + c0 = _mm256_permute4x64_epi64(bb0, 0xd8); \ + c1 = _mm256_permute4x64_epi64(bb1, 0xd8); \ + c2 = _mm256_permute4x64_epi64(bb2, 0xd8); \ + c3 = _mm256_permute4x64_epi64(bb3, 0xd8); \ + } + +static INLINE void transpose_round_shift_flip_8x8(__m128i *const in, + __m128i *const out, int bit) { + __m256i c0, c1, c2, c3; + bit = -bit; + const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1)); + const __m256i s04 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1); + const __m256i s15 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1); + const __m256i s26 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1); + const __m256i s37 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1); + + const __m256i a0 = _mm256_adds_epi16(s04, rounding); + const __m256i a1 = _mm256_adds_epi16(s15, rounding); + const __m256i a2 = _mm256_adds_epi16(s26, rounding); + const __m256i a3 = _mm256_adds_epi16(s37, rounding); + + // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47 + // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57 + // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67 + // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77 + const __m256i b0 = _mm256_srai_epi16(a0, bit); + const __m256i b1 = _mm256_srai_epi16(a1, bit); + const __m256i b2 = _mm256_srai_epi16(a2, bit); + const __m256i b3 = _mm256_srai_epi16(a3, bit); + + TRANSPOSE_8X8_AVX2() + + // Unpack 64 bit elements resulting in: + // out[7]: 00 10 20 30 40 50 60 70 + // out[6]: 01 11 21 31 41 51 61 71 + // out[5]: 02 12 22 32 42 52 62 72 + // out[4]: 03 13 23 33 43 53 63 73 + // out[3]: 04 14 24 34 44 54 64 74 + // out[2]: 05 15 25 35 45 55 65 75 + // out[1]: 06 16 26 36 46 56 66 76 + // out[0]: 07 17 27 37 47 57 67 77 + out[7] = _mm256_castsi256_si128(c0); + out[6] = _mm256_extractf128_si256(c0, 1); + out[5] = _mm256_castsi256_si128(c1); + out[4] = _mm256_extractf128_si256(c1, 1); + out[3] = _mm256_castsi256_si128(c2); + out[2] = _mm256_extractf128_si256(c2, 1); + out[1] = _mm256_castsi256_si128(c3); + out[0] = _mm256_extractf128_si256(c3, 1); +} + +static INLINE void transpose_round_shift_8x8(__m128i *const in, + __m128i *const out, int bit) { + __m256i c0, c1, c2, c3; + bit = -bit; + const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1)); + const __m256i s04 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1); + const __m256i s15 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1); + const __m256i s26 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1); + const __m256i s37 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1); + + const __m256i a0 = _mm256_adds_epi16(s04, rounding); + const __m256i a1 = _mm256_adds_epi16(s15, rounding); + const __m256i a2 = _mm256_adds_epi16(s26, rounding); + const __m256i a3 = _mm256_adds_epi16(s37, rounding); + + // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47 + // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57 + // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67 + // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77 + const __m256i b0 = _mm256_srai_epi16(a0, bit); + const __m256i b1 = _mm256_srai_epi16(a1, bit); + const __m256i b2 = _mm256_srai_epi16(a2, bit); + const __m256i b3 = _mm256_srai_epi16(a3, bit); + + TRANSPOSE_8X8_AVX2() + // Unpack 64 bit elements resulting in: + // out[7]: 00 10 20 30 40 50 60 70 + // out[6]: 01 11 21 31 41 51 61 71 + // out[5]: 02 12 22 32 42 52 62 72 + // out[4]: 03 13 23 33 43 53 63 73 + // out[3]: 04 14 24 34 44 54 64 74 + // out[2]: 05 15 25 35 45 55 65 75 + // out[1]: 06 16 26 36 46 56 66 76 + // out[0]: 07 17 27 37 47 57 67 77 + out[0] = _mm256_castsi256_si128(c0); + out[1] = _mm256_extractf128_si256(c0, 1); + out[2] = _mm256_castsi256_si128(c1); + out[3] = _mm256_extractf128_si256(c1, 1); + out[4] = _mm256_castsi256_si128(c2); + out[5] = _mm256_extractf128_si256(c2, 1); + out[6] = _mm256_castsi256_si128(c3); + out[7] = _mm256_extractf128_si256(c3, 1); +} + +static INLINE void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out + i * stride), + _mm256_cvtepi16_epi32(in[i])); + } +} + +static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // Condition to check shift bit is avoided while round shifting, by assuming + // that shift[0] will always be positive. + assert(shift[0] > 0); + if (ud_flip) + load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]); + else + load_buffer_and_round_shift(input, stride, buf0, shift[0]); + + col_txfm(buf0, buf0, cos_bit_col); + // Condition to check shift bit is avoided while round shifting, by assuming + // that shift[1] will always be negative. + assert(shift[1] < 0); + + if (lr_flip) { + transpose_round_shift_flip_8x8(buf0, buf1, shift[1]); + } else { + transpose_round_shift_8x8(buf0, buf1, shift[1]); + } + + buf = buf1; + row_txfm(buf, buf, cos_bit_row); + + // Round and shift operation is avoided here as the shift bit is assumed to be + // zero always. + assert(shift[2] == 0); + store_buffer_16bit_to_32bit_w8_avx2(buf, output, 8, 8); +} + +static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_16X16; + __m256i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int32_t i = 0; + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); + + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width); +} + +static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_32X32; + __m256i buf0[32], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, + height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i); + transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i); + } + + for (int i = 0; i < 2; i++) { + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width); + } +} + +static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m256i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(2, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[64]; + __m256i bufB[64]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + fdct64_new_avx2(bufA, bufA, cos_bit_row); + fdct64_new_avx2(bufB, bufB, cos_bit_row); + round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]); + round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]); + store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_16X32; + __m256i buf0[32], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1); + transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16); + + for (int i = 0; i < 2; i++) { + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, + width); + } +} + +static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m256i buf0[32], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 16; + const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, + height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); + } + + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, height, width); +} + +static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_64X32; + __m256i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[64]; + __m256i bufB[64]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + fdct64_new_avx2(bufA, bufA, cos_bit_row); + fdct64_new_avx2(bufB, bufB, cos_bit_row); + round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2); + round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2); + + store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_32X64; + __m256i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(2, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[32]; + __m256i bufB[32]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + fdct32_avx2(bufA, bufA, cos_bit_row); + fdct32_avx2(bufB, bufB, cos_bit_row); + round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2); + round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2); + + store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_16X64; + __m256i buf0[64], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const transform_1d_avx2 row_txfm = fdct16x16_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < height_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + width * i, 32, width); + } +} + +static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X16; + __m256i buf0[64], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x16_new_avx2; + const transform_1d_avx2 row_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < height_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < height_div16; i++) { + __m256i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * i, 16, 32); + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0, + __m256i *in1, __m128i *out0, __m128i *out1, + __m128i *out2, __m128i *out3, + const __m256i *__rounding, int8_t *cos_bit) { + __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); + __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); + __m256i u0 = _mm256_madd_epi16(t0, *w0); + __m256i u1 = _mm256_madd_epi16(t1, *w0); + __m256i v0 = _mm256_madd_epi16(t0, *w1); + __m256i v1 = _mm256_madd_epi16(t1, *w1); + + __m256i a0 = _mm256_add_epi32(u0, *__rounding); + __m256i a1 = _mm256_add_epi32(u1, *__rounding); + __m256i b0 = _mm256_add_epi32(v0, *__rounding); + __m256i b1 = _mm256_add_epi32(v1, *__rounding); + + __m256i c0 = _mm256_srai_epi32(a0, *cos_bit); + __m256i c1 = _mm256_srai_epi32(a1, *cos_bit); + __m256i d0 = _mm256_srai_epi32(b0, *cos_bit); + __m256i d1 = _mm256_srai_epi32(b1, *cos_bit); + + __m256i temp0 = _mm256_packs_epi32(c0, c1); + __m256i temp1 = _mm256_packs_epi32(d0, d1); + + *out0 = _mm256_castsi256_si128(temp0); + *out1 = _mm256_castsi256_si128(temp1); + *out2 = _mm256_extracti128_si256(temp0, 0x01); + *out3 = _mm256_extracti128_si256(temp1, 0x01); +} + +static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m256i x1[8]; + x1[0] = _mm256_adds_epi16(input[0], input[7]); + x1[7] = _mm256_subs_epi16(input[0], input[7]); + x1[1] = _mm256_adds_epi16(input[1], input[6]); + x1[6] = _mm256_subs_epi16(input[1], input[6]); + x1[2] = _mm256_adds_epi16(input[2], input[5]); + x1[5] = _mm256_subs_epi16(input[2], input[5]); + x1[3] = _mm256_adds_epi16(input[3], input[4]); + x1[4] = _mm256_subs_epi16(input[3], input[4]); + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_adds_epi16(x1[0], x1[3]); + x2[3] = _mm256_subs_epi16(x1[0], x1[3]); + x2[1] = _mm256_adds_epi16(x1[1], x1[2]); + x2[2] = _mm256_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding, + cos_bit); + x2[5] = x1[5]; + x2[6] = x1[6]; + x2[7] = x1[7]; + + // stage 3 + __m256i x3[8]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding, + cos_bit); + x3[0] = x2[0]; + x3[1] = x2[1]; + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding, + cos_bit); + x3[2] = x2[2]; + x3[3] = x2[3]; + x3[4] = _mm256_adds_epi16(x2[4], x2[5]); + x3[5] = _mm256_subs_epi16(x2[4], x2[5]); + x3[6] = _mm256_subs_epi16(x2[7], x2[6]); + x3[7] = _mm256_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding, + cos_bit); + x4[4] = x3[4]; + x4[7] = x3[7]; + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding, + cos_bit); + x4[5] = x3[5]; + x4[6] = x3[6]; + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m256i x1[8]; + x1[0] = input[0]; + x1[1] = _mm256_subs_epi16(__zero, input[7]); + x1[2] = _mm256_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm256_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm256_subs_epi16(__zero, input[5]); + + // stage 2 + __m256i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding, + cos_bit); + x2[2] = x1[2]; + x2[3] = x1[3]; + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding, + cos_bit); + x2[6] = x1[6]; + x2[7] = x1[7]; + + // stage 3 + __m256i x3[8]; + x3[0] = _mm256_adds_epi16(x2[0], x2[2]); + x3[2] = _mm256_subs_epi16(x2[0], x2[2]); + x3[1] = _mm256_adds_epi16(x2[1], x2[3]); + x3[3] = _mm256_subs_epi16(x2[1], x2[3]); + x3[4] = _mm256_adds_epi16(x2[4], x2[6]); + x3[6] = _mm256_subs_epi16(x2[4], x2[6]); + x3[5] = _mm256_adds_epi16(x2[5], x2[7]); + x3[7] = _mm256_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding, + cos_bit); + x4[4] = x3[4]; + x4[5] = x3[5]; + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding, + cos_bit); + x4[6] = x3[6]; + x4[7] = x3[7]; + + // stage 5 + __m256i x5[8]; + x5[0] = _mm256_adds_epi16(x4[0], x4[4]); + x5[4] = _mm256_subs_epi16(x4[0], x4[4]); + x5[1] = _mm256_adds_epi16(x4[1], x4[5]); + x5[5] = _mm256_subs_epi16(x4[1], x4[5]); + x5[2] = _mm256_adds_epi16(x4[2], x4[6]); + x5[6] = _mm256_subs_epi16(x4[2], x4[6]); + x5[3] = _mm256_adds_epi16(x4[3], x4[7]); + x5[7] = _mm256_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m256i x6[8]; + btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding, + cos_bit); + x6[0] = x5[0]; + x6[1] = x5[1]; + btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding, + cos_bit); + x6[2] = x5[2]; + x6[3] = x5[3]; + btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding, + cos_bit); + x6[4] = x5[4]; + x6[5] = x5[5]; + btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding, + cos_bit); + x6[6] = x5[6]; + x6[7] = x5[7]; + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + + output[0] = _mm256_adds_epi16(input[0], input[0]); + output[1] = _mm256_adds_epi16(input[1], input[1]); + output[2] = _mm256_adds_epi16(input[2], input[2]); + output[3] = _mm256_adds_epi16(input[3], input[3]); + output[4] = _mm256_adds_epi16(input[4], input[4]); + output[5] = _mm256_adds_epi16(input[5], input[5]); + output[6] = _mm256_adds_epi16(input[6], input[6]); + output[7] = _mm256_adds_epi16(input[7], input[7]); +} + +static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + __m128i temp0, temp1, temp2, temp3; + __m256i in0, in1; + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + + __m256i cospi_arr[12]; + + cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32), + cospi_m32_p32, 0x1); + cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p48_p16, 0x1); + cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_m16_p48, 0x1); + cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48), + cospi_m48_m16, 0x1); + cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16), + cospi_m16_p48, 0x1); + cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08), + cospi_p24_p40, 0x1); + cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56), + cospi_m40_p24, 0x1); + cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04), + cospi_p28_p36, 0x1); + cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60), + cospi_m36_p28, 0x1); + cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20), + cospi_p12_p52, 0x1); + cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44), + cospi_m52_p12, 0x1); + + __m256i x[8]; + x[0] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1); + x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14], + 0x1); + x[2] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1); + x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12], + 0x1); + x[4] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1); + x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11], + 0x1); + x[6] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1); + x[7] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1); + + // stage 1 + __m256i x1[8]; + x1[0] = _mm256_adds_epi16(x[0], x[1]); + x1[7] = _mm256_subs_epi16(x[0], x[1]); + x1[1] = _mm256_adds_epi16(x[2], x[3]); + x1[6] = _mm256_subs_epi16(x[2], x[3]); + x1[2] = _mm256_adds_epi16(x[4], x[5]); + x1[5] = _mm256_subs_epi16(x[4], x[5]); + x1[3] = _mm256_adds_epi16(x[6], x[7]); + x1[4] = _mm256_subs_epi16(x[6], x[7]); + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_adds_epi16(x1[0], x1[3]); + x2[7] = _mm256_subs_epi16(x1[0], x1[3]); + x2[1] = _mm256_adds_epi16(x1[1], x1[2]); + x2[6] = _mm256_subs_epi16(x1[1], x1[2]); + x2[2] = x1[4]; + x2[3] = x1[7]; + btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1, + &temp2, &temp3, &__rounding_256, &cos_bit); + x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1); + x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); + + // stage 3 + __m256i x3[8]; + x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e); + x3[0] = _mm256_adds_epi16(x2[0], x2[1]); + x3[1] = _mm256_subs_epi16(x2[0], x2[1]); + x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]), + _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1); + x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1); + x3[3] = _mm256_adds_epi16(x2[2], x2[4]); + x3[4] = _mm256_subs_epi16(x2[2], x2[4]); + x3[5] = _mm256_adds_epi16(x2[3], x2[5]); + x3[6] = _mm256_subs_epi16(x2[3], x2[5]); + + // stage 4 + __m256i x4[8]; + x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0); + x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21); + btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0], + &output[8], &output[4], &output[12], &__rounding_256, &cos_bit); + x4[2] = _mm256_adds_epi16(x3[2], x3[7]); + x4[3] = _mm256_subs_epi16(x3[2], x3[7]); + x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20); + x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20); + in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31); + in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31); + btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + + x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1); + x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); + + // stage 5 + __m256i x5[4]; + in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31); + in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20); + btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14], + &output[10], &output[6], &__rounding_256, &cos_bit); + x5[0] = _mm256_adds_epi16(x4[4], x4[6]); + x5[1] = _mm256_subs_epi16(x4[4], x4[6]); + x5[2] = _mm256_adds_epi16(x4[5], x4[7]); + x5[3] = _mm256_subs_epi16(x4[5], x4[7]); + + // stage 6 + in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20); + in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31); + btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15], + &output[9], &output[7], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31); + in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20); + btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5], + &output[11], &output[13], &output[3], &__rounding_256, &cos_bit); +} + +static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); + __m256i in0, in1; + __m128i temp0, temp1, temp2, temp3; + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + __m256i cospi_arr[20]; + + cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_p32_m32, 0x1); + cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_p32_m32, 0x1); + cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), + cospi_m48_p16, 0x1); + cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), + cospi_p16_p48, 0x1); + cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), + cospi_m48_p16, 0x1); + cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), + cospi_p16_p48, 0x1); + cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), + cospi_p40_p24, 0x1); + cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08), + cospi_p24_m40, 0x1); + cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08), + cospi_m24_p40, 0x1); + cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), + cospi_p40_p24, 0x1); + cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62), + cospi_p10_p54, 0x1); + cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02), + cospi_p54_m10, 0x1); + cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46), + cospi_p26_p38, 0x1); + cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18), + cospi_p38_m26, 0x1); + cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30), + cospi_p42_p22, 0x1); + cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34), + cospi_p22_m42, 0x1); + cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14), + cospi_p58_p06, 0x1); + cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50), + cospi_p06_m58, 0x1); + + __m256i x[8]; + x[0] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1); + x[1] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1); + x[2] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1); + x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14], + 0x1); + x[4] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1); + x[5] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1); + x[6] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1); + x[7] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1); + + // stage 1 + __m256i x1[8]; + x1[0] = x[0]; + x1[1] = _mm256_subs_epi16(__zero, x[7]); + x1[2] = x[2]; + x1[3] = _mm256_subs_epi16(__zero, x[5]); + x1[4] = _mm256_subs_epi16(__zero, x[4]); + x1[5] = x[3]; + x1[6] = _mm256_subs_epi16(__zero, x[6]); + x1[7] = x[1]; + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0); + x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0); + x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0); + x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0); + in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0); + in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0); + btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21); + in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21); + btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 3 + __m256i x3[8]; + x3[0] = _mm256_adds_epi16(x2[0], x2[1]); + x3[1] = _mm256_subs_epi16(x2[0], x2[1]); + x3[2] = _mm256_adds_epi16(x2[3], x2[2]); + x3[3] = _mm256_subs_epi16(x2[3], x2[2]); + x3[4] = _mm256_adds_epi16(x2[4], x2[5]); + x3[5] = _mm256_subs_epi16(x2[4], x2[5]); + x3[6] = _mm256_adds_epi16(x2[7], x2[6]); + x3[7] = _mm256_subs_epi16(x2[7], x2[6]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[4] = x3[4]; + x4[5] = x3[5]; + in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20); + in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31); + btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20); + in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31); + btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 5 + __m256i x5[8]; + x5[0] = _mm256_adds_epi16(x4[0], x4[2]); + x5[1] = _mm256_subs_epi16(x4[0], x4[2]); + x5[2] = _mm256_adds_epi16(x4[1], x4[3]); + x5[3] = _mm256_subs_epi16(x4[1], x4[3]); + x5[4] = _mm256_adds_epi16(x4[4], x4[6]); + x5[5] = _mm256_subs_epi16(x4[4], x4[6]); + x5[6] = _mm256_adds_epi16(x4[5], x4[7]); + x5[7] = _mm256_subs_epi16(x4[5], x4[7]); + + // stage 6 + __m256i x6[8]; + x6[0] = x5[0]; + x6[1] = x5[2]; + x6[2] = x5[1]; + x6[3] = x5[3]; + in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20); + in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31); + btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20); + in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31); + btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1, + &temp2, &temp3, &__rounding_256, &cos_bit); + x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 7 + __m256i x7[8]; + x7[0] = _mm256_adds_epi16(x6[0], x6[4]); + x7[1] = _mm256_subs_epi16(x6[0], x6[4]); + x7[2] = _mm256_adds_epi16(x6[1], x6[5]); + x7[3] = _mm256_subs_epi16(x6[1], x6[5]); + x7[4] = _mm256_adds_epi16(x6[2], x6[6]); + x7[5] = _mm256_subs_epi16(x6[2], x6[6]); + x7[6] = _mm256_adds_epi16(x6[3], x6[7]); + x7[7] = _mm256_subs_epi16(x6[3], x6[7]); + + // stage 8 + in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20); + in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31); + btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15], + &output[0], &output[13], &output[2], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20); + in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31); + btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11], + &output[4], &output[9], &output[6], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20); + in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31); + btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7], + &output[8], &output[5], &output[10], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20); + in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31); + btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3], + &output[12], &output[1], &output[14], &__rounding_256, &cos_bit); +} + +static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m256i one = _mm256_set1_epi16(1); + __m256i temp; + for (int i = 0; i < 16; i += 2) { + temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]), + input[i + 1], 0x1); + const __m256i a_lo = _mm256_unpacklo_epi16(temp, one); + const __m256i a_hi = _mm256_unpackhi_epi16(temp, one); + const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); + temp = _mm256_packs_epi32(b_lo, b_hi); + output[i] = _mm256_castsi256_si128(temp); + output[i + 1] = _mm256_extractf128_si256(temp, 0x1); + } +} + +static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = { + fdct8x8_new_avx2, // DCT_DCT + fdct8x8_new_avx2, // ADST_DCT + fadst8x8_new_avx2, // DCT_ADST + fadst8x8_new_avx2, // ADST_ADST + fdct8x8_new_avx2, // FLIPADST_DCT + fadst8x8_new_avx2, // DCT_FLIPADST + fadst8x8_new_avx2, // FLIPADST_FLIPADST + fadst8x8_new_avx2, // ADST_FLIPADST + fadst8x8_new_avx2, // FLIPADST_ADST + fidentity8x8_new_avx2, // IDTX + fidentity8x8_new_avx2, // V_DCT + fdct8x8_new_avx2, // H_DCT + fidentity8x8_new_avx2, // V_ADST + fadst8x8_new_avx2, // H_ADST + fidentity8x8_new_avx2, // V_FLIPADST + fadst8x8_new_avx2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_avx2, // DCT_DCT + fadst8x16_new_avx2, // ADST_DCT + fdct8x16_new_avx2, // DCT_ADST + fadst8x16_new_avx2, // ADST_ADST + fadst8x16_new_avx2, // FLIPADST_DCT + fdct8x16_new_avx2, // DCT_FLIPADST + fadst8x16_new_avx2, // FLIPADST_FLIPADST + fadst8x16_new_avx2, // ADST_FLIPADST + fadst8x16_new_avx2, // FLIPADST_ADST + fidentity8x16_new_avx2, // IDTX + fdct8x16_new_avx2, // V_DCT + fidentity8x16_new_avx2, // H_DCT + fadst8x16_new_avx2, // V_ADST + fidentity8x16_new_avx2, // H_ADST + fadst8x16_new_avx2, // V_FLIPADST + fidentity8x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = { + fdct8x8_new_avx2, // DCT_DCT + fadst8x8_new_avx2, // ADST_DCT + fdct8x8_new_avx2, // DCT_ADST + fadst8x8_new_avx2, // ADST_ADST + fadst8x8_new_avx2, // FLIPADST_DCT + fdct8x8_new_avx2, // DCT_FLIPADST + fadst8x8_new_avx2, // FLIPADST_FLIPADST + fadst8x8_new_avx2, // ADST_FLIPADST + fadst8x8_new_avx2, // FLIPADST_ADST + fidentity8x8_new_avx2, // IDTX + fdct8x8_new_avx2, // V_DCT + fidentity8x8_new_avx2, // H_DCT + fadst8x8_new_avx2, // V_ADST + fidentity8x8_new_avx2, // H_ADST + fadst8x8_new_avx2, // V_FLIPADST + fidentity8x8_new_avx2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = { + fdct8x16_new_avx2, // DCT_DCT + fdct8x16_new_avx2, // ADST_DCT + fadst8x16_new_avx2, // DCT_ADST + fadst8x16_new_avx2, // ADST_ADST + fdct8x16_new_avx2, // FLIPADST_DCT + fadst8x16_new_avx2, // DCT_FLIPADST + fadst8x16_new_avx2, // FLIPADST_FLIPADST + fadst8x16_new_avx2, // ADST_FLIPADST + fadst8x16_new_avx2, // FLIPADST_ADST + fidentity8x16_new_avx2, // IDTX + fidentity8x16_new_avx2, // V_DCT + fdct8x16_new_avx2, // H_DCT + fidentity8x16_new_avx2, // V_ADST + fadst8x16_new_avx2, // H_ADST + fidentity8x16_new_avx2, // V_FLIPADST + fadst8x16_new_avx2 // H_FLIPADST +}; + +static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + __m256i buf2[8]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + + __m128i *bufl, *bufu; + if (lr_flip) { + bufl = buf0; + bufu = buf0 + 8; + flip_buf_sse2(buf1 + width * 0, bufl, width); + flip_buf_sse2(buf1 + width * 1, bufu, width); + } else { + bufl = buf1 + width * 0; + bufu = buf1 + width * 1; + } + pack_reg(bufl, bufu, buf2); + row_txfm(buf2, buf2, cos_bit_row); + round_shift_16bit_w16_avx2(buf2, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf2, output, height, width); +} + +static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + __m256i buf2[8]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 8; + const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height); + load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height); + } else { + load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height); + load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height); + } + pack_reg(buf0, &buf0[8], buf2); + round_shift_16bit_w16_avx2(buf2, height, shift[0]); + col_txfm(buf2, buf2, cos_bit_col); + round_shift_16bit_w16_avx2(buf2, height, shift[1]); + transpose_16bit_16x8_avx2(buf2, buf2); + extract_reg(buf2, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width); +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_avx2, // 8x8 transform + lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform + lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform + lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform + lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform + lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform + lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform + lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform + lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform + lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c new file mode 100644 index 0000000000..825da8d7b4 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" + +static INLINE void int16_array_with_stride_to_int32_array_without_stride( + const int16_t *input, int stride, int32_t *output, int txfm1d_size) { + int r, c; + for (r = 0; r < txfm1d_size; r++) { + for (c = 0; c < txfm1d_size; c++) { + output[r * txfm1d_size + c] = (int32_t)input[r * stride + c]; + } + } +} + +static INLINE void store_output_32bit_w8(int32_t *const out, + const __m128i *const in1, + const __m128i *const in2, + const int stride, const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm_store_si128((__m128i *)(out + stride * i), in1[i]); + _mm_store_si128((__m128i *)(out + stride * i + 4), in2[i]); + } +} + +typedef void (*TxfmFuncSSE2)(__m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +static void fdct32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit, + const int8_t *stage_range) { + const int txfm_size = 32; + const int num_per_128 = 4; + int col_num = txfm_size / num_per_128; + int col; + (void)stage_range; + for (col = 0; col < col_num; col++) { + av1_fdct32_sse4_1((input + col), (output + col), cos_bit, col_num); + } +} + +static void fdct64_new_sse4_1(__m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 64; + const int num_per_128 = 4; + int col_num = txfm_size / num_per_128; + (void)stage_range; + for (int col = 0; col < col_num; col++) { + av1_fdct64_sse4_1((input + col), (output + col), cos_bit, col_num, col_num); + } +} +static void idtx32x32_sse4_1(__m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + (void)stage_range; + + for (int i = 0; i < 8; i++) { + av1_idtx32_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1); + } +} + +static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT32: return fdct32_sse4_1; + case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; + case TXFM_TYPE_IDENTITY32: return idtx32x32_sse4_1; + default: assert(0); + } + return NULL; +} + +static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, + const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + // TODO(sarahparker) This does not currently support rectangular transforms + // and will break without splitting txfm_size out into row and col size. + // Rectangular transforms use c code only, so it should be ok for now. + // It will be corrected when there are sse implementations for rectangular + // transforms. + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t *stage_range_row = cfg->stage_range_row; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); + + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf, + txfm_size); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]); + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row); + av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]); +} + +static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input, + int32_t *output, const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + + const int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + int col_num = txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, output, + txfm_size); + /*col wise transform*/ + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + + /*row wise transform*/ + for (int col = 0; col < (col_num >> 1); col++) { + av1_fdct64_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, col_num, + (col_num >> 1)); + } + + txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1); + av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]); +} + +void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); + (void)bd; + fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); +} + +void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + (void)bd; + fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf); +} + +static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m128i buf0[64], buf1[512]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[64]; + __m128i bufB[64]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1); + av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); + av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + + store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_64X32; + __m128i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[64]; + __m128i bufB[64]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); + + store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_32X64; + __m128i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[32]; + __m128i bufB[32]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct32_sse4_1(bufA, bufA, cos_bit_row, 1); + av1_fdct32_sse4_1(bufB, bufB, cos_bit_row, 1); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); + + store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform + av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform + lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform + av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform + lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform + lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform + av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h new file mode 100644 index 0000000000..aaad76e5ae --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ +#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ +#include + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1, + __m256i *in0, __m256i *in1, + const __m256i _r, const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i ww0 = _mm256_set1_epi32(w0); + const __m256i ww1 = _mm256_set1_epi32(w1); + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1, + __m256i *in0, __m256i *in1, + const __m256i _r, const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i ww0 = _mm256_set1_epi32(w0); + const __m256i ww1 = _mm256_set1_epi32(w1); + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1, + __m256i *in0, __m256i *in1, + const __m256i _r, + const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1, + __m256i *in0, __m256i *in1, + const __m256i _r, + const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c new file mode 100644 index 0000000000..a4def754b0 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c @@ -0,0 +1,2673 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" + +// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible). + +static void fdct4x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + __m128i u[4], v[4]; + + u[0] = _mm_unpacklo_epi16(input[0], input[1]); + u[1] = _mm_unpacklo_epi16(input[3], input[2]); + + v[0] = _mm_add_epi16(u[0], u[1]); + v[1] = _mm_sub_epi16(u[0], u[1]); + + u[0] = _mm_madd_epi16(v[0], cospi_p32_p32); // 0 + u[1] = _mm_madd_epi16(v[0], cospi_p32_m32); // 2 + u[2] = _mm_madd_epi16(v[1], cospi_p16_p48); // 1 + u[3] = _mm_madd_epi16(v[1], cospi_p48_m16); // 3 + + v[0] = _mm_add_epi32(u[0], __rounding); + v[1] = _mm_add_epi32(u[1], __rounding); + v[2] = _mm_add_epi32(u[2], __rounding); + v[3] = _mm_add_epi32(u[3], __rounding); + u[0] = _mm_srai_epi32(v[0], cos_bit); + u[1] = _mm_srai_epi32(v[1], cos_bit); + u[2] = _mm_srai_epi32(v[2], cos_bit); + u[3] = _mm_srai_epi32(v[3], cos_bit); + + output[0] = _mm_packs_epi32(u[0], u[1]); + output[1] = _mm_packs_epi32(u[2], u[3]); + output[2] = _mm_srli_si128(output[0], 8); + output[3] = _mm_srli_si128(output[1], 8); +} + +static void fdct8x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + + // stage 1 + __m128i x1[4]; + x1[0] = _mm_adds_epi16(input[0], input[3]); + x1[3] = _mm_subs_epi16(input[0], input[3]); + x1[1] = _mm_adds_epi16(input[1], input[2]); + x1[2] = _mm_subs_epi16(input[1], input[2]); + + // stage 2 + __m128i x2[4]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]); + + // stage 3 + output[0] = x2[0]; + output[1] = x2[2]; + output[2] = x2[1]; + output[3] = x2[3]; +} + +static void fdct4x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m128i x1[8]; + x1[0] = _mm_adds_epi16(input[0], input[7]); + x1[7] = _mm_subs_epi16(input[0], input[7]); + x1[1] = _mm_adds_epi16(input[1], input[6]); + x1[6] = _mm_subs_epi16(input[1], input[6]); + x1[2] = _mm_adds_epi16(input[2], input[5]); + x1[5] = _mm_subs_epi16(input[2], input[5]); + x1[3] = _mm_adds_epi16(input[3], input[4]); + x1[4] = _mm_subs_epi16(input[3], input[4]); + + // stage 2 + __m128i x2[8]; + x2[0] = _mm_adds_epi16(x1[0], x1[3]); + x2[3] = _mm_subs_epi16(x1[0], x1[3]); + x2[1] = _mm_adds_epi16(x1[1], x1[2]); + x2[2] = _mm_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5], + &x1[6], &x2[5], &x2[6]); + x2[7] = x1[7]; + + // stage 3 + __m128i x3[8]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0], + &x2[1], &x3[0], &x3[1]); + btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2], + &x2[3], &x3[2], &x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4], + &x3[7], &x4[4], &x4[7]); + btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5], + &x3[6], &x4[5], &x4[6]); + + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static void fdct8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + + // stage 1 + __m128i x1[16]; + x1[0] = _mm_adds_epi16(input[0], input[15]); + x1[15] = _mm_subs_epi16(input[0], input[15]); + x1[1] = _mm_adds_epi16(input[1], input[14]); + x1[14] = _mm_subs_epi16(input[1], input[14]); + x1[2] = _mm_adds_epi16(input[2], input[13]); + x1[13] = _mm_subs_epi16(input[2], input[13]); + x1[3] = _mm_adds_epi16(input[3], input[12]); + x1[12] = _mm_subs_epi16(input[3], input[12]); + x1[4] = _mm_adds_epi16(input[4], input[11]); + x1[11] = _mm_subs_epi16(input[4], input[11]); + x1[5] = _mm_adds_epi16(input[5], input[10]); + x1[10] = _mm_subs_epi16(input[5], input[10]); + x1[6] = _mm_adds_epi16(input[6], input[9]); + x1[9] = _mm_subs_epi16(input[6], input[9]); + x1[7] = _mm_adds_epi16(input[7], input[8]); + x1[8] = _mm_subs_epi16(input[7], input[8]); + + // stage 2 + __m128i x2[16]; + x2[0] = _mm_adds_epi16(x1[0], x1[7]); + x2[7] = _mm_subs_epi16(x1[0], x1[7]); + x2[1] = _mm_adds_epi16(x1[1], x1[6]); + x2[6] = _mm_subs_epi16(x1[1], x1[6]); + x2[2] = _mm_adds_epi16(x1[2], x1[5]); + x2[5] = _mm_subs_epi16(x1[2], x1[5]); + x2[3] = _mm_adds_epi16(x1[3], x1[4]); + x2[4] = _mm_subs_epi16(x1[3], x1[4]); + x2[8] = x1[8]; + x2[9] = x1[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]); + x2[14] = x1[14]; + x2[15] = x1[15]; + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[3]); + x3[3] = _mm_subs_epi16(x2[0], x2[3]); + x3[1] = _mm_adds_epi16(x2[1], x2[2]); + x3[2] = _mm_subs_epi16(x2[1], x2[2]); + x3[4] = x2[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]); + x3[7] = x2[7]; + x3[8] = _mm_adds_epi16(x2[8], x2[11]); + x3[11] = _mm_subs_epi16(x2[8], x2[11]); + x3[9] = _mm_adds_epi16(x2[9], x2[10]); + x3[10] = _mm_subs_epi16(x2[9], x2[10]); + x3[12] = _mm_subs_epi16(x2[15], x2[12]); + x3[15] = _mm_adds_epi16(x2[15], x2[12]); + x3[13] = _mm_subs_epi16(x2[14], x2[13]); + x3[14] = _mm_adds_epi16(x2[14], x2[13]); + + // stage 4 + __m128i x4[16]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]); + x4[4] = _mm_adds_epi16(x3[4], x3[5]); + x4[5] = _mm_subs_epi16(x3[4], x3[5]); + x4[6] = _mm_subs_epi16(x3[7], x3[6]); + x4[7] = _mm_adds_epi16(x3[7], x3[6]); + x4[8] = x3[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]); + x4[11] = x3[11]; + x4[12] = x3[12]; + x4[15] = x3[15]; + + // stage 5 + __m128i x5[16]; + x5[0] = x4[0]; + x5[1] = x4[1]; + x5[2] = x4[2]; + x5[3] = x4[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]); + x5[8] = _mm_adds_epi16(x4[8], x4[9]); + x5[9] = _mm_subs_epi16(x4[8], x4[9]); + x5[10] = _mm_subs_epi16(x4[11], x4[10]); + x5[11] = _mm_adds_epi16(x4[11], x4[10]); + x5[12] = _mm_adds_epi16(x4[12], x4[13]); + x5[13] = _mm_subs_epi16(x4[12], x4[13]); + x5[14] = _mm_subs_epi16(x4[15], x4[14]); + x5[15] = _mm_adds_epi16(x4[15], x4[14]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + x6[4] = x5[4]; + x6[5] = x5[5]; + x6[6] = x5[6]; + x6[7] = x5[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]); + + // stage 7 + output[0] = x6[0]; + output[1] = x6[8]; + output[2] = x6[4]; + output[3] = x6[12]; + output[4] = x6[2]; + output[5] = x6[10]; + output[6] = x6[6]; + output[7] = x6[14]; + output[8] = x6[1]; + output[9] = x6[9]; + output[10] = x6[5]; + output[11] = x6[13]; + output[12] = x6[3]; + output[13] = x6[11]; + output[14] = x6[7]; + output[15] = x6[15]; +} + +void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); + __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); + __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); + __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); + __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); + __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); + __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); + __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); + __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); + __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); + __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); + __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); + __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); + __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); + __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); + __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); + + // stage 1 + __m128i x1[32]; + x1[0] = _mm_adds_epi16(input[0], input[31]); + x1[31] = _mm_subs_epi16(input[0], input[31]); + x1[1] = _mm_adds_epi16(input[1], input[30]); + x1[30] = _mm_subs_epi16(input[1], input[30]); + x1[2] = _mm_adds_epi16(input[2], input[29]); + x1[29] = _mm_subs_epi16(input[2], input[29]); + x1[3] = _mm_adds_epi16(input[3], input[28]); + x1[28] = _mm_subs_epi16(input[3], input[28]); + x1[4] = _mm_adds_epi16(input[4], input[27]); + x1[27] = _mm_subs_epi16(input[4], input[27]); + x1[5] = _mm_adds_epi16(input[5], input[26]); + x1[26] = _mm_subs_epi16(input[5], input[26]); + x1[6] = _mm_adds_epi16(input[6], input[25]); + x1[25] = _mm_subs_epi16(input[6], input[25]); + x1[7] = _mm_adds_epi16(input[7], input[24]); + x1[24] = _mm_subs_epi16(input[7], input[24]); + x1[8] = _mm_adds_epi16(input[8], input[23]); + x1[23] = _mm_subs_epi16(input[8], input[23]); + x1[9] = _mm_adds_epi16(input[9], input[22]); + x1[22] = _mm_subs_epi16(input[9], input[22]); + x1[10] = _mm_adds_epi16(input[10], input[21]); + x1[21] = _mm_subs_epi16(input[10], input[21]); + x1[11] = _mm_adds_epi16(input[11], input[20]); + x1[20] = _mm_subs_epi16(input[11], input[20]); + x1[12] = _mm_adds_epi16(input[12], input[19]); + x1[19] = _mm_subs_epi16(input[12], input[19]); + x1[13] = _mm_adds_epi16(input[13], input[18]); + x1[18] = _mm_subs_epi16(input[13], input[18]); + x1[14] = _mm_adds_epi16(input[14], input[17]); + x1[17] = _mm_subs_epi16(input[14], input[17]); + x1[15] = _mm_adds_epi16(input[15], input[16]); + x1[16] = _mm_subs_epi16(input[15], input[16]); + + // stage 2 + __m128i x2[32]; + x2[0] = _mm_adds_epi16(x1[0], x1[15]); + x2[15] = _mm_subs_epi16(x1[0], x1[15]); + x2[1] = _mm_adds_epi16(x1[1], x1[14]); + x2[14] = _mm_subs_epi16(x1[1], x1[14]); + x2[2] = _mm_adds_epi16(x1[2], x1[13]); + x2[13] = _mm_subs_epi16(x1[2], x1[13]); + x2[3] = _mm_adds_epi16(x1[3], x1[12]); + x2[12] = _mm_subs_epi16(x1[3], x1[12]); + x2[4] = _mm_adds_epi16(x1[4], x1[11]); + x2[11] = _mm_subs_epi16(x1[4], x1[11]); + x2[5] = _mm_adds_epi16(x1[5], x1[10]); + x2[10] = _mm_subs_epi16(x1[5], x1[10]); + x2[6] = _mm_adds_epi16(x1[6], x1[9]); + x2[9] = _mm_subs_epi16(x1[6], x1[9]); + x2[7] = _mm_adds_epi16(x1[7], x1[8]); + x2[8] = _mm_subs_epi16(x1[7], x1[8]); + x2[16] = x1[16]; + x2[17] = x1[17]; + x2[18] = x1[18]; + x2[19] = x1[19]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]); + x2[28] = x1[28]; + x2[29] = x1[29]; + x2[30] = x1[30]; + x2[31] = x1[31]; + + // stage 3 + __m128i x3[32]; + x3[0] = _mm_adds_epi16(x2[0], x2[7]); + x3[7] = _mm_subs_epi16(x2[0], x2[7]); + x3[1] = _mm_adds_epi16(x2[1], x2[6]); + x3[6] = _mm_subs_epi16(x2[1], x2[6]); + x3[2] = _mm_adds_epi16(x2[2], x2[5]); + x3[5] = _mm_subs_epi16(x2[2], x2[5]); + x3[3] = _mm_adds_epi16(x2[3], x2[4]); + x3[4] = _mm_subs_epi16(x2[3], x2[4]); + x3[8] = x2[8]; + x3[9] = x2[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]); + x3[14] = x2[14]; + x3[15] = x2[15]; + x3[16] = _mm_adds_epi16(x2[16], x2[23]); + x3[23] = _mm_subs_epi16(x2[16], x2[23]); + x3[17] = _mm_adds_epi16(x2[17], x2[22]); + x3[22] = _mm_subs_epi16(x2[17], x2[22]); + x3[18] = _mm_adds_epi16(x2[18], x2[21]); + x3[21] = _mm_subs_epi16(x2[18], x2[21]); + x3[19] = _mm_adds_epi16(x2[19], x2[20]); + x3[20] = _mm_subs_epi16(x2[19], x2[20]); + x3[24] = _mm_subs_epi16(x2[31], x2[24]); + x3[31] = _mm_adds_epi16(x2[31], x2[24]); + x3[25] = _mm_subs_epi16(x2[30], x2[25]); + x3[30] = _mm_adds_epi16(x2[30], x2[25]); + x3[26] = _mm_subs_epi16(x2[29], x2[26]); + x3[29] = _mm_adds_epi16(x2[29], x2[26]); + x3[27] = _mm_subs_epi16(x2[28], x2[27]); + x3[28] = _mm_adds_epi16(x2[28], x2[27]); + + // stage 4 + __m128i x4[32]; + x4[0] = _mm_adds_epi16(x3[0], x3[3]); + x4[3] = _mm_subs_epi16(x3[0], x3[3]); + x4[1] = _mm_adds_epi16(x3[1], x3[2]); + x4[2] = _mm_subs_epi16(x3[1], x3[2]); + x4[4] = x3[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]); + x4[7] = x3[7]; + x4[8] = _mm_adds_epi16(x3[8], x3[11]); + x4[11] = _mm_subs_epi16(x3[8], x3[11]); + x4[9] = _mm_adds_epi16(x3[9], x3[10]); + x4[10] = _mm_subs_epi16(x3[9], x3[10]); + x4[12] = _mm_subs_epi16(x3[15], x3[12]); + x4[15] = _mm_adds_epi16(x3[15], x3[12]); + x4[13] = _mm_subs_epi16(x3[14], x3[13]); + x4[14] = _mm_adds_epi16(x3[14], x3[13]); + x4[16] = x3[16]; + x4[17] = x3[17]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]); + x4[22] = x3[22]; + x4[23] = x3[23]; + x4[24] = x3[24]; + x4[25] = x3[25]; + x4[30] = x3[30]; + x4[31] = x3[31]; + + // stage 5 + __m128i x5[32]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]); + x5[4] = _mm_adds_epi16(x4[4], x4[5]); + x5[5] = _mm_subs_epi16(x4[4], x4[5]); + x5[6] = _mm_subs_epi16(x4[7], x4[6]); + x5[7] = _mm_adds_epi16(x4[7], x4[6]); + x5[8] = x4[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]); + x5[11] = x4[11]; + x5[12] = x4[12]; + x5[15] = x4[15]; + x5[16] = _mm_adds_epi16(x4[16], x4[19]); + x5[19] = _mm_subs_epi16(x4[16], x4[19]); + x5[17] = _mm_adds_epi16(x4[17], x4[18]); + x5[18] = _mm_subs_epi16(x4[17], x4[18]); + x5[20] = _mm_subs_epi16(x4[23], x4[20]); + x5[23] = _mm_adds_epi16(x4[23], x4[20]); + x5[21] = _mm_subs_epi16(x4[22], x4[21]); + x5[22] = _mm_adds_epi16(x4[22], x4[21]); + x5[24] = _mm_adds_epi16(x4[24], x4[27]); + x5[27] = _mm_subs_epi16(x4[24], x4[27]); + x5[25] = _mm_adds_epi16(x4[25], x4[26]); + x5[26] = _mm_subs_epi16(x4[25], x4[26]); + x5[28] = _mm_subs_epi16(x4[31], x4[28]); + x5[31] = _mm_adds_epi16(x4[31], x4[28]); + x5[29] = _mm_subs_epi16(x4[30], x4[29]); + x5[30] = _mm_adds_epi16(x4[30], x4[29]); + + // stage 6 + __m128i x6[32]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]); + x6[8] = _mm_adds_epi16(x5[8], x5[9]); + x6[9] = _mm_subs_epi16(x5[8], x5[9]); + x6[10] = _mm_subs_epi16(x5[11], x5[10]); + x6[11] = _mm_adds_epi16(x5[11], x5[10]); + x6[12] = _mm_adds_epi16(x5[12], x5[13]); + x6[13] = _mm_subs_epi16(x5[12], x5[13]); + x6[14] = _mm_subs_epi16(x5[15], x5[14]); + x6[15] = _mm_adds_epi16(x5[15], x5[14]); + x6[16] = x5[16]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]); + x6[19] = x5[19]; + x6[20] = x5[20]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]); + x6[23] = x5[23]; + x6[24] = x5[24]; + x6[27] = x5[27]; + x6[28] = x5[28]; + x6[31] = x5[31]; + + // stage 7 + __m128i x7[32]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + x7[4] = x6[4]; + x7[5] = x6[5]; + x7[6] = x6[6]; + x7[7] = x6[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]); + x7[16] = _mm_adds_epi16(x6[16], x6[17]); + x7[17] = _mm_subs_epi16(x6[16], x6[17]); + x7[18] = _mm_subs_epi16(x6[19], x6[18]); + x7[19] = _mm_adds_epi16(x6[19], x6[18]); + x7[20] = _mm_adds_epi16(x6[20], x6[21]); + x7[21] = _mm_subs_epi16(x6[20], x6[21]); + x7[22] = _mm_subs_epi16(x6[23], x6[22]); + x7[23] = _mm_adds_epi16(x6[23], x6[22]); + x7[24] = _mm_adds_epi16(x6[24], x6[25]); + x7[25] = _mm_subs_epi16(x6[24], x6[25]); + x7[26] = _mm_subs_epi16(x6[27], x6[26]); + x7[27] = _mm_adds_epi16(x6[27], x6[26]); + x7[28] = _mm_adds_epi16(x6[28], x6[29]); + x7[29] = _mm_subs_epi16(x6[28], x6[29]); + x7[30] = _mm_subs_epi16(x6[31], x6[30]); + x7[31] = _mm_adds_epi16(x6[31], x6[30]); + + // stage 8 + __m128i x8[32]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + x8[8] = x7[8]; + x8[9] = x7[9]; + x8[10] = x7[10]; + x8[11] = x7[11]; + x8[12] = x7[12]; + x8[13] = x7[13]; + x8[14] = x7[14]; + x8[15] = x7[15]; + btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]); + btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]); + btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]); + btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]); + btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]); + btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]); + btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]); + btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]); + + // stage 9 + output[0] = x8[0]; + output[1] = x8[16]; + output[2] = x8[8]; + output[3] = x8[24]; + output[4] = x8[4]; + output[5] = x8[20]; + output[6] = x8[12]; + output[7] = x8[28]; + output[8] = x8[2]; + output[9] = x8[18]; + output[10] = x8[10]; + output[11] = x8[26]; + output[12] = x8[6]; + output[13] = x8[22]; + output[14] = x8[14]; + output[15] = x8[30]; + output[16] = x8[1]; + output[17] = x8[17]; + output[18] = x8[9]; + output[19] = x8[25]; + output[20] = x8[5]; + output[21] = x8[21]; + output[22] = x8[13]; + output[23] = x8[29]; + output[24] = x8[3]; + output[25] = x8[19]; + output[26] = x8[11]; + output[27] = x8[27]; + output[28] = x8[7]; + output[29] = x8[23]; + output[30] = x8[15]; + output[31] = x8[31]; +} + +void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); + __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); + __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); + __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); + __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); + __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); + __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); + __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); + __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); + __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); + __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); + __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); + __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); + __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); + __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); + __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); + __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); + __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); + __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]); + __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]); + __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]); + __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]); + __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]); + __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]); + __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]); + __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]); + __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]); + __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]); + __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]); + __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]); + __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]); + __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]); + __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]); + __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]); + __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]); + __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]); + __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]); + __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]); + __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]); + __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]); + __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]); + __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]); + __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]); + __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]); + __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]); + __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]); + __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]); + __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]); + __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]); + __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]); + + // stage 1 + __m128i x1[64]; + x1[0] = _mm_adds_epi16(input[0], input[63]); + x1[63] = _mm_subs_epi16(input[0], input[63]); + x1[1] = _mm_adds_epi16(input[1], input[62]); + x1[62] = _mm_subs_epi16(input[1], input[62]); + x1[2] = _mm_adds_epi16(input[2], input[61]); + x1[61] = _mm_subs_epi16(input[2], input[61]); + x1[3] = _mm_adds_epi16(input[3], input[60]); + x1[60] = _mm_subs_epi16(input[3], input[60]); + x1[4] = _mm_adds_epi16(input[4], input[59]); + x1[59] = _mm_subs_epi16(input[4], input[59]); + x1[5] = _mm_adds_epi16(input[5], input[58]); + x1[58] = _mm_subs_epi16(input[5], input[58]); + x1[6] = _mm_adds_epi16(input[6], input[57]); + x1[57] = _mm_subs_epi16(input[6], input[57]); + x1[7] = _mm_adds_epi16(input[7], input[56]); + x1[56] = _mm_subs_epi16(input[7], input[56]); + x1[8] = _mm_adds_epi16(input[8], input[55]); + x1[55] = _mm_subs_epi16(input[8], input[55]); + x1[9] = _mm_adds_epi16(input[9], input[54]); + x1[54] = _mm_subs_epi16(input[9], input[54]); + x1[10] = _mm_adds_epi16(input[10], input[53]); + x1[53] = _mm_subs_epi16(input[10], input[53]); + x1[11] = _mm_adds_epi16(input[11], input[52]); + x1[52] = _mm_subs_epi16(input[11], input[52]); + x1[12] = _mm_adds_epi16(input[12], input[51]); + x1[51] = _mm_subs_epi16(input[12], input[51]); + x1[13] = _mm_adds_epi16(input[13], input[50]); + x1[50] = _mm_subs_epi16(input[13], input[50]); + x1[14] = _mm_adds_epi16(input[14], input[49]); + x1[49] = _mm_subs_epi16(input[14], input[49]); + x1[15] = _mm_adds_epi16(input[15], input[48]); + x1[48] = _mm_subs_epi16(input[15], input[48]); + x1[16] = _mm_adds_epi16(input[16], input[47]); + x1[47] = _mm_subs_epi16(input[16], input[47]); + x1[17] = _mm_adds_epi16(input[17], input[46]); + x1[46] = _mm_subs_epi16(input[17], input[46]); + x1[18] = _mm_adds_epi16(input[18], input[45]); + x1[45] = _mm_subs_epi16(input[18], input[45]); + x1[19] = _mm_adds_epi16(input[19], input[44]); + x1[44] = _mm_subs_epi16(input[19], input[44]); + x1[20] = _mm_adds_epi16(input[20], input[43]); + x1[43] = _mm_subs_epi16(input[20], input[43]); + x1[21] = _mm_adds_epi16(input[21], input[42]); + x1[42] = _mm_subs_epi16(input[21], input[42]); + x1[22] = _mm_adds_epi16(input[22], input[41]); + x1[41] = _mm_subs_epi16(input[22], input[41]); + x1[23] = _mm_adds_epi16(input[23], input[40]); + x1[40] = _mm_subs_epi16(input[23], input[40]); + x1[24] = _mm_adds_epi16(input[24], input[39]); + x1[39] = _mm_subs_epi16(input[24], input[39]); + x1[25] = _mm_adds_epi16(input[25], input[38]); + x1[38] = _mm_subs_epi16(input[25], input[38]); + x1[26] = _mm_adds_epi16(input[26], input[37]); + x1[37] = _mm_subs_epi16(input[26], input[37]); + x1[27] = _mm_adds_epi16(input[27], input[36]); + x1[36] = _mm_subs_epi16(input[27], input[36]); + x1[28] = _mm_adds_epi16(input[28], input[35]); + x1[35] = _mm_subs_epi16(input[28], input[35]); + x1[29] = _mm_adds_epi16(input[29], input[34]); + x1[34] = _mm_subs_epi16(input[29], input[34]); + x1[30] = _mm_adds_epi16(input[30], input[33]); + x1[33] = _mm_subs_epi16(input[30], input[33]); + x1[31] = _mm_adds_epi16(input[31], input[32]); + x1[32] = _mm_subs_epi16(input[31], input[32]); + + // stage 2 + __m128i x2[64]; + x2[0] = _mm_adds_epi16(x1[0], x1[31]); + x2[31] = _mm_subs_epi16(x1[0], x1[31]); + x2[1] = _mm_adds_epi16(x1[1], x1[30]); + x2[30] = _mm_subs_epi16(x1[1], x1[30]); + x2[2] = _mm_adds_epi16(x1[2], x1[29]); + x2[29] = _mm_subs_epi16(x1[2], x1[29]); + x2[3] = _mm_adds_epi16(x1[3], x1[28]); + x2[28] = _mm_subs_epi16(x1[3], x1[28]); + x2[4] = _mm_adds_epi16(x1[4], x1[27]); + x2[27] = _mm_subs_epi16(x1[4], x1[27]); + x2[5] = _mm_adds_epi16(x1[5], x1[26]); + x2[26] = _mm_subs_epi16(x1[5], x1[26]); + x2[6] = _mm_adds_epi16(x1[6], x1[25]); + x2[25] = _mm_subs_epi16(x1[6], x1[25]); + x2[7] = _mm_adds_epi16(x1[7], x1[24]); + x2[24] = _mm_subs_epi16(x1[7], x1[24]); + x2[8] = _mm_adds_epi16(x1[8], x1[23]); + x2[23] = _mm_subs_epi16(x1[8], x1[23]); + x2[9] = _mm_adds_epi16(x1[9], x1[22]); + x2[22] = _mm_subs_epi16(x1[9], x1[22]); + x2[10] = _mm_adds_epi16(x1[10], x1[21]); + x2[21] = _mm_subs_epi16(x1[10], x1[21]); + x2[11] = _mm_adds_epi16(x1[11], x1[20]); + x2[20] = _mm_subs_epi16(x1[11], x1[20]); + x2[12] = _mm_adds_epi16(x1[12], x1[19]); + x2[19] = _mm_subs_epi16(x1[12], x1[19]); + x2[13] = _mm_adds_epi16(x1[13], x1[18]); + x2[18] = _mm_subs_epi16(x1[13], x1[18]); + x2[14] = _mm_adds_epi16(x1[14], x1[17]); + x2[17] = _mm_subs_epi16(x1[14], x1[17]); + x2[15] = _mm_adds_epi16(x1[15], x1[16]); + x2[16] = _mm_subs_epi16(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + __m128i x3[64]; + x3[0] = _mm_adds_epi16(x2[0], x2[15]); + x3[15] = _mm_subs_epi16(x2[0], x2[15]); + x3[1] = _mm_adds_epi16(x2[1], x2[14]); + x3[14] = _mm_subs_epi16(x2[1], x2[14]); + x3[2] = _mm_adds_epi16(x2[2], x2[13]); + x3[13] = _mm_subs_epi16(x2[2], x2[13]); + x3[3] = _mm_adds_epi16(x2[3], x2[12]); + x3[12] = _mm_subs_epi16(x2[3], x2[12]); + x3[4] = _mm_adds_epi16(x2[4], x2[11]); + x3[11] = _mm_subs_epi16(x2[4], x2[11]); + x3[5] = _mm_adds_epi16(x2[5], x2[10]); + x3[10] = _mm_subs_epi16(x2[5], x2[10]); + x3[6] = _mm_adds_epi16(x2[6], x2[9]); + x3[9] = _mm_subs_epi16(x2[6], x2[9]); + x3[7] = _mm_adds_epi16(x2[7], x2[8]); + x3[8] = _mm_subs_epi16(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm_adds_epi16(x2[32], x2[47]); + x3[47] = _mm_subs_epi16(x2[32], x2[47]); + x3[33] = _mm_adds_epi16(x2[33], x2[46]); + x3[46] = _mm_subs_epi16(x2[33], x2[46]); + x3[34] = _mm_adds_epi16(x2[34], x2[45]); + x3[45] = _mm_subs_epi16(x2[34], x2[45]); + x3[35] = _mm_adds_epi16(x2[35], x2[44]); + x3[44] = _mm_subs_epi16(x2[35], x2[44]); + x3[36] = _mm_adds_epi16(x2[36], x2[43]); + x3[43] = _mm_subs_epi16(x2[36], x2[43]); + x3[37] = _mm_adds_epi16(x2[37], x2[42]); + x3[42] = _mm_subs_epi16(x2[37], x2[42]); + x3[38] = _mm_adds_epi16(x2[38], x2[41]); + x3[41] = _mm_subs_epi16(x2[38], x2[41]); + x3[39] = _mm_adds_epi16(x2[39], x2[40]); + x3[40] = _mm_subs_epi16(x2[39], x2[40]); + x3[48] = _mm_subs_epi16(x2[63], x2[48]); + x3[63] = _mm_adds_epi16(x2[63], x2[48]); + x3[49] = _mm_subs_epi16(x2[62], x2[49]); + x3[62] = _mm_adds_epi16(x2[62], x2[49]); + x3[50] = _mm_subs_epi16(x2[61], x2[50]); + x3[61] = _mm_adds_epi16(x2[61], x2[50]); + x3[51] = _mm_subs_epi16(x2[60], x2[51]); + x3[60] = _mm_adds_epi16(x2[60], x2[51]); + x3[52] = _mm_subs_epi16(x2[59], x2[52]); + x3[59] = _mm_adds_epi16(x2[59], x2[52]); + x3[53] = _mm_subs_epi16(x2[58], x2[53]); + x3[58] = _mm_adds_epi16(x2[58], x2[53]); + x3[54] = _mm_subs_epi16(x2[57], x2[54]); + x3[57] = _mm_adds_epi16(x2[57], x2[54]); + x3[55] = _mm_subs_epi16(x2[56], x2[55]); + x3[56] = _mm_adds_epi16(x2[56], x2[55]); + + // stage 4 + __m128i x4[64]; + x4[0] = _mm_adds_epi16(x3[0], x3[7]); + x4[7] = _mm_subs_epi16(x3[0], x3[7]); + x4[1] = _mm_adds_epi16(x3[1], x3[6]); + x4[6] = _mm_subs_epi16(x3[1], x3[6]); + x4[2] = _mm_adds_epi16(x3[2], x3[5]); + x4[5] = _mm_subs_epi16(x3[2], x3[5]); + x4[3] = _mm_adds_epi16(x3[3], x3[4]); + x4[4] = _mm_subs_epi16(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm_adds_epi16(x3[16], x3[23]); + x4[23] = _mm_subs_epi16(x3[16], x3[23]); + x4[17] = _mm_adds_epi16(x3[17], x3[22]); + x4[22] = _mm_subs_epi16(x3[17], x3[22]); + x4[18] = _mm_adds_epi16(x3[18], x3[21]); + x4[21] = _mm_subs_epi16(x3[18], x3[21]); + x4[19] = _mm_adds_epi16(x3[19], x3[20]); + x4[20] = _mm_subs_epi16(x3[19], x3[20]); + x4[24] = _mm_subs_epi16(x3[31], x3[24]); + x4[31] = _mm_adds_epi16(x3[31], x3[24]); + x4[25] = _mm_subs_epi16(x3[30], x3[25]); + x4[30] = _mm_adds_epi16(x3[30], x3[25]); + x4[26] = _mm_subs_epi16(x3[29], x3[26]); + x4[29] = _mm_adds_epi16(x3[29], x3[26]); + x4[27] = _mm_subs_epi16(x3[28], x3[27]); + x4[28] = _mm_adds_epi16(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + __m128i x5[64]; + x5[0] = _mm_adds_epi16(x4[0], x4[3]); + x5[3] = _mm_subs_epi16(x4[0], x4[3]); + x5[1] = _mm_adds_epi16(x4[1], x4[2]); + x5[2] = _mm_subs_epi16(x4[1], x4[2]); + x5[4] = x4[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]); + x5[7] = x4[7]; + x5[8] = _mm_adds_epi16(x4[8], x4[11]); + x5[11] = _mm_subs_epi16(x4[8], x4[11]); + x5[9] = _mm_adds_epi16(x4[9], x4[10]); + x5[10] = _mm_subs_epi16(x4[9], x4[10]); + x5[12] = _mm_subs_epi16(x4[15], x4[12]); + x5[15] = _mm_adds_epi16(x4[15], x4[12]); + x5[13] = _mm_subs_epi16(x4[14], x4[13]); + x5[14] = _mm_adds_epi16(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm_adds_epi16(x4[32], x4[39]); + x5[39] = _mm_subs_epi16(x4[32], x4[39]); + x5[33] = _mm_adds_epi16(x4[33], x4[38]); + x5[38] = _mm_subs_epi16(x4[33], x4[38]); + x5[34] = _mm_adds_epi16(x4[34], x4[37]); + x5[37] = _mm_subs_epi16(x4[34], x4[37]); + x5[35] = _mm_adds_epi16(x4[35], x4[36]); + x5[36] = _mm_subs_epi16(x4[35], x4[36]); + x5[40] = _mm_subs_epi16(x4[47], x4[40]); + x5[47] = _mm_adds_epi16(x4[47], x4[40]); + x5[41] = _mm_subs_epi16(x4[46], x4[41]); + x5[46] = _mm_adds_epi16(x4[46], x4[41]); + x5[42] = _mm_subs_epi16(x4[45], x4[42]); + x5[45] = _mm_adds_epi16(x4[45], x4[42]); + x5[43] = _mm_subs_epi16(x4[44], x4[43]); + x5[44] = _mm_adds_epi16(x4[44], x4[43]); + x5[48] = _mm_adds_epi16(x4[48], x4[55]); + x5[55] = _mm_subs_epi16(x4[48], x4[55]); + x5[49] = _mm_adds_epi16(x4[49], x4[54]); + x5[54] = _mm_subs_epi16(x4[49], x4[54]); + x5[50] = _mm_adds_epi16(x4[50], x4[53]); + x5[53] = _mm_subs_epi16(x4[50], x4[53]); + x5[51] = _mm_adds_epi16(x4[51], x4[52]); + x5[52] = _mm_subs_epi16(x4[51], x4[52]); + x5[56] = _mm_subs_epi16(x4[63], x4[56]); + x5[63] = _mm_adds_epi16(x4[63], x4[56]); + x5[57] = _mm_subs_epi16(x4[62], x4[57]); + x5[62] = _mm_adds_epi16(x4[62], x4[57]); + x5[58] = _mm_subs_epi16(x4[61], x4[58]); + x5[61] = _mm_adds_epi16(x4[61], x4[58]); + x5[59] = _mm_subs_epi16(x4[60], x4[59]); + x5[60] = _mm_adds_epi16(x4[60], x4[59]); + + // stage 6 + __m128i x6[64]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]); + x6[4] = _mm_adds_epi16(x5[4], x5[5]); + x6[5] = _mm_subs_epi16(x5[4], x5[5]); + x6[6] = _mm_subs_epi16(x5[7], x5[6]); + x6[7] = _mm_adds_epi16(x5[7], x5[6]); + x6[8] = x5[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm_adds_epi16(x5[16], x5[19]); + x6[19] = _mm_subs_epi16(x5[16], x5[19]); + x6[17] = _mm_adds_epi16(x5[17], x5[18]); + x6[18] = _mm_subs_epi16(x5[17], x5[18]); + x6[20] = _mm_subs_epi16(x5[23], x5[20]); + x6[23] = _mm_adds_epi16(x5[23], x5[20]); + x6[21] = _mm_subs_epi16(x5[22], x5[21]); + x6[22] = _mm_adds_epi16(x5[22], x5[21]); + x6[24] = _mm_adds_epi16(x5[24], x5[27]); + x6[27] = _mm_subs_epi16(x5[24], x5[27]); + x6[25] = _mm_adds_epi16(x5[25], x5[26]); + x6[26] = _mm_subs_epi16(x5[25], x5[26]); + x6[28] = _mm_subs_epi16(x5[31], x5[28]); + x6[31] = _mm_adds_epi16(x5[31], x5[28]); + x6[29] = _mm_subs_epi16(x5[30], x5[29]); + x6[30] = _mm_adds_epi16(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + __m128i x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]); + x7[8] = _mm_adds_epi16(x6[8], x6[9]); + x7[9] = _mm_subs_epi16(x6[8], x6[9]); + x7[10] = _mm_subs_epi16(x6[11], x6[10]); + x7[11] = _mm_adds_epi16(x6[11], x6[10]); + x7[12] = _mm_adds_epi16(x6[12], x6[13]); + x7[13] = _mm_subs_epi16(x6[12], x6[13]); + x7[14] = _mm_subs_epi16(x6[15], x6[14]); + x7[15] = _mm_adds_epi16(x6[15], x6[14]); + x7[16] = x6[16]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm_adds_epi16(x6[32], x6[35]); + x7[35] = _mm_subs_epi16(x6[32], x6[35]); + x7[33] = _mm_adds_epi16(x6[33], x6[34]); + x7[34] = _mm_subs_epi16(x6[33], x6[34]); + x7[36] = _mm_subs_epi16(x6[39], x6[36]); + x7[39] = _mm_adds_epi16(x6[39], x6[36]); + x7[37] = _mm_subs_epi16(x6[38], x6[37]); + x7[38] = _mm_adds_epi16(x6[38], x6[37]); + x7[40] = _mm_adds_epi16(x6[40], x6[43]); + x7[43] = _mm_subs_epi16(x6[40], x6[43]); + x7[41] = _mm_adds_epi16(x6[41], x6[42]); + x7[42] = _mm_subs_epi16(x6[41], x6[42]); + x7[44] = _mm_subs_epi16(x6[47], x6[44]); + x7[47] = _mm_adds_epi16(x6[47], x6[44]); + x7[45] = _mm_subs_epi16(x6[46], x6[45]); + x7[46] = _mm_adds_epi16(x6[46], x6[45]); + x7[48] = _mm_adds_epi16(x6[48], x6[51]); + x7[51] = _mm_subs_epi16(x6[48], x6[51]); + x7[49] = _mm_adds_epi16(x6[49], x6[50]); + x7[50] = _mm_subs_epi16(x6[49], x6[50]); + x7[52] = _mm_subs_epi16(x6[55], x6[52]); + x7[55] = _mm_adds_epi16(x6[55], x6[52]); + x7[53] = _mm_subs_epi16(x6[54], x6[53]); + x7[54] = _mm_adds_epi16(x6[54], x6[53]); + x7[56] = _mm_adds_epi16(x6[56], x6[59]); + x7[59] = _mm_subs_epi16(x6[56], x6[59]); + x7[57] = _mm_adds_epi16(x6[57], x6[58]); + x7[58] = _mm_subs_epi16(x6[57], x6[58]); + x7[60] = _mm_subs_epi16(x6[63], x6[60]); + x7[63] = _mm_adds_epi16(x6[63], x6[60]); + x7[61] = _mm_subs_epi16(x6[62], x6[61]); + x7[62] = _mm_adds_epi16(x6[62], x6[61]); + + // stage 8 + __m128i x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]); + x8[16] = _mm_adds_epi16(x7[16], x7[17]); + x8[17] = _mm_subs_epi16(x7[16], x7[17]); + x8[18] = _mm_subs_epi16(x7[19], x7[18]); + x8[19] = _mm_adds_epi16(x7[19], x7[18]); + x8[20] = _mm_adds_epi16(x7[20], x7[21]); + x8[21] = _mm_subs_epi16(x7[20], x7[21]); + x8[22] = _mm_subs_epi16(x7[23], x7[22]); + x8[23] = _mm_adds_epi16(x7[23], x7[22]); + x8[24] = _mm_adds_epi16(x7[24], x7[25]); + x8[25] = _mm_subs_epi16(x7[24], x7[25]); + x8[26] = _mm_subs_epi16(x7[27], x7[26]); + x8[27] = _mm_adds_epi16(x7[27], x7[26]); + x8[28] = _mm_adds_epi16(x7[28], x7[29]); + x8[29] = _mm_subs_epi16(x7[28], x7[29]); + x8[30] = _mm_subs_epi16(x7[31], x7[30]); + x8[31] = _mm_adds_epi16(x7[31], x7[30]); + x8[32] = x7[32]; + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]); + btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]); + btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + __m128i x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]); + btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]); + btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]); + btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]); + btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]); + btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]); + btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]); + btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]); + x9[32] = _mm_adds_epi16(x8[32], x8[33]); + x9[33] = _mm_subs_epi16(x8[32], x8[33]); + x9[34] = _mm_subs_epi16(x8[35], x8[34]); + x9[35] = _mm_adds_epi16(x8[35], x8[34]); + x9[36] = _mm_adds_epi16(x8[36], x8[37]); + x9[37] = _mm_subs_epi16(x8[36], x8[37]); + x9[38] = _mm_subs_epi16(x8[39], x8[38]); + x9[39] = _mm_adds_epi16(x8[39], x8[38]); + x9[40] = _mm_adds_epi16(x8[40], x8[41]); + x9[41] = _mm_subs_epi16(x8[40], x8[41]); + x9[42] = _mm_subs_epi16(x8[43], x8[42]); + x9[43] = _mm_adds_epi16(x8[43], x8[42]); + x9[44] = _mm_adds_epi16(x8[44], x8[45]); + x9[45] = _mm_subs_epi16(x8[44], x8[45]); + x9[46] = _mm_subs_epi16(x8[47], x8[46]); + x9[47] = _mm_adds_epi16(x8[47], x8[46]); + x9[48] = _mm_adds_epi16(x8[48], x8[49]); + x9[49] = _mm_subs_epi16(x8[48], x8[49]); + x9[50] = _mm_subs_epi16(x8[51], x8[50]); + x9[51] = _mm_adds_epi16(x8[51], x8[50]); + x9[52] = _mm_adds_epi16(x8[52], x8[53]); + x9[53] = _mm_subs_epi16(x8[52], x8[53]); + x9[54] = _mm_subs_epi16(x8[55], x8[54]); + x9[55] = _mm_adds_epi16(x8[55], x8[54]); + x9[56] = _mm_adds_epi16(x8[56], x8[57]); + x9[57] = _mm_subs_epi16(x8[56], x8[57]); + x9[58] = _mm_subs_epi16(x8[59], x8[58]); + x9[59] = _mm_adds_epi16(x8[59], x8[58]); + x9[60] = _mm_adds_epi16(x8[60], x8[61]); + x9[61] = _mm_subs_epi16(x8[60], x8[61]); + x9[62] = _mm_subs_epi16(x8[63], x8[62]); + x9[63] = _mm_adds_epi16(x8[63], x8[62]); + + // stage 10 + __m128i x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]); + btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]); + btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]); + btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]); + btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]); + btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]); + btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]); + btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]); + btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]); + btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]); + btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]); + btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]); + btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]); + btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]); + btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]); + btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]); + + // stage 11 + output[0] = x10[0]; + output[1] = x10[32]; + output[2] = x10[16]; + output[3] = x10[48]; + output[4] = x10[8]; + output[5] = x10[40]; + output[6] = x10[24]; + output[7] = x10[56]; + output[8] = x10[4]; + output[9] = x10[36]; + output[10] = x10[20]; + output[11] = x10[52]; + output[12] = x10[12]; + output[13] = x10[44]; + output[14] = x10[28]; + output[15] = x10[60]; + output[16] = x10[2]; + output[17] = x10[34]; + output[18] = x10[18]; + output[19] = x10[50]; + output[20] = x10[10]; + output[21] = x10[42]; + output[22] = x10[26]; + output[23] = x10[58]; + output[24] = x10[6]; + output[25] = x10[38]; + output[26] = x10[22]; + output[27] = x10[54]; + output[28] = x10[14]; + output[29] = x10[46]; + output[30] = x10[30]; + output[31] = x10[62]; + output[32] = x10[1]; + output[33] = x10[33]; + output[34] = x10[17]; + output[35] = x10[49]; + output[36] = x10[9]; + output[37] = x10[41]; + output[38] = x10[25]; + output[39] = x10[57]; + output[40] = x10[5]; + output[41] = x10[37]; + output[42] = x10[21]; + output[43] = x10[53]; + output[44] = x10[13]; + output[45] = x10[45]; + output[46] = x10[29]; + output[47] = x10[61]; + output[48] = x10[3]; + output[49] = x10[35]; + output[50] = x10[19]; + output[51] = x10[51]; + output[52] = x10[11]; + output[53] = x10[43]; + output[54] = x10[27]; + output[55] = x10[59]; + output[56] = x10[7]; + output[57] = x10[39]; + output[58] = x10[23]; + output[59] = x10[55]; + output[60] = x10[15]; + output[61] = x10[47]; + output[62] = x10[31]; + output[63] = x10[63]; +} + +static void fadst4x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); + const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); + const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + const __m128i in7 = _mm_add_epi16(input[0], input[1]); + __m128i u[8], v[8]; + + u[0] = _mm_unpacklo_epi16(input[0], input[1]); + u[1] = _mm_unpacklo_epi16(input[2], input[3]); + u[2] = _mm_unpacklo_epi16(in7, __zero); + u[3] = _mm_unpacklo_epi16(input[2], __zero); + u[4] = _mm_unpacklo_epi16(input[3], __zero); + + v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02); // s0 + s2 + v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04); // s4 + s5 + v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03); // x1 + v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01); // s1 - s3 + v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02); // -s4 + s6 + v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03); + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_sub_epi32(v[2], v[6]); + u[2] = _mm_add_epi32(v[3], v[4]); + u[3] = _mm_sub_epi32(u[2], u[0]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_sub_epi32(u[4], v[5]); + u[6] = _mm_add_epi32(u[3], u[5]); + + v[0] = _mm_add_epi32(u[0], __rounding); + v[1] = _mm_add_epi32(u[1], __rounding); + v[2] = _mm_add_epi32(u[2], __rounding); + v[3] = _mm_add_epi32(u[6], __rounding); + + u[0] = _mm_srai_epi32(v[0], cos_bit); + u[1] = _mm_srai_epi32(v[1], cos_bit); + u[2] = _mm_srai_epi32(v[2], cos_bit); + u[3] = _mm_srai_epi32(v[3], cos_bit); + + output[0] = _mm_packs_epi32(u[0], u[2]); + output[1] = _mm_packs_epi32(u[1], u[3]); + output[2] = _mm_srli_si128(output[0], 8); + output[3] = _mm_srli_si128(output[1], 8); +} + +static void fadst4x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[7]); + x1[2] = _mm_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm_subs_epi16(__zero, input[5]); + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2], + &x1[3], &x2[2], &x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6], + &x1[7], &x2[6], &x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4], + &x3[5], &x4[4], &x4[5]); + btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6], + &x3[7], &x4[6], &x4[7]); + + // stage 5 + __m128i x5[8]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m128i x6[8]; + btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0], + &x5[1], &x6[0], &x6[1]); + btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2], + &x5[3], &x6[2], &x6[3]); + btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4], + &x5[5], &x6[4], &x6[5]); + btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6], + &x5[7], &x6[6], &x6[7]); + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static void fadst8x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); + const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); + const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + const __m128i in7 = _mm_add_epi16(input[0], input[1]); + __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8]; + + u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]); + u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]); + u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]); + u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]); + u_lo[2] = _mm_unpacklo_epi16(in7, __zero); + u_hi[2] = _mm_unpackhi_epi16(in7, __zero); + u_lo[3] = _mm_unpacklo_epi16(input[2], __zero); + u_hi[3] = _mm_unpackhi_epi16(input[2], __zero); + u_lo[4] = _mm_unpacklo_epi16(input[3], __zero); + u_hi[4] = _mm_unpackhi_epi16(input[3], __zero); + + v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02); // s0 + s2 + v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02); // s0 + s2 + v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04); // s4 + s5 + v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04); // s4 + s5 + v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03); // x1 + v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03); // x1 + v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01); // s1 - s3 + v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01); // s1 - s3 + v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02); // -s4 + s6 + v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02); // -s4 + s6 + v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03); // s4 + v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03); // s4 + v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03); + v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03); + + u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]); + u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]); + u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]); + u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]); + u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]); + u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]); + u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]); + u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]); + u_lo[4] = _mm_slli_epi32(v_lo[5], 2); + u_hi[4] = _mm_slli_epi32(v_hi[5], 2); + u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]); + u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]); + u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]); + u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]); + + v_lo[0] = _mm_add_epi32(u_lo[0], __rounding); + v_hi[0] = _mm_add_epi32(u_hi[0], __rounding); + v_lo[1] = _mm_add_epi32(u_lo[1], __rounding); + v_hi[1] = _mm_add_epi32(u_hi[1], __rounding); + v_lo[2] = _mm_add_epi32(u_lo[2], __rounding); + v_hi[2] = _mm_add_epi32(u_hi[2], __rounding); + v_lo[3] = _mm_add_epi32(u_lo[6], __rounding); + v_hi[3] = _mm_add_epi32(u_hi[6], __rounding); + + u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit); + u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit); + u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit); + u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit); + u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit); + u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit); + u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit); + u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit); + + output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]); + output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]); + output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]); + output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]); +} + +static void fadst8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m128i x1[16]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[15]); + x1[2] = _mm_subs_epi16(__zero, input[7]); + x1[3] = input[8]; + x1[4] = _mm_subs_epi16(__zero, input[3]); + x1[5] = input[12]; + x1[6] = input[4]; + x1[7] = _mm_subs_epi16(__zero, input[11]); + x1[8] = _mm_subs_epi16(__zero, input[1]); + x1[9] = input[14]; + x1[10] = input[6]; + x1[11] = _mm_subs_epi16(__zero, input[9]); + x1[12] = input[2]; + x1[13] = _mm_subs_epi16(__zero, input[13]); + x1[14] = _mm_subs_epi16(__zero, input[5]); + x1[15] = input[10]; + + // stage 2 + __m128i x2[16]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); + x2[8] = x1[8]; + x2[9] = x1[9]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]); + x2[12] = x1[12]; + x2[13] = x1[13]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]); + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + x3[8] = _mm_adds_epi16(x2[8], x2[10]); + x3[10] = _mm_subs_epi16(x2[8], x2[10]); + x3[9] = _mm_adds_epi16(x2[9], x2[11]); + x3[11] = _mm_subs_epi16(x2[9], x2[11]); + x3[12] = _mm_adds_epi16(x2[12], x2[14]); + x3[14] = _mm_subs_epi16(x2[12], x2[14]); + x3[13] = _mm_adds_epi16(x2[13], x2[15]); + x3[15] = _mm_subs_epi16(x2[13], x2[15]); + + // stage 4 + __m128i x4[16]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + x4[8] = x3[8]; + x4[9] = x3[9]; + x4[10] = x3[10]; + x4[11] = x3[11]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]); + + // stage 5 + __m128i x5[16]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + x5[8] = _mm_adds_epi16(x4[8], x4[12]); + x5[12] = _mm_subs_epi16(x4[8], x4[12]); + x5[9] = _mm_adds_epi16(x4[9], x4[13]); + x5[13] = _mm_subs_epi16(x4[9], x4[13]); + x5[10] = _mm_adds_epi16(x4[10], x4[14]); + x5[14] = _mm_subs_epi16(x4[10], x4[14]); + x5[11] = _mm_adds_epi16(x4[11], x4[15]); + x5[15] = _mm_subs_epi16(x4[11], x4[15]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + x6[4] = x5[4]; + x6[5] = x5[5]; + x6[6] = x5[6]; + x6[7] = x5[7]; + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]); + btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]); + btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]); + btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]); + + // stage 7 + __m128i x7[16]; + x7[0] = _mm_adds_epi16(x6[0], x6[8]); + x7[8] = _mm_subs_epi16(x6[0], x6[8]); + x7[1] = _mm_adds_epi16(x6[1], x6[9]); + x7[9] = _mm_subs_epi16(x6[1], x6[9]); + x7[2] = _mm_adds_epi16(x6[2], x6[10]); + x7[10] = _mm_subs_epi16(x6[2], x6[10]); + x7[3] = _mm_adds_epi16(x6[3], x6[11]); + x7[11] = _mm_subs_epi16(x6[3], x6[11]); + x7[4] = _mm_adds_epi16(x6[4], x6[12]); + x7[12] = _mm_subs_epi16(x6[4], x6[12]); + x7[5] = _mm_adds_epi16(x6[5], x6[13]); + x7[13] = _mm_subs_epi16(x6[5], x6[13]); + x7[6] = _mm_adds_epi16(x6[6], x6[14]); + x7[14] = _mm_subs_epi16(x6[6], x6[14]); + x7[7] = _mm_adds_epi16(x6[7], x6[15]); + x7[15] = _mm_subs_epi16(x6[7], x6[15]); + + // stage 8 + __m128i x8[16]; + btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]); + btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]); + btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]); + btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]); + btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]); + btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]); + btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]); + btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]); + + // stage 9 + output[0] = x8[1]; + output[1] = x8[14]; + output[2] = x8[3]; + output[3] = x8[12]; + output[4] = x8[5]; + output[5] = x8[10]; + output[6] = x8[7]; + output[7] = x8[8]; + output[8] = x8[9]; + output[9] = x8[6]; + output[10] = x8[11]; + output[11] = x8[4]; + output[12] = x8[13]; + output[13] = x8[2]; + output[14] = x8[15]; + output[15] = x8[0]; +} + +static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = { + fdct4x4_new_sse2, // DCT_DCT + fadst4x4_new_sse2, // ADST_DCT + fdct4x4_new_sse2, // DCT_ADST + fadst4x4_new_sse2, // ADST_ADST + fadst4x4_new_sse2, // FLIPADST_DCT + fdct4x4_new_sse2, // DCT_FLIPADST + fadst4x4_new_sse2, // FLIPADST_FLIPADST + fadst4x4_new_sse2, // ADST_FLIPADST + fadst4x4_new_sse2, // FLIPADST_ADST + fidentity4x4_new_sse2, // IDTX + fdct4x4_new_sse2, // V_DCT + fidentity4x4_new_sse2, // H_DCT + fadst4x4_new_sse2, // V_ADST + fidentity4x4_new_sse2, // H_ADST + fadst4x4_new_sse2, // V_FLIPADST + fidentity4x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = { + fdct4x4_new_sse2, // DCT_DCT + fdct4x4_new_sse2, // ADST_DCT + fadst4x4_new_sse2, // DCT_ADST + fadst4x4_new_sse2, // ADST_ADST + fdct4x4_new_sse2, // FLIPADST_DCT + fadst4x4_new_sse2, // DCT_FLIPADST + fadst4x4_new_sse2, // FLIPADST_FLIPADST + fadst4x4_new_sse2, // ADST_FLIPADST + fadst4x4_new_sse2, // FLIPADST_ADST + fidentity4x4_new_sse2, // IDTX + fidentity4x4_new_sse2, // V_DCT + fdct4x4_new_sse2, // H_DCT + fidentity4x4_new_sse2, // V_ADST + fadst4x4_new_sse2, // H_ADST + fidentity4x4_new_sse2, // V_FLIPADST + fadst4x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = { + fdct4x8_new_sse2, // DCT_DCT + fadst4x8_new_sse2, // ADST_DCT + fdct4x8_new_sse2, // DCT_ADST + fadst4x8_new_sse2, // ADST_ADST + fadst4x8_new_sse2, // FLIPADST_DCT + fdct4x8_new_sse2, // DCT_FLIPADST + fadst4x8_new_sse2, // FLIPADST_FLIPADST + fadst4x8_new_sse2, // ADST_FLIPADST + fadst4x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct4x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst4x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst4x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = { + fdct8x4_new_sse2, // DCT_DCT + fdct8x4_new_sse2, // ADST_DCT + fadst8x4_new_sse2, // DCT_ADST + fadst8x4_new_sse2, // ADST_ADST + fdct8x4_new_sse2, // FLIPADST_DCT + fadst8x4_new_sse2, // DCT_FLIPADST + fadst8x4_new_sse2, // FLIPADST_FLIPADST + fadst8x4_new_sse2, // ADST_FLIPADST + fadst8x4_new_sse2, // FLIPADST_ADST + fidentity8x4_new_sse2, // IDTX + fidentity8x4_new_sse2, // V_DCT + fdct8x4_new_sse2, // H_DCT + fidentity8x4_new_sse2, // V_ADST + fadst8x4_new_sse2, // H_ADST + fidentity8x4_new_sse2, // V_FLIPADST + fadst8x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = { + fdct8x4_new_sse2, // DCT_DCT + fadst8x4_new_sse2, // ADST_DCT + fdct8x4_new_sse2, // DCT_ADST + fadst8x4_new_sse2, // ADST_ADST + fadst8x4_new_sse2, // FLIPADST_DCT + fdct8x4_new_sse2, // DCT_FLIPADST + fadst8x4_new_sse2, // FLIPADST_FLIPADST + fadst8x4_new_sse2, // ADST_FLIPADST + fadst8x4_new_sse2, // FLIPADST_ADST + fidentity8x4_new_sse2, // IDTX + fdct8x4_new_sse2, // V_DCT + fidentity8x4_new_sse2, // H_DCT + fadst8x4_new_sse2, // V_ADST + fidentity8x4_new_sse2, // H_ADST + fadst8x4_new_sse2, // V_FLIPADST + fidentity8x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = { + fdct4x8_new_sse2, // DCT_DCT + fdct4x8_new_sse2, // ADST_DCT + fadst4x8_new_sse2, // DCT_ADST + fadst4x8_new_sse2, // ADST_ADST + fdct4x8_new_sse2, // FLIPADST_DCT + fadst4x8_new_sse2, // DCT_FLIPADST + fadst4x8_new_sse2, // FLIPADST_FLIPADST + fadst4x8_new_sse2, // ADST_FLIPADST + fadst4x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct4x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst4x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst4x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fadst8x8_new_sse2, // ADST_DCT + fdct8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fadst8x8_new_sse2, // FLIPADST_DCT + fdct8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct8x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst8x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst8x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fdct8x8_new_sse2, // ADST_DCT + fadst8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fdct8x8_new_sse2, // FLIPADST_DCT + fadst8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct8x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst8x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst8x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_sse2, // DCT_DCT + fadst8x16_new_sse2, // ADST_DCT + fdct8x16_new_sse2, // DCT_ADST + fadst8x16_new_sse2, // ADST_ADST + fadst8x16_new_sse2, // FLIPADST_DCT + fdct8x16_new_sse2, // DCT_FLIPADST + fadst8x16_new_sse2, // FLIPADST_FLIPADST + fadst8x16_new_sse2, // ADST_FLIPADST + fadst8x16_new_sse2, // FLIPADST_ADST + fidentity8x16_new_sse2, // IDTX + fdct8x16_new_sse2, // V_DCT + fidentity8x16_new_sse2, // H_DCT + fadst8x16_new_sse2, // V_ADST + fidentity8x16_new_sse2, // H_ADST + fadst8x16_new_sse2, // V_FLIPADST + fidentity8x16_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_sse2, // DCT_DCT + fdct8x16_new_sse2, // ADST_DCT + fadst8x16_new_sse2, // DCT_ADST + fadst8x16_new_sse2, // ADST_ADST + fdct8x16_new_sse2, // FLIPADST_DCT + fadst8x16_new_sse2, // DCT_FLIPADST + fadst8x16_new_sse2, // FLIPADST_FLIPADST + fadst8x16_new_sse2, // ADST_FLIPADST + fadst8x16_new_sse2, // FLIPADST_ADST + fidentity8x16_new_sse2, // IDTX + fidentity8x16_new_sse2, // V_DCT + fdct8x16_new_sse2, // H_DCT + fidentity8x16_new_sse2, // V_ADST + fadst8x16_new_sse2, // H_ADST + fidentity8x16_new_sse2, // V_FLIPADST + fadst8x16_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = { + av1_fdct8x32_new_sse2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_new_sse2, // IDTX + fidentity8x32_new_sse2, // V_DCT + av1_fdct8x32_new_sse2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[4], buf1[4], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x4(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w4(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)stride; + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8]; + const int txw_idx = get_txw_idx(TX_4X8); + const int txh_idx = get_txh_idx(TX_4X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16]; + const int txw_idx = get_txw_idx(TX_4X16); + const int txh_idx = get_txh_idx(TX_4X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x8(buf0, buf1); + transpose_16bit_4x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + 8 * i, buf, width); + } else { + buf = buf1 + 8 * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } +} + +void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4]; + const int txw_idx = get_txw_idx(TX_8X4); + const int txh_idx = get_txh_idx(TX_8X4); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + else + load_buffer_16bit_to_16bit(input, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w4(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + else + load_buffer_16bit_to_16bit(input, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } +} + +void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32]; + const int txw_idx = get_txw_idx(TX_8X32); + const int txh_idx = get_txh_idx(TX_8X32); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + transpose_16bit_8x8(buf0 + 16, buf1 + 16); + transpose_16bit_8x8(buf0 + 24, buf1 + 24); + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } +} + +void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4]; + const int txw_idx = get_txw_idx(TX_16X4); + const int txh_idx = get_txh_idx(TX_16X4); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x4(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w4(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16]; + const int txw_idx = get_txw_idx(TX_16X16); + const int txh_idx = get_txh_idx(TX_16X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); + } + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } +} + +void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32]; + const int txw_idx = get_txw_idx(TX_16X32); + const int txh_idx = get_txh_idx(TX_16X32); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); + transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); + transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); + } + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } + } else { + av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8]; + const int txw_idx = get_txw_idx(TX_32X8); + const int txh_idx = get_txh_idx(TX_32X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + } + + for (int i = 0; i < 1; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } + } else { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); + } + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } + } else { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32]; + const int txw_idx = get_txw_idx(TX_32X32); + const int txh_idx = get_txh_idx(TX_32X32); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); + transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); + transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); + } + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } + } else { + av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X16; + __m128i buf0[64], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x16_new_sse2; + const transform_1d_sse2 row_txfm = av1_fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < height_div8; ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < height_div8; i++) { + __m128i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 32); + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_16X64; + __m128i buf0[64], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; + const transform_1d_sse2 row_txfm = fdct8x16_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < height_div8; ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16); + } +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform + av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform + NULL, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform + av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform + av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + else + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h new file mode 100644 index 0000000000..3cb869a8fe --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ +#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit); +void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit); + +static INLINE void fidentity4x4_new_sse2(const __m128i *const input, + __m128i *const output, + const int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i a = _mm_unpacklo_epi16(input[i], one); + const __m128i b = scale_round_sse2(a, NewSqrt2); + output[i] = _mm_packs_epi32(b, b); + } +} + +static INLINE void fidentity8x4_new_sse2(const __m128i *const input, + __m128i *const output, + const int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); + output[i] = _mm_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + + output[0] = _mm_adds_epi16(input[0], input[0]); + output[1] = _mm_adds_epi16(input[1], input[1]); + output[2] = _mm_adds_epi16(input[2], input[2]); + output[3] = _mm_adds_epi16(input[3], input[3]); + output[4] = _mm_adds_epi16(input[4], input[4]); + output[5] = _mm_adds_epi16(input[5], input[5]); + output[6] = _mm_adds_epi16(input[6], input[6]); + output[7] = _mm_adds_epi16(input[7], input[7]); +} + +static INLINE void fdct8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m128i x1[8]; + x1[0] = _mm_adds_epi16(input[0], input[7]); + x1[7] = _mm_subs_epi16(input[0], input[7]); + x1[1] = _mm_adds_epi16(input[1], input[6]); + x1[6] = _mm_subs_epi16(input[1], input[6]); + x1[2] = _mm_adds_epi16(input[2], input[5]); + x1[5] = _mm_subs_epi16(input[2], input[5]); + x1[3] = _mm_adds_epi16(input[3], input[4]); + x1[4] = _mm_subs_epi16(input[3], input[4]); + + // stage 2 + __m128i x2[8]; + x2[0] = _mm_adds_epi16(x1[0], x1[3]); + x2[3] = _mm_subs_epi16(x1[0], x1[3]); + x2[1] = _mm_adds_epi16(x1[1], x1[2]); + x2[2] = _mm_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]); + x2[7] = x1[7]; + + // stage 3 + __m128i x3[8]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[7], x2[6]); + + // stage 4 and 5 + output[0] = x3[0]; + output[4] = x3[1]; + output[2] = x3[2]; + output[6] = x3[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], output[1], output[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], output[5], output[3]); +} + +static INLINE void fadst8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[7]); + x1[2] = _mm_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm_subs_epi16(__zero, input[5]); + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + + // stage 5, 6 and 7 + output[7] = _mm_adds_epi16(x4[0], x4[4]); + output[3] = _mm_subs_epi16(x4[0], x4[4]); + output[0] = _mm_adds_epi16(x4[1], x4[5]); + output[4] = _mm_subs_epi16(x4[1], x4[5]); + output[5] = _mm_adds_epi16(x4[2], x4[6]); + output[1] = _mm_subs_epi16(x4[2], x4[6]); + output[2] = _mm_adds_epi16(x4[3], x4[7]); + output[6] = _mm_subs_epi16(x4[3], x4[7]); + + btf_16_sse2(cospi_p04_p60, cospi_p60_m04, output[7], output[0], output[7], + output[0]); + btf_16_sse2(cospi_p20_p44, cospi_p44_m20, output[5], output[2], output[5], + output[2]); + btf_16_sse2(cospi_p36_p28, cospi_p28_m36, output[3], output[4], output[3], + output[4]); + btf_16_sse2(cospi_p52_p12, cospi_p12_m52, output[1], output[6], output[1], + output[6]); +} + +static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2); + output[i] = _mm_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) { + output[i] = _mm_slli_epi16(input[i], 2); + } +} + +static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = { + av1_fdct8x32_new_sse2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_new_sse2, // IDTX + av1_fdct8x32_new_sse2, // V_DCT + fidentity8x32_new_sse2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c new file mode 100644 index 0000000000..b58911fcb2 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i zero = _mm_setzero_si128(); + const __m128i dc = _mm_unpacklo_epi16(*p, zero); + const __m128i ac = _mm_unpackhi_epi16(*p, zero); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static INLINE void update_qp(__m256i *qp) { + qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); + qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); + qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); +} + +static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *dequant_ptr, int log_scale, + __m256i *qp) { + __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + if (log_scale) { + const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale)); + round = _mm_mulhrs_epi16(round, round_scale); + } + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + + init_one_qp(&round, &qp[0]); + init_one_qp(&quant, &qp[1]); + init_one_qp(&dequant, &qp[2]); +} + +static INLINE void quantize(const __m256i *qp, __m256i *c, + const int16_t *iscan_ptr, int log_scale, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i abs_coeff = _mm256_abs_epi32(*c); + __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); + + __m256i q_lo = _mm256_mul_epi32(q, qp[1]); + __m256i q_hi = _mm256_srli_epi64(q, 32); + const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); + q_hi = _mm256_mul_epi32(q_hi, qp_hi); + q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); + q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); + q_hi = _mm256_slli_epi64(q_hi, 32); + q = _mm256_or_si256(q_lo, q_hi); + const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); + const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); + q = _mm256_andnot_si256(mask, q); + + __m256i dq = _mm256_mullo_epi32(q, qp[2]); + dq = _mm256_srai_epi32(dq, log_scale); + q = _mm256_sign_epi32(q, *c); + dq = _mm256_sign_epi32(dq, *c); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); + const __m128i zr = _mm_setzero_si128(); + const __m128i lo = _mm_unpacklo_epi16(isc, zr); + const __m128i hi = _mm_unpackhi_epi16(isc, zr); + const __m256i iscan = + _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); + + const __m256i zero = _mm256_setzero_si256(); + const __m256i zc = _mm256_cmpeq_epi32(dq, zero); + const __m256i nz = _mm256_cmpeq_epi32(zc, zero); + __m256i cur_eob = _mm256_sub_epi32(iscan, nz); + cur_eob = _mm256_and_si256(cur_eob, nz); + *eob = _mm256_max_epi32(cur_eob, *eob); +} + +void av1_highbd_quantize_fp_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale) { + (void)scan; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 8; + __m256i qp[3], coeff; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c new file mode 100644 index 0000000000..40b3b460b6 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/synonyms.h" + +// Coefficient quantization phase 1 +// param[0-2] : rounding/quan/dequan constants +static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param, + const int shift, const int scale, + __m128i *qcoeff, __m128i *dquan, + __m128i *sign) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + + *sign = _mm_cmplt_epi32(*coeff, zero); + *sign = _mm_or_si128(*sign, one); + *coeff = _mm_abs_epi32(*coeff); + + qcoeff[0] = _mm_add_epi32(*coeff, param[0]); + qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero); + qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero); + + qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]); + qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift); + dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]); + dquan[0] = _mm_srli_epi64(dquan[0], scale); + const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale); + qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]); +} + +// Coefficient quantization phase 2 +static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan, + const __m128i *sign, + const __m128i *param, const int shift, + const int scale, tran_low_t *qAddr, + tran_low_t *dqAddr) { + __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0); + __m128i mask0H = _mm_set_epi32(0, 0, -1, -1); + + qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]); + qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift); + dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]); + dquan[1] = _mm_srli_epi64(dquan[1], scale); + + // combine L&H + qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8); + qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d); + + qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H); + qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L); + + dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8); + dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d); + + dquan[0] = _mm_and_si128(dquan[0], mask0H); + dquan[1] = _mm_and_si128(dquan[1], mask0L); + + qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]); + dquan[0] = _mm_or_si128(dquan[0], dquan[1]); + + qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign); + dquan[0] = _mm_sign_epi32(dquan[0], *sign); + qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]); + dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]); + _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]); + _mm_storeu_si128((__m128i *)dqAddr, dquan[0]); +} + +static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan, + __m128i *eob) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, iscanIdx; + const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr); + const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4)); + __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero); + __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero); + + nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero); + nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero); + + mask = _mm_packs_epi32(nz_flag0, nz_flag1); + iscanIdx = _mm_loadu_si128((__m128i const *)iscan); + iscanIdx = _mm_sub_epi16(iscanIdx, mask); + iscanIdx = _mm_and_si128(iscanIdx, mask); + *eob = _mm_max_epi16(*eob, iscanIdx); +} + +static INLINE uint16_t get_accumulated_eob(__m128i *eob) { + __m128i eob_shuffled; + uint16_t eobValue; + eob_shuffled = _mm_shuffle_epi32(*eob, 0xe); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eobValue = _mm_extract_epi16(*eob, 0); + return eobValue; +} + +void av1_highbd_quantize_fp_sse4_1( + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale) { + __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign; + __m128i eob = _mm_setzero_si128(); + const tran_low_t *src = coeff_ptr; + tran_low_t *quanAddr = qcoeff_ptr; + tran_low_t *dquanAddr = dqcoeff_ptr; + const int shift = 16 - log_scale; + const int coeff_stride = 4; + const int quan_stride = coeff_stride; + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + + memset(quanAddr, 0, count * sizeof(quanAddr[0])); + memset(dquanAddr, 0, count * sizeof(dquanAddr[0])); + + coeff[0] = _mm_loadu_si128((__m128i const *)src); + const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale); + const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); + + qparam[0] = _mm_set_epi32(round1, round1, round1, round0); + qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]); + qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]); + qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1], + dequant_ptr[0]); + + // DC and first 3 AC + quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + + // update round/quan/dquan for AC + qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); + qparam[1] = xx_set1_64_from_32i(quant_ptr[1]); + qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]); + qparam[3] = _mm_set1_epi32(dequant_ptr[1]); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, + quanAddr, dquanAddr); + + // next 4 AC + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); + quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, + quanAddr + quan_stride, dquanAddr + quan_stride); + + find_eob(quanAddr, iscan, &eob); + + count -= 8; + + // loop for the rest of AC + while (count > 0) { + src += coeff_stride << 1; + quanAddr += quan_stride << 1; + dquanAddr += quan_stride << 1; + iscan += quan_stride << 1; + + coeff[0] = _mm_loadu_si128((__m128i const *)src); + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); + + quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, + log_scale, quanAddr, dquanAddr); + + quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, + log_scale, quanAddr + quan_stride, + dquanAddr + quan_stride); + + find_eob(quanAddr, iscan, &eob); + + count -= 8; + } + *eob_ptr = get_accumulated_eob(&eob); +} diff --git a/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c b/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c new file mode 100644 index 0000000000..52ddc66437 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include // AVX2 + +#include "config/av1_rtcd.h" +#include "aom_dsp/x86/synonyms.h" + +static int64_t k_means_horizontal_sum_avx2(__m256i a) { + const __m128i low = _mm256_castsi256_si128(a); + const __m128i high = _mm256_extracti128_si256(a, 1); + const __m128i sum = _mm_add_epi64(low, high); + const __m128i sum_high = _mm_unpackhi_epi64(sum, sum); + int64_t res; + _mm_storel_epi64((__m128i *)&res, _mm_add_epi64(sum, sum_high)); + return res; +} + +void av1_calc_indices_dim1_avx2(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + const __m256i v_zero = _mm256_setzero_si256(); + __m256i sum = _mm256_setzero_si256(); + __m256i cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + cents[j] = _mm256_set1_epi16(centroids[j]); + } + + for (int i = 0; i < n; i += 16) { + const __m256i in = _mm256_loadu_si256((__m256i *)data); + __m256i ind = _mm256_setzero_si256(); + // Compute the distance to the first centroid. + __m256i d1 = _mm256_sub_epi16(in, cents[0]); + __m256i dist_min = _mm256_abs_epi16(d1); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + d1 = _mm256_sub_epi16(in, cents[j]); + const __m256i dist = _mm256_abs_epi16(d1); + // Compare to the minimal one. + const __m256i cmp = _mm256_cmpgt_epi16(dist_min, dist); + dist_min = _mm256_min_epi16(dist_min, dist); + const __m256i ind1 = _mm256_set1_epi16(j); + ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind), + _mm256_and_si256(cmp, ind1)); + } + + const __m256i p1 = _mm256_packus_epi16(ind, v_zero); + const __m256i px = _mm256_permute4x64_epi64(p1, 0x58); + const __m128i d2 = _mm256_extracti128_si256(px, 0); + + _mm_storeu_si128((__m128i *)indices, d2); + + if (total_dist) { + // Square, convert to 32 bit and add together. + dist_min = _mm256_madd_epi16(dist_min, dist_min); + // Convert to 64 bit and add to sum. + const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero); + const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero); + sum = _mm256_add_epi64(sum, dist1); + sum = _mm256_add_epi64(sum, dist2); + } + + indices += 16; + data += 16; + } + if (total_dist) { + *total_dist = k_means_horizontal_sum_avx2(sum); + } +} + +void av1_calc_indices_dim2_avx2(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + const __m256i v_zero = _mm256_setzero_si256(); + const __m256i permute = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); + __m256i sum = _mm256_setzero_si256(); + __m256i ind[2]; + __m256i cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1]; + cents[j] = _mm256_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx, cy, cx, cy, cx, + cy, cx, cy, cx); + } + + for (int i = 0; i < n; i += 16) { + for (int l = 0; l < 2; ++l) { + const __m256i in = _mm256_loadu_si256((__m256i *)data); + ind[l] = _mm256_setzero_si256(); + // Compute the distance to the first centroid. + __m256i d1 = _mm256_sub_epi16(in, cents[0]); + __m256i dist_min = _mm256_madd_epi16(d1, d1); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + d1 = _mm256_sub_epi16(in, cents[j]); + const __m256i dist = _mm256_madd_epi16(d1, d1); + // Compare to the minimal one. + const __m256i cmp = _mm256_cmpgt_epi32(dist_min, dist); + dist_min = _mm256_min_epi32(dist_min, dist); + const __m256i ind1 = _mm256_set1_epi32(j); + ind[l] = _mm256_or_si256(_mm256_andnot_si256(cmp, ind[l]), + _mm256_and_si256(cmp, ind1)); + } + if (total_dist) { + // Convert to 64 bit and add to sum. + const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero); + const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero); + sum = _mm256_add_epi64(sum, dist1); + sum = _mm256_add_epi64(sum, dist2); + } + data += 16; + } + // Cast to 8 bit and store. + const __m256i d2 = _mm256_packus_epi32(ind[0], ind[1]); + const __m256i d3 = _mm256_packus_epi16(d2, v_zero); + const __m256i d4 = _mm256_permutevar8x32_epi32(d3, permute); + const __m128i d5 = _mm256_extracti128_si256(d4, 0); + _mm_storeu_si128((__m128i *)indices, d5); + indices += 16; + } + if (total_dist) { + *total_dist = k_means_horizontal_sum_avx2(sum); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c b/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c new file mode 100644 index 0000000000..6c75822350 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/av1_rtcd.h" +#include "aom_dsp/x86/synonyms.h" + +static int64_t k_means_horizontal_sum_sse2(__m128i a) { + const __m128i sum1 = _mm_unpackhi_epi64(a, a); + const __m128i sum2 = _mm_add_epi64(a, sum1); + int64_t res; + _mm_storel_epi64((__m128i *)&res, sum2); + return res; +} + +void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + const __m128i v_zero = _mm_setzero_si128(); + __m128i sum = _mm_setzero_si128(); + __m128i cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + cents[j] = _mm_set1_epi16(centroids[j]); + } + + for (int i = 0; i < n; i += 8) { + const __m128i in = _mm_loadu_si128((__m128i *)data); + __m128i ind = _mm_setzero_si128(); + // Compute the distance to the first centroid. + __m128i d1 = _mm_sub_epi16(in, cents[0]); + __m128i d2 = _mm_sub_epi16(cents[0], in); + __m128i dist_min = _mm_max_epi16(d1, d2); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + d1 = _mm_sub_epi16(in, cents[j]); + d2 = _mm_sub_epi16(cents[j], in); + const __m128i dist = _mm_max_epi16(d1, d2); + // Compare to the minimal one. + const __m128i cmp = _mm_cmpgt_epi16(dist_min, dist); + dist_min = _mm_min_epi16(dist_min, dist); + const __m128i ind1 = _mm_set1_epi16(j); + ind = _mm_or_si128(_mm_andnot_si128(cmp, ind), _mm_and_si128(cmp, ind1)); + } + if (total_dist) { + // Square, convert to 32 bit and add together. + dist_min = _mm_madd_epi16(dist_min, dist_min); + // Convert to 64 bit and add to sum. + const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero); + const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero); + sum = _mm_add_epi64(sum, dist1); + sum = _mm_add_epi64(sum, dist2); + } + __m128i p2 = _mm_packus_epi16(ind, v_zero); + _mm_storel_epi64((__m128i *)indices, p2); + indices += 8; + data += 8; + } + if (total_dist) { + *total_dist = k_means_horizontal_sum_sse2(sum); + } +} + +void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + const __m128i v_zero = _mm_setzero_si128(); + __m128i sum = _mm_setzero_si128(); + __m128i ind[2]; + __m128i cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1]; + cents[j] = _mm_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx); + } + + for (int i = 0; i < n; i += 8) { + for (int l = 0; l < 2; ++l) { + const __m128i in = _mm_loadu_si128((__m128i *)data); + ind[l] = _mm_setzero_si128(); + // Compute the distance to the first centroid. + __m128i d1 = _mm_sub_epi16(in, cents[0]); + __m128i dist_min = _mm_madd_epi16(d1, d1); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + d1 = _mm_sub_epi16(in, cents[j]); + const __m128i dist = _mm_madd_epi16(d1, d1); + // Compare to the minimal one. + const __m128i cmp = _mm_cmpgt_epi32(dist_min, dist); + const __m128i dist1 = _mm_andnot_si128(cmp, dist_min); + const __m128i dist2 = _mm_and_si128(cmp, dist); + dist_min = _mm_or_si128(dist1, dist2); + const __m128i ind1 = _mm_set1_epi32(j); + ind[l] = _mm_or_si128(_mm_andnot_si128(cmp, ind[l]), + _mm_and_si128(cmp, ind1)); + } + if (total_dist) { + // Convert to 64 bit and add to sum. + const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero); + const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero); + sum = _mm_add_epi64(sum, dist1); + sum = _mm_add_epi64(sum, dist2); + } + data += 8; + } + // Cast to 8 bit and store. + const __m128i d2 = _mm_packus_epi16(ind[0], ind[1]); + const __m128i d3 = _mm_packus_epi16(d2, v_zero); + _mm_storel_epi64((__m128i *)indices, d3); + indices += 8; + } + if (total_dist) { + *total_dist = k_means_horizontal_sum_sse2(sum); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c new file mode 100644 index 0000000000..75c5172f85 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE void write_zero(tran_low_t *qcoeff) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)qcoeff + 1, zero); +} + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i ac = _mm_unpackhi_epi64(*p, *p); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1); +} + +static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *dequant_ptr, int log_scale, + __m256i *thr, __m256i *qp) { + __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + + if (log_scale > 0) { + const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1)); + round = _mm_add_epi16(round, rnd); + round = _mm_srai_epi16(round, log_scale); + } + + init_one_qp(&round, &qp[0]); + init_one_qp(&quant, &qp[1]); + + if (log_scale == 1) { + qp[1] = _mm256_slli_epi16(qp[1], log_scale); + } + + init_one_qp(&dequant, &qp[2]); + *thr = _mm256_srai_epi16(qp[2], 1 + log_scale); + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. + *thr = _mm256_sub_epi16(*thr, _mm256_set1_epi16(1)); +} + +static INLINE void update_qp(__m256i *thr, __m256i *qp) { + qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); + qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); + qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); + *thr = _mm256_permute2x128_si256(*thr, *thr, 0x11); +} + +static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { + const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr); + const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +} + +static INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo); + _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +} + +static INLINE uint16_t quant_gather_eob(__m256i eob) { + const __m128i eob_lo = _mm256_castsi256_si128(eob); + const __m128i eob_hi = _mm256_extractf128_si256(eob, 1); + __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi); + eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s); + eob_s = _mm_minpos_epu16(eob_s); + return INT16_MAX - _mm_extract_epi16(eob_s, 0); +} + +static INLINE int16_t accumulate_eob256(__m256i eob256) { + const __m128i eob_lo = _mm256_castsi256_si128(eob256); + const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); + __m128i eob = _mm_max_epi16(eob_lo, eob_hi); + __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +static AOM_FORCE_INLINE void quantize_lp_16_first( + const int16_t *coeff_ptr, const int16_t *iscan_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256, + __m256i *dequant256, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, qcoeff); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dqcoeff); + + const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); + const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask); + const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask); + *eob = _mm256_max_epi16(*eob, nz_iscan); +} + +static AOM_FORCE_INLINE void quantize_lp_16( + const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256, + __m256i *quant256, __m256i *dequant256, __m256i *eob) { + const __m256i coeff = + _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs)); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), dqcoeff); + + const __m256i iscan = + _mm256_loadu_si256((const __m256i *)(iscan_ptr + n_coeffs)); + const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask); + const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask); + *eob = _mm256_max_epi16(*eob, nz_iscan); +} + +void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + __m256i eob256 = _mm256_setzero_si256(); + + // Setup global values. + __m256i round256 = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + __m256i quant256 = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + __m256i dequant256 = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + + // Populate upper AC values. + round256 = _mm256_permute4x64_epi64(round256, 0x54); + quant256 = _mm256_permute4x64_epi64(quant256, 0x54); + dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54); + + // Process DC and the first 15 AC coeffs. + quantize_lp_16_first(coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &round256, + &quant256, &dequant256, &eob256); + + if (n_coeffs > 16) { + // Overwrite the DC constants with AC constants + dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31); + quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); + round256 = _mm256_permute2x128_si256(round256, round256, 0x31); + + // AC only loop. + for (int idx = 16; idx < n_coeffs; idx += 16) { + quantize_lp_16(coeff_ptr, idx, iscan, qcoeff_ptr, dqcoeff_ptr, &round256, + &quant256, &dequant256, &eob256); + } + } + + *eob_ptr = accumulate_eob256(eob256); +} + +static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); + const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); + const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static AOM_FORCE_INLINE void quantize_fp_16( + const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + __m256i *eob) { + const __m256i coeff = load_coefficients_avx2(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]); + const __m256i abs_q = _mm256_mulhi_epi16(tmp_rnd, qp[1]); + const __m256i q = _mm256_sign_epi16(abs_q, coeff); + const __m256i dq = _mm256_mullo_epi16(q, qp[2]); + const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256()); + + store_coefficients_avx2(q, qcoeff_ptr); + store_coefficients_avx2(dq, dqcoeff_ptr); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } else { + write_zero(qcoeff_ptr); + write_zero(dqcoeff_ptr); + } +} + +void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + + const int log_scale = 0; + const int step = 16; + __m256i qp[3], thr; + __m256i eob = _mm256_setzero_si256(); + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + + quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(&thr, qp); + + while (n_coeffs > 0) { + quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} + +static AOM_FORCE_INLINE void quantize_fp_32x32( + const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + __m256i *eob) { + const __m256i coeff = load_coefficients_avx2(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]); + const __m256i abs_q = _mm256_mulhi_epu16(tmp_rnd, qp[1]); + const __m256i q = _mm256_sign_epi16(abs_q, coeff); + const __m256i abs_dq = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 1); + const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256()); + const __m256i dq = _mm256_sign_epi16(abs_dq, coeff); + + store_coefficients_avx2(q, qcoeff_ptr); + store_coefficients_avx2(dq, dqcoeff_ptr); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } else { + write_zero(qcoeff_ptr); + write_zero(dqcoeff_ptr); + } +} + +void av1_quantize_fp_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + + const int log_scale = 1; + const unsigned int step = 16; + __m256i qp[3], thr; + __m256i eob = _mm256_setzero_si256(); + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + + quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(&thr, qp); + + while (n_coeffs > 0) { + quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} + +static INLINE void quantize_fp_64x64(const __m256i *thr, const __m256i *qp, + const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = load_coefficients_avx2(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(abs_coeff, qp[0]), mask); + const __m256i qh = _mm256_slli_epi16(_mm256_mulhi_epi16(tmp_rnd, qp[1]), 2); + const __m256i ql = + _mm256_srli_epi16(_mm256_mullo_epi16(tmp_rnd, qp[1]), 14); + const __m256i abs_q = _mm256_or_si256(qh, ql); + const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(abs_q, qp[2]), 14); + const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 2); + const __m256i abs_dq = _mm256_or_si256(dqh, dql); + const __m256i q = _mm256_sign_epi16(abs_q, coeff); + const __m256i dq = _mm256_sign_epi16(abs_dq, coeff); + // Check the signed q/dq value here instead of the absolute value. When + // dequant equals 4, the dequant threshold (*thr) becomes 0 after being + // scaled down by (1 + log_scale). See init_qp(). When *thr is 0 and the + // abs_coeff is 0, the nzflag will be set. As a result, the eob will be + // incorrectly calculated. The psign instruction corrects the error by + // zeroing out q/dq if coeff is zero. + const __m256i z_mask = _mm256_cmpeq_epi16(dq, _mm256_setzero_si256()); + const __m256i nz_mask = _mm256_cmpeq_epi16(z_mask, _mm256_setzero_si256()); + + store_coefficients_avx2(q, qcoeff_ptr); + store_coefficients_avx2(dq, dqcoeff_ptr); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } else { + write_zero(qcoeff_ptr); + write_zero(dqcoeff_ptr); + } +} + +void av1_quantize_fp_64x64_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + + const int log_scale = 2; + const unsigned int step = 16; + __m256i qp[3], thr; + __m256i eob = _mm256_setzero_si256(); + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + + quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(&thr, qp); + + while (n_coeffs > 0) { + quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c new file mode 100644 index 0000000000..b533894015 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, + __m128i *c0, __m128i *c1) { + const tran_low_t *addr = coeff + offset; + if (sizeof(tran_low_t) == 4) { + const __m128i x0 = _mm_load_si128((const __m128i *)addr); + const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1); + const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2); + const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3); + *c0 = _mm_packs_epi32(x0, x1); + *c1 = _mm_packs_epi32(x2, x3); + } else { + *c0 = _mm_load_si128((const __m128i *)addr); + *c1 = _mm_load_si128((const __m128i *)addr + 1); + } +} + +static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1, + tran_low_t *qcoeff, intptr_t offset) { + tran_low_t *addr = qcoeff + offset; + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero); + __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits); + __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits); + _mm_store_si128((__m128i *)addr, y0); + _mm_store_si128((__m128i *)addr + 1, y1); + + sign_bits = _mm_cmplt_epi16(*qc1, zero); + y0 = _mm_unpacklo_epi16(*qc1, sign_bits); + y1 = _mm_unpackhi_epi16(*qc1, sign_bits); + _mm_store_si128((__m128i *)addr + 2, y0); + _mm_store_si128((__m128i *)addr + 3, y1); + } else { + _mm_store_si128((__m128i *)addr, *qc0); + _mm_store_si128((__m128i *)addr + 1, *qc1); + } +} + +static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) { + const __m128i zero = _mm_setzero_si128(); + tran_low_t *addr = qcoeff + offset; + if (sizeof(tran_low_t) == 4) { + _mm_store_si128((__m128i *)addr, zero); + _mm_store_si128((__m128i *)addr + 1, zero); + _mm_store_si128((__m128i *)addr + 2, zero); + _mm_store_si128((__m128i *)addr + 3, zero); + } else { + _mm_store_si128((__m128i *)addr, zero); + _mm_store_si128((__m128i *)addr + 1, zero); + } +} + +static INLINE void quantize(const int16_t *iscan_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const __m128i *round0, const __m128i *round1, + const __m128i *quant0, const __m128i *quant1, + const __m128i *dequant0, const __m128i *dequant1, + const __m128i *thr0, const __m128i *thr1, + __m128i *eob) { + __m128i coeff0, coeff1; + // Do DC and first 15 AC + read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); + + // Poor man's sign extract + const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); + const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); + __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0), + _mm_cmpeq_epi16(qcoeff0, *thr0)); + const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1), + _mm_cmpeq_epi16(qcoeff1, *thr1)); + const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); + qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); + const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); + const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); + + coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); + coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); + + write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); + + const __m128i zero = _mm_setzero_si128(); + // Scan for eob + const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + const __m128i iscan0 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + const __m128i iscan1 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); + const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); + const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); + const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); + const __m128i eob2 = _mm_max_epi16(eob0, eob1); + *eob = _mm_max_epi16(*eob, eob2); + } else { + write_zero(qcoeff_ptr, n_coeffs); + write_zero(dqcoeff_ptr, n_coeffs); + } +} + +void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + + coeff_ptr += n_coeffs; + iscan_ptr += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); + const __m128i round1 = _mm_unpackhi_epi64(round0, round0); + const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); + const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); + const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); + const __m128i thr0 = _mm_srai_epi16(dequant0, 1); + const __m128i thr1 = _mm_srai_epi16(dequant1, 1); + __m128i eob = _mm_setzero_si128(); + + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, + &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob); + + n_coeffs += 8 * 2; + + // AC only loop + while (n_coeffs < 0) { + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, + &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1, + &eob); + n_coeffs += 8 * 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); + } +} + +static INLINE void quantize_lp(const int16_t *iscan_ptr, + const int16_t *coeff_ptr, intptr_t n_coeffs, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const __m128i *round0, const __m128i *round1, + const __m128i *quant0, const __m128i *quant1, + const __m128i *dequant0, const __m128i *dequant1, + __m128i *eob) { + const int16_t *read = coeff_ptr + n_coeffs; + __m128i coeff0 = _mm_load_si128((const __m128i *)read); + __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1); + + // Poor man's sign extract + const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); + const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); + __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); + qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); + const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); + const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + int16_t *addr = qcoeff_ptr + n_coeffs; + _mm_store_si128((__m128i *)addr, qcoeff0); + _mm_store_si128((__m128i *)addr + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); + coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); + + addr = dqcoeff_ptr + n_coeffs; + _mm_store_si128((__m128i *)addr, coeff0); + _mm_store_si128((__m128i *)addr + 1, coeff1); + + const __m128i zero = _mm_setzero_si128(); + // Scan for eob + const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + + const __m128i iscan0 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + const __m128i iscan1 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + + // Add one to convert from indices to counts + const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); + const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); + const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); + const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); + const __m128i eob2 = _mm_max_epi16(eob0, eob1); + *eob = _mm_max_epi16(*eob, eob2); +} + +void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); + const __m128i round1 = _mm_unpackhi_epi64(round0, round0); + const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); + const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); + const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); + __m128i eob = _mm_setzero_si128(); + + // DC and first 15 AC + quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, + &round1, &quant0, &quant1, &dequant0, &dequant1, &eob); + n_coeffs += 8 * 2; + + // AC only loop + while (n_coeffs < 0) { + quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, + &round1, &quant1, &quant1, &dequant1, &dequant1, &eob); + n_coeffs += 8 * 2; + } + + // Accumulate EOB + *eob_ptr = accumulate_eob(eob); +} diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm new file mode 100644 index 0000000000..ad4ae274e2 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm @@ -0,0 +1,204 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +%macro QUANTIZE_FP 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + cmp dword skipm, 0 + jne .blank + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, fp_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m1, m5 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [r2q] ; m3 = dequant + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, fp_32x32 + psllw m2, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + + lea coeffq, [ coeffq+ncoeffq*2] + lea r5q, [ r5q+ncoeffq*2] + lea r3q, [ r3q+ncoeffq*2] + lea r4q, [r4q+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [r3q+ncoeffq*2+ 0], m8 + mova [r3q+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; r4[i] = r3[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; r4[i] = r3[i] * q +%ifidn %1, fp_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 + psrlw m0, m3, 2 +%else + psrlw m0, m3, 1 +%endif + mova [r4q+ncoeffq*2+ 0], m8 + mova [r4q+ncoeffq*2+16], m13 + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + + pcmpgtw m7, m6, m0 + pcmpgtw m12, m11, m0 + pmovmskb r6d, m7 + pmovmskb r2d, m12 + + or r6, r2 + jz .skip_iter + + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [r3q+ncoeffq*2+ 0], m14 + mova [r3q+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; r4[i] = r3[i] * q + pmullw m13, m3 ; r4[i] = r3[i] * q +%ifidn %1, fp_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + mova [r4q+ncoeffq*2+ 0], m14 + mova [r4q+ncoeffq*2+16], m13 + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + + jmp .accumulate_eob +.skip_iter: + mova [r3q+ncoeffq*2+ 0], m5 + mova [r3q+ncoeffq*2+16], m5 + mova [r4q+ncoeffq*2+ 0], m5 + mova [r4q+ncoeffq*2+16], m5 + add ncoeffq, mmsize + jl .ac_only_loop + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET + + ; skip-block, i.e. just write all zeroes +.blank: + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + + lea r0q, [r0q+ncoeffq*2] + lea r2q, [r2q+ncoeffq*2] + neg ncoeffq + pxor m7, m7 +.blank_loop: + mova [r0q+ncoeffq*2+ 0], m7 + mova [r0q+ncoeffq*2+16], m7 + mova [r2q+ncoeffq*2+ 0], m7 + mova [r2q+ncoeffq*2+16], m7 + add ncoeffq, mmsize + jl .blank_loop + mov word [r3q], 0 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FP fp, 7 +QUANTIZE_FP fp_32x32, 7 diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm new file mode 100644 index 0000000000..618758105a --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm @@ -0,0 +1,222 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro + +SECTION .text + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +globalsym(av1_ssim_parms_16x16_sse2) +sym(av1_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +globalsym(av1_ssim_parms_8x8_sse2) +sym(av1_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c b/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c new file mode 100644 index 0000000000..830f40ecb0 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" + +#include "av1/common/reconinter.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/av1_temporal_denoiser.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int sum_diff_16x1(__m128i acc_diff) { + const __m128i k_1 = _mm_set1_epi16(1); + const __m128i acc_diff_lo = + _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = + _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); + const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); + const __m128i hgfe_dcba = + _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = + _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); + return _mm_cvtsi128_si32(hgfedcba); +} + +// Denoise a 16x1 vector. +static INLINE __m128i av1_denoiser_16x1_sse2( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, + const __m128i *k_16, const __m128i *l3, const __m128i *l32, + const __m128i *l21, __m128i acc_diff) { + // Calculate differences + const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); + __m128i v_running_avg_y; + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); + // Clamp absolute difference to 16 to be used to get mask. Doing this + // allows us to use _mm_cmpgt_epi8, which operates on signed byte. + const __m128i clamped_absdiff = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); + // Get masks for l2 l1 and l0 adjustments. + const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff); + // Get adjustments for l2, l1, and l0. + __m128i adj2 = _mm_and_si128(mask2, *l32); + const __m128i adj1 = _mm_and_si128(mask1, *l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + + // Combine the adjustments and get absolute adjustments. + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(*l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + // Restore the sign and get positive and negative adjustments. + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + // Calculate filtered value. + v_running_avg_y = _mm_adds_epu8(v_sig, padj); + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Adjustments <=7, and each element in acc_diff can fit in signed + // char. + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + return acc_diff; +} + +// Denoise a 16x1 vector with a weaker filter. +static INLINE __m128i av1_denoiser_adj_16x1_sse2( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const __m128i k_0, const __m128i k_delta, __m128i acc_diff) { + __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); + // Calculate differences. + const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); + v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + return acc_diff; +} + +// Denoise 8x8 and 8x16 blocks. +static int av1_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude, int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + const int b_height = block_size_high[bs] >> 1; + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + acc_diff = av1_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r], + running_buffer[r], &k_0, &k_4, &k_8, + &k_16, &l3, &l32, &l21, acc_diff); + memcpy(running_avg_y, running_buffer[r], width); + memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width); + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = sum_diff_16x1(acc_diff); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + acc_diff = av1_denoiser_adj_16x1_sse2( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0, + k_delta, acc_diff); + memcpy(running_avg_y, running_buffer[r], width); + memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, + width); + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = sum_diff_16x1(acc_diff); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +// Denoise 16x16 to 128x128 blocks. +static int av1_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + int sum_diff_thresh, r, c, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + __m128i acc_diff[8][8]; + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + const int b_width = block_size_wide[bs]; + const int b_height = block_size_high[bs]; + const int b_width_shift4 = b_width >> 4; + + for (r = 0; r < 8; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r] = _mm_setzero_si128(); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r >> 4] = av1_denoiser_16x1_sse2( + sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3, + &l32, &l21, acc_diff[c][r >> 4]); + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r >> 4] = + av1_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y, + k_0, k_delta, acc_diff[c][r >> 4]); + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); + } + } + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int av1_denoiser_filter_sse2(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return av1_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride, + avg, avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return av1_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride, + avg, avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } else { + return COPY_BLOCK; + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h new file mode 100644 index 0000000000..7a0f32898b --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ +#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ + +#include +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse4.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int stride); +void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit, + const int instride, const int outstride); +void av1_fadst4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fadst8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fadst16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_idct4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct32_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct64_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_iadst4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_iadst8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_iadst16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int col_num); + +static INLINE void transpose_32_4x4(int stride, const __m128i *input, + __m128i *output) { + __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]); + __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]); + __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]); + __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]); + + output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); +} + +// the entire input block can be represent by a grid of 4x4 blocks +// each 4x4 blocks can be represent by 4 vertical __m128i +// we first transpose each 4x4 block internally +// then transpose the grid +static INLINE void transpose_32(int txfm_size, const __m128i *input, + __m128i *output) { + const int num_per_128 = 4; + const int row_size = txfm_size; + const int col_size = txfm_size / num_per_128; + int r, c; + + // transpose each 4x4 block internally + for (r = 0; r < row_size; r += 4) { + for (c = 0; c < col_size; c++) { + transpose_32_4x4(col_size, &input[r * col_size + c], + &output[c * 4 * col_size + r / 4]); + } + } +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + const __m128i ww0 = _mm_set1_epi32(w0); \ + const __m128i ww1 = _mm_set1_epi32(w1); \ + const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ + const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = av1_round_shift_32_sse4_1(out0, bit); \ + const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ + const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = av1_round_shift_32_sse4_1(out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ + const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = _mm_add_epi32(out0, r); \ + out0 = _mm_srai_epi32(out0, bit); \ + const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ + const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = _mm_add_epi32(out1, r); \ + out1 = _mm_srai_epi32(out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \ + } while (0) + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ diff --git a/third_party/aom/av1/encoder/x86/cnn_avx2.c b/third_party/aom/av1/encoder/x86/cnn_avx2.c new file mode 100644 index 0000000000..ee93b3d5a0 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/cnn_avx2.c @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "av1/common/av1_common_int.h" +#include "av1/encoder/cnn.h" + +// This mask rearranges source pixels in the order shown below. +// shuffle_src_layer0[0][8]: applied on source pixels 0 to 7. +// shuffle_src_layer0[1][8]: applied on source pixels 7 to 14. +// This shuffling is needed to process 3 5x5 blocks which need +// source pixels in the following order. +// 1st 5x5 block: source pixels needed are 0 to 4, +// 2nd 5x5 block: source pixels needed are 4 to 8, +// 3rd 5x5 block: source pixels needed are 8 to 12. +// Source pixels are loaded like mentioned below. +// load_src0 : 0, 1, 2, 3, 4, 5, 6, 7 +// load_src1 : 7, 8, 9, 10, 11, 12, 13, 14 +// After applying masks, source bytes will be in the order: +// load_src0 : 0, 1, 2, 3, 4, 4, 5, 6 +// consists 5 pixels needed for 1st 5x5 block and +// first 3 pixels needed for 2nd 5x5 block. +// load_src1 : 7, 8, 8, 9, 10, 11, 12, x +// consists last 2 pixels needed for 2nd 5x5 block and +// 5 pixels needed for 3rd 5x5 block. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_src_layer0[2][8]) = { { 0, 1, 2, 3, 4, 4, 5, 6 }, + { 0, 1, 1, 2, 3, 4, 5, 0 } }; + +// This mask rearrange the weights to match shuffled source pixels order. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_weight_layer0[2][8]) = { { 0, 1, 2, 3, 4, 0, 1, 2 }, + { 3, 4, 0, 1, 2, 3, 4, 0 } }; + +// Shuffle mask used to rearrange weights corresponding to layer 1 and layer 2. +// For layer 1 and layer 2, convolution happens at 2x2 as filter_width and +// filter_height are equal to 2. So rearranging the weights in the +// order shown below to match source pixels. Basically this mask replicates +// the weights across the width of 2. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_weight_layer_1_and_2[2][8]) = { + { 0, 1, 0, 1, 0, 1, 0, 1 }, { 2, 3, 2, 3, 2, 3, 2, 3 } +}; + +// After the stages of multiplication and accumulation, the output values +// in the register will be jumbled. In order to store register into +// output buffer in a proper way, the following mask is applied on output +// register. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_output_layer_1_and_2[8]) = { 0, 1, 4, 5, 2, 3, 6, 7 }; + +// Load weights needed for layer 0 (for 5x5 block processing), +// and fill the registers appropriately to match source pixel mapping. +static INLINE void prepare_weights_for_5x5_convolve( + const float *layer_config_weights, int off, float weight[5][8], + const int cstep, __m256 *shuffle_weight, const __m256i weight_mask_0, + const __m256i weight_mask_1) { + for (int row = 0; row < 5; ++row) { + for (int col = 0; col < 5; ++col) { + weight[row][col] = layer_config_weights[off]; + off += cstep; + } + } + shuffle_weight[0] = _mm256_loadu_ps(weight[0]); + shuffle_weight[1] = _mm256_loadu_ps(weight[1]); + shuffle_weight[2] = _mm256_loadu_ps(weight[2]); + shuffle_weight[3] = _mm256_loadu_ps(weight[3]); + shuffle_weight[4] = _mm256_loadu_ps(weight[4]); + + shuffle_weight[0] = + _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_0); + shuffle_weight[1] = + _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_0); + shuffle_weight[2] = + _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_0); + shuffle_weight[3] = + _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_0); + shuffle_weight[4] = + _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_0); + shuffle_weight[5] = + _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_1); + shuffle_weight[6] = + _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_1); + shuffle_weight[7] = + _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_1); + shuffle_weight[8] = + _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_1); + shuffle_weight[9] = + _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_1); +} + +// For each row, loads source pixels 0 to 7(load_src_0), 7 to 14(load_src_1) and +// arranges them appropriately to process 3 blocks. +#define PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS() \ + do { \ + for (int row = 0; row < 5; row++) { \ + load_src_0 = _mm256_loadu_ps(input_ptr); \ + load_src_1 = _mm256_loadu_ps(input_ptr + 7); \ + load_src_0 = _mm256_permutevar8x32_ps(load_src_0, block0_1); \ + load_src_1 = _mm256_permutevar8x32_ps(load_src_1, block1_2); \ + load_src_0 = _mm256_mul_ps(load_src_0, shuffle_weight[0 + row]); \ + load_src_1 = _mm256_mul_ps(load_src_1, shuffle_weight[5 + row]); \ + accum_src_0 = _mm256_add_ps(load_src_0, accum_src_0); \ + accum_src_1 = _mm256_add_ps(load_src_1, accum_src_1); \ + input_ptr += in_stride; \ + } \ + } while (0) + +// Load masks needed for shuffling of output and weights. +static INLINE void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask, + __m256i *weight_mask) { + // Load shuffle buffer needed to sort the output. + *output_mask = + _mm256_load_si256((const __m256i *)shuffle_output_layer_1_and_2); + + // Load shuffle buffers needed for weight. + weight_mask[0] = + _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[0]); + weight_mask[1] = + _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[1]); +} + +// Load weights needed for layer 1 and 2 (for 2x2 block processing), +// and fill the registers appropriately to match source pixel mapping. +static INLINE void prepare_weights_for_2x2_convolve( + const float *layer_config_weights, int off, const int cstep, + __m256 *shuffle_weight, __m256i *weight_mask) { + // Weights needed for 2x2 block. + float weight[4] = { 0 }; + for (int i = 0; i < 4; ++i) { + weight[i] = layer_config_weights[off]; + off += cstep; + } + + const __m256 weight_vec = _mm256_castps128_ps256(_mm_loadu_ps(weight)); + shuffle_weight[0] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[0]); + shuffle_weight[1] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[1]); +} + +// Do convolution of one 5x5 block. +#define PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(w, accum0, in_stride) \ + do { \ + __m128 load_src[5]; \ + load_src[0] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[0][4]; \ + input_ptr += in_stride; \ + load_src[1] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[1][4]; \ + input_ptr += in_stride; \ + load_src[2] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[2][4]; \ + input_ptr += in_stride; \ + load_src[3] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[3][4]; \ + input_ptr += in_stride; \ + load_src[4] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[4][4]; \ + \ + load_src[0] = _mm_mul_ps(load_src[0], _mm256_castps256_ps128(w[0])); \ + load_src[1] = _mm_mul_ps(load_src[1], _mm256_castps256_ps128(w[1])); \ + load_src[2] = _mm_mul_ps(load_src[2], _mm256_castps256_ps128(w[2])); \ + load_src[3] = _mm_mul_ps(load_src[3], _mm256_castps256_ps128(w[3])); \ + load_src[4] = _mm_mul_ps(load_src[4], _mm256_castps256_ps128(w[4])); \ + \ + accum0 = _mm_add_ps(load_src[0], accum0); \ + load_src[1] = _mm_add_ps(load_src[1], load_src[2]); \ + load_src[3] = _mm_add_ps(load_src[3], load_src[4]); \ + load_src[1] = _mm_add_ps(load_src[1], load_src[3]); \ + accum0 = _mm_add_ps(accum0, load_src[1]); \ + } while (0) + +// Do convolution on 8 horizontal 2x2 blocks. +static INLINE void perform_convolve_for_8h_2x2_blocks( + const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum, + __m256i shuffle_output_mask) { + __m256 load_src[4]; + // Load input into source registers. + load_src[0] = _mm256_loadu_ps(input_ptr); + load_src[1] = _mm256_loadu_ps(input_ptr + 8); + load_src[2] = _mm256_loadu_ps(input_ptr + in_stride); + load_src[3] = _mm256_loadu_ps(input_ptr + in_stride + 8); + + // Multiply the loaded input with corresponding weights. + load_src[0] = _mm256_mul_ps(load_src[0], weight[0]); + load_src[1] = _mm256_mul_ps(load_src[1], weight[0]); + load_src[2] = _mm256_mul_ps(load_src[2], weight[1]); + load_src[3] = _mm256_mul_ps(load_src[3], weight[1]); + + // Accumulate across 2x2 blocks. + load_src[0] = _mm256_add_ps(load_src[0], load_src[2]); + load_src[1] = _mm256_add_ps(load_src[1], load_src[3]); + load_src[0] = _mm256_hadd_ps(load_src[0], load_src[1]); + + // Sort the output in order to store into output buffer. + load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask); + *out_accum = _mm256_add_ps(*out_accum, load_src[0]); +} + +// Do convolution on 8 (4 horizontal x 2 vertical) 2x2 blocks. +static INLINE void perform_convolve_for_4hx2v_2x2_blocks( + const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum, + __m256i shuffle_output_mask) { + __m256 load_src[4]; + // Load input into source registers. + load_src[0] = _mm256_loadu_ps(input_ptr); + load_src[1] = _mm256_loadu_ps(input_ptr + in_stride); + load_src[2] = _mm256_loadu_ps(input_ptr + (in_stride * 2)); + load_src[3] = _mm256_loadu_ps(input_ptr + (in_stride * 3)); + + // Multiply the loaded input with corresponding weights. + load_src[0] = _mm256_mul_ps(load_src[0], weight[0]); + load_src[1] = _mm256_mul_ps(load_src[1], weight[1]); + load_src[2] = _mm256_mul_ps(load_src[2], weight[0]); + load_src[3] = _mm256_mul_ps(load_src[3], weight[1]); + + // Accumulate across 2x2 blocks. + load_src[0] = _mm256_add_ps(load_src[0], load_src[1]); + load_src[2] = _mm256_add_ps(load_src[2], load_src[3]); + load_src[0] = _mm256_hadd_ps(load_src[0], load_src[2]); + + // Sort the output in order to store into output buffer. + load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask); + *out_accum = _mm256_add_ps(*out_accum, load_src[0]); +} + +// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when +// filter_width and filter_height are equal to 5. +// CNN convolve parsing is based on av1_intra_mode_cnn_partition_cnn_config. +// Based on the configuration set for each layer, the current encoder +// always chooses the case of no_maxpool_padding_valid. +// And also for layer 0 convolution happens at 5x5 level as the +// filter_width and filter_height are set as 5. +static void cnn_convolve_no_maxpool_padding_valid_5x5_avx2( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + const int kFilterWidth = 5; + const int kFilterHeight = 5; + const int kSkipWidth = 4; + const int kSkipHeight = 4; + assert(layer_config->filter_width == kFilterWidth && + layer_config->filter_height == kFilterHeight); + assert(layer_config->skip_width == kSkipWidth && + layer_config->skip_height == kSkipHeight); + + // Load shuffle buffers needed for source. + const __m256i block0_1 = + _mm256_load_si256((const __m256i *)shuffle_src_layer0[0]); + const __m256i block1_2 = + _mm256_load_si256((const __m256i *)shuffle_src_layer0[1]); + + // Load shuffle buffers needed for weight. + const __m256i weight_mask_0 = + _mm256_load_si256((const __m256i *)shuffle_weight_layer0[0]); + const __m256i weight_mask_1 = + _mm256_load_si256((const __m256i *)shuffle_weight_layer0[1]); + + // Width needs to be moved to go to next iteration of processing 3 5x5 blocks. + const int kSkipWidthForNextIter = kSkipWidth * 3; + + // Minimum width required to process 3 5x5 blocks at a time. + // min width (for processing 3 5x5 block) = 2*skip_width + filter_width + // Here, skip_width specifies how much width we should move while processing + // next block convolution and filter_width specifies for how many pixels + // filter needs to be applied. + const int kMinWidthFor3_5x5Blocks = (kSkipWidth * 2) + kFilterWidth; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + const float out_ch_bias = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + __m256 shuffle_weight[10]; + + // Weights needed are 5x5, for SIMD purpose made this array as 5x8. + float weight[5][8] = { { 0 } }; + int off = k * layer_config->out_channels + i; + + // In layer 0, the convolution process happens at 5x5. + // The weights needed for 5x5 block are same across the in-channels, + // which is why the load of weights happens once for each in-channel. + prepare_weights_for_5x5_convolve(layer_config->weights, off, weight, + cstep, shuffle_weight, weight_mask_0, + weight_mask_1); + + for (int h = 0, u = 0; h < in_height - kFilterHeight + 1; + h += kSkipHeight, ++u) { + const int out_h = u * out_stride; + int v = 0; + int w = 0; + int rem_width = in_width; + // Processing 3 5x5 blocks at a time, if sufficient width is present. + while (rem_width >= kMinWidthFor3_5x5Blocks) { + __m256 load_src_0, load_src_1; + __m256 accum_src_0 = _mm256_setzero_ps(); + __m256 accum_src_1 = _mm256_setzero_ps(); + const float *input_ptr = &input[k][h * in_stride + w]; + PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS(); + + // Accumulate across column. + __m256 accum = _mm256_hadd_ps(accum_src_0, accum_src_1); + __m128 tmp_reg_0 = _mm256_extractf128_ps(accum_src_0, 1); + __m128 tmp_reg_1 = _mm256_extractf128_ps(accum_src_1, 1); + + __m128 accum_l = _mm256_castps256_ps128(accum); + __m128 accum_h = _mm256_extractf128_ps(accum, 1); + + __m128 tmp_reg_2 = _mm_add_ps(accum_l, tmp_reg_0); + __m128 tmp_reg_3 = _mm_add_ps(tmp_reg_0, accum_h); + __m128 tmp_reg_4 = _mm_add_ps(tmp_reg_1, accum_h); + + // 1st 5x5 block output. + output[i][out_h + v] = + out_ch_bias + _mm_cvtss_f32(tmp_reg_2) + + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 1)); + + // 2nd 5x5 block output. + output[i][out_h + v + 1] = + out_ch_bias + + _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_3, tmp_reg_3, 1)) + + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 2)); + + // 3rd 5x5 block output. + output[i][out_h + v + 2] = + out_ch_bias + + _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_4, tmp_reg_4, 2)) + + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 3)); + + v += 3; + w += kSkipWidthForNextIter; + rem_width -= kSkipWidthForNextIter; + } + + // Process remaining blocks as single 5x5 block at a time. + while (rem_width >= kFilterWidth) { + float last_column_sum = 0; + __m128 accum = _mm_setzero_ps(); + const float *input_ptr = &input[k][h * in_stride + w]; + PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(shuffle_weight, accum, in_stride); + + // Accumulate across column. + accum = _mm_hadd_ps(accum, accum); + output[i][out_h + v] = out_ch_bias + last_column_sum + + _mm_cvtss_f32(accum) + + _mm_cvtss_f32(_mm_shuffle_ps(accum, accum, 1)); + + v += 1; + w += kSkipWidth; + rem_width -= kSkipWidth; + } + } + } + } +} + +// AVX2 implementation for layer 1. +static INLINE void cnn_convolve_no_maxpool_padding_valid_layer1_avx2( + const float **input, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + __m256i weight_mask[2]; + __m256i shuffle_output_mask; + load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask); + + const int kInHeight = 16; + const int kFilterHeight = 2; + const int kSkipHeight = 2; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]); + // out_accum registers are used to store the 2x2 convolve outputs + // (calculated over input block size), which are accumulated across the + // in_channels. As per the design, each iteration of for loop processes 8 + // (horizontal) 2x2 blocks and stores in corresponding out_accum register + // (as input size is 16x16, a total of 64 2x2 blocks are present and 8 + // out_accum registers are enough to store the outputs). + // Hence for loops corresponding to 'j' and 'h', below, run over the number + // of out_accum registers. + __m256 out_accum[8]; + for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg; + for (int k = 0; k < layer_config->in_channels; ++k) { + __m256 shuffle_weight[2]; + int off = k * layer_config->out_channels + i; + // In layer 1, the convolution process happens at 2x2. + // The weights needed for 2x2 block are same across the in-channels, + // which is why the load of weights happens once for each in-channel. + prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep, + shuffle_weight, weight_mask); + + for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1; + h += kSkipHeight, ++u) { + const float *input_ptr = &input[k][h * in_stride]; + perform_convolve_for_8h_2x2_blocks(input_ptr, in_stride, shuffle_weight, + &out_accum[u], shuffle_output_mask); + } + } + // Store output of layer 1. + for (int j = 0; j < 8; ++j) { + _mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]); + } + } +} + +// AVX2 implementation for layer 2. +static INLINE void cnn_convolve_no_maxpool_padding_valid_layer2_avx2( + const float **input, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + __m256i weight_mask[2]; + __m256i shuffle_output_mask; + load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask); + + const int kInHeight = 8; + const int kFilterHeight = 2; + const int kSkipHeight = 2; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]); + // out_accum registers are used to store the 2x2 convolve outputs + // (calculated over input block size), which are accumulated across the + // in_channels. As per the design, each iteration of for loop processes 8 + // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding + // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are + // present and 2 out_accum registers are enough to store the outputs). + // Hence for loops corresponding to 'j' and 'h', below, run over the number + // of out_accum registers. + __m256 out_accum[2]; + + // Height needs to be moved to go to next iteration of processing + // while processing 2 2x2 blocks vertically. + const int kSkipHeightForNextIter = kSkipHeight * 2; + for (int j = 0; j < 2; ++j) out_accum[j] = bias_reg; + for (int k = 0; k < layer_config->in_channels; ++k) { + __m256 shuffle_weight[2]; + int off = k * layer_config->out_channels + i; + // In layer 2, the convolution process happens at 2x2. + // The weights needed for 2x2 block are same across the in-channels, + // which is why the load of weights happens once for each in-channel. + prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep, + shuffle_weight, weight_mask); + + for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1; + h += kSkipHeightForNextIter, ++u) { + const float *input_ptr = &input[k][h * in_stride]; + perform_convolve_for_4hx2v_2x2_blocks(input_ptr, in_stride, + shuffle_weight, &out_accum[u], + shuffle_output_mask); + } + } + // Store output of layer 2. + for (int j = 0; j < 2; ++j) { + _mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]); + } + } +} + +// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when +// filter_width and filter_height are equal to 2. +// As per the layer config set by av1_intra_mode_cnn_partition_cnn_config, +// the filter_width and filter_height are equal to 2 for layer >= 1. So +// convolution happens at 2x2 for layer >= 1. +void cnn_convolve_no_maxpool_padding_valid_2x2_avx2( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + assert(layer_config->filter_width == 2 && layer_config->filter_height == 2); + assert(layer_config->skip_width == 2 && layer_config->skip_height == 2); + + if (in_width == 16 && in_height == 16) { + // This case of in_width and in_height equal to 16 corresponds to layer 1. + // The output size of this layer is 8x8. + cnn_convolve_no_maxpool_padding_valid_layer1_avx2( + input, in_stride, layer_config, output, out_stride, start_idx, cstep, + channel_step); + } else if (in_width == 8 && in_height == 8) { + // This case of in_width and in_height equal to 8 corresponds to layer 2. + // The output size of this layer is 4x4. + cnn_convolve_no_maxpool_padding_valid_layer2_avx2( + input, in_stride, layer_config, output, out_stride, start_idx, cstep, + channel_step); + } else { + // For layer equal to 3 and 4, the input is of size 4x4 and 2x2 + // respectively. Implementing SIMD for these cases might not be optimal, + // which is why we call C path for layer >= 3. + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } +} + +// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(). +// As per the current encoder, av1_cnn_convolve function gets called for +// block size equal to 64x64. av1_cnn_convolve() uses layer config values +// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few +// details related to each layer's config parameters. +// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht +// 0 64x64 16x16 5 5 4 4 +// 1 16x16 8x8 2 2 2 2 +// 2 8x8 4x4 2 2 2 2 +// 3 4x4 2x2 2 2 2 2 +// 4 2x2 1x1 2 2 2 2 +// Here, +// filter_wd = filter_width and filter_ht = filter_height, +// skip_wd = skip_width and skip_ht = skip_height. +void av1_cnn_convolve_no_maxpool_padding_valid_avx2( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, + int start_idx, int cstep, int channel_step) { + if (layer_config->filter_width == 5 && layer_config->filter_height == 5 && + layer_config->skip_width == 4 && layer_config->skip_height == 4) { + cnn_convolve_no_maxpool_padding_valid_5x5_avx2( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } else if (layer_config->filter_width == 2 && + layer_config->filter_height == 2 && + layer_config->skip_width == 2 && layer_config->skip_height == 2) { + cnn_convolve_no_maxpool_padding_valid_2x2_avx2( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } +} diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm new file mode 100644 index 0000000000..b185548184 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm @@ -0,0 +1,82 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + ; 00 01 02 03 + ; 10 11 12 13 + ; 20 21 22 23 + ; 30 31 32 33 + punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13 + punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33 + mova m1, m0 + punpckldq m0, m2 ; 00 10 20 30 01 11 21 31 + punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33 +%endmacro + +INIT_XMM sse2 +cglobal fwht4x4, 3, 4, 8, input, output, stride + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + SWAP 1, 2 + psrldq m1, m0, 8 + psrldq m3, m2, 8 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + + ; sign extension + mova m2, m0 + mova m3, m1 + punpcklwd m0, m0 + punpcklwd m1, m1 + punpckhwd m2, m2 + punpckhwd m3, m3 + psrad m0, 16 + psrad m1, 16 + psrad m2, 16 + psrad m3, 16 + mova [outputq], m0 + mova [outputq + 16], m2 + mova [outputq + 32], m1 + mova [outputq + 48], m3 + + RET diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c new file mode 100644 index 0000000000..9627f75930 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include /* SSE4.1 */ +#include /* AVX2 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = height + TX_PAD_HOR; + const __m256i y_zeros = _mm256_setzero_si256(); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf_end = levels + (width + TX_PAD_BOTTOM) * stride; + uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31)); + + do { + yy_storeu_256(bottom_buf, y_zeros); + bottom_buf += 32; + } while (bottom_buf < bottom_buf_end); + + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (height == 4) { + do { + const __m256i c0 = yy_loadu_256(cf); + const __m256i c1 = yy_loadu_256(cf + 8); + const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1)); + const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros); + const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8); + const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8); + yy_storeu_256(ls, res); + ls += 32; + cf += 16; + i += 4; + } while (i < width); + } else if (height == 8) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + const __m128i res0 = _mm256_castsi256_si128(res); + const __m128i res1 = _mm256_extracti128_si256(res, 1); + xx_storel_64(ls, res0); + *(int32_t *)(ls + height) = 0; + xx_storel_64(ls + stride, _mm_srli_si128(res0, 8)); + *(int32_t *)(ls + height + stride) = 0; + xx_storel_64(ls + stride * 2, res1); + *(int32_t *)(ls + height + stride * 2) = 0; + xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8)); + *(int32_t *)(ls + height + stride * 3) = 0; + cf += 32; + ls += stride << 2; + i += 4; + } while (i < width); + } else if (height == 16) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + xx_storeu_128(ls, _mm256_castsi256_si128(res)); + xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1)); + cf += 32; + *(int32_t *)(ls + height) = 0; + *(int32_t *)(ls + stride + height) = 0; + ls += stride << 1; + i += 2; + } while (i < width); + } else { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + yy_storeu_256(ls, res); + cf += 32; + *(int32_t *)(ls + height) = 0; + ls += stride; + i += 1; + } while (i < width); + } +} diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c new file mode 100644 index 0000000000..d23a688747 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" + +static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride); + level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride); + level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride); + level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride); + level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride); +} + +static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride); + level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride); + level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride); + level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride); + level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride); +} + +static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = _mm_loadu_si128((__m128i *)(src + 1)); + level[1] = _mm_loadu_si128((__m128i *)(src + stride)); + level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0])); + level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1])); + level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2])); +} + +static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) { + const __m128i const_3 = _mm_set1_epi8(3); + const __m128i const_4 = _mm_set1_epi8(4); + __m128i count; + + count = _mm_min_epu8(level[0], const_3); + level[1] = _mm_min_epu8(level[1], const_3); + level[2] = _mm_min_epu8(level[2], const_3); + level[3] = _mm_min_epu8(level[3], const_3); + level[4] = _mm_min_epu8(level[4], const_3); + count = _mm_add_epi8(count, level[1]); + count = _mm_add_epi8(count, level[2]); + count = _mm_add_epi8(count, level[3]); + count = _mm_add_epi8(count, level[4]); + count = _mm_avg_epu8(count, _mm_setzero_si128()); + count = _mm_min_epu8(count, const_4); + return count; +} + +static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *const coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(21); + __m128i pos_to_offset = + (width == 4) + ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21) + : _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, + 21, 21); + __m128i count; + __m128i level[5]; + int8_t *cc = coeff_contexts; + int col = width; + + assert(!(width % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)cc, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + cc += 16; + col -= 4; + } while (col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int col = width; + + assert(!(width % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 4 * stride; + coeff_contexts += 16; + col -= 4; + } while (col); +} + +static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int col = width; + + assert(!(width % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + coeff_contexts += 16; + col -= 4; + } while (col); +} + +static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + int8_t *cc = coeff_contexts; + int col = width; + __m128i count; + __m128i level[5]; + __m128i pos_to_offset[3]; + + assert(!(width % 2)); + + if (width == 8) { + pos_to_offset[0] = + _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, + 21, 21, 21, 21, 21); + } else if (width < 8) { + pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, + 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, + 21, 21, 21, 21, 21); + } else { + pos_to_offset[0] = _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16); + pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, + 21, 21, 21, 21, 21); + } + pos_to_offset[2] = _mm_set1_epi8(21); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)cc, count); + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += 2 * stride; + cc += 16; + col -= 2; + } while (col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + int col = width; + __m128i count; + __m128i level[5]; + + assert(!(width % 2)); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 2 * stride; + coeff_contexts += 16; + col -= 2; + } while (col); +} + +static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5); + int col = width; + __m128i count; + __m128i level[5]; + + assert(!(width % 2)); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 2 * stride; + coeff_contexts += 16; + col -= 2; + } while (col); +} + +static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, + const int real_width, + const int real_height, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + int8_t *cc = coeff_contexts; + int col = width; + __m128i pos_to_offset[5]; + __m128i pos_to_offset_large[3]; + __m128i count; + __m128i level[5]; + + assert(!(height % 16)); + + pos_to_offset_large[2] = _mm_set1_epi8(21); + if (real_width == real_height) { + pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = + pos_to_offset_large[2]; + } else if (real_width < real_height) { + pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8( + 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); + pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; + } else { // real_width > real_height + pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8( + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); + pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[4] = pos_to_offset_large[2]; + pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(16); + } + + do { + int h = height; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)cc, count); + levels += 16; + cc += 16; + h -= 16; + pos_to_offset[0] = pos_to_offset_large[0]; + } while (h); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + pos_to_offset[2] = pos_to_offset[3]; + pos_to_offset[3] = pos_to_offset[4]; + pos_to_offset_large[0] = pos_to_offset_large[1]; + pos_to_offset_large[1] = pos_to_offset_large[2]; + levels += TX_PAD_HOR; + } while (--col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + const __m128i pos_to_offset_large = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int col = width; + + assert(!(height % 16)); + + do { + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + int h = height; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 16; + coeff_contexts += 16; + h -= 16; + } while (h); + + levels += TX_PAD_HOR; + } while (--col); +} + +static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + __m128i pos_to_offset[3]; + __m128i count; + __m128i level[5]; + int col = width; + + assert(!(height % 16)); + + pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0); + pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5); + pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + + do { + int h = height; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 16; + coeff_contexts += 16; + h -= 16; + } while (h); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += TX_PAD_HOR; + } while (--col); +} + +// Note: levels[] must be in the range [0, 127], inclusive. +void av1_get_nz_map_contexts_sse2(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, + const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int last_idx = eob - 1; + if (!last_idx) { + coeff_contexts[0] = 0; + return; + } + + const int real_width = tx_size_wide[tx_size]; + const int real_height = tx_size_high[tx_size]; + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const int stride = height + TX_PAD_HOR; + ptrdiff_t offsets[3]; + + /* coeff_contexts must be 16 byte aligned. */ + assert(!((intptr_t)coeff_contexts & 0xf)); + + if (tx_class == TX_CLASS_2D) { + offsets[0] = 0 * stride + 2; + offsets[1] = 1 * stride + 1; + offsets[2] = 2 * stride + 0; + + if (height == 4) { + get_4_nz_map_contexts_2d(levels, width, offsets, coeff_contexts); + } else if (height == 8) { + get_8_coeff_contexts_2d(levels, width, offsets, coeff_contexts); + } else if (height == 16) { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coeff_contexts); + } + } else if (tx_class == TX_CLASS_HORIZ) { + offsets[0] = 2 * stride; + offsets[1] = 3 * stride; + offsets[2] = 4 * stride; + if (height == 4) { + get_4_nz_map_contexts_hor(levels, width, offsets, coeff_contexts); + } else if (height == 8) { + get_8_coeff_contexts_hor(levels, width, offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_hor(levels, width, height, offsets, + coeff_contexts); + } + } else { // TX_CLASS_VERT + offsets[0] = 2; + offsets[1] = 3; + offsets[2] = 4; + if (height == 4) { + get_4_nz_map_contexts_ver(levels, width, offsets, coeff_contexts); + } else if (height == 8) { + get_8_coeff_contexts_ver(levels, width, offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_ver(levels, width, height, offsets, + coeff_contexts); + } + } + + const int bhl = get_txb_bhl(tx_size); + const int pos = scan[last_idx]; + if (last_idx <= (width << bhl) / 8) + coeff_contexts[pos] = 1; + else if (last_idx <= (width << bhl) / 4) + coeff_contexts[pos] = 2; + else + coeff_contexts[pos] = 3; +} diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c new file mode 100644 index 0000000000..72bd8e3411 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include /* SSE4.1 */ + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" + +void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = height + TX_PAD_HOR; + const __m128i zeros = _mm_setzero_si128(); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf = levels + stride * width; + uint8_t *bottom_buf_end = bottom_buf + bottom_len; + do { + _mm_storeu_si128((__m128i *)(bottom_buf), zeros); + bottom_buf += 16; + } while (bottom_buf < bottom_buf_end); + + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (height == 4) { + do { + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); + const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros); + xx_storeu_128(ls, lsAB); + ls += (stride << 1); + cf += (height << 1); + i += 2; + } while (i < width); + } else if (height == 8) { + do { + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); + xx_storeu_128(ls, absAB8); + ls += stride; + cf += height; + i += 1; + } while (i < width); + } else { + do { + int j = 0; + do { + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffC = xx_loadu_128(cf + 8); + const __m128i coeffD = xx_loadu_128(cf + 12); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absCD = _mm_abs_epi16(coeffCD); + const __m128i absABCD = _mm_packs_epi16(absAB, absCD); + xx_storeu_128(ls + j, absABCD); + j += 16; + cf += 16; + } while (j < height); + *(int32_t *)(ls + height) = 0; + ls += stride; + i += 1; + } while (i < width); + } +} diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c new file mode 100644 index 0000000000..57725d1795 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" + +static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, + __m256i *c) { + const tran_low_t *addr = coeff + offset; + + if (sizeof(tran_low_t) == 4) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1); + const __m256i y = _mm256_packs_epi32(x0, x1); + *c = _mm256_permute4x64_epi64(y, 0xD8); + } else { + *c = _mm256_loadu_si256((const __m256i *)addr); + } +} + +static INLINE void av1_block_error_num_coeff16_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + __m256i *sse_256) { + const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff); + // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + // r0 r1 r2 r3 r4 r5 r6 r7 + const __m256i error = _mm256_madd_epi16(diff, diff); + // r0+r1 r2+r3 | r0+r1 r2+r3 | r4+r5 r6+r7 | r4+r5 r6+r7 + const __m256i error_hi = _mm256_hadd_epi32(error, error); + // r0+r1 | r2+r3 | r4+r5 | r6+r7 + *sse_256 = _mm256_unpacklo_epi32(error_hi, _mm256_setzero_si256()); +} + +static INLINE void av1_block_error_num_coeff32_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + __m256i *sse_256) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff); + const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16)); + const __m256i _dqcoeff_1 = + _mm256_loadu_si256((const __m256i *)(dqcoeff + 16)); + + // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 + const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0); + const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1); + + // r0 r1 r2 r3 r4 r5 r6 r7 + const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0); + const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1); + const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1); + + // For extreme input values, the accumulation needs to happen in 64 bit + // precision to avoid any overflow. + const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero); + const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero); + const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo); + *sse_256 = _mm256_add_epi64(*sse_256, sum_temp_0); +} + +static INLINE void av1_block_error_num_coeff64_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + __m256i *sse_256, + intptr_t num_coeff) { + const __m256i zero = _mm256_setzero_si256(); + for (int i = 0; i < num_coeff; i += 64) { + // Load 64 elements for coeff and dqcoeff. + const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff); + const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16)); + const __m256i _dqcoeff_1 = + _mm256_loadu_si256((const __m256i *)(dqcoeff + 16)); + const __m256i _coeff_2 = _mm256_loadu_si256((const __m256i *)(coeff + 32)); + const __m256i _dqcoeff_2 = + _mm256_loadu_si256((const __m256i *)(dqcoeff + 32)); + const __m256i _coeff_3 = _mm256_loadu_si256((const __m256i *)(coeff + 48)); + const __m256i _dqcoeff_3 = + _mm256_loadu_si256((const __m256i *)(dqcoeff + 48)); + + // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 + const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0); + const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1); + const __m256i diff_2 = _mm256_sub_epi16(_dqcoeff_2, _coeff_2); + const __m256i diff_3 = _mm256_sub_epi16(_dqcoeff_3, _coeff_3); + + // r0 r1 r2 r3 r4 r5 r6 r7 + const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0); + const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1); + const __m256i error_2 = _mm256_madd_epi16(diff_2, diff_2); + const __m256i error_3 = _mm256_madd_epi16(diff_3, diff_3); + // r00 r01 r02 r03 r04 r05 r06 r07 + const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1); + // r10 r11 r12 r13 r14 r15 r16 r17 + const __m256i err_final_1 = _mm256_add_epi32(error_2, error_3); + + // For extreme input values, the accumulation needs to happen in 64 bit + // precision to avoid any overflow. r00 r01 r04 r05 + const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero); + // r02 r03 r06 r07 + const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero); + // r10 r11 r14 r15 + const __m256i exp1_error_lo = _mm256_unpacklo_epi32(err_final_1, zero); + // r12 r13 r16 r17 + const __m256i exp1_error_hi = _mm256_unpackhi_epi32(err_final_1, zero); + + const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo); + const __m256i sum_temp_1 = _mm256_add_epi64(exp1_error_hi, exp1_error_lo); + const __m256i sse_256_temp = _mm256_add_epi64(sum_temp_1, sum_temp_0); + *sse_256 = _mm256_add_epi64(*sse_256, sse_256_temp); + coeff += 64; + dqcoeff += 64; + } +} + +int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t num_coeff) { + assert(num_coeff % 16 == 0); + __m256i sse_256 = _mm256_setzero_si256(); + int64_t sse; + + if (num_coeff == 16) + av1_block_error_num_coeff16_avx2(coeff, dqcoeff, &sse_256); + else if (num_coeff == 32) + av1_block_error_num_coeff32_avx2(coeff, dqcoeff, &sse_256); + else + av1_block_error_num_coeff64_avx2(coeff, dqcoeff, &sse_256, num_coeff); + + // Save the higher 64 bit of each 128 bit lane. + const __m256i sse_hi = _mm256_srli_si256(sse_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + // Accumulate the sse_256 register to get final sse + const __m128i sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)&sse, sse_128); + return sse; +} + +int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_reg_64hi, ssz_reg_64hi; + __m128i sse_reg128, ssz_reg128; + int64_t sse; + int i; + const __m256i zero_reg = _mm256_setzero_si256(); + + // init sse and ssz registerd to zero + sse_reg = _mm256_setzero_si256(); + ssz_reg = _mm256_setzero_si256(); + + for (i = 0; i < block_size; i += 16) { + // load 32 bytes from coeff and dqcoeff + read_coeff(coeff, i, &coeff_reg); + read_coeff(dqcoeff, i, &dqcoeff_reg); + // dqcoeff - coeff + dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); + // madd (dqcoeff - coeff) + dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); + // madd coeff + coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); + // expand each double word of madd (dqcoeff - coeff) to quad word + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); + // add each quad word of madd (dqcoeff - coeff) and madd (coeff) + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); + } + // save the higher 64 bit of each 128 bit lane + sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); + ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); + // add the higher 64 bit to the low 64 bit + sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); + ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); + + // add each 64 bit from each of the 128 bit lane of the 256 bit + sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), + _mm256_extractf128_si256(sse_reg, 1)); + + ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), + _mm256_extractf128_si256(ssz_reg, 1)); + + // store the results + _mm_storel_epi64((__m128i *)(&sse), sse_reg128); + + _mm_storel_epi64((__m128i *)(ssz), ssz_reg128); + _mm256_zeroupper(); + return sse; +} diff --git a/third_party/aom/av1/encoder/x86/error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/error_intrin_sse2.c new file mode 100644 index 0000000000..61f65c623f --- /dev/null +++ b/third_party/aom/av1/encoder/x86/error_intrin_sse2.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" + +static AOM_INLINE __m128i reduce_sum_epi64(__m128i reg) { + __m128i reg_hi = _mm_srli_si128(reg, 8); + reg = _mm_add_epi64(reg, reg_hi); + + return reg; +} + +int64_t av1_block_error_lp_sse2(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t block_size) { + assert(block_size % 16 == 0); + assert(block_size >= 16); + + const __m128i zero = _mm_setzero_si128(); + __m128i accum_0 = zero; + __m128i accum_1 = zero; + + for (int i = 0; i < block_size; i += 16) { + // Load 8 elements for coeff and dqcoeff. + const __m128i _coeff_0 = _mm_loadu_si128((const __m128i *)coeff); + const __m128i _coeff_1 = _mm_loadu_si128((const __m128i *)(coeff + 8)); + const __m128i _dqcoeff_0 = _mm_loadu_si128((const __m128i *)dqcoeff); + const __m128i _dqcoeff_1 = _mm_loadu_si128((const __m128i *)(dqcoeff + 8)); + // Compute the diff + const __m128i diff_0 = _mm_sub_epi16(_dqcoeff_0, _coeff_0); + const __m128i diff_1 = _mm_sub_epi16(_dqcoeff_1, _coeff_1); + // Compute the error + const __m128i error_0 = _mm_madd_epi16(diff_0, diff_0); + const __m128i error_1 = _mm_madd_epi16(diff_1, diff_1); + + const __m128i error_lo_0 = _mm_unpacklo_epi32(error_0, zero); + const __m128i error_lo_1 = _mm_unpacklo_epi32(error_1, zero); + const __m128i error_hi_0 = _mm_unpackhi_epi32(error_0, zero); + const __m128i error_hi_1 = _mm_unpackhi_epi32(error_1, zero); + + // Accumulate + accum_0 = _mm_add_epi64(accum_0, error_lo_0); + accum_1 = _mm_add_epi64(accum_1, error_lo_1); + accum_0 = _mm_add_epi64(accum_0, error_hi_0); + accum_1 = _mm_add_epi64(accum_1, error_hi_1); + + // Advance + coeff += 16; + dqcoeff += 16; + } + + __m128i accum = _mm_add_epi64(accum_0, accum_1); + // Reduce sum the register + accum = reduce_sum_epi64(accum); + + // Store the results. +#if AOM_ARCH_X86_64 + return _mm_cvtsi128_si64(accum); +#else + int64_t result; + _mm_storel_epi64((__m128i *)&result, accum); + return result; +#endif // AOM_ARCH_X86_64 +} diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm new file mode 100644 index 0000000000..6407c106ab --- /dev/null +++ b/third_party/aom/av1/encoder/x86/error_sse2.asm @@ -0,0 +1,88 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +; Increment %1 by sizeof() tran_low_t * %2. +%macro INCREMENT_ELEMENTS_TRAN_LOW 2 + lea %1, [%1 + %2 * 4] +%endmacro + +; Load %2 + %3 into m%1. +; %3 is the offset in elements, not bytes. +; If tran_low_t is 16 bits (low bit depth configuration) then load the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack +; the values down to 16 bits. +%macro LOAD_TRAN_LOW 3 + mova m%1, [%2 + (%3) * 4] + packssdw m%1, [%2 + (%3) * 4 + 16] +%endmacro + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, +; int64_t *ssz) + +INIT_XMM sse2 +cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz + pxor m4, m4 ; sse accumulator + pxor m6, m6 ; ssz accumulator + pxor m5, m5 ; dedicated zero register +.loop: + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + paddd m2, m3 + ; accumulate in 64bit + punpckldq m7, m0, m5 + punpckhdq m0, m5 + paddq m4, m7 + punpckldq m7, m2, m5 + paddq m4, m0 + punpckhdq m2, m5 + paddq m6, m7 + paddq m6, m2 + jg .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + movhlps m7, m6 + paddq m4, m5 + paddq m6, m7 +%if AOM_ARCH_X86_64 + movq rax, m4 + movq [sszq], m6 +%else + mov eax, sszm + pshufd m5, m4, 0x1 + movq [eax], m6 + movd eax, m4 + movd edx, m5 +%endif + RET diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c new file mode 100644 index 0000000000..ebe75310e9 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/hash_sse42.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +// Byte-boundary alignment issues +#define ALIGN_SIZE 8 +#define ALIGN_MASK (ALIGN_SIZE - 1) + +#define CALC_CRC(op, crc, type, buf, len) \ + while ((len) >= sizeof(type)) { \ + (crc) = op((crc), *(type *)(buf)); \ + (len) -= sizeof(type); \ + buf += sizeof(type); \ + } + +/** + * Calculates 32-bit CRC for the input buffer + * polynomial is 0x11EDC6F41 + * @return A 32-bit unsigned integer representing the CRC + */ +uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p, + size_t len) { + (void)crc_calculator; + const uint8_t *buf = p; + uint32_t crc = 0xFFFFFFFF; + + // Align the input to the word boundary + for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) { + crc = _mm_crc32_u8(crc, *buf); + } + +#ifdef __x86_64__ + uint64_t crc64 = crc; + CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len) + crc = (uint32_t)crc64; +#endif + CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len) + CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len) + CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len) + return (crc ^ 0xFFFFFFFF); +} diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c new file mode 100644 index 0000000000..340307cb3e --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom/aom_integer.h" +#include "av1/common/common.h" +#include "config/av1_rtcd.h" + +int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps) { + int i; + int64_t temp1[8]; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bps - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i += 16) { + __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i)); + __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8)); + __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i)); + __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8)); + + __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff); + __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2); + __m256i diff1h = _mm256_srli_epi64(diff1, 32); + __m256i diff2h = _mm256_srli_epi64(diff2, 32); + __m256i res = _mm256_mul_epi32(diff1, diff1); + __m256i res1 = _mm256_mul_epi32(diff1h, diff1h); + __m256i res2 = _mm256_mul_epi32(diff2, diff2); + __m256i res3 = _mm256_mul_epi32(diff2h, diff2h); + __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1), + _mm256_add_epi64(res2, res3)); + __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32); + __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32); + res = _mm256_mul_epi32(mm256_coeff, mm256_coeff); + res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh); + res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2); + res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2); + __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1), + _mm256_add_epi64(res2, res3)); + _mm256_storeu_si256((__m256i *)temp1, res_diff); + _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff); + + error += temp1[0] + temp1[1] + temp1[2] + temp1[3]; + sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7]; + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c new file mode 100644 index 0000000000..b0b2757568 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/common.h" +#include "config/av1_rtcd.h" + +int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps) { + int i, j, test; + uint32_t temp[4]; + __m128i max, min, cmp0, cmp1, cmp2, cmp3; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bps - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i += 8) { + // Load the data into xmm registers + __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); + // Check if any values require more than 15 bit + max = _mm_set1_epi32(0x3fff); + min = _mm_set1_epi32((int)0xffffc000); + cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), + _mm_cmplt_epi32(mm_coeff, min)); + cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), + _mm_cmplt_epi32(mm_coeff2, min)); + cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), + _mm_cmplt_epi32(mm_dqcoeff, min)); + cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), + _mm_cmplt_epi32(mm_dqcoeff2, min)); + test = _mm_movemask_epi8( + _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); + + if (!test) { + __m128i mm_diff, error_sse2, sqcoeff_sse2; + mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); + mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); + mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); + error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); + sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); + _mm_storeu_si128((__m128i *)temp, error_sse2); + error = error + temp[0] + temp[1] + temp[2] + temp[3]; + _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); + sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; + } else { + for (j = 0; j < 8; j++) { + const int64_t diff = coeff[i + j] - dqcoeff[i + j]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; + } + } + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c new file mode 100644 index 0000000000..9cdf21fc7c --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c @@ -0,0 +1,3132 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include /*AVX2*/ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m256i *out, + int stride, int flipud, int fliplr, + int shift) { + __m128i out1[8]; + if (!flipud) { + out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + } else { + out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + } + if (!fliplr) { + out[0] = _mm256_cvtepi16_epi32(out1[0]); + out[1] = _mm256_cvtepi16_epi32(out1[1]); + out[2] = _mm256_cvtepi16_epi32(out1[2]); + out[3] = _mm256_cvtepi16_epi32(out1[3]); + out[4] = _mm256_cvtepi16_epi32(out1[4]); + out[5] = _mm256_cvtepi16_epi32(out1[5]); + out[6] = _mm256_cvtepi16_epi32(out1[6]); + out[7] = _mm256_cvtepi16_epi32(out1[7]); + + } else { + out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0])); + out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1])); + out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2])); + out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3])); + out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4])); + out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5])); + out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6])); + out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7])); + } + out[0] = _mm256_slli_epi32(out[0], shift); + out[1] = _mm256_slli_epi32(out[1], shift); + out[2] = _mm256_slli_epi32(out[2], shift); + out[3] = _mm256_slli_epi32(out[3], shift); + out[4] = _mm256_slli_epi32(out[4], shift); + out[5] = _mm256_slli_epi32(out[5], shift); + out[6] = _mm256_slli_epi32(out[6], shift); + out[7] = _mm256_slli_epi32(out[7], shift); +} +static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) { + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + + in[0] = _mm256_add_epi32(in[0], rounding); + in[1] = _mm256_add_epi32(in[1], rounding); + in[2] = _mm256_add_epi32(in[2], rounding); + in[3] = _mm256_add_epi32(in[3], rounding); + in[4] = _mm256_add_epi32(in[4], rounding); + in[5] = _mm256_add_epi32(in[5], rounding); + in[6] = _mm256_add_epi32(in[6], rounding); + in[7] = _mm256_add_epi32(in[7], rounding); + + in[0] = _mm256_srai_epi32(in[0], shift); + in[1] = _mm256_srai_epi32(in[1], shift); + in[2] = _mm256_srai_epi32(in[2], shift); + in[3] = _mm256_srai_epi32(in[3], shift); + in[4] = _mm256_srai_epi32(in[4], shift); + in[5] = _mm256_srai_epi32(in[5], shift); + in[6] = _mm256_srai_epi32(in[6], shift); + in[7] = _mm256_srai_epi32(in[7], shift); +} +static INLINE void load_buffer_8x16_avx2(const int16_t *input, __m256i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift); +} +static INLINE void load_buffer_16xn_avx2(const int16_t *input, __m256i *out, + int stride, int height, int outstride, + int flipud, int fliplr) { + __m256i out1[64]; + if (!flipud) { + for (int i = 0; i < height; i++) { + out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride)); + } + } else { + for (int i = 0; i < height; i++) { + out1[(height - 1) - i] = + _mm256_loadu_si256((const __m256i *)(input + i * stride)); + } + } + if (!fliplr) { + for (int i = 0; i < height; i++) { + out[i * outstride] = + _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i])); + out[i * outstride + 1] = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1)); + } + } else { + for (int i = 0; i < height; i++) { + out[i * outstride + 1] = _mm256_cvtepi16_epi32( + mm_reverse_epi16(_mm256_castsi256_si128(out1[i]))); + out[i * outstride + 0] = _mm256_cvtepi16_epi32( + mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1))); + } + } +} + +static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out, + const int instride, + const int outstride) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]); + u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]); + + u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]); + u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]); + + u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]); + u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]); + + u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]); + u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); +} +static INLINE void round_shift_32_8xn_avx2(__m256i *in, int size, int bit, + int stride) { + if (bit < 0) { + bit = -bit; + __m256i round = _mm256_set1_epi32(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[stride * i] = _mm256_add_epi32(in[stride * i], round); + in[stride * i] = _mm256_srai_epi32(in[stride * i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[stride * i] = _mm256_slli_epi32(in[stride * i], bit); + } + } +} +static INLINE void store_buffer_avx2(const __m256i *const in, int32_t *out, + const int stride, const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out), in[i]); + out += stride; + } +} +static INLINE void fwd_txfm_transpose_16x16_avx2(const __m256i *in, + __m256i *out) { + fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2); + fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2); + fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2); + fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2); +} + +static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *w1, const __m256i *n1, + const __m256i *rounding, int bit) { + __m256i x, y; + + x = _mm256_mullo_epi32(*w0, *n0); + y = _mm256_mullo_epi32(*w1, *n1); + x = _mm256_add_epi32(x, y); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; +} +#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + const __m256i ww0 = _mm256_set1_epi32(w0); \ + const __m256i ww1 = _mm256_set1_epi32(w1); \ + const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ + const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ + out0 = _mm256_add_epi32(in0_w0, in1_w1); \ + round_shift_32_8xn_avx2(&out0, 1, -bit, 1); \ + const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ + const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ + out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ + round_shift_32_8xn_avx2(&out1, 1, -bit, 1); \ + } while (0) + +#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ + const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ + out0 = _mm256_add_epi32(in0_w0, in1_w1); \ + out0 = _mm256_add_epi32(out0, r); \ + out0 = _mm256_srai_epi32(out0, bit); \ + const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ + const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ + out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ + out1 = _mm256_add_epi32(out1, r); \ + out1 = _mm256_srai_epi32(out1, bit); \ + } while (0) + +typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, + const int8_t cos_bit, int instride, + int outstride); +static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + __m256i u[8], v[8]; + for (int col = 0; col < col_num; ++col) { + u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]); + v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]); + u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]); + u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]); + u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]); + u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]); + u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]); + v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]); + v[0] = _mm256_add_epi32(u[0], u[3]); + v[3] = _mm256_sub_epi32(u[0], u[3]); + v[1] = _mm256_add_epi32(u[1], u[2]); + v[2] = _mm256_sub_epi32(u[1], u[2]); + + v[5] = _mm256_mullo_epi32(u[5], cospim32); + v[6] = _mm256_mullo_epi32(u[6], cospi32); + v[5] = _mm256_add_epi32(v[5], v[6]); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + u[0] = _mm256_mullo_epi32(u[5], cospi32); + v[6] = _mm256_mullo_epi32(u[6], cospim32); + v[6] = _mm256_sub_epi32(u[0], v[6]); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm256_mullo_epi32(v[0], cospi32); + v[1] = _mm256_mullo_epi32(v[1], cospi32); + u[0] = _mm256_add_epi32(v[0], v[1]); + u[0] = _mm256_add_epi32(u[0], rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + u[1] = _mm256_sub_epi32(v[0], v[1]); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm256_mullo_epi32(v[2], cospi48); + v[1] = _mm256_mullo_epi32(v[3], cospi16); + u[2] = _mm256_add_epi32(v[0], v[1]); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + v[0] = _mm256_mullo_epi32(v[2], cospi16); + v[1] = _mm256_mullo_epi32(v[3], cospi48); + u[3] = _mm256_sub_epi32(v[1], v[0]); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + u[4] = _mm256_add_epi32(v[4], v[5]); + u[5] = _mm256_sub_epi32(v[4], v[5]); + u[6] = _mm256_sub_epi32(v[7], v[6]); + u[7] = _mm256_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm256_mullo_epi32(u[4], cospi56); + v[1] = _mm256_mullo_epi32(u[7], cospi8); + v[0] = _mm256_add_epi32(v[0], v[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm256_mullo_epi32(u[4], cospi8); + v[1] = _mm256_mullo_epi32(u[7], cospi56); + v[0] = _mm256_sub_epi32(v[1], v[0]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm256_mullo_epi32(u[5], cospi24); + v[1] = _mm256_mullo_epi32(u[6], cospi40); + v[0] = _mm256_add_epi32(v[0], v[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm256_mullo_epi32(u[5], cospi40); + v[1] = _mm256_mullo_epi32(u[6], cospi24); + v[0] = _mm256_sub_epi32(v[1], v[0]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[6] + + out[0 * outstride + col] = u[0]; // buf0[0] + out[4 * outstride + col] = u[1]; // buf0[1] + out[2 * outstride + col] = u[2]; // buf0[2] + out[6 * outstride + col] = u[3]; // buf0[3] + } +} +static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstirde) { + (void)col_num; + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + __m256i x, y; + for (int col = 0; col < col_num; ++col) { + u0 = in[0 * col_num + col]; + u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]); + u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]); + u3 = in[4 * col_num + col]; + u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]); + u5 = in[6 * col_num + col]; + u6 = in[2 * col_num + col]; + u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]); + + // stage 2 + v0 = u0; + v1 = u1; + + x = _mm256_mullo_epi32(u2, cospi32); + y = _mm256_mullo_epi32(u3, cospi32); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + v3 = _mm256_sub_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + v4 = u4; + v5 = u5; + + x = _mm256_mullo_epi32(u6, cospi32); + y = _mm256_mullo_epi32(u7, cospi32); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + v7 = _mm256_sub_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 3 + u0 = _mm256_add_epi32(v0, v2); + u1 = _mm256_add_epi32(v1, v3); + u2 = _mm256_sub_epi32(v0, v2); + u3 = _mm256_sub_epi32(v1, v3); + u4 = _mm256_add_epi32(v4, v6); + u5 = _mm256_add_epi32(v5, v7); + u6 = _mm256_sub_epi32(v4, v6); + u7 = _mm256_sub_epi32(v5, v7); + + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + x = _mm256_mullo_epi32(u4, cospi16); + y = _mm256_mullo_epi32(u5, cospi48); + v4 = _mm256_add_epi32(x, y); + v4 = _mm256_add_epi32(v4, rnding); + v4 = _mm256_srai_epi32(v4, bit); + + x = _mm256_mullo_epi32(u4, cospi48); + y = _mm256_mullo_epi32(u5, cospim16); + v5 = _mm256_add_epi32(x, y); + v5 = _mm256_add_epi32(v5, rnding); + v5 = _mm256_srai_epi32(v5, bit); + + x = _mm256_mullo_epi32(u6, cospim48); + y = _mm256_mullo_epi32(u7, cospi16); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + x = _mm256_mullo_epi32(u6, cospi16); + y = _mm256_mullo_epi32(u7, cospi48); + v7 = _mm256_add_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 5 + u0 = _mm256_add_epi32(v0, v4); + u1 = _mm256_add_epi32(v1, v5); + u2 = _mm256_add_epi32(v2, v6); + u3 = _mm256_add_epi32(v3, v7); + u4 = _mm256_sub_epi32(v0, v4); + u5 = _mm256_sub_epi32(v1, v5); + u6 = _mm256_sub_epi32(v2, v6); + u7 = _mm256_sub_epi32(v3, v7); + + // stage 6 + x = _mm256_mullo_epi32(u0, cospi4); + y = _mm256_mullo_epi32(u1, cospi60); + v0 = _mm256_add_epi32(x, y); + v0 = _mm256_add_epi32(v0, rnding); + v0 = _mm256_srai_epi32(v0, bit); + + x = _mm256_mullo_epi32(u0, cospi60); + y = _mm256_mullo_epi32(u1, cospim4); + v1 = _mm256_add_epi32(x, y); + v1 = _mm256_add_epi32(v1, rnding); + v1 = _mm256_srai_epi32(v1, bit); + + x = _mm256_mullo_epi32(u2, cospi20); + y = _mm256_mullo_epi32(u3, cospi44); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + x = _mm256_mullo_epi32(u2, cospi44); + y = _mm256_mullo_epi32(u3, cospim20); + v3 = _mm256_add_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + x = _mm256_mullo_epi32(u4, cospi36); + y = _mm256_mullo_epi32(u5, cospi28); + v4 = _mm256_add_epi32(x, y); + v4 = _mm256_add_epi32(v4, rnding); + v4 = _mm256_srai_epi32(v4, bit); + + x = _mm256_mullo_epi32(u4, cospi28); + y = _mm256_mullo_epi32(u5, cospim36); + v5 = _mm256_add_epi32(x, y); + v5 = _mm256_add_epi32(v5, rnding); + v5 = _mm256_srai_epi32(v5, bit); + + x = _mm256_mullo_epi32(u6, cospi52); + y = _mm256_mullo_epi32(u7, cospi12); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + x = _mm256_mullo_epi32(u6, cospi12); + y = _mm256_mullo_epi32(u7, cospim52); + v7 = _mm256_add_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 7 + out[0 * outstirde + col] = v1; + out[1 * outstirde + col] = v6; + out[2 * outstirde + col] = v3; + out[3 * outstirde + col] = v4; + out[4 * outstirde + col] = v5; + out[5 * outstirde + col] = v2; + out[6 * outstirde + col] = v7; + out[7 * outstirde + col] = v0; + } +} +static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num, + int outstride) { + (void)bit; + (void)outstride; + int num_iters = 8 * col_num; + for (int i = 0; i < num_iters; i += 8) { + out[i] = _mm256_add_epi32(in[i], in[i]); + out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]); + out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]); + out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]); + out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]); + out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]); + out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]); + out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]); + } +} +void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[8], out[8]; + const TX_SIZE tx_size = TX_8X8; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int width = tx_size_wide[tx_size]; + const int width_div8 = (width >> 3); + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case ADST_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case DCT_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case ADST_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case FLIPADST_DCT: + load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case DCT_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case ADST_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case FLIPADST_ADST: + load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case IDTX: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case V_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case H_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case V_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case H_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case V_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case H_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + default: assert(0); + } + (void)bd; +} + +static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + __m256i u[16], v[16], x; + int col; + + // Calculate the column 0, 1, 2, 3 + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); + u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); + + // stage 2 + v[0] = _mm256_add_epi32(u[0], u[7]); + v[7] = _mm256_sub_epi32(u[0], u[7]); + v[1] = _mm256_add_epi32(u[1], u[6]); + v[6] = _mm256_sub_epi32(u[1], u[6]); + v[2] = _mm256_add_epi32(u[2], u[5]); + v[5] = _mm256_sub_epi32(u[2], u[5]); + v[3] = _mm256_add_epi32(u[3], u[4]); + v[4] = _mm256_sub_epi32(u[3], u[4]); + v[8] = u[8]; + v[9] = u[9]; + + v[10] = _mm256_mullo_epi32(u[10], cospim32); + x = _mm256_mullo_epi32(u[13], cospi32); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[13], cospim32); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = _mm256_mullo_epi32(u[11], cospim32); + x = _mm256_mullo_epi32(u[12], cospi32); + v[11] = _mm256_add_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[11], cospi32); + x = _mm256_mullo_epi32(u[12], cospim32); + v[12] = _mm256_sub_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + v[14] = u[14]; + v[15] = u[15]; + + // stage 3 + u[0] = _mm256_add_epi32(v[0], v[3]); + u[3] = _mm256_sub_epi32(v[0], v[3]); + u[1] = _mm256_add_epi32(v[1], v[2]); + u[2] = _mm256_sub_epi32(v[1], v[2]); + u[4] = v[4]; + + u[5] = _mm256_mullo_epi32(v[5], cospim32); + x = _mm256_mullo_epi32(v[6], cospi32); + u[5] = _mm256_add_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[5], cospi32); + x = _mm256_mullo_epi32(v[6], cospim32); + u[6] = _mm256_sub_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = v[7]; + u[8] = _mm256_add_epi32(v[8], v[11]); + u[11] = _mm256_sub_epi32(v[8], v[11]); + u[9] = _mm256_add_epi32(v[9], v[10]); + u[10] = _mm256_sub_epi32(v[9], v[10]); + u[12] = _mm256_sub_epi32(v[15], v[12]); + u[15] = _mm256_add_epi32(v[15], v[12]); + u[13] = _mm256_sub_epi32(v[14], v[13]); + u[14] = _mm256_add_epi32(v[14], v[13]); + + // stage 4 + u[0] = _mm256_mullo_epi32(u[0], cospi32); + u[1] = _mm256_mullo_epi32(u[1], cospi32); + v[0] = _mm256_add_epi32(u[0], u[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_sub_epi32(u[0], u[1]); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = _mm256_mullo_epi32(u[2], cospi48); + x = _mm256_mullo_epi32(u[3], cospi16); + v[2] = _mm256_add_epi32(v[2], x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_mullo_epi32(u[2], cospi16); + x = _mm256_mullo_epi32(u[3], cospi48); + v[3] = _mm256_sub_epi32(x, v[3]); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = _mm256_add_epi32(u[4], u[5]); + v[5] = _mm256_sub_epi32(u[4], u[5]); + v[6] = _mm256_sub_epi32(u[7], u[6]); + v[7] = _mm256_add_epi32(u[7], u[6]); + v[8] = u[8]; + + v[9] = _mm256_mullo_epi32(u[9], cospim16); + x = _mm256_mullo_epi32(u[14], cospi48); + v[9] = _mm256_add_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[14] = _mm256_mullo_epi32(u[9], cospi48); + x = _mm256_mullo_epi32(u[14], cospim16); + v[14] = _mm256_sub_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospim48); + x = _mm256_mullo_epi32(u[13], cospim16); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospim16); + x = _mm256_mullo_epi32(u[13], cospim48); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = u[11]; + v[12] = u[12]; + v[15] = u[15]; + + // stage 5 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm256_mullo_epi32(v[4], cospi56); + x = _mm256_mullo_epi32(v[7], cospi8); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[7] = _mm256_mullo_epi32(v[4], cospi8); + x = _mm256_mullo_epi32(v[7], cospi56); + u[7] = _mm256_sub_epi32(x, u[7]); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + u[5] = _mm256_mullo_epi32(v[5], cospi24); + x = _mm256_mullo_epi32(v[6], cospi40); + u[5] = _mm256_add_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[5], cospi40); + x = _mm256_mullo_epi32(v[6], cospi24); + u[6] = _mm256_sub_epi32(x, u[6]); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[8] = _mm256_add_epi32(v[8], v[9]); + u[9] = _mm256_sub_epi32(v[8], v[9]); + u[10] = _mm256_sub_epi32(v[11], v[10]); + u[11] = _mm256_add_epi32(v[11], v[10]); + u[12] = _mm256_add_epi32(v[12], v[13]); + u[13] = _mm256_sub_epi32(v[12], v[13]); + u[14] = _mm256_sub_epi32(v[15], v[14]); + u[15] = _mm256_add_epi32(v[15], v[14]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm256_mullo_epi32(u[8], cospi60); + x = _mm256_mullo_epi32(u[15], cospi4); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[15] = _mm256_mullo_epi32(u[8], cospi4); + x = _mm256_mullo_epi32(u[15], cospi60); + v[15] = _mm256_sub_epi32(x, v[15]); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + v[9] = _mm256_mullo_epi32(u[9], cospi28); + x = _mm256_mullo_epi32(u[14], cospi36); + v[9] = _mm256_add_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[14] = _mm256_mullo_epi32(u[9], cospi36); + x = _mm256_mullo_epi32(u[14], cospi28); + v[14] = _mm256_sub_epi32(x, v[14]); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospi44); + x = _mm256_mullo_epi32(u[13], cospi20); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospi20); + x = _mm256_mullo_epi32(u[13], cospi44); + v[13] = _mm256_sub_epi32(x, v[13]); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = _mm256_mullo_epi32(u[11], cospi12); + x = _mm256_mullo_epi32(u[12], cospi52); + v[11] = _mm256_add_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[11], cospi52); + x = _mm256_mullo_epi32(u[12], cospi12); + v[12] = _mm256_sub_epi32(x, v[12]); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + out[0 * outstride + col] = v[0]; + out[1 * outstride + col] = v[8]; + out[2 * outstride + col] = v[4]; + out[3 * outstride + col] = v[12]; + out[4 * outstride + col] = v[2]; + out[5 * outstride + col] = v[10]; + out[6 * outstride + col] = v[6]; + out[7 * outstride + col] = v[14]; + out[8 * outstride + col] = v[1]; + out[9 * outstride + col] = v[9]; + out[10 * outstride + col] = v[5]; + out[11 * outstride + col] = v[13]; + out[12 * outstride + col] = v[3]; + out[13 * outstride + col] = v[11]; + out[14 * outstride + col] = v[7]; + out[15 * outstride + col] = v[15]; + } +} +static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int num_cols, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + + __m256i u[16], v[16], x, y; + int col; + + for (col = 0; col < num_cols; ++col) { + // stage 0 + // stage 1 + u[0] = in[0 * num_cols + col]; + u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]); + u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]); + u[3] = in[8 * num_cols + col]; + u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]); + u[5] = in[12 * num_cols + col]; + u[6] = in[4 * num_cols + col]; + u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]); + u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]); + u[9] = in[14 * num_cols + col]; + u[10] = in[6 * num_cols + col]; + u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]); + u[12] = in[2 * num_cols + col]; + u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]); + u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]); + u[15] = in[10 * num_cols + col]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + + x = _mm256_mullo_epi32(u[2], cospi32); + y = _mm256_mullo_epi32(u[3], cospi32); + v[2] = _mm256_add_epi32(x, y); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(x, y); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + x = _mm256_mullo_epi32(u[6], cospi32); + y = _mm256_mullo_epi32(u[7], cospi32); + v[6] = _mm256_add_epi32(x, y); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(x, y); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[11], cospi32); + v[10] = _mm256_add_epi32(x, y); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(x, y); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + x = _mm256_mullo_epi32(u[14], cospi32); + y = _mm256_mullo_epi32(u[15], cospi32); + v[14] = _mm256_add_epi32(x, y); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(x, y); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 3 + u[0] = _mm256_add_epi32(v[0], v[2]); + u[1] = _mm256_add_epi32(v[1], v[3]); + u[2] = _mm256_sub_epi32(v[0], v[2]); + u[3] = _mm256_sub_epi32(v[1], v[3]); + u[4] = _mm256_add_epi32(v[4], v[6]); + u[5] = _mm256_add_epi32(v[5], v[7]); + u[6] = _mm256_sub_epi32(v[4], v[6]); + u[7] = _mm256_sub_epi32(v[5], v[7]); + u[8] = _mm256_add_epi32(v[8], v[10]); + u[9] = _mm256_add_epi32(v[9], v[11]); + u[10] = _mm256_sub_epi32(v[8], v[10]); + u[11] = _mm256_sub_epi32(v[9], v[11]); + u[12] = _mm256_add_epi32(v[12], v[14]); + u[13] = _mm256_add_epi32(v[13], v[15]); + u[14] = _mm256_sub_epi32(v[12], v[14]); + u[15] = _mm256_sub_epi32(v[13], v[15]); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = + av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = + av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); + + // stage 5 + u[0] = _mm256_add_epi32(v[0], v[4]); + u[1] = _mm256_add_epi32(v[1], v[5]); + u[2] = _mm256_add_epi32(v[2], v[6]); + u[3] = _mm256_add_epi32(v[3], v[7]); + u[4] = _mm256_sub_epi32(v[0], v[4]); + u[5] = _mm256_sub_epi32(v[1], v[5]); + u[6] = _mm256_sub_epi32(v[2], v[6]); + u[7] = _mm256_sub_epi32(v[3], v[7]); + u[8] = _mm256_add_epi32(v[8], v[12]); + u[9] = _mm256_add_epi32(v[9], v[13]); + u[10] = _mm256_add_epi32(v[10], v[14]); + u[11] = _mm256_add_epi32(v[11], v[15]); + u[12] = _mm256_sub_epi32(v[8], v[12]); + u[13] = _mm256_sub_epi32(v[9], v[13]); + u[14] = _mm256_sub_epi32(v[10], v[14]); + u[15] = _mm256_sub_epi32(v[11], v[15]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = + av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = + av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); + + // stage 7 + u[0] = _mm256_add_epi32(v[0], v[8]); + u[1] = _mm256_add_epi32(v[1], v[9]); + u[2] = _mm256_add_epi32(v[2], v[10]); + u[3] = _mm256_add_epi32(v[3], v[11]); + u[4] = _mm256_add_epi32(v[4], v[12]); + u[5] = _mm256_add_epi32(v[5], v[13]); + u[6] = _mm256_add_epi32(v[6], v[14]); + u[7] = _mm256_add_epi32(v[7], v[15]); + u[8] = _mm256_sub_epi32(v[0], v[8]); + u[9] = _mm256_sub_epi32(v[1], v[9]); + u[10] = _mm256_sub_epi32(v[2], v[10]); + u[11] = _mm256_sub_epi32(v[3], v[11]); + u[12] = _mm256_sub_epi32(v[4], v[12]); + u[13] = _mm256_sub_epi32(v[5], v[13]); + u[14] = _mm256_sub_epi32(v[6], v[14]); + u[15] = _mm256_sub_epi32(v[7], v[15]); + + // stage 8 + v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = + av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = + av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); + + // stage 9 + out[0 * outstride + col] = v[1]; + out[1 * outstride + col] = v[14]; + out[2 * outstride + col] = v[3]; + out[3 * outstride + col] = v[12]; + out[4 * outstride + col] = v[5]; + out[5 * outstride + col] = v[10]; + out[6 * outstride + col] = v[7]; + out[7 * outstride + col] = v[8]; + out[8 * outstride + col] = v[9]; + out[9 * outstride + col] = v[6]; + out[10 * outstride + col] = v[11]; + out[11 * outstride + col] = v[4]; + out[12 * outstride + col] = v[13]; + out[13 * outstride + col] = v[2]; + out[14 * outstride + col] = v[15]; + out[15 * outstride + col] = v[0]; + } +} +static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit, + int col_num, const int outstride) { + (void)bit; + (void)outstride; + __m256i fact = _mm256_set1_epi32(2 * NewSqrt2); + __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m256i a_low; + + int num_iters = 16 * col_num; + for (int i = 0; i < num_iters; i++) { + a_low = _mm256_mullo_epi32(in[i], fact); + a_low = _mm256_add_epi32(a_low, offset); + out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits); + } +} +static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16_avx2, // DCT_DCT + fadst16_avx2, // ADST_DCT + fdct16_avx2, // DCT_ADST + fadst16_avx2, // ADST_ADST + fadst16_avx2, // FLIPADST_DCT + fdct16_avx2, // DCT_FLIPADST + fadst16_avx2, // FLIPADST_FLIPADST + fadst16_avx2, // ADST_FLIPADST + fadst16_avx2, // FLIPADST_ADST + idtx16_avx2, // IDTX + fdct16_avx2, // V_DCT + idtx16_avx2, // H_DCT + fadst16_avx2, // V_ADST + idtx16_avx2, // H_ADST + fadst16_avx2, // V_FLIPADST + idtx16_avx2 // H_FLIPADST +}; +static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8_avx2, // DCT_DCT + fdct8_avx2, // ADST_DCT + fadst8_avx2, // DCT_ADST + fadst8_avx2, // ADST_ADST + fdct8_avx2, // FLIPADST_DCT + fadst8_avx2, // DCT_FLIPADST + fadst8_avx2, // FLIPADST_FLIPADST + fadst8_avx2, // ADST_FLIPADST + fadst8_avx2, // FLIPADST_ADST + idtx8_avx2, // IDTX + idtx8_avx2, // V_DCT + fdct8_avx2, // H_DCT + idtx8_avx2, // V_ADST + fadst8_avx2, // H_ADST + idtx8_avx2, // V_FLIPADST + fadst8_avx2 // H_FLIPADST +}; +void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[16], out[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type]; + const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, out, bit, 1, 1); + col_txfm_8x8_rounding(out, -shift[1]); + col_txfm_8x8_rounding(&out[8], -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, 1, 2); + fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2); + row_txfm(in, out, bit, 2, 2); + round_shift_rect_array_32_avx2(out, in, 16, -shift[2], NewSqrt2); + store_buffer_avx2(in, coeff, 8, 16); + (void)bd; +} +static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8_avx2, // DCT_DCT + fadst8_avx2, // ADST_DCT + fdct8_avx2, // DCT_ADST + fadst8_avx2, // ADST_ADST + fadst8_avx2, // FLIPADST_DCT + fdct8_avx2, // DCT_FLIPADST + fadst8_avx2, // FLIPADST_FLIPADST + fadst8_avx2, // ADST_FLIPADST + fadst8_avx2, // FLIPADST_ADST + idtx8_avx2, // IDTX + fdct8_avx2, // V_DCT + idtx8_avx2, // H_DCT + fadst8_avx2, // V_ADST + idtx8_avx2, // H_ADST + fadst8_avx2, // V_FLIPADST + idtx8_avx2 // H_FLIPADST +}; +static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16_avx2, // DCT_DCT + fdct16_avx2, // ADST_DCT + fadst16_avx2, // DCT_ADST + fadst16_avx2, // ADST_ADST + fdct16_avx2, // FLIPADST_DCT + fadst16_avx2, // DCT_FLIPADST + fadst16_avx2, // FLIPADST_FLIPADST + fadst16_avx2, // ADST_FLIPADST + fadst16_avx2, // FLIPADST_ADST + idtx16_avx2, // IDTX + idtx16_avx2, // V_DCT + fdct16_avx2, // H_DCT + idtx16_avx2, // V_ADST + fadst16_avx2, // H_ADST + idtx16_avx2, // V_FLIPADST + fadst16_avx2 // H_FLIPADST +}; +void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[16], out[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip); + round_shift_32_8xn_avx2(in, 16, shift[0], 1); + col_txfm(in, out, bit, 2, 2); + round_shift_32_8xn_avx2(out, 16, shift[1], 1); + fwd_txfm_transpose_8x8_avx2(out, in, 2, 1); + fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1); + row_txfm(in, out, bit, 1, 1); + round_shift_rect_array_32_avx2(out, out, 16, -shift[2], NewSqrt2); + store_buffer_avx2(out, coeff, 8, 16); + (void)bd; +} +void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[32], out[32]; + const TX_SIZE tx_size = TX_16X16; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const int width_div8 = (width >> 3); + const int width_div16 = (width >> 4); + const int size = (height << 1); + switch (tx_type) { + case DCT_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case ADST_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case DCT_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case ADST_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case FLIPADST_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case DCT_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case FLIPADST_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case ADST_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case FLIPADST_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case IDTX: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case V_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case H_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case V_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case H_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case V_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case H_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + default: assert(0); + } + (void)bd; +} +static INLINE void fdct32_avx2(__m256i *input, __m256i *output, + const int8_t cos_bit, const int instride, + const int outstride) { + __m256i buf0[32]; + __m256i buf1[32]; + const int32_t *cospi; + int startidx = 0 * instride; + int endidx = 31 * instride; + // stage 0 + // stage 1 + buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]); + buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]); + buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]); + buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]); + buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]); + buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]); + buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]); + buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]); + buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]); + buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]); + buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]); + buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]); + buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]); + buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]); + buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]); + buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + cospi = cospi_arr(cos_bit); + buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]); + buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]); + buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]); + buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]); + buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]); + buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]); + buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]); + buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]); + buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]); + buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]); + buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]); + buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]); + buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]); + buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]); + buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]); + buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]); + buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]); + buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]); + buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]); + buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]); + buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]); + buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]); + buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]); + buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]); + buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]); + buf0[4] = buf1[4]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[7] = buf1[7]; + buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]); + buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]); + buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]); + buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]); + buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]); + buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]); + buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]); + buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + cospi = cospi_arr(cos_bit); + btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], + cos_bit); + btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3], + cos_bit); + buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]); + buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]); + buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]); + buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]); + buf1[8] = buf0[8]; + btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14], + cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]); + buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]); + buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]); + buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]); + buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]); + buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]); + buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]); + buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]); + buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]); + buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]); + buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]); + buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]); + buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]); + buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]); + buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]); + + // stage 6 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7], + cos_bit); + btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6], + cos_bit); + buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]); + buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]); + buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]); + buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]); + buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]); + buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]); + buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]); + buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]); + buf0[16] = buf1[16]; + btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + cospi = cospi_arr(cos_bit); + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15], + cos_bit); + btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14], + cos_bit); + btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10], + buf1[13], cos_bit); + btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11], + buf1[12], cos_bit); + buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]); + buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]); + buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]); + buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]); + buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]); + buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]); + buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]); + buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]); + buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]); + buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]); + buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]); + buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]); + buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]); + buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]); + buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]); + + // stage 8 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31], + cos_bit); + btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17], + buf0[30], cos_bit); + btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18], + buf0[29], cos_bit); + btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19], + buf0[28], cos_bit); + btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22], + buf0[25], cos_bit); + btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24], + cos_bit); + + startidx = 0 * outstride; + endidx = 31 * outstride; + // stage 9 + output[startidx] = buf0[0]; + output[endidx] = buf0[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[16]; + output[endidx] = buf0[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[8]; + output[endidx] = buf0[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[24]; + output[endidx] = buf0[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[4]; + output[endidx] = buf0[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[20]; + output[endidx] = buf0[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[12]; + output[endidx] = buf0[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[28]; + output[endidx] = buf0[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[2]; + output[endidx] = buf0[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[18]; + output[endidx] = buf0[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[10]; + output[endidx] = buf0[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[26]; + output[endidx] = buf0[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[6]; + output[endidx] = buf0[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[22]; + output[endidx] = buf0[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[14]; + output[endidx] = buf0[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[30]; + output[endidx] = buf0[1]; +} +static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output, + const int8_t cos_bit, int instride, + int outstride) { + (void)cos_bit; + for (int i = 0; i < 32; i += 8) { + output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2); + output[(i + 1) * outstride] = + _mm256_slli_epi32(input[(i + 1) * instride], 2); + output[(i + 2) * outstride] = + _mm256_slli_epi32(input[(i + 2) * instride], 2); + output[(i + 3) * outstride] = + _mm256_slli_epi32(input[(i + 3) * instride], 2); + output[(i + 4) * outstride] = + _mm256_slli_epi32(input[(i + 4) * instride], 2); + output[(i + 5) * outstride] = + _mm256_slli_epi32(input[(i + 5) * instride], 2); + output[(i + 6) * outstride] = + _mm256_slli_epi32(input[(i + 6) * instride], 2); + output[(i + 7) * outstride] = + _mm256_slli_epi32(input[(i + 7) * instride], 2); + } +} +static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = { + fdct32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx32x32_avx2, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; +static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = { + fdct32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx32x32_avx2, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; +void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m256i buf0[128], buf1[128]; + const int tx_size = TX_32X32; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type]; + int r, c; + const int width_div16 = (width >> 4); + const int width_div8 = (width >> 3); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height, + width_div8, 0, 0); + round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8); + col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8, + width_div8); + col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, + width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8); + } + + for (r = 0; r < height; r += 8) { + for (c = 0; c < width_div8; c++) { + fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], + &buf1[c * 8 * width_div8 + (r >> 3)], + width_div8, width_div8); + } + } + + for (int i = 0; i < width_div16; i++) { + row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8, + width_div8); + row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8, + width_div8); + round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8); + round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8); + } + + store_buffer_avx2(buf1, output, 8, 128); +} +static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2, + __m256i *cospi_m32, __m256i *cospi_p32, + const __m256i *__rounding, + int8_t cos_bit) { + x2[0] = _mm256_add_epi32(x1[0], x1[31]); + x2[31] = _mm256_sub_epi32(x1[0], x1[31]); + x2[1] = _mm256_add_epi32(x1[1], x1[30]); + x2[30] = _mm256_sub_epi32(x1[1], x1[30]); + x2[2] = _mm256_add_epi32(x1[2], x1[29]); + x2[29] = _mm256_sub_epi32(x1[2], x1[29]); + x2[3] = _mm256_add_epi32(x1[3], x1[28]); + x2[28] = _mm256_sub_epi32(x1[3], x1[28]); + x2[4] = _mm256_add_epi32(x1[4], x1[27]); + x2[27] = _mm256_sub_epi32(x1[4], x1[27]); + x2[5] = _mm256_add_epi32(x1[5], x1[26]); + x2[26] = _mm256_sub_epi32(x1[5], x1[26]); + x2[6] = _mm256_add_epi32(x1[6], x1[25]); + x2[25] = _mm256_sub_epi32(x1[6], x1[25]); + x2[7] = _mm256_add_epi32(x1[7], x1[24]); + x2[24] = _mm256_sub_epi32(x1[7], x1[24]); + x2[8] = _mm256_add_epi32(x1[8], x1[23]); + x2[23] = _mm256_sub_epi32(x1[8], x1[23]); + x2[9] = _mm256_add_epi32(x1[9], x1[22]); + x2[22] = _mm256_sub_epi32(x1[9], x1[22]); + x2[10] = _mm256_add_epi32(x1[10], x1[21]); + x2[21] = _mm256_sub_epi32(x1[10], x1[21]); + x2[11] = _mm256_add_epi32(x1[11], x1[20]); + x2[20] = _mm256_sub_epi32(x1[11], x1[20]); + x2[12] = _mm256_add_epi32(x1[12], x1[19]); + x2[19] = _mm256_sub_epi32(x1[12], x1[19]); + x2[13] = _mm256_add_epi32(x1[13], x1[18]); + x2[18] = _mm256_sub_epi32(x1[13], x1[18]); + x2[14] = _mm256_add_epi32(x1[14], x1[17]); + x2[17] = _mm256_sub_epi32(x1[14], x1[17]); + x2[15] = _mm256_add_epi32(x1[15], x1[16]); + x2[16] = _mm256_sub_epi32(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48], + *__rounding, cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; +} +static INLINE void fdct64_stage3_avx2(__m256i *x2, __m256i *x3, + __m256i *cospi_m32, __m256i *cospi_p32, + const __m256i *__rounding, + int8_t cos_bit) { + x3[0] = _mm256_add_epi32(x2[0], x2[15]); + x3[15] = _mm256_sub_epi32(x2[0], x2[15]); + x3[1] = _mm256_add_epi32(x2[1], x2[14]); + x3[14] = _mm256_sub_epi32(x2[1], x2[14]); + x3[2] = _mm256_add_epi32(x2[2], x2[13]); + x3[13] = _mm256_sub_epi32(x2[2], x2[13]); + x3[3] = _mm256_add_epi32(x2[3], x2[12]); + x3[12] = _mm256_sub_epi32(x2[3], x2[12]); + x3[4] = _mm256_add_epi32(x2[4], x2[11]); + x3[11] = _mm256_sub_epi32(x2[4], x2[11]); + x3[5] = _mm256_add_epi32(x2[5], x2[10]); + x3[10] = _mm256_sub_epi32(x2[5], x2[10]); + x3[6] = _mm256_add_epi32(x2[6], x2[9]); + x3[9] = _mm256_sub_epi32(x2[6], x2[9]); + x3[7] = _mm256_add_epi32(x2[7], x2[8]); + x3[8] = _mm256_sub_epi32(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24], + *__rounding, cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm256_add_epi32(x2[32], x2[47]); + x3[47] = _mm256_sub_epi32(x2[32], x2[47]); + x3[33] = _mm256_add_epi32(x2[33], x2[46]); + x3[46] = _mm256_sub_epi32(x2[33], x2[46]); + x3[34] = _mm256_add_epi32(x2[34], x2[45]); + x3[45] = _mm256_sub_epi32(x2[34], x2[45]); + x3[35] = _mm256_add_epi32(x2[35], x2[44]); + x3[44] = _mm256_sub_epi32(x2[35], x2[44]); + x3[36] = _mm256_add_epi32(x2[36], x2[43]); + x3[43] = _mm256_sub_epi32(x2[36], x2[43]); + x3[37] = _mm256_add_epi32(x2[37], x2[42]); + x3[42] = _mm256_sub_epi32(x2[37], x2[42]); + x3[38] = _mm256_add_epi32(x2[38], x2[41]); + x3[41] = _mm256_sub_epi32(x2[38], x2[41]); + x3[39] = _mm256_add_epi32(x2[39], x2[40]); + x3[40] = _mm256_sub_epi32(x2[39], x2[40]); + x3[48] = _mm256_sub_epi32(x2[63], x2[48]); + x3[63] = _mm256_add_epi32(x2[63], x2[48]); + x3[49] = _mm256_sub_epi32(x2[62], x2[49]); + x3[62] = _mm256_add_epi32(x2[62], x2[49]); + x3[50] = _mm256_sub_epi32(x2[61], x2[50]); + x3[61] = _mm256_add_epi32(x2[61], x2[50]); + x3[51] = _mm256_sub_epi32(x2[60], x2[51]); + x3[60] = _mm256_add_epi32(x2[60], x2[51]); + x3[52] = _mm256_sub_epi32(x2[59], x2[52]); + x3[59] = _mm256_add_epi32(x2[59], x2[52]); + x3[53] = _mm256_sub_epi32(x2[58], x2[53]); + x3[58] = _mm256_add_epi32(x2[58], x2[53]); + x3[54] = _mm256_sub_epi32(x2[57], x2[54]); + x3[57] = _mm256_add_epi32(x2[57], x2[54]); + x3[55] = _mm256_sub_epi32(x2[56], x2[55]); + x3[56] = _mm256_add_epi32(x2[56], x2[55]); +} +static INLINE void fdct64_stage4_avx2(__m256i *x3, __m256i *x4, + __m256i *cospi_m32, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, + __m256i *cospi_m48, + const __m256i *__rounding, + int8_t cos_bit) { + x4[0] = _mm256_add_epi32(x3[0], x3[7]); + x4[7] = _mm256_sub_epi32(x3[0], x3[7]); + x4[1] = _mm256_add_epi32(x3[1], x3[6]); + x4[6] = _mm256_sub_epi32(x3[1], x3[6]); + x4[2] = _mm256_add_epi32(x3[2], x3[5]); + x4[5] = _mm256_sub_epi32(x3[2], x3[5]); + x4[3] = _mm256_add_epi32(x3[3], x3[4]); + x4[4] = _mm256_sub_epi32(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12], + *__rounding, cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm256_add_epi32(x3[16], x3[23]); + x4[23] = _mm256_sub_epi32(x3[16], x3[23]); + x4[17] = _mm256_add_epi32(x3[17], x3[22]); + x4[22] = _mm256_sub_epi32(x3[17], x3[22]); + x4[18] = _mm256_add_epi32(x3[18], x3[21]); + x4[21] = _mm256_sub_epi32(x3[18], x3[21]); + x4[19] = _mm256_add_epi32(x3[19], x3[20]); + x4[20] = _mm256_sub_epi32(x3[19], x3[20]); + x4[24] = _mm256_sub_epi32(x3[31], x3[24]); + x4[31] = _mm256_add_epi32(x3[31], x3[24]); + x4[25] = _mm256_sub_epi32(x3[30], x3[25]); + x4[30] = _mm256_add_epi32(x3[30], x3[25]); + x4[26] = _mm256_sub_epi32(x3[29], x3[26]); + x4[29] = _mm256_add_epi32(x3[29], x3[26]); + x4[27] = _mm256_sub_epi32(x3[28], x3[27]); + x4[28] = _mm256_add_epi32(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52], + *__rounding, cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; +} +static INLINE void fdct64_stage5_avx2(__m256i *x4, __m256i *x5, + __m256i *cospi_m32, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, + __m256i *cospi_m48, + const __m256i *__rounding, + int8_t cos_bit) { + x5[0] = _mm256_add_epi32(x4[0], x4[3]); + x5[3] = _mm256_sub_epi32(x4[0], x4[3]); + x5[1] = _mm256_add_epi32(x4[1], x4[2]); + x5[2] = _mm256_sub_epi32(x4[1], x4[2]); + x5[4] = x4[4]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6], + *__rounding, cos_bit); + x5[7] = x4[7]; + x5[8] = _mm256_add_epi32(x4[8], x4[11]); + x5[11] = _mm256_sub_epi32(x4[8], x4[11]); + x5[9] = _mm256_add_epi32(x4[9], x4[10]); + x5[10] = _mm256_sub_epi32(x4[9], x4[10]); + x5[12] = _mm256_sub_epi32(x4[15], x4[12]); + x5[15] = _mm256_add_epi32(x4[15], x4[12]); + x5[13] = _mm256_sub_epi32(x4[14], x4[13]); + x5[14] = _mm256_add_epi32(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26], + *__rounding, cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm256_add_epi32(x4[32], x4[39]); + x5[39] = _mm256_sub_epi32(x4[32], x4[39]); + x5[33] = _mm256_add_epi32(x4[33], x4[38]); + x5[38] = _mm256_sub_epi32(x4[33], x4[38]); + x5[34] = _mm256_add_epi32(x4[34], x4[37]); + x5[37] = _mm256_sub_epi32(x4[34], x4[37]); + x5[35] = _mm256_add_epi32(x4[35], x4[36]); + x5[36] = _mm256_sub_epi32(x4[35], x4[36]); + x5[40] = _mm256_sub_epi32(x4[47], x4[40]); + x5[47] = _mm256_add_epi32(x4[47], x4[40]); + x5[41] = _mm256_sub_epi32(x4[46], x4[41]); + x5[46] = _mm256_add_epi32(x4[46], x4[41]); + x5[42] = _mm256_sub_epi32(x4[45], x4[42]); + x5[45] = _mm256_add_epi32(x4[45], x4[42]); + x5[43] = _mm256_sub_epi32(x4[44], x4[43]); + x5[44] = _mm256_add_epi32(x4[44], x4[43]); + x5[48] = _mm256_add_epi32(x4[48], x4[55]); + x5[55] = _mm256_sub_epi32(x4[48], x4[55]); + x5[49] = _mm256_add_epi32(x4[49], x4[54]); + x5[54] = _mm256_sub_epi32(x4[49], x4[54]); + x5[50] = _mm256_add_epi32(x4[50], x4[53]); + x5[53] = _mm256_sub_epi32(x4[50], x4[53]); + x5[51] = _mm256_add_epi32(x4[51], x4[52]); + x5[52] = _mm256_sub_epi32(x4[51], x4[52]); + x5[56] = _mm256_sub_epi32(x4[63], x4[56]); + x5[63] = _mm256_add_epi32(x4[63], x4[56]); + x5[57] = _mm256_sub_epi32(x4[62], x4[57]); + x5[62] = _mm256_add_epi32(x4[62], x4[57]); + x5[58] = _mm256_sub_epi32(x4[61], x4[58]); + x5[61] = _mm256_add_epi32(x4[61], x4[58]); + x5[59] = _mm256_sub_epi32(x4[60], x4[59]); + x5[60] = _mm256_add_epi32(x4[60], x4[59]); +} +static INLINE void fdct64_stage6_avx2( + __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48, + __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56, + __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24, + const __m256i *__rounding, int8_t cos_bit) { + btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3], + *__rounding, cos_bit); + x6[4] = _mm256_add_epi32(x5[4], x5[5]); + x6[5] = _mm256_sub_epi32(x5[4], x5[5]); + x6[6] = _mm256_sub_epi32(x5[7], x5[6]); + x6[7] = _mm256_add_epi32(x5[7], x5[6]); + x6[8] = x5[8]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13], + *__rounding, cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm256_add_epi32(x5[16], x5[19]); + x6[19] = _mm256_sub_epi32(x5[16], x5[19]); + x6[17] = _mm256_add_epi32(x5[17], x5[18]); + x6[18] = _mm256_sub_epi32(x5[17], x5[18]); + x6[20] = _mm256_sub_epi32(x5[23], x5[20]); + x6[23] = _mm256_add_epi32(x5[23], x5[20]); + x6[21] = _mm256_sub_epi32(x5[22], x5[21]); + x6[22] = _mm256_add_epi32(x5[22], x5[21]); + x6[24] = _mm256_add_epi32(x5[24], x5[27]); + x6[27] = _mm256_sub_epi32(x5[24], x5[27]); + x6[25] = _mm256_add_epi32(x5[25], x5[26]); + x6[26] = _mm256_sub_epi32(x5[25], x5[26]); + x6[28] = _mm256_sub_epi32(x5[31], x5[28]); + x6[31] = _mm256_add_epi32(x5[31], x5[28]); + x6[29] = _mm256_sub_epi32(x5[30], x5[29]); + x6[30] = _mm256_add_epi32(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58], + *__rounding, cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50], + *__rounding, cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; +} +static INLINE void fdct64_stage7_avx2(__m256i *x6, __m256i *x7, + __m256i *cospi_p08, __m256i *cospi_p56, + __m256i *cospi_p40, __m256i *cospi_p24, + __m256i *cospi_m08, __m256i *cospi_m56, + __m256i *cospi_m40, __m256i *cospi_m24, + const __m256i *__rounding, + int8_t cos_bit) { + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6], + *__rounding, cos_bit); + x7[8] = _mm256_add_epi32(x6[8], x6[9]); + x7[9] = _mm256_sub_epi32(x6[8], x6[9]); + x7[10] = _mm256_sub_epi32(x6[11], x6[10]); + x7[11] = _mm256_add_epi32(x6[11], x6[10]); + x7[12] = _mm256_add_epi32(x6[12], x6[13]); + x7[13] = _mm256_sub_epi32(x6[12], x6[13]); + x7[14] = _mm256_sub_epi32(x6[15], x6[14]); + x7[15] = _mm256_add_epi32(x6[15], x6[14]); + x7[16] = x6[16]; + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29], + *__rounding, cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25], + *__rounding, cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm256_add_epi32(x6[32], x6[35]); + x7[35] = _mm256_sub_epi32(x6[32], x6[35]); + x7[33] = _mm256_add_epi32(x6[33], x6[34]); + x7[34] = _mm256_sub_epi32(x6[33], x6[34]); + x7[36] = _mm256_sub_epi32(x6[39], x6[36]); + x7[39] = _mm256_add_epi32(x6[39], x6[36]); + x7[37] = _mm256_sub_epi32(x6[38], x6[37]); + x7[38] = _mm256_add_epi32(x6[38], x6[37]); + x7[40] = _mm256_add_epi32(x6[40], x6[43]); + x7[43] = _mm256_sub_epi32(x6[40], x6[43]); + x7[41] = _mm256_add_epi32(x6[41], x6[42]); + x7[42] = _mm256_sub_epi32(x6[41], x6[42]); + x7[44] = _mm256_sub_epi32(x6[47], x6[44]); + x7[47] = _mm256_add_epi32(x6[47], x6[44]); + x7[45] = _mm256_sub_epi32(x6[46], x6[45]); + x7[46] = _mm256_add_epi32(x6[46], x6[45]); + x7[48] = _mm256_add_epi32(x6[48], x6[51]); + x7[51] = _mm256_sub_epi32(x6[48], x6[51]); + x7[49] = _mm256_add_epi32(x6[49], x6[50]); + x7[50] = _mm256_sub_epi32(x6[49], x6[50]); + x7[52] = _mm256_sub_epi32(x6[55], x6[52]); + x7[55] = _mm256_add_epi32(x6[55], x6[52]); + x7[53] = _mm256_sub_epi32(x6[54], x6[53]); + x7[54] = _mm256_add_epi32(x6[54], x6[53]); + x7[56] = _mm256_add_epi32(x6[56], x6[59]); + x7[59] = _mm256_sub_epi32(x6[56], x6[59]); + x7[57] = _mm256_add_epi32(x6[57], x6[58]); + x7[58] = _mm256_sub_epi32(x6[57], x6[58]); + x7[60] = _mm256_sub_epi32(x6[63], x6[60]); + x7[63] = _mm256_add_epi32(x6[63], x6[60]); + x7[61] = _mm256_sub_epi32(x6[62], x6[61]); + x7[62] = _mm256_add_epi32(x6[62], x6[61]); +} +static INLINE void fdct64_stage8_avx2(__m256i *x7, __m256i *x8, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); + __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); + __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); + __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); + __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); + __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); + __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); + __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); + __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); + __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); + __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); + __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); + __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); + __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); + __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); + __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); + + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + + btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12], + *__rounding, cos_bit); + x8[16] = _mm256_add_epi32(x7[16], x7[17]); + x8[17] = _mm256_sub_epi32(x7[16], x7[17]); + x8[18] = _mm256_sub_epi32(x7[19], x7[18]); + x8[19] = _mm256_add_epi32(x7[19], x7[18]); + x8[20] = _mm256_add_epi32(x7[20], x7[21]); + x8[21] = _mm256_sub_epi32(x7[20], x7[21]); + x8[22] = _mm256_sub_epi32(x7[23], x7[22]); + x8[23] = _mm256_add_epi32(x7[23], x7[22]); + x8[24] = _mm256_add_epi32(x7[24], x7[25]); + x8[25] = _mm256_sub_epi32(x7[24], x7[25]); + x8[26] = _mm256_sub_epi32(x7[27], x7[26]); + x8[27] = _mm256_add_epi32(x7[27], x7[26]); + x8[28] = _mm256_add_epi32(x7[28], x7[29]); + x8[29] = _mm256_sub_epi32(x7[28], x7[29]); + x8[30] = _mm256_sub_epi32(x7[31], x7[30]); + x8[31] = _mm256_add_epi32(x7[31], x7[30]); + x8[32] = x7[32]; + btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], + *__rounding, cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], + *__rounding, cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], + *__rounding, cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], + *__rounding, cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; +} +static INLINE void fdct64_stage9_avx2(__m256i *x8, __m256i *x9, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); + __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); + __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); + __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); + __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); + __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); + __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); + __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); + __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); + __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); + __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); + __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); + __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); + __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); + __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); + __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); + + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24], + *__rounding, cos_bit); + x9[32] = _mm256_add_epi32(x8[32], x8[33]); + x9[33] = _mm256_sub_epi32(x8[32], x8[33]); + x9[34] = _mm256_sub_epi32(x8[35], x8[34]); + x9[35] = _mm256_add_epi32(x8[35], x8[34]); + x9[36] = _mm256_add_epi32(x8[36], x8[37]); + x9[37] = _mm256_sub_epi32(x8[36], x8[37]); + x9[38] = _mm256_sub_epi32(x8[39], x8[38]); + x9[39] = _mm256_add_epi32(x8[39], x8[38]); + x9[40] = _mm256_add_epi32(x8[40], x8[41]); + x9[41] = _mm256_sub_epi32(x8[40], x8[41]); + x9[42] = _mm256_sub_epi32(x8[43], x8[42]); + x9[43] = _mm256_add_epi32(x8[43], x8[42]); + x9[44] = _mm256_add_epi32(x8[44], x8[45]); + x9[45] = _mm256_sub_epi32(x8[44], x8[45]); + x9[46] = _mm256_sub_epi32(x8[47], x8[46]); + x9[47] = _mm256_add_epi32(x8[47], x8[46]); + x9[48] = _mm256_add_epi32(x8[48], x8[49]); + x9[49] = _mm256_sub_epi32(x8[48], x8[49]); + x9[50] = _mm256_sub_epi32(x8[51], x8[50]); + x9[51] = _mm256_add_epi32(x8[51], x8[50]); + x9[52] = _mm256_add_epi32(x8[52], x8[53]); + x9[53] = _mm256_sub_epi32(x8[52], x8[53]); + x9[54] = _mm256_sub_epi32(x8[55], x8[54]); + x9[55] = _mm256_add_epi32(x8[55], x8[54]); + x9[56] = _mm256_add_epi32(x8[56], x8[57]); + x9[57] = _mm256_sub_epi32(x8[56], x8[57]); + x9[58] = _mm256_sub_epi32(x8[59], x8[58]); + x9[59] = _mm256_add_epi32(x8[59], x8[58]); + x9[60] = _mm256_add_epi32(x8[60], x8[61]); + x9[61] = _mm256_sub_epi32(x8[60], x8[61]); + x9[62] = _mm256_sub_epi32(x8[63], x8[62]); + x9[63] = _mm256_add_epi32(x8[63], x8[62]); +} +static INLINE void fdct64_stage10_avx2(__m256i *x9, __m256i *x10, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); + __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); + __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); + __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); + __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); + __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); + __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); + __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); + __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); + __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); + __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); + __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); + __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); + __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); + __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); + __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); + __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); + __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); + __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); + __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); + __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); + __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); + __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); + __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); + __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); + __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); + __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); + __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); + __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); + __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); + __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); + __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); + + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48], + *__rounding, cos_bit); +} +static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit, + const int instride, const int outstride) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); + __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); + __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); + __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); + __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); + __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); + __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); + __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); + __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); + __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); + __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); + __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); + __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); + __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); + + int startidx = 0 * instride; + int endidx = 63 * instride; + // stage 1 + __m256i x1[64]; + x1[0] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[1] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[2] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[3] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[4] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[5] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[6] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[7] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[8] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[9] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[10] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[11] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[12] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[13] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[14] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[15] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[16] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[17] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[18] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[19] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[20] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[21] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[22] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[23] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[24] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[25] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[26] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[27] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[28] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[29] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[30] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[31] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + __m256i x2[64]; + fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit); + // stage 3 + fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit); + // stage 4 + fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &__rounding, cos_bit); + // stage 5 + fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &__rounding, cos_bit); + // stage 6 + fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40, + &cospi_p24, &cospi_m24, &__rounding, cos_bit); + // stage 7 + fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24, + &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24, + &__rounding, cos_bit); + // stage 8 + fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit); + // stage 9 + fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit); + // stage 10 + fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit); + + startidx = 0 * outstride; + endidx = 63 * outstride; + + // stage 11 + output[startidx] = x2[0]; + output[endidx] = x2[63]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[32]; + output[endidx] = x2[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[16]; + output[endidx] = x2[47]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[48]; + output[endidx] = x2[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[8]; + output[endidx] = x2[55]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[40]; + output[endidx] = x2[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[24]; + output[endidx] = x2[39]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[56]; + output[endidx] = x2[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[4]; + output[endidx] = x2[59]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[36]; + output[endidx] = x2[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[20]; + output[endidx] = x2[43]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[52]; + output[endidx] = x2[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[12]; + output[endidx] = x2[51]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[44]; + output[endidx] = x2[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[28]; + output[endidx] = x2[35]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[60]; + output[endidx] = x2[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[2]; + output[endidx] = x2[61]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[34]; + output[endidx] = x2[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[18]; + output[endidx] = x2[45]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[50]; + output[endidx] = x2[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[10]; + output[endidx] = x2[53]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[42]; + output[endidx] = x2[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[26]; + output[endidx] = x2[37]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[58]; + output[endidx] = x2[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[6]; + output[endidx] = x2[57]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[38]; + output[endidx] = x2[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[22]; + output[endidx] = x2[41]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[54]; + output[endidx] = x2[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[14]; + output[endidx] = x2[49]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[46]; + output[endidx] = x2[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[30]; + output[endidx] = x2[33]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[62]; + output[endidx] = x2[1]; +} +void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m256i buf0[512], buf1[512]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct64_avx2; + const transform_1d_avx2 row_txfm = fdct64_avx2; + const int width_div16 = (width >> 4); + const int width_div8 = (width >> 3); + int r, c; + for (int i = 0; i < width_div16; i++) { + load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height, + width_div8, 0, 0); + round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8); + col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8); + col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, + width_div8); + round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8); + } + + for (r = 0; r < height; r += 8) { + for (c = 0; c < width_div8; c++) { + fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], + &buf1[c * 8 * width_div8 + (r >> 3)], + width_div8, width_div8); + } + } + + for (int i = 0; i < 2; i++) { + row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8, + width_div16); + row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8, + width_div16); + round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2], + width_div16); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2], + width_div16); + } + + store_buffer_avx2(buf0, output, 8, 128); +} diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c new file mode 100644 index 0000000000..158b4ae439 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c @@ -0,0 +1,2629 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include /* SSE4.1 */ + +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +static INLINE void store_output_w4(int32_t *const out, const __m128i *const in, + const int stride, const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm_store_si128((__m128i *)(out + i * stride), in[i]); + } +} + +void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) { + __m128i in[4]; + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + + // Convert to int32_t. + __m128i op[4]; + op[0] = _mm_cvtepi16_epi32(in[0]); + op[1] = _mm_cvtepi16_epi32(in[1]); + op[2] = _mm_cvtepi16_epi32(in[2]); + op[3] = _mm_cvtepi16_epi32(in[3]); + + for (int i = 0; i < 2; ++i) { + __m128i a1 = op[0]; + __m128i b1 = op[1]; + __m128i c1 = op[2]; + __m128i d1 = op[3]; + __m128i e1; + + a1 = _mm_add_epi32(a1, b1); // a1 += b1 + d1 = _mm_sub_epi32(d1, c1); // d1 = d1 - c1 + e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1 + e1 = _mm_srai_epi32(e1, 1); + b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1 + c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1 + a1 = _mm_sub_epi32(a1, c1); // a1 -= c1 + d1 = _mm_add_epi32(d1, b1); // d1 += b1 + + op[0] = a1; + op[1] = c1; + op[2] = d1; + op[3] = b1; + + if (i == 0) { + transpose_32bit_4x4(op, op); + } + } + + op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT); + op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT); + op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT); + op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT); + + _mm_storeu_si128((__m128i *)(output + 0), op[0]); + _mm_storeu_si128((__m128i *)(output + 4), op[1]); + _mm_storeu_si128((__m128i *)(output + 8), op[2]); + _mm_storeu_si128((__m128i *)(output + 12), op[3]); +} + +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr, + int shift) { + if (!flipud) { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + } else { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = _mm_shufflelo_epi16(in[0], 0x1b); + in[1] = _mm_shufflelo_epi16(in[1], 0x1b); + in[2] = _mm_shufflelo_epi16(in[2], 0x1b); + in[3] = _mm_shufflelo_epi16(in[3], 0x1b); + } + + in[0] = _mm_cvtepi16_epi32(in[0]); + in[1] = _mm_cvtepi16_epi32(in[1]); + in[2] = _mm_cvtepi16_epi32(in[2]); + in[3] = _mm_cvtepi16_epi32(in[3]); + + in[0] = _mm_slli_epi32(in[0], shift); + in[1] = _mm_slli_epi32(in[1], shift); + in[2] = _mm_slli_epi32(in[2], shift); + in[3] = _mm_slli_epi32(in[3], shift); +} + +// We only use stage-2 bit; +// shift[0] is used in load_buffer_4x4() +// shift[1] is used in txfm_func_col() +// shift[2] is used in txfm_func_row() +static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_col) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i s0, s1, s2, s3; + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + int endidx = 3 * num_col; + s0 = _mm_add_epi32(in[0], in[endidx]); + s3 = _mm_sub_epi32(in[0], in[endidx]); + endidx -= num_col; + s1 = _mm_add_epi32(in[num_col], in[endidx]); + s2 = _mm_sub_epi32(in[num_col], in[endidx]); + + // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit); + u0 = _mm_mullo_epi32(s0, cospi32); + u1 = _mm_mullo_epi32(s1, cospi32); + u2 = _mm_add_epi32(u0, u1); + v0 = _mm_sub_epi32(u0, u1); + + u3 = _mm_add_epi32(u2, rnding); + v1 = _mm_add_epi32(v0, rnding); + + u0 = _mm_srai_epi32(u3, bit); + u2 = _mm_srai_epi32(v1, bit); + + // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit); + v0 = _mm_mullo_epi32(s2, cospi48); + v1 = _mm_mullo_epi32(s3, cospi16); + v2 = _mm_add_epi32(v0, v1); + + v3 = _mm_add_epi32(v2, rnding); + u1 = _mm_srai_epi32(v3, bit); + + v0 = _mm_mullo_epi32(s2, cospi16); + v1 = _mm_mullo_epi32(s3, cospi48); + v2 = _mm_sub_epi32(v1, v0); + + v3 = _mm_add_epi32(v2, rnding); + u3 = _mm_srai_epi32(v3, bit); + + // Note: shift[1] and shift[2] are zeros + + out[0] = u0; + out[1] = u1; + out[2] = u2; + out[3] = u3; +} + +static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) { + _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); + _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); + _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); + _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); +} + +static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_col) { + const int32_t *sinpi = sinpi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); + const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); + const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); + const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); + __m128i t; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i x0, x1, x2, x3; + __m128i u0, u1, u2, u3; + + int idx = 0 * num_col; + s0 = _mm_mullo_epi32(in[idx], sinpi1); + s1 = _mm_mullo_epi32(in[idx], sinpi4); + t = _mm_add_epi32(in[idx], in[idx + num_col]); + idx += num_col; + s2 = _mm_mullo_epi32(in[idx], sinpi2); + s3 = _mm_mullo_epi32(in[idx], sinpi1); + idx += num_col; + s4 = _mm_mullo_epi32(in[idx], sinpi3); + idx += num_col; + s5 = _mm_mullo_epi32(in[idx], sinpi4); + s6 = _mm_mullo_epi32(in[idx], sinpi2); + s7 = _mm_sub_epi32(t, in[idx]); + + t = _mm_add_epi32(s0, s2); + x0 = _mm_add_epi32(t, s5); + x1 = _mm_mullo_epi32(s7, sinpi3); + t = _mm_sub_epi32(s1, s3); + x2 = _mm_add_epi32(t, s6); + x3 = s4; + + s0 = _mm_add_epi32(x0, x3); + s1 = x1; + s2 = _mm_sub_epi32(x2, x3); + t = _mm_sub_epi32(x2, x0); + s3 = _mm_add_epi32(t, x3); + + u0 = _mm_add_epi32(s0, rnding); + u0 = _mm_srai_epi32(u0, bit); + + u1 = _mm_add_epi32(s1, rnding); + u1 = _mm_srai_epi32(u1, bit); + + u2 = _mm_add_epi32(s2, rnding); + u2 = _mm_srai_epi32(u2, bit); + + u3 = _mm_add_epi32(s3, rnding); + u3 = _mm_srai_epi32(u3, bit); + + out[0] = u0; + out[1] = u1; + out[2] = u2; + out[3] = u3; +} +static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + __m128i fact = _mm_set1_epi32(NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a_low; + + for (int i = 0; i < 4; i++) { + a_low = _mm_mullo_epi32(in[i * col_num], fact); + a_low = _mm_add_epi32(a_low, offset); + out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits); + } +} +void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff, + int input_stride, TX_TYPE tx_type, int bd) { + __m128i in[4]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case ADST_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case DCT_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case ADST_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_DCT: + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case IDTX: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case V_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case H_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case V_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case H_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case V_FLIPADST: + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case H_FLIPADST: + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + default: assert(0); + } + (void)bd; +} + +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr, + int shift) { + __m128i u; + if (!flipud) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + } else { + in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + in[4] = mm_reverse_epi16(in[4]); + in[5] = mm_reverse_epi16(in[5]); + in[6] = mm_reverse_epi16(in[6]); + in[7] = mm_reverse_epi16(in[7]); + } + + u = _mm_unpackhi_epi64(in[4], in[4]); + in[8] = _mm_cvtepi16_epi32(in[4]); + in[9] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[5], in[5]); + in[10] = _mm_cvtepi16_epi32(in[5]); + in[11] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[6], in[6]); + in[12] = _mm_cvtepi16_epi32(in[6]); + in[13] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[7], in[7]); + in[14] = _mm_cvtepi16_epi32(in[7]); + in[15] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[3], in[3]); + in[6] = _mm_cvtepi16_epi32(in[3]); + in[7] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[2], in[2]); + in[4] = _mm_cvtepi16_epi32(in[2]); + in[5] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[1], in[1]); + in[2] = _mm_cvtepi16_epi32(in[1]); + in[3] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[0], in[0]); + in[0] = _mm_cvtepi16_epi32(in[0]); + in[1] = _mm_cvtepi16_epi32(u); + + in[0] = _mm_slli_epi32(in[0], shift); + in[1] = _mm_slli_epi32(in[1], shift); + in[2] = _mm_slli_epi32(in[2], shift); + in[3] = _mm_slli_epi32(in[3], shift); + in[4] = _mm_slli_epi32(in[4], shift); + in[5] = _mm_slli_epi32(in[5], shift); + in[6] = _mm_slli_epi32(in[6], shift); + in[7] = _mm_slli_epi32(in[7], shift); + + in[8] = _mm_slli_epi32(in[8], shift); + in[9] = _mm_slli_epi32(in[9], shift); + in[10] = _mm_slli_epi32(in[10], shift); + in[11] = _mm_slli_epi32(in[11], shift); + in[12] = _mm_slli_epi32(in[12], shift); + in[13] = _mm_slli_epi32(in[13], shift); + in[14] = _mm_slli_epi32(in[14], shift); + in[15] = _mm_slli_epi32(in[15], shift); +} + +static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) { + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + + in[0] = _mm_add_epi32(in[0], rounding); + in[1] = _mm_add_epi32(in[1], rounding); + in[2] = _mm_add_epi32(in[2], rounding); + in[3] = _mm_add_epi32(in[3], rounding); + in[4] = _mm_add_epi32(in[4], rounding); + in[5] = _mm_add_epi32(in[5], rounding); + in[6] = _mm_add_epi32(in[6], rounding); + in[7] = _mm_add_epi32(in[7], rounding); + in[8] = _mm_add_epi32(in[8], rounding); + in[9] = _mm_add_epi32(in[9], rounding); + in[10] = _mm_add_epi32(in[10], rounding); + in[11] = _mm_add_epi32(in[11], rounding); + in[12] = _mm_add_epi32(in[12], rounding); + in[13] = _mm_add_epi32(in[13], rounding); + in[14] = _mm_add_epi32(in[14], rounding); + in[15] = _mm_add_epi32(in[15], rounding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + in[4] = _mm_srai_epi32(in[4], shift); + in[5] = _mm_srai_epi32(in[5], shift); + in[6] = _mm_srai_epi32(in[6], shift); + in[7] = _mm_srai_epi32(in[7], shift); + in[8] = _mm_srai_epi32(in[8], shift); + in[9] = _mm_srai_epi32(in[9], shift); + in[10] = _mm_srai_epi32(in[10], shift); + in[11] = _mm_srai_epi32(in[11], shift); + in[12] = _mm_srai_epi32(in[12], shift); + in[13] = _mm_srai_epi32(in[13], shift); + in[14] = _mm_srai_epi32(in[14], shift); + in[15] = _mm_srai_epi32(in[15], shift); +} + +static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) { + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + + in[0] = _mm_add_epi32(in[0], rounding); + in[1] = _mm_add_epi32(in[1], rounding); + in[2] = _mm_add_epi32(in[2], rounding); + in[3] = _mm_add_epi32(in[3], rounding); + in[4] = _mm_add_epi32(in[4], rounding); + in[5] = _mm_add_epi32(in[5], rounding); + in[6] = _mm_add_epi32(in[6], rounding); + in[7] = _mm_add_epi32(in[7], rounding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + in[4] = _mm_srai_epi32(in[4], shift); + in[5] = _mm_srai_epi32(in[5], shift); + in[6] = _mm_srai_epi32(in[6], shift); + in[7] = _mm_srai_epi32(in[7], shift); +} + +static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) { + _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); + _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); + _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); + _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); + + _mm_store_si128((__m128i *)(output + 4 * 4), res[4]); + _mm_store_si128((__m128i *)(output + 5 * 4), res[5]); + _mm_store_si128((__m128i *)(output + 6 * 4), res[6]); + _mm_store_si128((__m128i *)(output + 7 * 4), res[7]); + + _mm_store_si128((__m128i *)(output + 8 * 4), res[8]); + _mm_store_si128((__m128i *)(output + 9 * 4), res[9]); + _mm_store_si128((__m128i *)(output + 10 * 4), res[10]); + _mm_store_si128((__m128i *)(output + 11 * 4), res[11]); + + _mm_store_si128((__m128i *)(output + 12 * 4), res[12]); + _mm_store_si128((__m128i *)(output + 13 * 4), res[13]); + _mm_store_si128((__m128i *)(output + 14 * 4), res[14]); + _mm_store_si128((__m128i *)(output + 15 * 4), res[15]); +} + +static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output, + const int stride) { + _mm_storeu_si128((__m128i *)(output), res[0]); + _mm_storeu_si128((__m128i *)(output + 4), res[1]); + _mm_storeu_si128((__m128i *)(output + stride), res[2]); + _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]); + + _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]); + _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]); + _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]); + _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]); + + _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]); + _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]); + _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]); + _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]); + + _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]); + _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]); + _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]); + _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]); +} + +static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u[8], v[8]; + + int startidx = 0 * col_num; + int endidx = 7 * col_num; + // Even 8 points 0, 2, ..., 14 + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[startidx], in[endidx]); + v[7] = _mm_sub_epi32(in[startidx], in[endidx]); // v[7] + startidx += col_num; + endidx -= col_num; + u[1] = _mm_add_epi32(in[startidx], in[endidx]); + u[6] = _mm_sub_epi32(in[startidx], in[endidx]); + startidx += col_num; + endidx -= col_num; + u[2] = _mm_add_epi32(in[startidx], in[endidx]); + u[5] = _mm_sub_epi32(in[startidx], in[endidx]); + startidx += col_num; + endidx -= col_num; + u[3] = _mm_add_epi32(in[startidx], in[endidx]); + v[4] = _mm_sub_epi32(in[startidx], in[endidx]); // v[4] + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[3]); + v[3] = _mm_sub_epi32(u[0], u[3]); + v[1] = _mm_add_epi32(u[1], u[2]); + v[2] = _mm_sub_epi32(u[1], u[2]); + + v[5] = _mm_mullo_epi32(u[5], cospim32); + v[6] = _mm_mullo_epi32(u[6], cospi32); + v[5] = _mm_add_epi32(v[5], v[6]); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + u[0] = _mm_mullo_epi32(u[5], cospi32); + v[6] = _mm_mullo_epi32(u[6], cospim32); + v[6] = _mm_sub_epi32(u[0], v[6]); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm_mullo_epi32(v[0], cospi32); + v[1] = _mm_mullo_epi32(v[1], cospi32); + u[0] = _mm_add_epi32(v[0], v[1]); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_sub_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm_mullo_epi32(v[2], cospi48); + v[1] = _mm_mullo_epi32(v[3], cospi16); + u[2] = _mm_add_epi32(v[0], v[1]); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + v[0] = _mm_mullo_epi32(v[2], cospi16); + v[1] = _mm_mullo_epi32(v[3], cospi48); + u[3] = _mm_sub_epi32(v[1], v[0]); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + u[4] = _mm_add_epi32(v[4], v[5]); + u[5] = _mm_sub_epi32(v[4], v[5]); + u[6] = _mm_sub_epi32(v[7], v[6]); + u[7] = _mm_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm_mullo_epi32(u[4], cospi56); + v[1] = _mm_mullo_epi32(u[7], cospi8); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[1 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm_mullo_epi32(u[4], cospi8); + v[1] = _mm_mullo_epi32(u[7], cospi56); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[7 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm_mullo_epi32(u[5], cospi24); + v[1] = _mm_mullo_epi32(u[6], cospi40); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[5 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm_mullo_epi32(u[5], cospi40); + v[1] = _mm_mullo_epi32(u[6], cospi24); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[3 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[6] + + out[0 * col_num] = u[0]; // buf0[0] + out[4 * col_num] = u[1]; // buf0[1] + out[2 * col_num] = u[2]; // buf0[2] + out[6 * col_num] = u[3]; // buf0[3] +} + +static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + fdct4x8_sse4_1(in, out, bit, col_num); + fdct4x8_sse4_1(in + 1, out + 1, bit, col_num); +} + +static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + int col; + + // Note: + // Even column: 0, 2, ..., 14 + // Odd column: 1, 3, ..., 15 + // one even column plus one odd column constructs one row (8 coeffs) + // total we have 8 rows (8x8). + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u0 = in[col_num * 0 + col]; + u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]); + u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]); + u3 = in[col_num * 4 + col]; + u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]); + u5 = in[col_num * 6 + col]; + u6 = in[col_num * 2 + col]; + u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]); + + // stage 2 + v0 = u0; + v1 = u1; + + x = _mm_mullo_epi32(u2, cospi32); + y = _mm_mullo_epi32(u3, cospi32); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + v3 = _mm_sub_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + v4 = u4; + v5 = u5; + + x = _mm_mullo_epi32(u6, cospi32); + y = _mm_mullo_epi32(u7, cospi32); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + v7 = _mm_sub_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); + + // stage 3 + u0 = _mm_add_epi32(v0, v2); + u1 = _mm_add_epi32(v1, v3); + u2 = _mm_sub_epi32(v0, v2); + u3 = _mm_sub_epi32(v1, v3); + u4 = _mm_add_epi32(v4, v6); + u5 = _mm_add_epi32(v5, v7); + u6 = _mm_sub_epi32(v4, v6); + u7 = _mm_sub_epi32(v5, v7); + + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + x = _mm_mullo_epi32(u4, cospi16); + y = _mm_mullo_epi32(u5, cospi48); + v4 = _mm_add_epi32(x, y); + v4 = _mm_add_epi32(v4, rnding); + v4 = _mm_srai_epi32(v4, bit); + + x = _mm_mullo_epi32(u4, cospi48); + y = _mm_mullo_epi32(u5, cospim16); + v5 = _mm_add_epi32(x, y); + v5 = _mm_add_epi32(v5, rnding); + v5 = _mm_srai_epi32(v5, bit); + + x = _mm_mullo_epi32(u6, cospim48); + y = _mm_mullo_epi32(u7, cospi16); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + x = _mm_mullo_epi32(u6, cospi16); + y = _mm_mullo_epi32(u7, cospi48); + v7 = _mm_add_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); + + // stage 5 + u0 = _mm_add_epi32(v0, v4); + u1 = _mm_add_epi32(v1, v5); + u2 = _mm_add_epi32(v2, v6); + u3 = _mm_add_epi32(v3, v7); + u4 = _mm_sub_epi32(v0, v4); + u5 = _mm_sub_epi32(v1, v5); + u6 = _mm_sub_epi32(v2, v6); + u7 = _mm_sub_epi32(v3, v7); + + // stage 6 + x = _mm_mullo_epi32(u0, cospi4); + y = _mm_mullo_epi32(u1, cospi60); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + x = _mm_mullo_epi32(u0, cospi60); + y = _mm_mullo_epi32(u1, cospim4); + v1 = _mm_add_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi20); + y = _mm_mullo_epi32(u3, cospi44); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi44); + y = _mm_mullo_epi32(u3, cospim20); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + x = _mm_mullo_epi32(u4, cospi36); + y = _mm_mullo_epi32(u5, cospi28); + v4 = _mm_add_epi32(x, y); + v4 = _mm_add_epi32(v4, rnding); + v4 = _mm_srai_epi32(v4, bit); + + x = _mm_mullo_epi32(u4, cospi28); + y = _mm_mullo_epi32(u5, cospim36); + v5 = _mm_add_epi32(x, y); + v5 = _mm_add_epi32(v5, rnding); + v5 = _mm_srai_epi32(v5, bit); + + x = _mm_mullo_epi32(u6, cospi52); + y = _mm_mullo_epi32(u7, cospi12); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + x = _mm_mullo_epi32(u6, cospi12); + y = _mm_mullo_epi32(u7, cospim52); + v7 = _mm_add_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); + + // stage 7 + out[col_num * 0 + col] = v1; + out[col_num * 1 + col] = v6; + out[col_num * 2 + col] = v3; + out[col_num * 3 + col] = v4; + out[col_num * 4 + col] = v5; + out[col_num * 5 + col] = v2; + out[col_num * 6 + col] = v7; + out[col_num * 7 + col] = v0; + } +} +static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + + for (int i = 0; i < col_num; i += 1) { + out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]); + out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]); + out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]); + out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]); + out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]); + out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]); + out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]); + out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]); + } +} +#if !CONFIG_REALTIME_ONLY +static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + (void)col_num; + for (int j = 0; j < 2; j++) { + out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]); + out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]); + out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]); + out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]); + out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]); + out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]); + out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]); + out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]); + } +} +#endif +void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m128i in[16], out[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case ADST_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case DCT_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case ADST_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case FLIPADST_DCT: + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, in, stride, 1, 1, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case IDTX: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case V_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case H_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case V_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case H_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case V_FLIPADST: + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case H_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + default: assert(0); + } + (void)bd; +} + +// Hybrid Transform 16x16 + +static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) { + int row_index = 0; + int dst_index = 0; + int src_index = 0; + + // row 0, 1, .., 7 + do { + out[dst_index] = in[src_index]; + out[dst_index + 1] = in[src_index + 1]; + out[dst_index + 2] = in[src_index + 16]; + out[dst_index + 3] = in[src_index + 17]; + dst_index += 4; + src_index += 2; + row_index += 1; + } while (row_index < 8); + + // row 8, 9, ..., 15 + src_index += 16; + do { + out[dst_index] = in[src_index]; + out[dst_index + 1] = in[src_index + 1]; + out[dst_index + 2] = in[src_index + 16]; + out[dst_index + 3] = in[src_index + 17]; + dst_index += 4; + src_index += 2; + row_index += 1; + } while (row_index < 16); +} + +static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + __m128i in[64]; + // Load 4 8x8 blocks + const int16_t *topL = input; + const int16_t *topR = input + 8; + const int16_t *botL = input + 8 * stride; + const int16_t *botR = input + 8 * stride + 8; + + const int16_t *tmp; + + if (flipud) { + // Swap left columns + tmp = topL; + topL = botL; + botL = tmp; + // Swap right columns + tmp = topR; + topR = botR; + botR = tmp; + } + + if (fliplr) { + // Swap top rows + tmp = topL; + topL = topR; + topR = tmp; + // Swap bottom rows + tmp = botL; + botL = botR; + botR = tmp; + } + + // load first 8 columns + load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift); + load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift); + + // load second 8 columns + load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift); + load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift); + + convert_8x8_to_16x16(in, out); +} + +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + + load_buffer_8x8(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *topR = input + 4; + + const int16_t *tmp; + + if (fliplr) { + tmp = topL; + topL = topR; + topR = tmp; + } + + load_buffer_4x4(topL, out, stride, flipud, fliplr, shift); + load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *topR = input + 8; + + const int16_t *tmp; + + if (fliplr) { + tmp = topL; + topL = topR; + topR = tmp; + } + + load_buffer_8x4(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 4 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + + load_buffer_4x4(topL, out, stride, flipud, fliplr, shift); + load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift); +} + +#if !CONFIG_REALTIME_ONLY +static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out, + const int stride, const int flipud, + const int fliplr, const int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + load_buffer_4x8(topL, out, stride, flipud, fliplr, shift); + load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift); +} +#endif + +static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift, const int height) { + const int16_t *in = input; + __m128i *output = out; + for (int col = 0; col < height; col++) { + in = input + col * stride; + output = out + col * 8; + load_buffer_4x4(in, output, 4, flipud, fliplr, shift); + load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift); + } +} + +static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u[16], v[16], x; + int col; + + // Calculate the column 0, 1, 2, 3 + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); + u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[7]); + v[7] = _mm_sub_epi32(u[0], u[7]); + v[1] = _mm_add_epi32(u[1], u[6]); + v[6] = _mm_sub_epi32(u[1], u[6]); + v[2] = _mm_add_epi32(u[2], u[5]); + v[5] = _mm_sub_epi32(u[2], u[5]); + v[3] = _mm_add_epi32(u[3], u[4]); + v[4] = _mm_sub_epi32(u[3], u[4]); + v[8] = u[8]; + v[9] = u[9]; + + v[10] = _mm_mullo_epi32(u[10], cospim32); + x = _mm_mullo_epi32(u[13], cospi32); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[13], cospim32); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = _mm_mullo_epi32(u[11], cospim32); + x = _mm_mullo_epi32(u[12], cospi32); + v[11] = _mm_add_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[11], cospi32); + x = _mm_mullo_epi32(u[12], cospim32); + v[12] = _mm_sub_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + v[14] = u[14]; + v[15] = u[15]; + + // stage 3 + u[0] = _mm_add_epi32(v[0], v[3]); + u[3] = _mm_sub_epi32(v[0], v[3]); + u[1] = _mm_add_epi32(v[1], v[2]); + u[2] = _mm_sub_epi32(v[1], v[2]); + u[4] = v[4]; + + u[5] = _mm_mullo_epi32(v[5], cospim32); + x = _mm_mullo_epi32(v[6], cospi32); + u[5] = _mm_add_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[5], cospi32); + x = _mm_mullo_epi32(v[6], cospim32); + u[6] = _mm_sub_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = v[7]; + u[8] = _mm_add_epi32(v[8], v[11]); + u[11] = _mm_sub_epi32(v[8], v[11]); + u[9] = _mm_add_epi32(v[9], v[10]); + u[10] = _mm_sub_epi32(v[9], v[10]); + u[12] = _mm_sub_epi32(v[15], v[12]); + u[15] = _mm_add_epi32(v[15], v[12]); + u[13] = _mm_sub_epi32(v[14], v[13]); + u[14] = _mm_add_epi32(v[14], v[13]); + + // stage 4 + u[0] = _mm_mullo_epi32(u[0], cospi32); + u[1] = _mm_mullo_epi32(u[1], cospi32); + v[0] = _mm_add_epi32(u[0], u[1]); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_sub_epi32(u[0], u[1]); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(u[2], cospi48); + x = _mm_mullo_epi32(u[3], cospi16); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(u[2], cospi16); + x = _mm_mullo_epi32(u[3], cospi48); + v[3] = _mm_sub_epi32(x, v[3]); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_add_epi32(u[4], u[5]); + v[5] = _mm_sub_epi32(u[4], u[5]); + v[6] = _mm_sub_epi32(u[7], u[6]); + v[7] = _mm_add_epi32(u[7], u[6]); + v[8] = u[8]; + + v[9] = _mm_mullo_epi32(u[9], cospim16); + x = _mm_mullo_epi32(u[14], cospi48); + v[9] = _mm_add_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[14] = _mm_mullo_epi32(u[9], cospi48); + x = _mm_mullo_epi32(u[14], cospim16); + v[14] = _mm_sub_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[10] = _mm_mullo_epi32(u[10], cospim48); + x = _mm_mullo_epi32(u[13], cospim16); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospim16); + x = _mm_mullo_epi32(u[13], cospim48); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = u[11]; + v[12] = u[12]; + v[15] = u[15]; + + // stage 5 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi56); + x = _mm_mullo_epi32(v[7], cospi8); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[7] = _mm_mullo_epi32(v[4], cospi8); + x = _mm_mullo_epi32(v[7], cospi56); + u[7] = _mm_sub_epi32(x, u[7]); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + u[5] = _mm_mullo_epi32(v[5], cospi24); + x = _mm_mullo_epi32(v[6], cospi40); + u[5] = _mm_add_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[5], cospi40); + x = _mm_mullo_epi32(v[6], cospi24); + u[6] = _mm_sub_epi32(x, u[6]); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[8] = _mm_add_epi32(v[8], v[9]); + u[9] = _mm_sub_epi32(v[8], v[9]); + u[10] = _mm_sub_epi32(v[11], v[10]); + u[11] = _mm_add_epi32(v[11], v[10]); + u[12] = _mm_add_epi32(v[12], v[13]); + u[13] = _mm_sub_epi32(v[12], v[13]); + u[14] = _mm_sub_epi32(v[15], v[14]); + u[15] = _mm_add_epi32(v[15], v[14]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi60); + x = _mm_mullo_epi32(u[15], cospi4); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[15] = _mm_mullo_epi32(u[8], cospi4); + x = _mm_mullo_epi32(u[15], cospi60); + v[15] = _mm_sub_epi32(x, v[15]); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + v[9] = _mm_mullo_epi32(u[9], cospi28); + x = _mm_mullo_epi32(u[14], cospi36); + v[9] = _mm_add_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[14] = _mm_mullo_epi32(u[9], cospi36); + x = _mm_mullo_epi32(u[14], cospi28); + v[14] = _mm_sub_epi32(x, v[14]); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi44); + x = _mm_mullo_epi32(u[13], cospi20); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospi20); + x = _mm_mullo_epi32(u[13], cospi44); + v[13] = _mm_sub_epi32(x, v[13]); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = _mm_mullo_epi32(u[11], cospi12); + x = _mm_mullo_epi32(u[12], cospi52); + v[11] = _mm_add_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[11], cospi52); + x = _mm_mullo_epi32(u[12], cospi12); + v[12] = _mm_sub_epi32(x, v[12]); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + out[0 * col_num + col] = v[0]; + out[1 * col_num + col] = v[8]; + out[2 * col_num + col] = v[4]; + out[3 * col_num + col] = v[12]; + out[4 * col_num + col] = v[2]; + out[5 * col_num + col] = v[10]; + out[6 * col_num + col] = v[6]; + out[7 * col_num + col] = v[14]; + out[8 * col_num + col] = v[1]; + out[9 * col_num + col] = v[9]; + out[10 * col_num + col] = v[5]; + out[11 * col_num + col] = v[13]; + out[12 * col_num + col] = v[3]; + out[13 * col_num + col] = v[11]; + out[14 * col_num + col] = v[7]; + out[15 * col_num + col] = v[15]; + } +} + +static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_cols) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + + __m128i u[16], v[16], x, y; + int col; + + for (col = 0; col < num_cols; ++col) { + // stage 0 + // stage 1 + u[0] = in[0 * num_cols + col]; + u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]); + u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]); + u[3] = in[8 * num_cols + col]; + u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]); + u[5] = in[12 * num_cols + col]; + u[6] = in[4 * num_cols + col]; + u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]); + u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]); + u[9] = in[14 * num_cols + col]; + u[10] = in[6 * num_cols + col]; + u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]); + u[12] = in[2 * num_cols + col]; + u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]); + u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]); + u[15] = in[10 * num_cols + col]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + + x = _mm_mullo_epi32(u[2], cospi32); + y = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(x, y); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(x, y); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + x = _mm_mullo_epi32(u[6], cospi32); + y = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(x, y); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(x, y); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(x, y); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(x, y); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + x = _mm_mullo_epi32(u[14], cospi32); + y = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(x, y); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(x, y); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 3 + u[0] = _mm_add_epi32(v[0], v[2]); + u[1] = _mm_add_epi32(v[1], v[3]); + u[2] = _mm_sub_epi32(v[0], v[2]); + u[3] = _mm_sub_epi32(v[1], v[3]); + u[4] = _mm_add_epi32(v[4], v[6]); + u[5] = _mm_add_epi32(v[5], v[7]); + u[6] = _mm_sub_epi32(v[4], v[6]); + u[7] = _mm_sub_epi32(v[5], v[7]); + u[8] = _mm_add_epi32(v[8], v[10]); + u[9] = _mm_add_epi32(v[9], v[11]); + u[10] = _mm_sub_epi32(v[8], v[10]); + u[11] = _mm_sub_epi32(v[9], v[11]); + u[12] = _mm_add_epi32(v[12], v[14]); + u[13] = _mm_add_epi32(v[13], v[15]); + u[14] = _mm_sub_epi32(v[12], v[14]); + u[15] = _mm_sub_epi32(v[13], v[15]); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); + + // stage 5 + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); + + // stage 7 + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + // stage 8 + v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); + + // stage 9 + out[0 * num_cols + col] = v[1]; + out[1 * num_cols + col] = v[14]; + out[2 * num_cols + col] = v[3]; + out[3 * num_cols + col] = v[12]; + out[4 * num_cols + col] = v[5]; + out[5 * num_cols + col] = v[10]; + out[6 * num_cols + col] = v[7]; + out[7 * num_cols + col] = v[8]; + out[8 * num_cols + col] = v[9]; + out[9 * num_cols + col] = v[6]; + out[10 * num_cols + col] = v[11]; + out[11 * num_cols + col] = v[4]; + out[12 * num_cols + col] = v[13]; + out[13 * num_cols + col] = v[2]; + out[14 * num_cols + col] = v[15]; + out[15 * num_cols + col] = v[0]; + } +} + +static void col_txfm_16x16_rounding(__m128i *in, int shift) { + // Note: + // We split 16x16 rounding into 4 sections of 8x8 rounding, + // instead of 4 columns + col_txfm_8x8_rounding(&in[0], shift); + col_txfm_8x8_rounding(&in[16], shift); + col_txfm_8x8_rounding(&in[32], shift); + col_txfm_8x8_rounding(&in[48], shift); +} + +static void col_txfm_8x16_rounding(__m128i *in, int shift) { + col_txfm_8x8_rounding(&in[0], shift); + col_txfm_8x8_rounding(&in[16], shift); +} + +static void write_buffer_16x16(const __m128i *in, int32_t *output) { + const int size_8x8 = 16 * 4; + write_buffer_8x8(&in[0], output); + output += size_8x8; + write_buffer_8x8(&in[16], output); + output += size_8x8; + write_buffer_8x8(&in[32], output); + output += size_8x8; + write_buffer_8x8(&in[48], output); +} +static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + __m128i fact = _mm_set1_epi32(2 * NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a_low; + + int num_iters = 16 * col_num; + for (int i = 0; i < num_iters; i++) { + a_low = _mm_mullo_epi32(in[i], fact); + a_low = _mm_add_epi32(a_low, offset); + out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits); + } +} +void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[64], out[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16]; + const int txw_idx = get_txw_idx(TX_16X16); + const int txh_idx = get_txh_idx(TX_16X16); + const int col_num = 4; + switch (tx_type) { + case DCT_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case ADST_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case DCT_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case ADST_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case FLIPADST_DCT: + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case DCT_FLIPADST: + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case FLIPADST_FLIPADST: + load_buffer_16x16(input, in, stride, 1, 1, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case ADST_FLIPADST: + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case FLIPADST_ADST: + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case IDTX: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case V_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case H_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case V_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case H_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case V_FLIPADST: + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case H_FLIPADST: + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + default: assert(0); + } + (void)bd; +} + +static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) { + for (int i = 0; i < size; i += 2) in[30 - i] = out[i]; + for (int i = 1; i < size; i += 2) in[size - i] = out[i]; +} + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + fadst8x8_sse4_1, // ADST_DCT + fdct8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fadst8x8_sse4_1, // FLIPADST_DCT + fdct8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + fdct8x8_sse4_1, // V_DCT + idtx8x8_sse4_1, // H_DCT + fadst8x8_sse4_1, // V_ADST + idtx8x8_sse4_1, // H_ADST + fadst8x8_sse4_1, // V_FLIPADST + idtx8x8_sse4_1 // H_FLIPADST +}; +#if !CONFIG_REALTIME_ONLY +static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST-ADST + idtx32x8_sse4_1, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL, // H_FLIPADST +}; +#endif +static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = { + fdct4x8_sse4_1, // DCT_DCT + fadst8x8_sse4_1, // ADST_DCT + fdct4x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fadst8x8_sse4_1, // FLIPADST_DCT + fdct4x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + fdct4x8_sse4_1, // V_DCT + idtx8x8_sse4_1, // H_DCT + fadst8x8_sse4_1, // V_ADST + idtx8x8_sse4_1, // H_ADST + fadst8x8_sse4_1, // V_FLIPADST + idtx8x8_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + fdct16x16_sse4_1, // ADST_DCT + fadst16x16_sse4_1, // DCT_ADST + fadst16x16_sse4_1, // ADST_ADST + fdct16x16_sse4_1, // FLIPADST_DCT + fadst16x16_sse4_1, // DCT_FLIPADST + fadst16x16_sse4_1, // FLIPADST_FLIPADST + fadst16x16_sse4_1, // ADST_FLIPADST + fadst16x16_sse4_1, // FLIPADST_ADST + idtx16x16_sse4_1, // IDTX + idtx16x16_sse4_1, // V_DCT + fdct16x16_sse4_1, // H_DCT + idtx16x16_sse4_1, // V_ADST + fadst16x16_sse4_1, // H_ADST + idtx16x16_sse4_1, // V_FLIPADST + fadst16x16_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + fadst16x16_sse4_1, // ADST_DCT + fdct16x16_sse4_1, // DCT_ADST + fadst16x16_sse4_1, // ADST_ADST + fadst16x16_sse4_1, // FLIPADST_DCT + fdct16x16_sse4_1, // DCT_FLIPADST + fadst16x16_sse4_1, // FLIPADST_FLIPADST + fadst16x16_sse4_1, // ADST_FLIPADST + fadst16x16_sse4_1, // FLIPADST_ADST + idtx16x16_sse4_1, // IDTX + fdct16x16_sse4_1, // V_DCT + idtx16x16_sse4_1, // H_DCT + fadst16x16_sse4_1, // V_ADST + idtx16x16_sse4_1, // H_ADST + fadst16x16_sse4_1, // V_FLIPADST + idtx16x16_sse4_1 // H_FLIPADST +}; +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + fdct8x8_sse4_1, // ADST_DCT + fadst8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fdct8x8_sse4_1, // FLIPADST_DCT + fadst8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + idtx8x8_sse4_1, // V_DCT + fdct8x8_sse4_1, // H_DCT + idtx8x8_sse4_1, // V_ADST + fadst8x8_sse4_1, // H_ADST + idtx8x8_sse4_1, // V_FLIPADST + fadst8x8_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = { + fdct4x8_sse4_1, // DCT_DCT + fdct4x8_sse4_1, // ADST_DCT + fadst8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fdct4x8_sse4_1, // FLIPADST_DCT + fadst8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + idtx8x8_sse4_1, // V_DCT + fdct4x8_sse4_1, // H_DCT + idtx8x8_sse4_1, // V_ADST + fadst8x8_sse4_1, // H_ADST + idtx8x8_sse4_1, // V_FLIPADST + fadst8x8_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = { + fdct4x4_sse4_1, // DCT_DCT + fdct4x4_sse4_1, // ADST_DCT + fadst4x4_sse4_1, // DCT_ADST + fadst4x4_sse4_1, // ADST_ADST + fdct4x4_sse4_1, // FLIPADST_DCT + fadst4x4_sse4_1, // DCT_FLIPADST + fadst4x4_sse4_1, // FLIPADST_FLIPADST + fadst4x4_sse4_1, // ADST_FLIPADST + fadst4x4_sse4_1, // FLIPADST_ADST + idtx4x4_sse4_1, // IDTX + idtx4x4_sse4_1, // V_DCT + fdct4x4_sse4_1, // H_DCT + idtx4x4_sse4_1, // V_ADST + fadst4x4_sse4_1, // H_ADST + idtx4x4_sse4_1, // V_FLIPADST + fadst4x4_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = { + fdct4x4_sse4_1, // DCT_DCT + fadst4x4_sse4_1, // ADST_DCT + fdct4x4_sse4_1, // DCT_ADST + fadst4x4_sse4_1, // ADST_ADST + fadst4x4_sse4_1, // FLIPADST_DCT + fdct4x4_sse4_1, // DCT_FLIPADST + fadst4x4_sse4_1, // FLIPADST_FLIPADST + fadst4x4_sse4_1, // ADST_FLIPADST + fadst4x4_sse4_1, // FLIPADST_ADST + idtx4x4_sse4_1, // IDTX + fdct4x4_sse4_1, // V_DCT + idtx4x4_sse4_1, // H_DCT + fadst4x4_sse4_1, // V_ADST + idtx4x4_sse4_1, // H_ADST + fadst4x4_sse4_1, // V_FLIPADST + idtx4x4_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = { + av1_fdct32_sse4_1, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + av1_idtx32_sse4_1, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx16x16_sse4_1, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[32], out[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]); + col_txfm(in, in, bit, 2); + col_txfm_8x8_rounding(in, -shift[1]); + transpose_8x8(in, out + i * 16); + } + + if (lr_flip) { + flip_buf_sse4_1(in, out, 32); + row_txfm(in, out, bit, 2); + } else { + row_txfm(out, out, bit, 2); + } + + for (int i = 0; i < 2; i++) { + av1_round_shift_rect_array_32_sse4_1(out + i * 16, in, 16, -shift[2], + NewSqrt2); + write_buffer_8x8(in, coeff + i * 64); + } + (void)bd; +} + +void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[32], out[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type]; + int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, in, bit, 2); + col_txfm_8x16_rounding(in, -shift[1]); + transpose_8x8(in, out); + transpose_8x8(in + 16, out + 16); + + for (int i = 0; i < 2; i++) { + row_txfm(out + i * 16, out, bit, 2); + av1_round_shift_rect_array_32_sse4_1(out, out, 16, -shift[2], NewSqrt2); + write_buffer_16x8(out, coeff + i * 8, 16); + } + (void)bd; +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[16]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16]; + const int txw_idx = get_txw_idx(TX_4X16); + const int txh_idx = get_txh_idx(TX_4X16); + const int txfm_size_col = tx_size_wide[TX_4X16]; + const int txfm_size_row = tx_size_high[TX_4X16]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // col transform + load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, outcoeff128, bitcol, 1); + col_txfm_8x8_rounding(outcoeff128, -shift[1]); + transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < 4; i++) { + __m128i tmp[4]; + row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2); + store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col); + } + (void)bd; +} +#endif + +void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[16]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4]; + const int txw_idx = get_txw_idx(TX_16X4); + const int txh_idx = get_txh_idx(TX_16X4); + const int txfm_size_col = tx_size_wide[TX_16X4]; + const int txfm_size_row = tx_size_high[TX_16X4]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // col transform + load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]); + + for (int i = 0; i < (txfm_size_col >> 2); i++) { + __m128i *cur_in = &in[i * txfm_size_row]; + col_txfm(cur_in, cur_in, bitcol, 1); + transpose_32bit_4x4(cur_in, cur_in); + } + col_txfm_8x8_rounding(in, -shift[1]); + + // row transform + row_txfm(in, outcoeff128, bitrow, 1); + (void)bd; +} + +void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[128]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32]; + const int txw_idx = get_txw_idx(TX_16X32); + const int txh_idx = get_txh_idx(TX_16X32); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + // column transform + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]); + + for (int i = 0; i < 4; i++) { + col_txfm((in + i), (in + i), bitcol, 4); + } + col_txfm_16x16_rounding(&in[0], -shift[1]); + col_txfm_16x16_rounding(&in[64], -shift[1]); + transpose_8nx8n(in, outcoef128, 16, 32); + + // row transform + row_txfm(outcoef128, in, bitrow, 8); + av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2], + NewSqrt2); + (void)bd; +} + +void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + (void)tx_type; + __m128i in[512]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64]; + const int txw_idx = get_txw_idx(TX_32X64); + const int txh_idx = get_txh_idx(TX_32X64); + const int txfm_size_col = tx_size_wide[TX_32X64]; + const int txfm_size_row = tx_size_high[TX_32X64]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int num_row = txfm_size_row >> 2; + const int num_col = txfm_size_col >> 2; + + // column transform + load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row); + for (int i = 0; i < num_col; i++) { + av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col); + } + for (int i = 0; i < num_col; i++) { + col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]); + } + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < num_row; i++) { + av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row); + } + for (int i = 0; i < txfm_size_col; i++) { + av1_round_shift_rect_array_32_sse4_1(in + i * 16, outcoef128 + i * 8, 8, + -shift[2], NewSqrt2); + } + (void)bd; +} + +void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + (void)tx_type; + __m128i in[512]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32]; + const int txw_idx = get_txw_idx(TX_64X32); + const int txh_idx = get_txh_idx(TX_64X32); + const int txfm_size_col = tx_size_wide[TX_64X32]; + const int txfm_size_row = tx_size_high[TX_64X32]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int num_row = txfm_size_row >> 2; + const int num_col = txfm_size_col >> 2; + + // column transform + for (int i = 0; i < 32; i++) { + load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]); + load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0, + shift[0]); + load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0, + shift[0]); + load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0, + shift[0]); + } + + for (int i = 0; i < num_col; i++) { + av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col); + } + + for (int i = 0; i < num_row; i++) { + col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]); + } + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < num_row; i++) { + av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row); + } + av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 512, -shift[2], + NewSqrt2); + (void)bd; +} + +void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[128]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + // column transform + load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16); + col_txfm(in, in, bitcol, 8); + col_txfm_16x16_rounding(&in[0], -shift[1]); + col_txfm_16x16_rounding(&in[64], -shift[1]); + transpose_8nx8n(in, outcoef128, 32, 16); + + // row transform + for (int i = 0; i < 4; i++) { + row_txfm((outcoef128 + i), (in + i), bitrow, 4); + } + av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2], + NewSqrt2); + (void)bd; +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[64]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32]; + const int txw_idx = get_txw_idx(TX_8X32); + const int txh_idx = get_txh_idx(TX_8X32); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + const int txfm_size_col = tx_size_wide[TX_8X32]; + const int txfm_size_row = tx_size_high[TX_8X32]; + const int num_col = txfm_size_col >> 2; + + // column transform + load_buffer_8x16(input, in, stride, 0, 0, shift[0]); + load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row, + stride, 0, 0, shift[0]); + + for (int i = 0; i < num_col; i++) { + col_txfm((in + i), (in + i), bitcol, num_col); + } + col_txfm_16x16_rounding(in, -shift[1]); + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < txfm_size_col; i += 2) { + row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col); + } + (void)bd; +} + +void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[64]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8]; + const int txw_idx = get_txw_idx(TX_32X8); + const int txh_idx = get_txh_idx(TX_32X8); + const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + const int txfm_size_col = tx_size_wide[TX_32X8]; + const int txfm_size_row = tx_size_high[TX_32X8]; + const int num_col = txfm_size_row >> 2; + + // column transform + load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8); + for (int i = 0; i < txfm_size_row; i += 2) { + col_txfm((in + i), (in + i), bitcol, txfm_size_row); + } + + col_txfm_16x16_rounding(&in[0], -shift[1]); + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < num_col; i++) { + row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col); + } + (void)bd; +} +#endif + +void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m128i in[8]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8]; + const int txw_idx = get_txw_idx(TX_4X8); + const int txh_idx = get_txh_idx(TX_4X8); + const int txfm_size_col = tx_size_wide[TX_4X8]; + const int txfm_size_row = tx_size_high[TX_4X8]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, in, bitcol, 1); + col_txfm_4x8_rounding(in, -shift[1]); + + for (int i = 0; i < 2; i++) { + __m128i *cur_in = &in[i * 4]; + transpose_32bit_4x4(cur_in, cur_in); + row_txfm(cur_in, cur_in, bitrow, 1); + av1_round_shift_rect_array_32_sse4_1(cur_in, cur_in, txfm_size_col, + -shift[2], NewSqrt2); + store_output_w4(coeff + i * 4, cur_in, txfm_size_row, 4); + } + (void)bd; +} + +void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m128i in[8]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4]; + const int txw_idx = get_txw_idx(TX_8X4); + const int txh_idx = get_txh_idx(TX_8X4); + const int txfm_size_col = tx_size_wide[TX_8X4]; + const int txfm_size_row = tx_size_high[TX_8X4]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // col tranform + load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]); + for (int i = 0; i < 2; i++) { + __m128i *cur_in = &in[i * txfm_size_row]; + col_txfm(cur_in, cur_in, bitcol, 1); + transpose_32bit_4x4(cur_in, cur_in); + } + col_txfm_4x8_rounding(in, -shift[1]); + + // row tranform + row_txfm(in, outcoeff128, bitrow, 1); + av1_round_shift_rect_array_32_sse4_1(outcoeff128, outcoeff128, txfm_size_col, + -shift[2], NewSqrt2); + (void)bd; +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[256]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64]; + const int txw_idx = get_txw_idx(TX_16X64); + const int txh_idx = get_txh_idx(TX_16X64); + const int txfm_size_col = tx_size_wide[TX_16X64]; + const int txfm_size_row = tx_size_high[TX_16X64]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int num_col = txfm_size_col >> 2; + // col tranform + for (int i = 0; i < txfm_size_row; i += num_col) { + load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + } + + for (int i = 0; i < num_col; i++) { + av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col); + } + + col_txfm_16x16_rounding(outcoeff128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]); + + transpose_8nx8n(outcoeff128, in, txfm_size_col, 32); + fdct16x16_sse4_1(in, outcoeff128, bitrow, 8); + (void)bd; +} + +void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[256]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16]; + const int txw_idx = get_txw_idx(TX_64X16); + const int txh_idx = get_txh_idx(TX_64X16); + const int txfm_size_col = tx_size_wide[TX_64X16]; + const int txfm_size_row = tx_size_high[TX_64X16]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // col tranform + for (int i = 0; i < txfm_size_row; i++) { + load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + } + + fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row); + col_txfm_16x16_rounding(outcoeff128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]); + + transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row); + for (int i = 0; i < 4; i++) { + av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitrow, 4, 4); + } + memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff)); + (void)bd; +} +#endif diff --git a/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c new file mode 100644 index 0000000000..ca448ca37b --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "aom_dsp/mathutils.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +#define SSE_STRIDE (BW + 4) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = { + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 }, + { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, + { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 }, + { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } +}; + +static AOM_FORCE_INLINE void get_squared_error_16x16_avx2( + const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + uint32_t *frame_sse, const unsigned int sse_stride) { + (void)block_width; + const uint16_t *src1 = frame1; + const uint16_t *src2 = frame2; + uint32_t *dst = frame_sse + 2; + for (int i = 0; i < block_height; i++) { + __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1); + __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2); + __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2); + __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff); + __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff); + + __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi); + __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi); + __m256i diff_lo = + _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1); + __m256i diff_hi = + _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0); + + _mm256_storeu_si256((__m256i *)dst, diff_lo); + dst += 8; + _mm256_storeu_si256((__m256i *)dst, diff_hi); + + src1 += stride, src2 += stride2; + dst += sse_stride - 8; + } +} + +static AOM_FORCE_INLINE void get_squared_error_32x32_avx2( + const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + uint32_t *frame_sse, const unsigned int sse_stride) { + (void)block_width; + const uint16_t *src1 = frame1; + const uint16_t *src2 = frame2; + uint32_t *dst = frame_sse + 2; + for (int i = 0; i < block_height; i++) { + __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1); + __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2); + __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2); + __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff); + __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff); + + __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi); + __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi); + __m256i diff_lo = + _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1); + __m256i diff_hi = + _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0); + + _mm256_storeu_si256((__m256i *)dst, diff_lo); + _mm256_storeu_si256((__m256i *)(dst + 8), diff_hi); + + v_src1 = _mm256_loadu_si256((__m256i *)(src1 + 16)); + v_src2 = _mm256_loadu_si256((__m256i *)(src2 + 16)); + v_diff = _mm256_sub_epi16(v_src1, v_src2); + v_mullo = _mm256_mullo_epi16(v_diff, v_diff); + v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff); + + v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi); + v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi); + diff_lo = + _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1); + diff_hi = + _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0); + + _mm256_storeu_si256((__m256i *)(dst + 16), diff_lo); + _mm256_storeu_si256((__m256i *)(dst + 24), diff_hi); + + src1 += stride; + src2 += stride2; + dst += sse_stride; + } +} + +static AOM_FORCE_INLINE void xx_load_and_pad_left(uint32_t *src, + __m256i *v256tmp) { + *v256tmp = _mm256_loadu_si256((__m256i *)src); + // For the first column, replicate the first element twice to the left + __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0xEA); + *v256tmp = _mm256_inserti128_si256(*v256tmp, + _mm256_extracti128_si256(v256tmp1, 0), 0); +} + +static AOM_FORCE_INLINE void xx_load_and_pad_right(uint32_t *src, + __m256i *v256tmp) { + *v256tmp = _mm256_loadu_si256((__m256i *)src); + // For the last column, replicate the last element twice to the right + __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0x54); + *v256tmp = _mm256_inserti128_si256(*v256tmp, + _mm256_extracti128_si256(v256tmp1, 1), 1); +} + +static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) { + // Mask the required 5 values inside the vector + __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]); + __m128i v128a, v128b; + // Extract 256b as two 128b registers A and B + v128a = _mm256_castsi256_si128(vtmp); + v128b = _mm256_extracti128_si256(vtmp, 1); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + v128a = _mm_add_epi32(v128a, v128b); + // B = [A2+B2, A3+B3, 0, 0] + v128b = _mm_srli_si128(v128a, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + v128a = _mm_add_epi32(v128a, v128b); + // B = [A1+B1+A3+B3, 0, 0, 0] + v128b = _mm_srli_si128(v128a, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + v128a = _mm_add_epi32(v128a, v128b); + return _mm_extract_epi32(v128a, 0); +} + +static void highbd_apply_temporal_filter( + const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const int *subblock_mses, unsigned int *accumulator, uint16_t *count, + uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd, + const double inv_num_ref_pixels, const double decay_factor, + const double inv_factor, const double weight_factor, double *d_factor, + int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_sse[BH][BW]; + + if (block_width == 32) { + get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width, + block_height, frame_sse, SSE_STRIDE); + } else { + get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width, + block_height, frame_sse, SSE_STRIDE); + } + + __m256i vsrc[5]; + + // Traverse 4 columns at a time + // First and last columns will require padding + int col; + uint32_t *src = frame_sse; + for (int i = 2; i < 5; i++) { + xx_load_and_pad_left(src, &vsrc[i]); + src += SSE_STRIDE; + } + + // Copy first row to first 2 vectors + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (int row = 0; row < block_height - 3; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + xx_load_and_pad_left(src, &vsrc[4]); + src += SSE_STRIDE; + + acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3); + } + for (int row = block_height - 3; row < block_height; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3); + } + for (col = 4; col < block_width - 4; col += 4) { + src = frame_sse + col; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + vsrc[i] = _mm256_loadu_si256((__m256i *)src); + src += SSE_STRIDE; + } + + // Copy first row to first 2 vectors + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (int row = 0; row < block_height - 3; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + vsrc[4] = _mm256_loadu_si256((__m256i *)src); + + src += SSE_STRIDE; + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); + } + for (int row = block_height - 3; row < block_height; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); + } + } + + src = frame_sse + col; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + xx_load_and_pad_right(src, &vsrc[i]); + src += SSE_STRIDE; + } + + // Copy first row to first 2 vectors + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (int row = 0; row < block_height - 3; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + xx_load_and_pad_right(src, &vsrc[4]); + src += SSE_STRIDE; + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); + } + for (int row = block_height - 3; row < block_height; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); + } + + double subblock_mses_scaled[4]; + double d_factor_decayed[4]; + for (int idx = 0; idx < 4; idx++) { + subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; + d_factor_decayed[idx] = d_factor[idx] * decay_factor; + } + if (tf_wgt_calc_lvl == 0) { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + // Scale down the difference for high bit depth input. + diff_sse >>= ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } else { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + // Scale down the difference for high bit depth input. + diff_sse >>= ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } +} + +void av1_highbd_apply_temporal_filter_avx2( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint32_t frame_sse[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred); + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint16_t *ref = + CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0, k = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++, k++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; + } + } + } + } + } + + highbd_apply_temporal_filter( + ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h, + subblock_mses, accum + plane_offset, count + plane_offset, frame_sse, + luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + plane_offset += plane_h * plane_w; + } +} diff --git a/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c new file mode 100644 index 0000000000..2032847083 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "aom_dsp/mathutils.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +// For the squared error buffer, keep a padding for 4 samples +#define SSE_STRIDE (BW + 4) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = { + { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } +}; + +static void get_squared_error(const uint16_t *frame1, const unsigned int stride, + const uint16_t *frame2, + const unsigned int stride2, const int block_width, + const int block_height, uint32_t *frame_sse, + const unsigned int dst_stride) { + const uint16_t *src1 = frame1; + const uint16_t *src2 = frame2; + uint32_t *dst = frame_sse; + + for (int i = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j += 8) { + __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j)); + __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j)); + + __m128i vdiff = _mm_sub_epi16(vsrc1, vsrc2); + __m128i vmullo = _mm_mullo_epi16(vdiff, vdiff); + __m128i vmullh = _mm_mulhi_epi16(vdiff, vdiff); + + __m128i vres1 = _mm_unpacklo_epi16(vmullo, vmullh); + __m128i vres2 = _mm_unpackhi_epi16(vmullo, vmullh); + + _mm_storeu_si128((__m128i *)(dst + j + 2), vres1); + _mm_storeu_si128((__m128i *)(dst + j + 6), vres2); + } + + src1 += stride; + src2 += stride2; + dst += dst_stride; + } +} + +static void xx_load_and_pad(uint32_t *src, __m128i *dstvec, int col, + int block_width) { + __m128i vtmp1 = _mm_loadu_si128((__m128i *)src); + __m128i vtmp2 = _mm_loadu_si128((__m128i *)(src + 4)); + // For the first column, replicate the first element twice to the left + dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA); + // For the last column, replicate the last element twice to the right + dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54); +} + +static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) { + __m128i veca, vecb; + // Mask and obtain the required 5 values inside the vector + veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]); + vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + veca = _mm_add_epi32(veca, vecb); + // B = [A2+B2, A3+B3, 0, 0] + vecb = _mm_srli_si128(veca, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + veca = _mm_add_epi32(veca, vecb); + // B = [A1+B1+A3+B3, 0, 0, 0] + vecb = _mm_srli_si128(veca, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + veca = _mm_add_epi32(veca, vecb); + return _mm_cvtsi128_si32(veca); +} + +static void highbd_apply_temporal_filter( + const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const int *subblock_mses, unsigned int *accumulator, uint16_t *count, + uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd, + const double inv_num_ref_pixels, const double decay_factor, + const double inv_factor, const double weight_factor, double *d_factor, + int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_sse[BH][BW]; + + get_squared_error(frame1, stride, frame2, stride2, block_width, block_height, + frame_sse, SSE_STRIDE); + + __m128i vsrc[5][2]; + + // Traverse 4 columns at a time + // First and last columns will require padding + for (int col = 0; col < block_width; col += 4) { + uint32_t *src = frame_sse + col; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + xx_load_and_pad(src, vsrc[i], col, block_width); + src += SSE_STRIDE; + } + + // Padding for top 2 rows + vsrc[0][0] = vsrc[2][0]; + vsrc[0][1] = vsrc[2][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + + for (int row = 0; row < block_height - 3; row++) { + __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]); + __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]); + __m128i vsum13 = _mm_add_epi32(vsum11, vsum12); + __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]); + + __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]); + __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]); + __m128i vsum23 = _mm_add_epi32(vsum21, vsum22); + __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]); + + vsrc[0][0] = vsrc[1][0]; + vsrc[0][1] = vsrc[1][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + vsrc[2][0] = vsrc[3][0]; + vsrc[2][1] = vsrc[3][1]; + vsrc[3][0] = vsrc[4][0]; + vsrc[3][1] = vsrc[4][1]; + + // Load next row + xx_load_and_pad(src, vsrc[4], col, block_width); + src += SSE_STRIDE; + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3); + } + for (int row = block_height - 3; row < block_height; row++) { + __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]); + __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]); + __m128i vsum13 = _mm_add_epi32(vsum11, vsum12); + __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]); + + __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]); + __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]); + __m128i vsum23 = _mm_add_epi32(vsum21, vsum22); + __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]); + + vsrc[0][0] = vsrc[1][0]; + vsrc[0][1] = vsrc[1][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + vsrc[2][0] = vsrc[3][0]; + vsrc[2][1] = vsrc[3][1]; + vsrc[3][0] = vsrc[4][0]; + vsrc[3][1] = vsrc[4][1]; + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3); + } + } + + double subblock_mses_scaled[4]; + double d_factor_decayed[4]; + for (int idx = 0; idx < 4; idx++) { + subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; + d_factor_decayed[idx] = d_factor[idx] * decay_factor; + } + if (tf_wgt_calc_lvl == 0) { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + // Scale down the difference for high bit depth input. + diff_sse >>= ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } else { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + // Scale down the difference for high bit depth input. + diff_sse >>= ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } +} + +void av1_highbd_apply_temporal_filter_sse2( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint32_t frame_sse[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred); + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint16_t *ref = + CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0, k = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++, k++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; + } + } + } + } + } + + highbd_apply_temporal_filter( + ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h, + subblock_mses, accum + plane_offset, count + plane_offset, frame_sse, + luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + plane_offset += plane_h * plane_w; + } +} diff --git a/third_party/aom/av1/encoder/x86/ml_avx2.c b/third_party/aom/av1/encoder/x86/ml_avx2.c new file mode 100644 index 0000000000..6432708416 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/ml_avx2.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/ml.h" +#include "av1/encoder/x86/ml_sse3.h" + +#define CALC_OUTPUT_FOR_2ROWS \ + const int index = weight_idx + (2 * i * tot_num_inputs); \ + const __m256 weight0 = _mm256_loadu_ps(&weights[index]); \ + const __m256 weight1 = _mm256_loadu_ps(&weights[index + tot_num_inputs]); \ + const __m256 mul0 = _mm256_mul_ps(inputs256, weight0); \ + const __m256 mul1 = _mm256_mul_ps(inputs256, weight1); \ + hadd[i] = _mm256_hadd_ps(mul0, mul1); + +static INLINE void nn_propagate_8to1( + const float *const inputs, const float *const weights, + const float *const bias, int num_inputs_to_process, int tot_num_inputs, + int num_outputs, float *const output_nodes, int is_clip_required) { + // Process one output row at a time. + for (int out = 0; out < num_outputs; out++) { + __m256 in_result = _mm256_setzero_ps(); + float bias_val = bias[out]; + for (int in = 0; in < num_inputs_to_process; in += 8) { + const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]); + const int weight_idx = in + (out * tot_num_inputs); + const __m256 weight0 = _mm256_loadu_ps(&weights[weight_idx]); + const __m256 mul0 = _mm256_mul_ps(inputs256, weight0); + in_result = _mm256_add_ps(in_result, mul0); + } + const __m128 low_128 = _mm256_castps256_ps128(in_result); + const __m128 high_128 = _mm256_extractf128_ps(in_result, 1); + const __m128 sum_par_0 = _mm_add_ps(low_128, high_128); + const __m128 sum_par_1 = _mm_hadd_ps(sum_par_0, sum_par_0); + const __m128 sum_tot = + _mm_add_ps(_mm_shuffle_ps(sum_par_1, sum_par_1, 0x99), sum_par_1); + + bias_val += _mm_cvtss_f32(sum_tot); + if (is_clip_required) bias_val = AOMMAX(bias_val, 0); + output_nodes[out] = bias_val; + } +} + +static INLINE void nn_propagate_8to4( + const float *const inputs, const float *const weights, + const float *const bias, int num_inputs_to_process, int tot_num_inputs, + int num_outputs, float *const output_nodes, int is_clip_required) { + __m256 hadd[2]; + for (int out = 0; out < num_outputs; out += 4) { + __m128 bias_reg = _mm_loadu_ps(&bias[out]); + __m128 in_result = _mm_setzero_ps(); + for (int in = 0; in < num_inputs_to_process; in += 8) { + const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]); + const int weight_idx = in + (out * tot_num_inputs); + // Process two output row at a time. + for (int i = 0; i < 2; i++) { + CALC_OUTPUT_FOR_2ROWS + } + + const __m256 sum_par = _mm256_hadd_ps(hadd[0], hadd[1]); + const __m128 low_128 = _mm256_castps256_ps128(sum_par); + const __m128 high_128 = _mm256_extractf128_ps(sum_par, 1); + const __m128 result = _mm_add_ps(low_128, high_128); + + in_result = _mm_add_ps(in_result, result); + } + + in_result = _mm_add_ps(in_result, bias_reg); + if (is_clip_required) in_result = _mm_max_ps(in_result, _mm_setzero_ps()); + _mm_storeu_ps(&output_nodes[out], in_result); + } +} + +static INLINE void nn_propagate_8to8( + const float *const inputs, const float *const weights, + const float *const bias, int num_inputs_to_process, int tot_num_inputs, + int num_outputs, float *const output_nodes, int is_clip_required) { + __m256 hadd[4]; + for (int out = 0; out < num_outputs; out += 8) { + __m256 bias_reg = _mm256_loadu_ps(&bias[out]); + __m256 in_result = _mm256_setzero_ps(); + for (int in = 0; in < num_inputs_to_process; in += 8) { + const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]); + const int weight_idx = in + (out * tot_num_inputs); + // Process two output rows at a time. + for (int i = 0; i < 4; i++) { + CALC_OUTPUT_FOR_2ROWS + } + const __m256 hh0 = _mm256_hadd_ps(hadd[0], hadd[1]); + const __m256 hh1 = _mm256_hadd_ps(hadd[2], hadd[3]); + + __m256 ht_0 = _mm256_permute2f128_ps(hh0, hh1, 0x20); + __m256 ht_1 = _mm256_permute2f128_ps(hh0, hh1, 0x31); + + __m256 result = _mm256_add_ps(ht_0, ht_1); + in_result = _mm256_add_ps(in_result, result); + } + in_result = _mm256_add_ps(in_result, bias_reg); + if (is_clip_required) + in_result = _mm256_max_ps(in_result, _mm256_setzero_ps()); + _mm256_storeu_ps(&output_nodes[out], in_result); + } +} + +static INLINE void nn_propagate_input_multiple_of_8( + const float *const inputs, const float *const weights, + const float *const bias, int num_inputs_to_process, int tot_num_inputs, + bool is_output_layer, int num_outputs, float *const output_nodes) { + // The saturation of output is considered for hidden layer which is not equal + // to final hidden layer. + const int is_clip_required = + !is_output_layer && num_inputs_to_process == tot_num_inputs; + if (num_outputs % 8 == 0) { + nn_propagate_8to8(inputs, weights, bias, num_inputs_to_process, + tot_num_inputs, num_outputs, output_nodes, + is_clip_required); + } else if (num_outputs % 4 == 0) { + nn_propagate_8to4(inputs, weights, bias, num_inputs_to_process, + tot_num_inputs, num_outputs, output_nodes, + is_clip_required); + } else { + nn_propagate_8to1(inputs, weights, bias, num_inputs_to_process, + tot_num_inputs, num_outputs, output_nodes, + is_clip_required); + } +} + +void av1_nn_predict_avx2(const float *input_nodes, + const NN_CONFIG *const nn_config, int reduce_prec, + float *const output) { + float buf[2][NN_MAX_NODES_PER_LAYER]; + int buf_index = 0; + int num_inputs = nn_config->num_inputs; + assert(num_inputs > 0 && num_inputs <= NN_MAX_NODES_PER_LAYER); + + for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + bool is_output_layer = layer == nn_config->num_hidden_layers; + float *const output_nodes = is_output_layer ? output : &buf[buf_index][0]; + const int num_outputs = is_output_layer + ? nn_config->num_outputs + : nn_config->num_hidden_nodes[layer]; + assert(num_outputs > 0 && num_outputs <= NN_MAX_NODES_PER_LAYER); + + // Process input multiple of 8 using AVX2 intrinsic. + if (num_inputs % 8 == 0) { + nn_propagate_input_multiple_of_8(input_nodes, layer_weights, layer_bias, + num_inputs, num_inputs, is_output_layer, + num_outputs, output_nodes); + } else { + // When number of inputs is not multiple of 8, use hybrid approach of AVX2 + // and SSE3 based on the need. + const int in_mul_8 = num_inputs / 8; + const int num_inputs_to_process = in_mul_8 * 8; + int bias_is_considered = 0; + if (in_mul_8) { + nn_propagate_input_multiple_of_8( + input_nodes, layer_weights, layer_bias, num_inputs_to_process, + num_inputs, is_output_layer, num_outputs, output_nodes); + bias_is_considered = 1; + } + + const float *out_temp = bias_is_considered ? output_nodes : layer_bias; + const int input_remaining = num_inputs % 8; + if (input_remaining % 4 == 0 && num_outputs % 8 == 0) { + for (int out = 0; out < num_outputs; out += 8) { + __m128 out_h = _mm_loadu_ps(&out_temp[out + 4]); + __m128 out_l = _mm_loadu_ps(&out_temp[out]); + for (int in = in_mul_8 * 8; in < num_inputs; in += 4) { + av1_nn_propagate_4to8_sse3(&input_nodes[in], + &layer_weights[out * num_inputs + in], + &out_h, &out_l, num_inputs); + } + if (!is_output_layer) { + const __m128 zero = _mm_setzero_ps(); + out_h = _mm_max_ps(out_h, zero); + out_l = _mm_max_ps(out_l, zero); + } + _mm_storeu_ps(&output_nodes[out + 4], out_h); + _mm_storeu_ps(&output_nodes[out], out_l); + } + } else if (input_remaining % 4 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + __m128 outputs = _mm_loadu_ps(&out_temp[out]); + for (int in = in_mul_8 * 8; in < num_inputs; in += 4) { + av1_nn_propagate_4to4_sse3(&input_nodes[in], + &layer_weights[out * num_inputs + in], + &outputs, num_inputs); + } + if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps()); + _mm_storeu_ps(&output_nodes[out], outputs); + } + } else if (input_remaining % 4 == 0) { + for (int out = 0; out < num_outputs; out++) { + __m128 outputs = _mm_load1_ps(&out_temp[out]); + for (int in = in_mul_8 * 8; in < num_inputs; in += 4) { + av1_nn_propagate_4to1_sse3(&input_nodes[in], + &layer_weights[out * num_inputs + in], + &outputs); + } + if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps()); + output_nodes[out] = _mm_cvtss_f32(outputs); + } + } else { + // Use SSE instructions for scalar operations to avoid the latency + // of swapping between SIMD and FPU modes. + for (int out = 0; out < num_outputs; out++) { + __m128 outputs = _mm_load1_ps(&out_temp[out]); + for (int in_node = in_mul_8 * 8; in_node < num_inputs; in_node++) { + __m128 input = _mm_load1_ps(&input_nodes[in_node]); + __m128 weight = + _mm_load1_ps(&layer_weights[num_inputs * out + in_node]); + outputs = _mm_add_ps(outputs, _mm_mul_ps(input, weight)); + } + if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps()); + output_nodes[out] = _mm_cvtss_f32(outputs); + } + } + } + // Before processing the next layer, treat the output of current layer as + // input to next layer. + input_nodes = output_nodes; + num_inputs = num_outputs; + buf_index = 1 - buf_index; + } + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); +} diff --git a/third_party/aom/av1/encoder/x86/ml_sse3.c b/third_party/aom/av1/encoder/x86/ml_sse3.c new file mode 100644 index 0000000000..4748a68d38 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/ml_sse3.c @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/ml.h" +#include "av1/encoder/x86/ml_sse3.h" + +// In order to avoid the high-latency of swapping between FPU and SIMD +// operations, we keep the result in a 128-bit register even though we only +// care about a single value. +static void nn_propagate_8to1(const float *const inputs, + const float *const weights, + __m128 *const output) { + const __m128 inputs_h = _mm_loadu_ps(&inputs[4]); + const __m128 inputs_l = _mm_loadu_ps(inputs); + + const __m128 weights_h = _mm_loadu_ps(&weights[4]); + const __m128 weights_l = _mm_loadu_ps(weights); + + const __m128 mul_h = _mm_mul_ps(inputs_h, weights_h); + const __m128 mul_l = _mm_mul_ps(inputs_l, weights_l); + // [7 6 5 4] [3 2 1 0] (weight and input indices) + + const __m128 vadd = _mm_add_ps(mul_l, mul_h); + // [7+3 6+2 5+1 4+0] + const __m128 hadd1 = _mm_hadd_ps(vadd, vadd); + // [7+6+3+2 5+4+1+0 7+6+3+2 5+4+1+0] + const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); + // [7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0] + *output = _mm_add_ps(*output, hadd2); +} + +void av1_nn_propagate_4to1_sse3(const float *const inputs, + const float *const weights, + __m128 *const output) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + const __m128 weights128 = _mm_loadu_ps(weights); + + const __m128 mul = _mm_mul_ps(inputs128, weights128); + // [3 2 1 0] (weight and input indices) + + const __m128 hadd1 = _mm_hadd_ps(mul, mul); + // [3+2 1+0 3+2 1+0] + const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); + // [3+2+1+0 3+2+1+0 3+2+1+0 3+2+1+0] + *output = _mm_add_ps(*output, hadd2); +} + +void av1_nn_propagate_4to4_sse3(const float *const inputs, + const float *const weights, + __m128 *const outputs, const int num_inputs) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + __m128 hadd[2]; + for (int i = 0; i < 2; i++) { // For each pair of outputs + const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]); + const __m128 mul0 = _mm_mul_ps(weight0, inputs128); + const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]); + const __m128 mul1 = _mm_mul_ps(weight1, inputs128); + hadd[i] = _mm_hadd_ps(mul0, mul1); + } + // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices) + // hadd[1] = [15+14 13+12 11+10 9+8] + + const __m128 hh = _mm_hadd_ps(hadd[0], hadd[1]); + // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0] + + *outputs = _mm_add_ps(*outputs, hh); +} + +void av1_nn_propagate_4to8_sse3(const float *const inputs, + const float *const weights, __m128 *const out_h, + __m128 *const out_l, const int num_inputs) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + __m128 hadd[4]; + for (int i = 0; i < 4; i++) { // For each pair of outputs + const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]); + const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]); + const __m128 mul0 = _mm_mul_ps(inputs128, weight0); + const __m128 mul1 = _mm_mul_ps(inputs128, weight1); + hadd[i] = _mm_hadd_ps(mul0, mul1); + } + // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices) + // hadd[1] = [15+14 13+12 11+10 9+8] + // hadd[2] = [23+22 21+20 19+18 17+16] + // hadd[3] = [31+30 29+28 27+26 25+24] + + const __m128 hh0 = _mm_hadd_ps(hadd[0], hadd[1]); + // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0] + const __m128 hh1 = _mm_hadd_ps(hadd[2], hadd[3]); + // [31+30+29+28 27+26+25+24 23+22+21+20 19+18+17+16] + + *out_h = _mm_add_ps(*out_h, hh1); + *out_l = _mm_add_ps(*out_l, hh0); +} + +static void nn_propagate_8to4(const float *const inputs, + const float *const weights, __m128 *const outputs, + const int num_inputs) { + const __m128 inputs_h = _mm_loadu_ps(inputs + 4); + const __m128 inputs_l = _mm_loadu_ps(inputs); + // [7 6 5 4] [3 2 1 0] (input indices) + + __m128 add[4]; + for (int i = 0; i < 4; i++) { // For each output: + const __m128 weight_h = _mm_loadu_ps(&weights[i * num_inputs + 4]); + const __m128 weight_l = _mm_loadu_ps(&weights[i * num_inputs]); + const __m128 mul_h = _mm_mul_ps(inputs_h, weight_h); + const __m128 mul_l = _mm_mul_ps(inputs_l, weight_l); + add[i] = _mm_add_ps(mul_l, mul_h); + } + // add[0] = [7+3 6+2 5+1 4+0] + // add[1] = [15+11 14+10 13+9 12+8] + // add[2] = [23+19 22+18 21+17 20+16] + // add[3] = [31+27 30+26 29+25 28+24] + + const __m128 hadd_h = _mm_hadd_ps(add[2], add[3]); + // [31+30+27+26 29+28+25+24 23+22+19+18 21+20+17+16] + const __m128 hadd_l = _mm_hadd_ps(add[0], add[1]); + // [15+14+11+10 13+12+9+8 7+6+3+2 5+4+1+0] + + const __m128 haddhadd = _mm_hadd_ps(hadd_l, hadd_h); + // [31+30+29+28+27+26+25+24 23+22+21+20+19+18+17+16 + // 15+14+13+12+11+10+9+8 7+6+5+4+3+2+1+0] + + *outputs = _mm_add_ps(*outputs, haddhadd); +} + +static void nn_activate8(__m128 *out_h, __m128 *out_l) { + const __m128 zero = _mm_setzero_ps(); + *out_h = _mm_max_ps(*out_h, zero); + *out_l = _mm_max_ps(*out_l, zero); +} + +static void nn_activate4(__m128 *x) { *x = _mm_max_ps(*x, _mm_setzero_ps()); } + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_sse3(const float *input_nodes, + const NN_CONFIG *const nn_config, int reduce_prec, + float *const output) { + float buf[2][NN_MAX_NODES_PER_LAYER]; + int buf_index = 0; + int num_inputs = nn_config->num_inputs; + + // Hidden layers, except the final iteration is the output layer. + for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + bool output_layer = (layer == nn_config->num_hidden_layers); + float *const output_nodes = output_layer ? output : &buf[buf_index][0]; + const int num_outputs = output_layer ? nn_config->num_outputs + : nn_config->num_hidden_nodes[layer]; + + if (num_inputs % 4 == 0 && num_outputs % 8 == 0) { + for (int out = 0; out < num_outputs; out += 8) { + __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]); + __m128 out_l = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + av1_nn_propagate_4to8_sse3(&input_nodes[in], + &layer_weights[out * num_inputs + in], + &out_h, &out_l, num_inputs); + } + if (!output_layer) nn_activate8(&out_h, &out_l); + _mm_storeu_ps(&output_nodes[out + 4], out_h); + _mm_storeu_ps(&output_nodes[out], out_l); + } + } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + __m128 outputs = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 8) { + nn_propagate_8to4(&input_nodes[in], + &layer_weights[out * num_inputs + in], &outputs, + num_inputs); + } + if (!output_layer) nn_activate4(&outputs); + _mm_storeu_ps(&output_nodes[out], outputs); + } + } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + __m128 outputs = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + av1_nn_propagate_4to4_sse3(&input_nodes[in], + &layer_weights[out * num_inputs + in], + &outputs, num_inputs); + } + if (!output_layer) nn_activate4(&outputs); + _mm_storeu_ps(&output_nodes[out], outputs); + } + } else if (num_inputs % 8 == 0) { + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 8) { + nn_propagate_8to1(&input_nodes[in], + &layer_weights[out * num_inputs + in], &total); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } else if (num_inputs % 4 == 0) { + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + av1_nn_propagate_4to1_sse3( + &input_nodes[in], &layer_weights[out * num_inputs + in], &total); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } else { + // Use SSE instructions for scalar operations to avoid the latency of + // swapping between SIMD and FPU modes. + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in_node = 0; in_node < num_inputs; in_node++) { + __m128 input = _mm_load1_ps(&input_nodes[in_node]); + __m128 weight = + _mm_load1_ps(&layer_weights[num_inputs * out + in_node]); + total = _mm_add_ps(total, _mm_mul_ps(input, weight)); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } + input_nodes = output_nodes; + num_inputs = num_outputs; + buf_index = 1 - buf_index; + } + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); +} + +// Based on N. N. Schraudolph. A Fast, Compact Approximation of the Exponential +// Function. Neural Computation, 11(4):853–862, 1999. +static AOM_INLINE __m128 approx_exp(__m128 y) { +#define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2) +#define B \ + 127 // Offset for the exponent according to IEEE floating point standard. +#define C 60801 // Magic number controls the accuracy of approximation + const __m128 multiplier = _mm_set1_ps(A); + const __m128i offset = _mm_set1_epi32(B * (1 << 23) - C); + + y = _mm_mul_ps(y, multiplier); + y = _mm_castsi128_ps(_mm_add_epi32(_mm_cvtps_epi32(y), offset)); + return y; +#undef A +#undef B +#undef C +} + +static AOM_INLINE __m128 reduce_max(__m128 reg) { + __m128 tmp_reg; + + tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e); // 01 00 11 10 + reg = _mm_max_ps(reg, tmp_reg); + + tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1); // 10 11 00 01 + reg = _mm_max_ps(reg, tmp_reg); + + return reg; +} + +static AOM_INLINE __m128 reduce_sum(__m128 reg) { + __m128 tmp_reg; + + tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e); // 01 00 11 10 + reg = _mm_add_ps(reg, tmp_reg); + + tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1); // 10 11 00 01 + reg = _mm_add_ps(reg, tmp_reg); + + return reg; +} + +void av1_nn_fast_softmax_16_sse3(const float *input, float *output) { + // Clips at -10 to avoid underflowing + const __m128 clipper = _mm_set1_ps(-10.0f); + + // Load in 16 values + __m128 in_0 = _mm_loadu_ps(&input[0]); + __m128 in_1 = _mm_loadu_ps(&input[4]); + __m128 in_2 = _mm_loadu_ps(&input[8]); + __m128 in_3 = _mm_loadu_ps(&input[12]); + + // Get the max + __m128 max_0 = _mm_max_ps(in_0, in_1); + __m128 max_1 = _mm_max_ps(in_2, in_3); + + max_0 = _mm_max_ps(max_0, max_1); + max_0 = reduce_max(max_0); + + // Subtract the max off and clip + in_0 = _mm_sub_ps(in_0, max_0); + in_1 = _mm_sub_ps(in_1, max_0); + in_2 = _mm_sub_ps(in_2, max_0); + in_3 = _mm_sub_ps(in_3, max_0); + + in_0 = _mm_max_ps(in_0, clipper); + in_1 = _mm_max_ps(in_1, clipper); + in_2 = _mm_max_ps(in_2, clipper); + in_3 = _mm_max_ps(in_3, clipper); + + // Exponentiate and compute the denominator + __m128 sum = in_0 = approx_exp(in_0); + in_1 = approx_exp(in_1); + sum = _mm_add_ps(sum, in_1); + in_2 = approx_exp(in_2); + sum = _mm_add_ps(sum, in_2); + in_3 = approx_exp(in_3); + sum = _mm_add_ps(sum, in_3); + sum = reduce_sum(sum); + + // Divide to get the probability + in_0 = _mm_div_ps(in_0, sum); + in_1 = _mm_div_ps(in_1, sum); + in_2 = _mm_div_ps(in_2, sum); + in_3 = _mm_div_ps(in_3, sum); + + _mm_storeu_ps(&output[0], in_0); + _mm_storeu_ps(&output[4], in_1); + _mm_storeu_ps(&output[8], in_2); + _mm_storeu_ps(&output[12], in_3); +} diff --git a/third_party/aom/av1/encoder/x86/ml_sse3.h b/third_party/aom/av1/encoder/x86/ml_sse3.h new file mode 100644 index 0000000000..f41a2474af --- /dev/null +++ b/third_party/aom/av1/encoder/x86/ml_sse3.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_X86_ML_SSE3_H_ +#define AOM_AV1_ENCODER_X86_ML_SSE3_H_ + +#include + +void av1_nn_propagate_4to1_sse3(const float *const inputs, + const float *const weights, + __m128 *const output); + +void av1_nn_propagate_4to4_sse3(const float *const inputs, + const float *const weights, + __m128 *const outputs, const int num_inputs); + +void av1_nn_propagate_4to8_sse3(const float *const inputs, + const float *const weights, __m128 *const out_h, + __m128 *const out_l, const int num_inputs); + +#endif // AOM_AV1_ENCODER_X86_ML_SSE3_H_ diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c new file mode 100644 index 0000000000..6658ed39a8 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c @@ -0,0 +1,2348 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/transpose_sse2.h" + +#include "config/av1_rtcd.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd, + const __m256i *shuffle, + const __m256i *dgd_ijkl) { + // Load two 128-bit chunks from dgd + const __m256i s0 = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)dgd)), + _mm_loadu_si128((__m128i *)(dgd + 4)), 1); + // s0 = [11 10 9 8 7 6 5 4] [7 6 5 4 3 2 1 0] as u16 (values are dgd indices) + // The weird order is so the shuffle stays within 128-bit lanes + + // Shuffle 16x u16 values within lanes according to the mask: + // [0 1 1 2 2 3 3 4] [0 1 1 2 2 3 3 4] + // (Actually we shuffle u8 values as there's no 16-bit shuffle) + const __m256i s1 = _mm256_shuffle_epi8(s0, *shuffle); + // s1 = [8 7 7 6 6 5 5 4] [4 3 3 2 2 1 1 0] as u16 (values are dgd indices) + + // Multiply 16x 16-bit integers in dgd_ijkl and s1, resulting in 16x 32-bit + // integers then horizontally add pairs of these integers resulting in 8x + // 32-bit integers + const __m256i d0 = _mm256_madd_epi16(*dgd_ijkl, s1); + // d0 = [a b c d] [e f g h] as u32 + + // Take the lower-half of d0, extend to u64, add it on to dst (H) + const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0)); + // d0l = [a b] [c d] as u64 + const __m256i dst0 = yy_load_256(dst); + yy_store_256(dst, _mm256_add_epi64(d0l, dst0)); + + // Take the upper-half of d0, extend to u64, add it on to dst (H) + const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1)); + // d0h = [e f] [g h] as u64 + const __m256i dst1 = yy_load_256(dst + 4); + yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1)); +} + +static INLINE void acc_stat_highbd_win7_one_line_avx2( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m256i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN], + int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd_ijkl combined as a u32, + // then broadcast to 8x u32 slots of a 256 + const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l)); + // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16 + + acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint16_t X1 = src[j]; + *sumX += X1; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_highbd_avx2` function wants its input to have + // interleaved copies of two pixels, but we only have one. However, the + // pixels are (effectively) used as inputs to a multiply-accumulate. So + // if we set the extra pixel slot to 0, then it is effectively ignored. + const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1); + + acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win7_opt_avx2( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } }; + DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win7_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +static INLINE void acc_stat_highbd_win5_one_line_avx2( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m256i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN_CHROMA; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd_ijkl combined as a u32, + // then broadcast to 8x u32 slots of a 256 + const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l)); + // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16 + + acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint16_t X1 = src[j]; + *sumX += X1; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_highbd_avx2` function wants its input to have + // interleaved copies of two pixels, but we only have one. However, the + // pixels are (effectively) used as inputs to a multiply-accumulate. So + // if we set the extra pixel slot to 0, then it is effectively ignored. + const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1); + + acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win5_opt_avx2( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + DECLARE_ALIGNED( + 32, int64_t, + H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win5_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int64, H_int64); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int64[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + if (wiener_win == WIENER_WIN) { + compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else { + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, bit_depth); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void madd_and_accum_avx2(__m256i src, __m256i dgd, __m256i *sum) { + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(src, dgd)); +} + +static INLINE __m256i convert_and_add_avx2(__m256i src) { + const __m256i s0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src)); + const __m256i s1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 1)); + return _mm256_add_epi64(s0, s1); +} + +static INLINE __m256i hadd_four_32_to_64_avx2(__m256i src0, __m256i src1, + __m256i *src2, __m256i *src3) { + // 00 01 10 11 02 03 12 13 + const __m256i s_0 = _mm256_hadd_epi32(src0, src1); + // 20 21 30 31 22 23 32 33 + const __m256i s_1 = _mm256_hadd_epi32(*src2, *src3); + // 00+01 10+11 20+21 30+31 02+03 12+13 22+23 32+33 + const __m256i s_2 = _mm256_hadd_epi32(s_0, s_1); + return convert_and_add_avx2(s_2); +} + +static INLINE __m128i add_64bit_lvl_avx2(__m256i src0, __m256i src1) { + // 00 10 02 12 + const __m256i t0 = _mm256_unpacklo_epi64(src0, src1); + // 01 11 03 13 + const __m256i t1 = _mm256_unpackhi_epi64(src0, src1); + // 00+01 10+11 02+03 12+13 + const __m256i sum = _mm256_add_epi64(t0, t1); + // 00+01 10+11 + const __m128i sum0 = _mm256_castsi256_si128(sum); + // 02+03 12+13 + const __m128i sum1 = _mm256_extracti128_si256(sum, 1); + // 00+01+02+03 10+11+12+13 + return _mm_add_epi64(sum0, sum1); +} + +static INLINE __m128i convert_32_to_64_add_avx2(__m256i src0, __m256i src1) { + // 00 01 02 03 + const __m256i s0 = convert_and_add_avx2(src0); + // 10 11 12 13 + const __m256i s1 = convert_and_add_avx2(src1); + return add_64bit_lvl_avx2(s0, s1); +} + +static INLINE int32_t calc_sum_of_register(__m256i src) { + const __m128i src_l = _mm256_castsi256_si128(src); + const __m128i src_h = _mm256_extracti128_si256(src, 1); + const __m128i sum = _mm_add_epi32(src_l, src_h); + const __m128i dst0 = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + const __m128i dst1 = _mm_add_epi32(dst0, _mm_srli_si128(dst0, 4)); + return _mm_cvtsi128_si32(dst1); +} + +static INLINE void transpose_64bit_4x4_avx2(const __m256i *const src, + __m256i *const dst) { + // Unpack 64 bit elements. Goes from: + // src[0]: 00 01 02 03 + // src[1]: 10 11 12 13 + // src[2]: 20 21 22 23 + // src[3]: 30 31 32 33 + // to: + // reg0: 00 10 02 12 + // reg1: 20 30 22 32 + // reg2: 01 11 03 13 + // reg3: 21 31 23 33 + const __m256i reg0 = _mm256_unpacklo_epi64(src[0], src[1]); + const __m256i reg1 = _mm256_unpacklo_epi64(src[2], src[3]); + const __m256i reg2 = _mm256_unpackhi_epi64(src[0], src[1]); + const __m256i reg3 = _mm256_unpackhi_epi64(src[2], src[3]); + + // Unpack 64 bit elements resulting in: + // dst[0]: 00 10 20 30 + // dst[1]: 01 11 21 31 + // dst[2]: 02 12 22 32 + // dst[3]: 03 13 23 33 + dst[0] = _mm256_inserti128_si256(reg0, _mm256_castsi256_si128(reg1), 1); + dst[1] = _mm256_inserti128_si256(reg2, _mm256_castsi256_si128(reg3), 1); + dst[2] = _mm256_inserti128_si256(reg1, _mm256_extracti128_si256(reg0, 1), 0); + dst[3] = _mm256_inserti128_si256(reg3, _mm256_extracti128_si256(reg2, 1), 0); +} + +// When we load 32 values of int8_t type and need less than 32 values for +// processing, the below mask is used to make the extra values zero. +static const int8_t mask_8bit[32] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes +}; + +// When we load 16 values of int16_t type and need less than 16 values for +// processing, the below mask is used to make the extra values zero. +static const int16_t mask_16bit[32] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes +}; + +static INLINE uint8_t calc_dgd_buf_avg_avx2(const uint8_t *src, int32_t h_start, + int32_t h_end, int32_t v_start, + int32_t v_end, int32_t stride) { + const uint8_t *src_temp = src + v_start * stride + h_start; + const __m256i zero = _mm256_setzero_si256(); + const int32_t width = h_end - h_start; + const int32_t height = v_end - v_start; + const int32_t wd_beyond_mul32 = width & 31; + const int32_t wd_mul32 = width - wd_beyond_mul32; + __m128i mask_low, mask_high; + __m256i ss = zero; + + // When width is not multiple of 32, it still loads 32 and to make the data + // which is extra (beyond required) as zero using the below mask. + if (wd_beyond_mul32 >= 16) { + mask_low = _mm_set1_epi8(-1); + mask_high = _mm_loadu_si128((__m128i *)(&mask_8bit[32 - wd_beyond_mul32])); + } else { + mask_low = _mm_loadu_si128((__m128i *)(&mask_8bit[16 - wd_beyond_mul32])); + mask_high = _mm_setzero_si128(); + } + const __m256i mask = + _mm256_inserti128_si256(_mm256_castsi128_si256(mask_low), mask_high, 1); + + int32_t proc_ht = 0; + do { + // Process width in multiple of 32. + int32_t proc_wd = 0; + while (proc_wd < wd_mul32) { + const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd)); + const __m256i sad_0 = _mm256_sad_epu8(s_0, zero); + ss = _mm256_add_epi32(ss, sad_0); + proc_wd += 32; + } + + // Process the remaining width. + if (wd_beyond_mul32) { + const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd)); + const __m256i s_m_0 = _mm256_and_si256(s_0, mask); + const __m256i sad_0 = _mm256_sad_epu8(s_m_0, zero); + ss = _mm256_add_epi32(ss, sad_0); + } + src_temp += stride; + proc_ht++; + } while (proc_ht < height); + + const uint32_t sum = calc_sum_of_register(ss); + const uint8_t avg = sum / (width * height); + return avg; +} + +// Fill (src-avg) or (dgd-avg) buffers. Note that when n = (width % 16) is not +// 0, it writes (16 - n) more data than required. +static INLINE void sub_avg_block_avx2(const uint8_t *src, int32_t src_stride, + uint8_t avg, int32_t width, + int32_t height, int16_t *dst, + int32_t dst_stride, + int use_downsampled_wiener_stats) { + const __m256i avg_reg = _mm256_set1_epi16(avg); + + int32_t proc_ht = 0; + do { + int ds_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + if (use_downsampled_wiener_stats && + (height - proc_ht < WIENER_STATS_DOWNSAMPLE_FACTOR)) { + ds_factor = height - proc_ht; + } + + int32_t proc_wd = 0; + while (proc_wd < width) { + const __m128i s = _mm_loadu_si128((__m128i *)(src + proc_wd)); + const __m256i ss = _mm256_cvtepu8_epi16(s); + const __m256i d = _mm256_sub_epi16(ss, avg_reg); + _mm256_storeu_si256((__m256i *)(dst + proc_wd), d); + proc_wd += 16; + } + + src += ds_factor * src_stride; + dst += ds_factor * dst_stride; + proc_ht += ds_factor; + } while (proc_ht < height); +} + +// Fills lower-triangular elements of H buffer from upper triangular elements of +// the same +static INLINE void fill_lower_triag_elements_avx2(const int32_t wiener_win2, + int64_t *const H) { + for (int32_t i = 0; i < wiener_win2 - 1; i += 4) { + __m256i in[4], out[4]; + + in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + i + 1)); + in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + i + 1)); + in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + i + 1)); + in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i + 1)); + + transpose_64bit_4x4_avx2(in, out); + + _mm_storel_epi64((__m128i *)(H + (i + 1) * wiener_win2 + i), + _mm256_castsi256_si128(out[0])); + _mm_storeu_si128((__m128i *)(H + (i + 2) * wiener_win2 + i), + _mm256_castsi256_si128(out[1])); + _mm256_storeu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i), out[2]); + _mm256_storeu_si256((__m256i *)(H + (i + 4) * wiener_win2 + i), out[3]); + + for (int32_t j = i + 5; j < wiener_win2; j += 4) { + in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + j)); + in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + j)); + in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + j)); + in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + j)); + + transpose_64bit_4x4_avx2(in, out); + + _mm256_storeu_si256((__m256i *)(H + (j + 0) * wiener_win2 + i), out[0]); + _mm256_storeu_si256((__m256i *)(H + (j + 1) * wiener_win2 + i), out[1]); + _mm256_storeu_si256((__m256i *)(H + (j + 2) * wiener_win2 + i), out[2]); + _mm256_storeu_si256((__m256i *)(H + (j + 3) * wiener_win2 + i), out[3]); + } + } +} + +// Fill H buffer based on loop_count. +#define INIT_H_VALUES(d, loop_count) \ + for (int g = 0; g < (loop_count); g++) { \ + const __m256i dgd0 = \ + _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \ + madd_and_accum_avx2(dgd_mul_df, dgd0, &sum_h[g]); \ + } + +// Fill M & H buffer. +#define INIT_MH_VALUES(d) \ + for (int g = 0; g < wiener_win; g++) { \ + const __m256i dgds_0 = \ + _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \ + madd_and_accum_avx2(src_mul_df, dgds_0, &sum_m[g]); \ + madd_and_accum_avx2(dgd_mul_df, dgds_0, &sum_h[g]); \ + } + +// Update the dgd pointers appropriately. +#define INITIALIZATION(wiener_window_sz) \ + j = i / (wiener_window_sz); \ + const int16_t *d_window = d + j; \ + const int16_t *d_current_row = \ + d + j + ((i % (wiener_window_sz)) * d_stride); \ + int proc_ht = v_start; \ + downsample_factor = \ + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \ + __m256i sum_h[wiener_window_sz]; \ + memset(sum_h, 0, sizeof(sum_h)); + +// Update the downsample factor appropriately. +#define UPDATE_DOWNSAMPLE_FACTOR \ + int proc_wd = 0; \ + if (use_downsampled_wiener_stats && \ + ((v_end - proc_ht) < WIENER_STATS_DOWNSAMPLE_FACTOR)) { \ + downsample_factor = v_end - proc_ht; \ + } \ + const __m256i df_reg = _mm256_set1_epi16(downsample_factor); + +#define CALCULATE_REMAINING_H_WIN5 \ + while (j < wiener_win) { \ + d_window = d; \ + d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \ + const __m256i zero = _mm256_setzero_si256(); \ + sum_h[0] = zero; \ + sum_h[1] = zero; \ + sum_h[2] = zero; \ + sum_h[3] = zero; \ + sum_h[4] = zero; \ + \ + proc_ht = v_start; \ + downsample_factor = \ + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \ + do { \ + UPDATE_DOWNSAMPLE_FACTOR; \ + \ + /* Process the amount of width multiple of 16.*/ \ + while (proc_wd < wd_mul16) { \ + const __m256i dgd = \ + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \ + INIT_H_VALUES(d_window + j + proc_wd, 5) \ + \ + proc_wd += 16; \ + }; \ + \ + /* Process the remaining width here. */ \ + if (wd_beyond_mul16) { \ + const __m256i dgd = \ + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \ + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \ + INIT_H_VALUES(d_window + j + proc_wd, 5) \ + } \ + proc_ht += downsample_factor; \ + d_window += downsample_factor * d_stride; \ + d_current_row += downsample_factor * d_stride; \ + } while (proc_ht < v_end); \ + const __m256i s_h0 = \ + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \ + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \ + s_h0); \ + const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); \ + const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); \ + _mm_storel_epi64( \ + (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0); \ + j++; \ + } + +#define CALCULATE_REMAINING_H_WIN7 \ + while (j < wiener_win) { \ + d_window = d; \ + d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \ + const __m256i zero = _mm256_setzero_si256(); \ + sum_h[0] = zero; \ + sum_h[1] = zero; \ + sum_h[2] = zero; \ + sum_h[3] = zero; \ + sum_h[4] = zero; \ + sum_h[5] = zero; \ + sum_h[6] = zero; \ + \ + proc_ht = v_start; \ + downsample_factor = \ + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \ + do { \ + UPDATE_DOWNSAMPLE_FACTOR; \ + \ + /* Process the amount of width multiple of 16.*/ \ + while (proc_wd < wd_mul16) { \ + const __m256i dgd = \ + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \ + INIT_H_VALUES(d_window + j + proc_wd, 7) \ + \ + proc_wd += 16; \ + }; \ + \ + /* Process the remaining width here. */ \ + if (wd_beyond_mul16) { \ + const __m256i dgd = \ + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \ + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \ + INIT_H_VALUES(d_window + j + proc_wd, 7) \ + } \ + proc_ht += downsample_factor; \ + d_window += downsample_factor * d_stride; \ + d_current_row += downsample_factor * d_stride; \ + } while (proc_ht < v_end); \ + const __m256i s_h1 = \ + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \ + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \ + s_h1); \ + const __m256i s_h2 = \ + hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); \ + _mm256_storeu_si256( \ + (__m256i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_h2); \ + j++; \ + } + +// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate +// the filter tap values required for wiener filtering. Here, the buffer H is of +// size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size +// (wiener_window_size*wiener_window_size). H is a symmetric matrix where the +// value above the diagonal (upper triangle) are equal to the values below the +// diagonal (lower triangle). The calculation of elements/stats of H(upper +// triangle) and M is done in steps as described below where each step fills +// specific values of H and M. +// Once the upper triangular elements of H matrix are derived, the same will be +// copied to lower triangular using the function +// fill_lower_triag_elements_avx2(). +// Example: Wiener window size = +// WIENER_WIN_CHROMA (5) M buffer = [M0 M1 M2 ---- M23 M24] H buffer = Hxy +// (x-row, y-column) [H00 H01 H02 ---- H023 H024] [H10 H11 H12 ---- H123 H124] +// [H30 H31 H32 ---- H323 H324] +// [H40 H41 H42 ---- H423 H424] +// [H50 H51 H52 ---- H523 H524] +// [H60 H61 H62 ---- H623 H624] +// || +// || +// [H230 H231 H232 ---- H2323 H2324] +// [H240 H241 H242 ---- H2423 H2424] +// In Step 1, whole M buffers (i.e., M0 to M24) and the first row of H (i.e., +// H00 to H024) is filled. The remaining rows of H buffer are filled through +// steps 2 to 6. +static void compute_stats_win5_avx2(const int16_t *const d, int32_t d_stride, + const int16_t *const s, int32_t s_stride, + int32_t width, int v_start, int v_end, + int64_t *const M, int64_t *const H, + int use_downsampled_wiener_stats) { + const int32_t wiener_win = WIENER_WIN_CHROMA; + const int32_t wiener_win2 = wiener_win * wiener_win; + // Amount of width which is beyond multiple of 16. This case is handled + // appropriately to process only the required width towards the end. + const int32_t wd_mul16 = width & ~15; + const int32_t wd_beyond_mul16 = width - wd_mul16; + const __m256i mask = + _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16])); + int downsample_factor; + + // Step 1: Full M (i.e., M0 to M24) and first row H (i.e., H00 to H024) + // values are filled here. Here, the loop over 'j' is executed for values 0 + // to 4 (wiener_win-1). When the loop executed for a specific 'j', 5 values of + // M and H are filled as shown below. + // j=0: M0-M4 and H00-H04, j=1: M5-M9 and H05-H09 are filled etc,. + int j = 0; + do { + const int16_t *s_t = s; + const int16_t *d_t = d; + __m256i sum_m[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() }; + __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() }; + downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + int proc_ht = v_start; + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); + const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); + const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_MH_VALUES(d_t + j + proc_wd) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); + const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); + const __m256i src_mask = _mm256_and_si256(src, mask); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_MH_VALUES(d_t + j + proc_wd) + } + proc_ht += downsample_factor; + s_t += downsample_factor * s_stride; + d_t += downsample_factor * d_stride; + } while (proc_ht < v_end); + + const __m256i s_m = + hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]); + const __m128i s_m_h = convert_32_to_64_add_avx2(sum_m[4], sum_h[4]); + _mm256_storeu_si256((__m256i *)(M + wiener_win * j), s_m); + _mm_storel_epi64((__m128i *)&M[wiener_win * j + 4], s_m_h); + + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + wiener_win * j), s_h); + _mm_storeh_epi64((__m128i *)&H[wiener_win * j + 4], s_m_h); + } while (++j < wiener_win); + + // The below steps are designed to fill remaining rows of H buffer. Here, aim + // is to fill only upper triangle elements correspond to each row and lower + // triangle elements are copied from upper-triangle elements. Also, as + // mentioned in Step 1, the core function is designed to fill 5 + // elements/stats/values of H buffer. + // + // Step 2: Here, the rows 1, 6, 11, 16 and 21 are filled. As we need to fill + // only upper-triangle elements, H10 from row1, H60-H64 and H65 from row6,etc, + // are need not be filled. As the core function process 5 values, in first + // iteration of 'j' only 4 values to be filled i.e., H11-H14 from row1,H66-H69 + // from row6, etc. + for (int i = 1; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN_CHROMA) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN5 + } + + // Step 3: Here, the rows 2, 7, 12, 17 and 22 are filled. As we need to fill + // only upper-triangle elements, H20-H21 from row2, H70-H74 and H75-H76 from + // row7, etc, are need not be filled. As the core function process 5 values, + // in first iteration of 'j' only 3 values to be filled i.e., H22-H24 from + // row2, H77-H79 from row7, etc. + for (int i = 2; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN_CHROMA) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN5 + } + + // Step 4: Here, the rows 3, 8, 13, 18 and 23 are filled. As we need to fill + // only upper-triangle elements, H30-H32 from row3, H80-H84 and H85-H87 from + // row8, etc, are need not be filled. As the core function process 5 values, + // in first iteration of 'j' only 2 values to be filled i.e., H33-H34 from + // row3, H88-89 from row8, etc. + for (int i = 3; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN_CHROMA) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]); + _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN5 + } + + // Step 5: Here, the rows 4, 9, 14, 19 and 24 are filled. As we need to fill + // only upper-triangle elements, H40-H43 from row4, H90-H94 and H95-H98 from + // row9, etc, are need not be filled. As the core function process 5 values, + // in first iteration of 'j' only 1 values to be filled i.e., H44 from row4, + // H99 from row9, etc. + for (int i = 4; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN_CHROMA) + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]); + _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN5 + } + + // Step 6: Here, the rows 5, 10, 15 and 20 are filled. As we need to fill only + // upper-triangle elements, H50-H54 from row5, H100-H104 and H105-H109 from + // row10,etc, are need not be filled. The first iteration of 'j' fills H55-H59 + // from row5 and H1010-H1014 from row10, etc. + for (int i = 5; i < wiener_win2; i += wiener_win) { + // Derive j'th iteration from where the H buffer filling needs to be + // started. + j = i / wiener_win; + int shift = 0; + do { + // Update the dgd pointers appropriately. + int proc_ht = v_start; + const int16_t *d_window = d + (i / wiener_win); + const int16_t *d_current_row = + d + (i / wiener_win) + ((i % wiener_win) * d_stride); + downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() }; + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + shift + proc_wd, 5) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + shift + proc_wd, 5) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), + s_h); + const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); + const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); + _mm_storel_epi64( + (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0); + shift++; + } while (++j < wiener_win); + } + + fill_lower_triag_elements_avx2(wiener_win2, H); +} + +// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate +// the filter tap values required for wiener filtering. Here, the buffer H is of +// size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size +// (wiener_window_size*wiener_window_size). H is a symmetric matrix where the +// value above the diagonal (upper triangle) are equal to the values below the +// diagonal (lower triangle). The calculation of elements/stats of H(upper +// triangle) and M is done in steps as described below where each step fills +// specific values of H and M. +// Example: +// Wiener window size = WIENER_WIN (7) +// M buffer = [M0 M1 M2 ---- M47 M48] +// H buffer = Hxy (x-row, y-column) +// [H00 H01 H02 ---- H047 H048] +// [H10 H11 H12 ---- H147 H148] +// [H30 H31 H32 ---- H347 H348] +// [H40 H41 H42 ---- H447 H448] +// [H50 H51 H52 ---- H547 H548] +// [H60 H61 H62 ---- H647 H648] +// || +// || +// [H470 H471 H472 ---- H4747 H4748] +// [H480 H481 H482 ---- H4847 H4848] +// In Step 1, whole M buffers (i.e., M0 to M48) and the first row of H (i.e., +// H00 to H048) is filled. The remaining rows of H buffer are filled through +// steps 2 to 8. +static void compute_stats_win7_avx2(const int16_t *const d, int32_t d_stride, + const int16_t *const s, int32_t s_stride, + int32_t width, int v_start, int v_end, + int64_t *const M, int64_t *const H, + int use_downsampled_wiener_stats) { + const int32_t wiener_win = WIENER_WIN; + const int32_t wiener_win2 = wiener_win * wiener_win; + // Amount of width which is beyond multiple of 16. This case is handled + // appropriately to process only the required width towards the end. + const int32_t wd_mul16 = width & ~15; + const int32_t wd_beyond_mul16 = width - wd_mul16; + const __m256i mask = + _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16])); + int downsample_factor; + + // Step 1: Full M (i.e., M0 to M48) and first row H (i.e., H00 to H048) + // values are filled here. Here, the loop over 'j' is executed for values 0 + // to 6. When the loop executed for a specific 'j', 7 values of M and H are + // filled as shown below. + // j=0: M0-M6 and H00-H06, j=1: M7-M13 and H07-H013 are filled etc,. + int j = 0; + do { + const int16_t *s_t = s; + const int16_t *d_t = d; + __m256i sum_m[WIENER_WIN] = { _mm256_setzero_si256() }; + __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() }; + downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + int proc_ht = v_start; + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); + const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); + const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_MH_VALUES(d_t + j + proc_wd) + + proc_wd += 16; + } + + if (wd_beyond_mul16) { + const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); + const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); + const __m256i src_mask = _mm256_and_si256(src, mask); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_MH_VALUES(d_t + j + proc_wd) + } + proc_ht += downsample_factor; + s_t += downsample_factor * s_stride; + d_t += downsample_factor * d_stride; + } while (proc_ht < v_end); + + const __m256i s_m0 = + hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]); + const __m256i s_m1 = + hadd_four_32_to_64_avx2(sum_m[4], sum_m[5], &sum_m[6], &sum_m[6]); + _mm256_storeu_si256((__m256i *)(M + wiener_win * j + 0), s_m0); + _mm_storeu_si128((__m128i *)(M + wiener_win * j + 4), + _mm256_castsi256_si128(s_m1)); + _mm_storel_epi64((__m128i *)&M[wiener_win * j + 6], + _mm256_extracti128_si256(s_m1, 1)); + + const __m256i sh_0 = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + const __m256i sh_1 = + hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); + _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 0), sh_0); + _mm_storeu_si128((__m128i *)(H + wiener_win * j + 4), + _mm256_castsi256_si128(sh_1)); + _mm_storel_epi64((__m128i *)&H[wiener_win * j + 6], + _mm256_extracti128_si256(sh_1, 1)); + } while (++j < wiener_win); + + // The below steps are designed to fill remaining rows of H buffer. Here, aim + // is to fill only upper triangle elements correspond to each row and lower + // triangle elements are copied from upper-triangle elements. Also, as + // mentioned in Step 1, the core function is designed to fill 7 + // elements/stats/values of H buffer. + // + // Step 2: Here, the rows 1, 8, 15, 22, 29, 36 and 43 are filled. As we need + // to fill only upper-triangle elements, H10 from row1, H80-H86 and H87 from + // row8, etc. are need not be filled. As the core function process 7 values, + // in first iteration of 'j' only 6 values to be filled i.e., H11-H16 from + // row1 and H88-H813 from row8, etc. + for (int i = 1; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + const __m128i s_h0 = convert_32_to_64_add_avx2(sum_h[4], sum_h[5]); + _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i + 4), s_h0); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 3: Here, the rows 2, 9, 16, 23, 30, 37 and 44 are filled. As we need + // to fill only upper-triangle elements, H20-H21 from row2, H90-H96 and + // H97-H98 from row9, etc. are need not be filled. As the core function + // process 7 values, in first iteration of 'j' only 5 values to be filled + // i.e., H22-H26 from row2 and H99-H913 from row9, etc. + for (int i = 2; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); + const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); + _mm_storel_epi64((__m128i *)(H + (i * wiener_win2) + i + 4), s_m_h0); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 4: Here, the rows 3, 10, 17, 24, 31, 38 and 45 are filled. As we need + // to fill only upper-triangle elements, H30-H32 from row3, H100-H106 and + // H107-H109 from row10, etc. are need not be filled. As the core function + // process 7 values, in first iteration of 'j' only 4 values to be filled + // i.e., H33-H36 from row3 and H1010-H1013 from row10, etc. + for (int i = 3; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 5: Here, the rows 4, 11, 18, 25, 32, 39 and 46 are filled. As we need + // to fill only upper-triangle elements, H40-H43 from row4, H110-H116 and + // H117-H1110 from row10, etc. are need not be filled. As the core function + // process 7 values, in first iteration of 'j' only 3 values to be filled + // i.e., H44-H46 from row4 and H1111-H1113 from row11, etc. + for (int i = 4; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 6: Here, the rows 5, 12, 19, 26, 33, 40 and 47 are filled. As we need + // to fill only upper-triangle elements, H50-H54 from row5, H120-H126 and + // H127-H1211 from row12, etc. are need not be filled. As the core function + // process 7 values, in first iteration of 'j' only 2 values to be filled + // i.e., H55-H56 from row5 and H1212-H1213 from row12, etc. + for (int i = 5; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 7: Here, the rows 6, 13, 20, 27, 34, 41 and 48 are filled. As we need + // to fill only upper-triangle elements, H60-H65 from row6, H130-H136 and + // H137-H1312 from row13, etc. are need not be filled. As the core function + // process 7 values, in first iteration of 'j' only 1 value to be filled + // i.e., H66 from row6 and H1313 from row13, etc. + for (int i = 6; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + xx_storel_64(&H[(i * wiener_win2) + i], _mm256_castsi256_si128(s_h)); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 8: Here, the rows 7, 14, 21, 28, 35 and 42 are filled. As we need + // to fill only upper-triangle elements, H70-H75 from row7, H140-H146 and + // H147-H1413 from row14, etc. are need not be filled. The first iteration of + // 'j' fills H77-H713 from row7 and H1414-H1420 from row14, etc. + for (int i = 7; i < wiener_win2; i += wiener_win) { + // Derive j'th iteration from where the H buffer filling needs to be + // started. + j = i / wiener_win; + int shift = 0; + do { + // Update the dgd pointers appropriately. + int proc_ht = v_start; + const int16_t *d_window = d + (i / WIENER_WIN); + const int16_t *d_current_row = + d + (i / WIENER_WIN) + ((i % WIENER_WIN) * d_stride); + downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() }; + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + shift + proc_wd, 7) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + shift + proc_wd, 7) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + + const __m256i sh_0 = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + const __m256i sh_1 = + hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), + sh_0); + _mm_storeu_si128( + (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), + _mm256_castsi256_si128(sh_1)); + _mm_storel_epi64((__m128i *)&H[(i * wiener_win2) + (wiener_win * j) + 6], + _mm256_extracti128_si256(sh_1, 1)); + shift++; + } while (++j < wiener_win); + } + + fill_lower_triag_elements_avx2(wiener_win2, H); +} + +void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + if (wiener_win != WIENER_WIN && wiener_win != WIENER_WIN_CHROMA) { + // Currently, libaom supports Wiener filter processing with window sizes as + // WIENER_WIN_CHROMA(5) and WIENER_WIN(7). For any other window size, SIMD + // support is not facilitated. Hence, invoke C function for the same. + av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H, + use_downsampled_wiener_stats); + return; + } + + const int32_t wiener_halfwin = wiener_win >> 1; + const uint8_t avg = + calc_dgd_buf_avg_avx2(dgd, h_start, h_end, v_start, v_end, dgd_stride); + const int32_t width = h_end - h_start; + const int32_t height = v_end - v_start; + const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15; + const int32_t s_stride = (width + 15) & ~15; + + // Based on the sf 'use_downsampled_wiener_stats', process either once for + // UPDATE_DOWNSAMPLE_FACTOR or for each row. + sub_avg_block_avx2(src + v_start * src_stride + h_start, src_stride, avg, + width, height, src_avg, s_stride, + use_downsampled_wiener_stats); + + // Compute (dgd-avg) buffer here which is used to fill H buffer. + sub_avg_block_avx2( + dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin, + dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, + dgd_avg, d_stride, 0); + if (wiener_win == WIENER_WIN) { + compute_stats_win7_avx2(dgd_avg, d_stride, src_avg, s_stride, width, + v_start, v_end, M, H, use_downsampled_wiener_stats); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win5_avx2(dgd_avg, d_stride, src_avg, s_stride, width, + v_start, v_end, M, H, use_downsampled_wiener_stats); + } +} + +static INLINE __m256i pair_set_epi16(int a, int b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +int64_t av1_lowbd_pixel_proj_error_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + __m256i sum64 = _mm256_setzero_si256(); + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt0_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt0 + j), + yy_loadu_256(flt0 + j + 8)), + 0xd8); + const __m256i flt1_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt1 + j), + yy_loadu_256(flt1 + j + 8)), + 0xd8); + const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS); + const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0); + const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0); + const __m256i v0 = _mm256_madd_epi16( + xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m256i v1 = _mm256_madd_epi16( + xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else if (params->r[0] > 0 || params->r[1] > 0) { + const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m256i xq_coeff = + pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt + j), + yy_loadu_256(flt + j + 8)), + 0xd8); + const __m256i v0 = + _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0)); + const __m256i v1 = + _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_active * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else { + __m256i sum32 = _mm256_setzero_si256(); + for (i = 0; i < height; ++i) { + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i diff0 = _mm256_sub_epi16(d0, s0); + const __m256i err0 = _mm256_madd_epi16(diff0, diff0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64_0, sum64_1); + } + int64_t sum[4]; + yy_storeu_256(sum, sum64); + err += sum[0] + sum[1] + sum[2] + sum[3]; + return err; +} + +// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of +// C and H need to be computed. +static AOM_INLINE void calc_proj_params_r0_r1_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m256i h00, h01, h11, c0, c1; + const __m256i zero = _mm256_setzero_si256(); + h01 = h11 = c0 = c1 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); + __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f1 = _mm256_sub_epi32(f1, d); + f2 = _mm256_sub_epi32(f2, d); + + const __m256i h00_even = _mm256_mul_epi32(f1, f1); + const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f1, 32)); + h00 = _mm256_add_epi64(h00, h00_even); + h00 = _mm256_add_epi64(h00, h00_odd); + + const __m256i h01_even = _mm256_mul_epi32(f1, f2); + const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f2, 32)); + h01 = _mm256_add_epi64(h01, h01_even); + h01 = _mm256_add_epi64(h01, h01_odd); + + const __m256i h11_even = _mm256_mul_epi32(f2, f2); + const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), + _mm256_srli_epi64(f2, 32)); + h11 = _mm256_add_epi64(h11, h11_even); + h11 = _mm256_add_epi64(h11, h11_odd); + + const __m256i c0_even = _mm256_mul_epi32(f1, s); + const __m256i c0_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); + c0 = _mm256_add_epi64(c0, c0_even); + c0 = _mm256_add_epi64(c0, c0_odd); + + const __m256i c1_even = _mm256_mul_epi32(f2, s); + const __m256i c1_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); + c1 = _mm256_add_epi64(c1, c1_even); + c1 = _mm256_add_epi64(c1, c1_odd); + } + } + + __m256i c_low = _mm256_unpacklo_epi64(c0, c1); + const __m256i c_high = _mm256_unpackhi_epi64(c0, c1); + c_low = _mm256_add_epi64(c_low, c_high); + const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1), + _mm256_castsi256_si128(c_low)); + + __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01); + const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01); + h0x_low = _mm256_add_epi64(h0x_low, h0x_high); + const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1), + _mm256_castsi256_si128(h0x_low)); + + // Using the symmetric properties of H, calculations of H[1][0] are not + // needed. + __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11); + const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11); + h1x_low = _mm256_add_epi64(h1x_low, h1x_high); + const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1), + _mm256_castsi256_si128(h1x_low)); + + xx_storeu_128(C, c_128bit); + xx_storeu_128(H[0], h0x_128bit); + xx_storeu_128(H[1], h1x_128bit); + + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + + // Since H is a symmetric matrix + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +// When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m256i h00, c0; + const __m256i zero = _mm256_setzero_si256(); + c0 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f1 = _mm256_sub_epi32(f1, d); + + const __m256i h00_even = _mm256_mul_epi32(f1, f1); + const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f1, 32)); + h00 = _mm256_add_epi64(h00, h00_even); + h00 = _mm256_add_epi64(h00, h00_odd); + + const __m256i c0_even = _mm256_mul_epi32(f1, s); + const __m256i c0_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); + c0 = _mm256_add_epi64(c0, c0_even); + c0 = _mm256_add_epi64(c0, c0_odd); + } + } + const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1), + _mm256_castsi256_si128(h00)); + const __m128i h00_val = + _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8)); + + const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1), + _mm256_castsi256_si128(c0)); + const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8)); + + const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero)); + const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero)); + + xx_storeu_128(C, c); + xx_storeu_128(H[0], h0x); + + H[0][0] /= size; + C[0] /= size; +} + +// When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r1_avx2(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt1, + int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m256i h11, c1; + const __m256i zero = _mm256_setzero_si256(); + c1 = h11 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f2 = _mm256_sub_epi32(f2, d); + + const __m256i h11_even = _mm256_mul_epi32(f2, f2); + const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), + _mm256_srli_epi64(f2, 32)); + h11 = _mm256_add_epi64(h11, h11_even); + h11 = _mm256_add_epi64(h11, h11_odd); + + const __m256i c1_even = _mm256_mul_epi32(f2, s); + const __m256i c1_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); + c1 = _mm256_add_epi64(c1, c1_even); + c1 = _mm256_add_epi64(c1, c1_odd); + } + } + + const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1), + _mm256_castsi256_si128(h11)); + const __m128i h11_val = + _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8)); + + const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1), + _mm256_castsi256_si128(c1)); + const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8)); + + const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val); + const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val); + + xx_storeu_128(C, c); + xx_storeu_128(H[1], h1x); + + H[1][1] /= size; + C[1] /= size; +} + +// AVX2 variant of av1_calc_proj_params_c. +void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], + int64_t C[2], const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride, + flt1, flt1_stride, H, C); + } +} + +static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m256i h00, h01, h11, c0, c1; + const __m256i zero = _mm256_setzero_si256(); + h01 = h11 = c0 = c1 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(src + i * src_stride + j))); + __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); + __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f1 = _mm256_sub_epi32(f1, d); + f2 = _mm256_sub_epi32(f2, d); + + const __m256i h00_even = _mm256_mul_epi32(f1, f1); + const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f1, 32)); + h00 = _mm256_add_epi64(h00, h00_even); + h00 = _mm256_add_epi64(h00, h00_odd); + + const __m256i h01_even = _mm256_mul_epi32(f1, f2); + const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f2, 32)); + h01 = _mm256_add_epi64(h01, h01_even); + h01 = _mm256_add_epi64(h01, h01_odd); + + const __m256i h11_even = _mm256_mul_epi32(f2, f2); + const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), + _mm256_srli_epi64(f2, 32)); + h11 = _mm256_add_epi64(h11, h11_even); + h11 = _mm256_add_epi64(h11, h11_odd); + + const __m256i c0_even = _mm256_mul_epi32(f1, s); + const __m256i c0_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); + c0 = _mm256_add_epi64(c0, c0_even); + c0 = _mm256_add_epi64(c0, c0_odd); + + const __m256i c1_even = _mm256_mul_epi32(f2, s); + const __m256i c1_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); + c1 = _mm256_add_epi64(c1, c1_even); + c1 = _mm256_add_epi64(c1, c1_odd); + } + } + + __m256i c_low = _mm256_unpacklo_epi64(c0, c1); + const __m256i c_high = _mm256_unpackhi_epi64(c0, c1); + c_low = _mm256_add_epi64(c_low, c_high); + const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1), + _mm256_castsi256_si128(c_low)); + + __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01); + const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01); + h0x_low = _mm256_add_epi64(h0x_low, h0x_high); + const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1), + _mm256_castsi256_si128(h0x_low)); + + // Using the symmetric properties of H, calculations of H[1][0] are not + // needed. + __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11); + const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11); + h1x_low = _mm256_add_epi64(h1x_low, h1x_high); + const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1), + _mm256_castsi256_si128(h1x_low)); + + xx_storeu_128(C, c_128bit); + xx_storeu_128(H[0], h0x_128bit); + xx_storeu_128(H[1], h1x_128bit); + + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + + // Since H is a symmetric matrix + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_high_bd_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m256i h00, c0; + const __m256i zero = _mm256_setzero_si256(); + c0 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(src + i * src_stride + j))); + __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f1 = _mm256_sub_epi32(f1, d); + + const __m256i h00_even = _mm256_mul_epi32(f1, f1); + const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f1, 32)); + h00 = _mm256_add_epi64(h00, h00_even); + h00 = _mm256_add_epi64(h00, h00_odd); + + const __m256i c0_even = _mm256_mul_epi32(f1, s); + const __m256i c0_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); + c0 = _mm256_add_epi64(c0, c0_even); + c0 = _mm256_add_epi64(c0, c0_odd); + } + } + const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1), + _mm256_castsi256_si128(h00)); + const __m128i h00_val = + _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8)); + + const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1), + _mm256_castsi256_si128(c0)); + const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8)); + + const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero)); + const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero)); + + xx_storeu_128(C, c); + xx_storeu_128(H[0], h0x); + + H[0][0] /= size; + C[0] /= size; +} + +static AOM_INLINE void calc_proj_params_r1_high_bd_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m256i h11, c1; + const __m256i zero = _mm256_setzero_si256(); + c1 = h11 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(src + i * src_stride + j))); + __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f2 = _mm256_sub_epi32(f2, d); + + const __m256i h11_even = _mm256_mul_epi32(f2, f2); + const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), + _mm256_srli_epi64(f2, 32)); + h11 = _mm256_add_epi64(h11, h11_even); + h11 = _mm256_add_epi64(h11, h11_odd); + + const __m256i c1_even = _mm256_mul_epi32(f2, s); + const __m256i c1_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); + c1 = _mm256_add_epi64(c1, c1_even); + c1 = _mm256_add_epi64(c1, c1_odd); + } + } + + const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1), + _mm256_castsi256_si128(h11)); + const __m128i h11_val = + _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8)); + + const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1), + _mm256_castsi256_si128(c1)); + const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8)); + + const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val); + const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val); + + xx_storeu_128(C, c); + xx_storeu_128(H[1], h1x); + + H[1][1] /= size; + C[1] /= size; +} + +// AVX2 variant of av1_calc_proj_params_high_bd_c. +void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_pixel_proj_error_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + __m256i sum64 = _mm256_setzero_si256(); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled + const __m256i xq0 = _mm256_set1_epi32(xq[0]); + const __m256i xq1 = _mm256_set1_epi32(xq[1]); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { // Process 16 pixels at a time + // Load 16 pixels each from source image and corrupted image + const __m256i s0 = yy_loadu_256(src + j); + const __m256i d0 = yy_loadu_256(dat + j); + // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 (indices) + + // Shift-up each pixel to match filtered image scaling + const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS); + + // Split u0 into two halves and pad each from u16 to i32 + const __m256i u0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(u0)); + const __m256i u0h = + _mm256_cvtepu16_epi32(_mm256_extracti128_si256(u0, 1)); + // u0h, u0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32 + + // Load 16 pixels from each filtered image + const __m256i flt0l = yy_loadu_256(flt0 + j); + const __m256i flt0h = yy_loadu_256(flt0 + j + 8); + const __m256i flt1l = yy_loadu_256(flt1 + j); + const __m256i flt1h = yy_loadu_256(flt1 + j + 8); + // flt?l, flt?h = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32 + + // Subtract shifted corrupt image from each filtered image + const __m256i flt0l_subu = _mm256_sub_epi32(flt0l, u0l); + const __m256i flt0h_subu = _mm256_sub_epi32(flt0h, u0h); + const __m256i flt1l_subu = _mm256_sub_epi32(flt1l, u0l); + const __m256i flt1h_subu = _mm256_sub_epi32(flt1h, u0h); + + // Multiply basis vectors by appropriate coefficients + const __m256i v0l = _mm256_mullo_epi32(flt0l_subu, xq0); + const __m256i v0h = _mm256_mullo_epi32(flt0h_subu, xq0); + const __m256i v1l = _mm256_mullo_epi32(flt1l_subu, xq1); + const __m256i v1h = _mm256_mullo_epi32(flt1h_subu, xq1); + + // Add together the contributions from the two basis vectors + const __m256i vl = _mm256_add_epi32(v0l, v1l); + const __m256i vh = _mm256_add_epi32(v0h, v1h); + + // Right-shift v with appropriate rounding + const __m256i vrl = + _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift); + const __m256i vrh = + _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift); + // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] + + // Saturate each i32 to an i16 then combine both halves + // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes + const __m256i vr = + _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8); + // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] + // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] + + // Add twin-subspace-sgr-filter to corrupt image then subtract source + const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m256i err0 = _mm256_madd_epi16(e0, e0); + + sum32 = _mm256_add_epi32(sum32, err0); + } + + const __m256i sum32l = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); + sum64 = _mm256_add_epi64(sum64, sum32l); + const __m256i sum32h = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 16) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled + const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m256i xq_active = _mm256_set1_epi32(xq_on); + const __m256i xq_inactive = + _mm256_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + // Load 16 pixels from source image + const __m256i s0 = yy_loadu_256(src + j); + // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 + + // Load 16 pixels from corrupted image and pad each u16 to i32 + const __m256i d0 = yy_loadu_256(dat + j); + const __m256i d0h = + _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d0, 1)); + const __m256i d0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d0)); + // d0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 + // d0h, d0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 + + // Load 16 pixels from the filtered image + const __m256i flth = yy_loadu_256(flt + j + 8); + const __m256i fltl = yy_loadu_256(flt + j); + // flth, fltl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 + + const __m256i flth_xq = _mm256_mullo_epi32(flth, xq_active); + const __m256i fltl_xq = _mm256_mullo_epi32(fltl, xq_active); + const __m256i d0h_xq = _mm256_mullo_epi32(d0h, xq_inactive); + const __m256i d0l_xq = _mm256_mullo_epi32(d0l, xq_inactive); + + const __m256i vh = _mm256_add_epi32(flth_xq, d0h_xq); + const __m256i vl = _mm256_add_epi32(fltl_xq, d0l_xq); + + // Shift this down with appropriate rounding + const __m256i vrh = + _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift); + const __m256i vrl = + _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift); + // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 + + // Saturate each i32 to an i16 then combine both halves + // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes + const __m256i vr = + _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8); + // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] as u16 + // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 + + // Subtract twin-subspace-sgr filtered from source image to get error + const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m256i err0 = _mm256_madd_epi16(e0, e0); + + sum32 = _mm256_add_epi32(sum32, err0); + } + + const __m256i sum32l = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); + sum64 = _mm256_add_epi64(sum64, sum32l); + const __m256i sum32h = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 16) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_on * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + } + } else { // Neither filter is enabled + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 32; j += 32) { + // Load 2x16 u16 from source image + const __m256i s0l = yy_loadu_256(src + j); + const __m256i s0h = yy_loadu_256(src + j + 16); + + // Load 2x16 u16 from corrupted image + const __m256i d0l = yy_loadu_256(dat + j); + const __m256i d0h = yy_loadu_256(dat + j + 16); + + // Subtract corrupted image from source image + const __m256i diffl = _mm256_sub_epi16(d0l, s0l); + const __m256i diffh = _mm256_sub_epi16(d0h, s0h); + + // Square error and add adjacent values + const __m256i err0l = _mm256_madd_epi16(diffl, diffl); + const __m256i err0h = _mm256_madd_epi16(diffh, diffh); + + sum32 = _mm256_add_epi32(sum32, err0l); + sum32 = _mm256_add_epi32(sum32, err0h); + } + + const __m256i sum32l = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); + sum64 = _mm256_add_epi64(sum64, sum32l); + const __m256i sum32h = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum32h); + + // Process remaining pixels (modulu 16) + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + + // Sum 4 values from sum64l and sum64h into err + int64_t sum[4]; + yy_storeu_256(sum, sum64); + err += sum[0] + sum[1] + sum[2] + sum[3]; + return err; +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c new file mode 100644 index 0000000000..50db305802 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c @@ -0,0 +1,1483 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms.h" + +#include "config/av1_rtcd.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src, + const __m128i *shuffle, const __m128i *kl) { + const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle); + const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s)); + const __m128i d1 = + _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8))); + const __m128i dst0 = xx_loadu_128(dst); + const __m128i dst1 = xx_loadu_128(dst + 4); + const __m128i r0 = _mm_add_epi32(dst0, d0); + const __m128i r1 = _mm_add_epi32(dst1, d1); + xx_storeu_128(dst, r0); + xx_storeu_128(dst + 4, r1); +} + +static INLINE void acc_stat_win7_one_line_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN], + int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + const int wiener_win = 7; + int j, k, l; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m128i kl = + _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l))); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + *sumX += X1; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_sse41` function wants its input to have interleaved + // copies of two pixels, but we only have one. However, the pixels + // are (effectively) used as inputs to a multiply-accumulate. + // So if we set the extra pixel slot to 0, then it is effectively + // ignored. + const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1)); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win7_opt_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint8_t avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t M_int32_row[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t H_int32_row[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + int32_t sumX_row = 0; + int32_t sumY_row[WIENER_WIN][WIENER_WIN] = { { 0 } }; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i = i + downsample_factor) { + if (use_downsampled_wiener_stats && + (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) { + downsample_factor = vert_end - i; + } + sumX_row = 0; + memset(sumY_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN); + memset(M_int32_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN); + memset(H_int32_row, 0, sizeof(int32_t) * WIENER_WIN2 * (WIENER_WIN * 8)); + acc_stat_win7_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row); + sumX += sumX_row * downsample_factor; + // Scale M matrix based on the downsampling factor + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + sumY[k][l] += (sumY_row[k][l] * downsample_factor); + M_int32[k][l] += (M_int32_row[k][l] * downsample_factor); + } + } + // Scale H matrix based on the downsampling factor + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = 0; l < WIENER_WIN * 8; ++l) { + H_int32[k][l] += (H_int32_row[k][l] * downsample_factor); + } + } + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = 0; l < WIENER_WIN * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = + M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l])); + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + (int64_t)avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd, + const __m128i *shuffle, + const __m128i *dgd_ijkl) { + // Load 256 bits from dgd in two chunks + const __m128i s0l = xx_loadu_128(dgd); + const __m128i s0h = xx_loadu_128(dgd + 4); + // s0l = [7 6 5 4 3 2 1 0] as u16 values (dgd indices) + // s0h = [11 10 9 8 7 6 5 4] as u16 values (dgd indices) + // (Slightly strange order so we can apply the same shuffle to both halves) + + // Shuffle the u16 values in each half (actually using 8-bit shuffle mask) + const __m128i s1l = _mm_shuffle_epi8(s0l, *shuffle); + const __m128i s1h = _mm_shuffle_epi8(s0h, *shuffle); + // s1l = [4 3 3 2 2 1 1 0] as u16 values (dgd indices) + // s1h = [8 7 7 6 6 5 5 4] as u16 values (dgd indices) + + // Multiply s1 by dgd_ijkl resulting in 8x u32 values + // Horizontally add pairs of u32 resulting in 4x u32 + const __m128i dl = _mm_madd_epi16(*dgd_ijkl, s1l); + const __m128i dh = _mm_madd_epi16(*dgd_ijkl, s1h); + // dl = [d c b a] as u32 values + // dh = [h g f e] as u32 values + + // Add these 8x u32 results on to dst in four parts + const __m128i dll = _mm_cvtepu32_epi64(dl); + const __m128i dlh = _mm_cvtepu32_epi64(_mm_srli_si128(dl, 8)); + const __m128i dhl = _mm_cvtepu32_epi64(dh); + const __m128i dhh = _mm_cvtepu32_epi64(_mm_srli_si128(dh, 8)); + // dll = [b a] as u64 values, etc. + + const __m128i rll = _mm_add_epi64(xx_loadu_128(dst), dll); + xx_storeu_128(dst, rll); + const __m128i rlh = _mm_add_epi64(xx_loadu_128(dst + 2), dlh); + xx_storeu_128(dst + 2, rlh); + const __m128i rhl = _mm_add_epi64(xx_loadu_128(dst + 4), dhl); + xx_storeu_128(dst + 4, rhl); + const __m128i rhh = _mm_add_epi64(xx_loadu_128(dst + 6), dhh); + xx_storeu_128(dst + 6, rhh); +} + +static INLINE void acc_stat_highbd_win7_one_line_sse4_1( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN], + int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd as a single u32 + // Then broadcast to 4x u32 slots of a 128 + const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l)); + // dgd_ijkl = [y x y x y x y x] as u16 + + acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint16_t X1 = src[j]; + *sumX += X1; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_highbd_sse41` function wants its input to have + // interleaved copies of two pixels, but we only have one. However, the + // pixels are (effectively) used as inputs to a multiply-accumulate. So + // if we set the extra pixel slot to 0, then it is effectively ignored. + const __m128i dgd_ijkl = _mm_set1_epi32((int)D1); + + acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win7_opt_sse4_1( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + // Load just half of the 256-bit shuffle control used for the AVX2 version + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win7_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +static INLINE void acc_stat_highbd_win5_one_line_sse4_1( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN_CHROMA; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd as a single u32 + // then broadcast to 4x u32 slots of a 128 + const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l)); + // dgd_ijkl = [y x y x y x y x] as u16 + + acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint16_t X1 = src[j]; + *sumX += X1; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_highbd_sse41` function wants its input to have + // interleaved copies of two pixels, but we only have one. However, the + // pixels are (effectively) used as inputs to a multiply-accumulate. So + // if we set the extra pixel slot to 0, then it is effectively ignored. + const __m128i dgd_ijkl = _mm_set1_epi32((int)D1); + + acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win5_opt_sse4_1( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + // Load just half of the 256-bit shuffle control used for the AVX2 version + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win5_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, + int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + if (wiener_win == WIENER_WIN) { + compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else { + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, bit_depth); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void acc_stat_win5_one_line_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + const int wiener_win = WIENER_WIN_CHROMA; + int j, k, l; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m128i kl = + _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l))); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + *sumX += X1; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_sse41` function wants its input to have interleaved + // copies of two pixels, but we only have one. However, the pixels + // are (effectively) used as inputs to a multiply-accumulate. + // So if we set the extra pixel slot to 0, then it is effectively + // ignored. + const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1)); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win5_opt_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint8_t avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t M_int32_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t H_int32_row[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + int32_t sumX_row = 0; + int32_t sumY_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i = i + downsample_factor) { + if (use_downsampled_wiener_stats && + (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) { + downsample_factor = vert_end - i; + } + sumX_row = 0; + memset(sumY_row, 0, + sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA); + memset(M_int32_row, 0, + sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA); + memset(H_int32_row, 0, + sizeof(int32_t) * WIENER_WIN2_CHROMA * (WIENER_WIN_CHROMA * 8)); + acc_stat_win5_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row); + sumX += sumX_row * downsample_factor; + // Scale M matrix based on the downsampling factor + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + sumY[k][l] += (sumY_row[k][l] * downsample_factor); + M_int32[k][l] += (M_int32_row[k][l] * downsample_factor); + } + } + // Scale H matrix based on the downsampling factor + for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) { + for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { + H_int32[k][l] += (H_int32_row[k][l] * downsample_factor); + } + } + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) { + for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = + M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l])); + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + (int64_t)avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} +void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H, + use_downsampled_wiener_stats); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H, + use_downsampled_wiener_stats); + } else { + av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H, + use_downsampled_wiener_stats); + } +} + +static INLINE __m128i pair_set_epi16(int a, int b) { + return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +int64_t av1_lowbd_pixel_proj_error_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + __m128i sum64 = _mm_setzero_si128(); + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]); + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt0_16b = + _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4)); + const __m128i flt1_16b = + _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4)); + const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS); + const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0); + const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0); + const __m128i v0 = _mm_madd_epi16( + xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m128i v1 = _mm_madd_epi16( + xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else if (params->r[0] > 0 || params->r[1] > 0) { + const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m128i xq_coeff = + pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt_16b = + _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4)); + const __m128i v0 = + _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0)); + const __m128i v1 = + _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_active * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else { + __m128i sum32 = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j <= width - 16; j += 16) { + const __m128i d = xx_loadu_128(dat + j); + const __m128i s = xx_loadu_128(src + j); + const __m128i d0 = _mm_cvtepu8_epi16(d); + const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8)); + const __m128i s0 = _mm_cvtepu8_epi16(s); + const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)); + const __m128i diff0 = _mm_sub_epi16(d0, s0); + const __m128i diff1 = _mm_sub_epi16(d1, s1); + const __m128i err0 = _mm_madd_epi16(diff0, diff0); + const __m128i err1 = _mm_madd_epi16(diff1, diff1); + sum32 = _mm_add_epi32(sum32, err0); + sum32 = _mm_add_epi32(sum32, err1); + } + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64_0, sum64_1); + } + int64_t sum[2]; + xx_storeu_128(sum, sum64); + err += sum[0] + sum[1]; + return err; +} + +// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of +// C and H need to be computed. +static AOM_INLINE void calc_proj_params_r0_r1_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m128i h00, h01, h11, c0, c1; + const __m128i zero = _mm_setzero_si128(); + h01 = h11 = c0 = c1 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j)))); + const __m128i s_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j)))); + __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); + __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f1 = _mm_sub_epi32(f1, d); + f2 = _mm_sub_epi32(f2, d); + + const __m128i h00_even = _mm_mul_epi32(f1, f1); + const __m128i h00_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); + h00 = _mm_add_epi64(h00, h00_even); + h00 = _mm_add_epi64(h00, h00_odd); + + const __m128i h01_even = _mm_mul_epi32(f1, f2); + const __m128i h01_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32)); + h01 = _mm_add_epi64(h01, h01_even); + h01 = _mm_add_epi64(h01, h01_odd); + + const __m128i h11_even = _mm_mul_epi32(f2, f2); + const __m128i h11_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); + h11 = _mm_add_epi64(h11, h11_even); + h11 = _mm_add_epi64(h11, h11_odd); + + const __m128i c0_even = _mm_mul_epi32(f1, s); + const __m128i c0_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); + c0 = _mm_add_epi64(c0, c0_even); + c0 = _mm_add_epi64(c0, c0_odd); + + const __m128i c1_even = _mm_mul_epi32(f2, s); + const __m128i c1_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); + c1 = _mm_add_epi64(c1, c1_even); + c1 = _mm_add_epi64(c1, c1_odd); + } + } + + __m128i c_low = _mm_unpacklo_epi64(c0, c1); + const __m128i c_high = _mm_unpackhi_epi64(c0, c1); + c_low = _mm_add_epi64(c_low, c_high); + + __m128i h0x_low = _mm_unpacklo_epi64(h00, h01); + const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01); + h0x_low = _mm_add_epi64(h0x_low, h0x_high); + + // Using the symmetric properties of H, calculations of H[1][0] are not + // needed. + __m128i h1x_low = _mm_unpacklo_epi64(zero, h11); + const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11); + h1x_low = _mm_add_epi64(h1x_low, h1x_high); + + xx_storeu_128(C, c_low); + xx_storeu_128(H[0], h0x_low); + xx_storeu_128(H[1], h1x_low); + + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + + // Since H is a symmetric matrix + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +// When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r0_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m128i h00, c0; + const __m128i zero = _mm_setzero_si128(); + c0 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j)))); + const __m128i s_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j)))); + __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f1 = _mm_sub_epi32(f1, d); + + const __m128i h00_even = _mm_mul_epi32(f1, f1); + const __m128i h00_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); + h00 = _mm_add_epi64(h00, h00_even); + h00 = _mm_add_epi64(h00, h00_odd); + + const __m128i c0_even = _mm_mul_epi32(f1, s); + const __m128i c0_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); + c0 = _mm_add_epi64(c0, c0_even); + c0 = _mm_add_epi64(c0, c0_odd); + } + } + const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8)); + + const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8)); + + const __m128i c = _mm_unpacklo_epi64(c0_val, zero); + const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero); + + xx_storeu_128(C, c); + xx_storeu_128(H[0], h0x); + + H[0][0] /= size; + C[0] /= size; +} + +// When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r1_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m128i h11, c1; + const __m128i zero = _mm_setzero_si128(); + c1 = h11 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j)))); + const __m128i s_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j)))); + __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f2 = _mm_sub_epi32(f2, d); + + const __m128i h11_even = _mm_mul_epi32(f2, f2); + const __m128i h11_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); + h11 = _mm_add_epi64(h11, h11_even); + h11 = _mm_add_epi64(h11, h11_odd); + + const __m128i c1_even = _mm_mul_epi32(f2, s); + const __m128i c1_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); + c1 = _mm_add_epi64(c1, c1_even); + c1 = _mm_add_epi64(c1, c1_odd); + } + } + + const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8)); + + const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8)); + + const __m128i c = _mm_unpacklo_epi64(zero, c1_val); + const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val); + + xx_storeu_128(C, c); + xx_storeu_128(H[1], h1x); + + H[1][1] /= size; + C[1] /= size; +} + +// SSE4.1 variant of av1_calc_proj_params_c. +void av1_calc_proj_params_sse4_1(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +static AOM_INLINE void calc_proj_params_r0_r1_high_bd_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m128i h00, h01, h11, c0, c1; + const __m128i zero = _mm_setzero_si128(); + h01 = h11 = c0 = c1 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m128i s_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); + __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f1 = _mm_sub_epi32(f1, d); + f2 = _mm_sub_epi32(f2, d); + + const __m128i h00_even = _mm_mul_epi32(f1, f1); + const __m128i h00_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); + h00 = _mm_add_epi64(h00, h00_even); + h00 = _mm_add_epi64(h00, h00_odd); + + const __m128i h01_even = _mm_mul_epi32(f1, f2); + const __m128i h01_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32)); + h01 = _mm_add_epi64(h01, h01_even); + h01 = _mm_add_epi64(h01, h01_odd); + + const __m128i h11_even = _mm_mul_epi32(f2, f2); + const __m128i h11_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); + h11 = _mm_add_epi64(h11, h11_even); + h11 = _mm_add_epi64(h11, h11_odd); + + const __m128i c0_even = _mm_mul_epi32(f1, s); + const __m128i c0_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); + c0 = _mm_add_epi64(c0, c0_even); + c0 = _mm_add_epi64(c0, c0_odd); + + const __m128i c1_even = _mm_mul_epi32(f2, s); + const __m128i c1_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); + c1 = _mm_add_epi64(c1, c1_even); + c1 = _mm_add_epi64(c1, c1_odd); + } + } + + __m128i c_low = _mm_unpacklo_epi64(c0, c1); + const __m128i c_high = _mm_unpackhi_epi64(c0, c1); + c_low = _mm_add_epi64(c_low, c_high); + + __m128i h0x_low = _mm_unpacklo_epi64(h00, h01); + const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01); + h0x_low = _mm_add_epi64(h0x_low, h0x_high); + + // Using the symmetric properties of H, calculations of H[1][0] are not + // needed. + __m128i h1x_low = _mm_unpacklo_epi64(zero, h11); + const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11); + h1x_low = _mm_add_epi64(h1x_low, h1x_high); + + xx_storeu_128(C, c_low); + xx_storeu_128(H[0], h0x_low); + xx_storeu_128(H[1], h1x_low); + + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + + // Since H is a symmetric matrix + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +// When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r0_high_bd_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m128i h00, c0; + const __m128i zero = _mm_setzero_si128(); + c0 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m128i s_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f1 = _mm_sub_epi32(f1, d); + + const __m128i h00_even = _mm_mul_epi32(f1, f1); + const __m128i h00_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); + h00 = _mm_add_epi64(h00, h00_even); + h00 = _mm_add_epi64(h00, h00_odd); + + const __m128i c0_even = _mm_mul_epi32(f1, s); + const __m128i c0_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); + c0 = _mm_add_epi64(c0, c0_even); + c0 = _mm_add_epi64(c0, c0_odd); + } + } + const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8)); + + const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8)); + + const __m128i c = _mm_unpacklo_epi64(c0_val, zero); + const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero); + + xx_storeu_128(C, c); + xx_storeu_128(H[0], h0x); + + H[0][0] /= size; + C[0] /= size; +} + +// When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r1_high_bd_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m128i h11, c1; + const __m128i zero = _mm_setzero_si128(); + c1 = h11 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m128i s_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f2 = _mm_sub_epi32(f2, d); + + const __m128i h11_even = _mm_mul_epi32(f2, f2); + const __m128i h11_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); + h11 = _mm_add_epi64(h11, h11_even); + h11 = _mm_add_epi64(h11, h11_odd); + + const __m128i c1_even = _mm_mul_epi32(f2, s); + const __m128i c1_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); + c1 = _mm_add_epi64(c1, c1_even); + c1 = _mm_add_epi64(c1, c1_odd); + } + } + + const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8)); + + const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8)); + + const __m128i c = _mm_unpacklo_epi64(zero, c1_val); + const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val); + + xx_storeu_128(C, c); + xx_storeu_128(H[1], h1x); + + H[1][1] /= size; + C[1] /= size; +} + +// SSE4.1 variant of av1_calc_proj_params_high_bd_c. +void av1_calc_proj_params_high_bd_sse4_1(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_high_bd_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_pixel_proj_error_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + __m128i sum64 = _mm_setzero_si128(); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled + const __m128i xq0 = _mm_set1_epi32(xq[0]); + const __m128i xq1 = _mm_set1_epi32(xq[1]); + + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + // Load 8x pixels from source image + const __m128i s0 = xx_loadu_128(src + j); + // s0 = [7 6 5 4 3 2 1 0] as i16 (indices of src[]) + + // Load 8x pixels from corrupted image + const __m128i d0 = xx_loadu_128(dat + j); + // d0 = [7 6 5 4 3 2 1 0] as i16 (indices of dat[]) + + // Shift each pixel value up by SGRPROJ_RST_BITS + const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS); + + // Split u0 into two halves and pad each from u16 to i32 + const __m128i u0l = _mm_cvtepu16_epi32(u0); + const __m128i u0h = _mm_cvtepu16_epi32(_mm_srli_si128(u0, 8)); + // u0h = [7 6 5 4] as i32, u0l = [3 2 1 0] as i32, all dat[] indices + + // Load 8 pixels from first and second filtered images + const __m128i flt0l = xx_loadu_128(flt0 + j); + const __m128i flt0h = xx_loadu_128(flt0 + j + 4); + const __m128i flt1l = xx_loadu_128(flt1 + j); + const __m128i flt1h = xx_loadu_128(flt1 + j + 4); + // flt0 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt0+j) + // flt1 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt1+j) + + // Subtract shifted corrupt image from each filtered image + // This gives our two basis vectors for the projection + const __m128i flt0l_subu = _mm_sub_epi32(flt0l, u0l); + const __m128i flt0h_subu = _mm_sub_epi32(flt0h, u0h); + const __m128i flt1l_subu = _mm_sub_epi32(flt1l, u0l); + const __m128i flt1h_subu = _mm_sub_epi32(flt1h, u0h); + // flt?h_subu = [ f[7]-u[7] f[6]-u[6] f[5]-u[5] f[4]-u[4] ] as i32 + // flt?l_subu = [ f[3]-u[3] f[2]-u[2] f[1]-u[1] f[0]-u[0] ] as i32 + + // Multiply each basis vector by the corresponding coefficient + const __m128i v0l = _mm_mullo_epi32(flt0l_subu, xq0); + const __m128i v0h = _mm_mullo_epi32(flt0h_subu, xq0); + const __m128i v1l = _mm_mullo_epi32(flt1l_subu, xq1); + const __m128i v1h = _mm_mullo_epi32(flt1h_subu, xq1); + + // Add together the contribution from each scaled basis vector + const __m128i vl = _mm_add_epi32(v0l, v1l); + const __m128i vh = _mm_add_epi32(v0h, v1h); + + // Right-shift v with appropriate rounding + const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift); + const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift); + + // Saturate each i32 value to i16 and combine lower and upper halves + const __m128i vr = _mm_packs_epi32(vrl, vrh); + + // Add twin-subspace-sgr-filter to corrupt image then subtract source + const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m128i err0 = _mm_madd_epi16(e0, e0); + + sum32 = _mm_add_epi32(sum32, err0); + } + + const __m128i sum32l = _mm_cvtepu32_epi64(sum32); + sum64 = _mm_add_epi64(sum64, sum32l); + const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 8) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled + const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m128i xq_active = _mm_set1_epi32(xq_on); + const __m128i xq_inactive = + _mm_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + // Load 8x pixels from source image + const __m128i s0 = xx_loadu_128(src + j); + // s0 = [7 6 5 4 3 2 1 0] as u16 (indices of src[]) + + // Load 8x pixels from corrupted image and pad each u16 to i32 + const __m128i d0 = xx_loadu_128(dat + j); + const __m128i d0h = _mm_cvtepu16_epi32(_mm_srli_si128(d0, 8)); + const __m128i d0l = _mm_cvtepu16_epi32(d0); + // d0h, d0l = [7 6 5 4], [3 2 1 0] as u32 (indices of dat[]) + + // Load 8 pixels from the filtered image + const __m128i flth = xx_loadu_128(flt + j + 4); + const __m128i fltl = xx_loadu_128(flt + j); + // flth, fltl = [7 6 5 4], [3 2 1 0] as i32 (indices of flt+j) + + const __m128i flth_xq = _mm_mullo_epi32(flth, xq_active); + const __m128i fltl_xq = _mm_mullo_epi32(fltl, xq_active); + const __m128i d0h_xq = _mm_mullo_epi32(d0h, xq_inactive); + const __m128i d0l_xq = _mm_mullo_epi32(d0l, xq_inactive); + + const __m128i vh = _mm_add_epi32(flth_xq, d0h_xq); + const __m128i vl = _mm_add_epi32(fltl_xq, d0l_xq); + // vh = [ xq0(f[7]-d[7]) xq0(f[6]-d[6]) xq0(f[5]-d[5]) xq0(f[4]-d[4]) ] + // vl = [ xq0(f[3]-d[3]) xq0(f[2]-d[2]) xq0(f[1]-d[1]) xq0(f[0]-d[0]) ] + + // Shift this down with appropriate rounding + const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift); + const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift); + + // Saturate vr0 and vr1 from i32 to i16 then pack together + const __m128i vr = _mm_packs_epi32(vrl, vrh); + + // Subtract twin-subspace-sgr filtered from source image to get error + const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m128i err0 = _mm_madd_epi16(e0, e0); + + sum32 = _mm_add_epi32(sum32, err0); + } + + const __m128i sum32l = _mm_cvtepu32_epi64(sum32); + sum64 = _mm_add_epi64(sum64, sum32l); + const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 8) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_on * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + } + } else { // Neither filter is enabled + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 16; j += 16) { + // Load 2x8 u16 from source image + const __m128i s0 = xx_loadu_128(src + j); + const __m128i s1 = xx_loadu_128(src + j + 8); + // Load 2x8 u16 from corrupted image + const __m128i d0 = xx_loadu_128(dat + j); + const __m128i d1 = xx_loadu_128(dat + j + 8); + + // Subtract corrupted image from source image + const __m128i diff0 = _mm_sub_epi16(d0, s0); + const __m128i diff1 = _mm_sub_epi16(d1, s1); + + // Square error and add adjacent values + const __m128i err0 = _mm_madd_epi16(diff0, diff0); + const __m128i err1 = _mm_madd_epi16(diff1, diff1); + + sum32 = _mm_add_epi32(sum32, err0); + sum32 = _mm_add_epi32(sum32, err1); + } + + const __m128i sum32l = _mm_cvtepu32_epi64(sum32); + sum64 = _mm_add_epi64(sum64, sum32l); + const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum32h); + + // Process remaining pixels (modulu 8) + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + + // Sum 4 values from sum64l and sum64h into err + int64_t sum[2]; + xx_storeu_128(sum, sum64); + err += sum[0] + sum[1]; + return err; +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/x86/rdopt_avx2.c b/third_party/aom/av1/encoder/x86/rdopt_avx2.c new file mode 100644 index 0000000000..a0ab3940c0 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/rdopt_avx2.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +#include "config/av1_rtcd.h" +#include "av1/encoder/rdopt.h" + +// Process horizontal and vertical correlations in a 4x4 block of pixels. +// We actually use the 4x4 pixels to calculate correlations corresponding to +// the top-left 3x3 pixels, so this function must be called with 1x1 overlap, +// moving the window along/down by 3 pixels at a time. +INLINE static void horver_correlation_4x4(const int16_t *diff, int stride, + __m256i *xy_sum_32, + __m256i *xz_sum_32, __m256i *x_sum_32, + __m256i *x2_sum_32) { + // Pixels in this 4x4 [ a b c d ] + // are referred to as: [ e f g h ] + // [ i j k l ] + // [ m n o p ] + + const __m256i pixels = _mm256_set_epi64x( + loadu_int64(&diff[0 * stride]), loadu_int64(&diff[1 * stride]), + loadu_int64(&diff[2 * stride]), loadu_int64(&diff[3 * stride])); + // pixels = [d c b a h g f e] [l k j i p o n m] as i16 + + const __m256i slli = _mm256_slli_epi64(pixels, 16); + // slli = [c b a 0 g f e 0] [k j i 0 o n m 0] as i16 + + const __m256i madd_xy = _mm256_madd_epi16(pixels, slli); + // madd_xy = [bc+cd ab fg+gh ef] [jk+kl ij no+op mn] as i32 + *xy_sum_32 = _mm256_add_epi32(*xy_sum_32, madd_xy); + + // Permute control [3 2] [1 0] => [2 1] [0 0], 0b10010000 = 0x90 + const __m256i perm = _mm256_permute4x64_epi64(slli, 0x90); + // perm = [g f e 0 k j i 0] [o n m 0 o n m 0] as i16 + + const __m256i madd_xz = _mm256_madd_epi16(slli, perm); + // madd_xz = [cg+bf ae gk+fj ei] [ko+jn im oo+nn mm] as i32 + *xz_sum_32 = _mm256_add_epi32(*xz_sum_32, madd_xz); + + // Sum every element in slli (and then also their squares) + const __m256i madd1_slli = _mm256_madd_epi16(slli, _mm256_set1_epi16(1)); + // madd1_slli = [c+b a g+f e] [k+j i o+n m] as i32 + *x_sum_32 = _mm256_add_epi32(*x_sum_32, madd1_slli); + + const __m256i madd_slli = _mm256_madd_epi16(slli, slli); + // madd_slli = [cc+bb aa gg+ff ee] [kk+jj ii oo+nn mm] as i32 + *x2_sum_32 = _mm256_add_epi32(*x2_sum_32, madd_slli); +} + +void av1_get_horver_correlation_full_avx2(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - right neighbour pixel + // z - below neighbour pixel + // w - down-right neighbour pixel + int64_t xy_sum = 0, xz_sum = 0; + int64_t x_sum = 0, x2_sum = 0; + + // Process horizontal and vertical correlations through the body in 4x4 + // blocks. This excludes the final row and column and possibly one extra + // column depending how 3 divides into width and height + int32_t xy_xz_tmp[8] = { 0 }, x_x2_tmp[8] = { 0 }; + __m256i xy_sum_32 = _mm256_setzero_si256(); + __m256i xz_sum_32 = _mm256_setzero_si256(); + __m256i x_sum_32 = _mm256_setzero_si256(); + __m256i x2_sum_32 = _mm256_setzero_si256(); + for (int i = 0; i <= height - 4; i += 3) { + for (int j = 0; j <= width - 4; j += 3) { + horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, + &xz_sum_32, &x_sum_32, &x2_sum_32); + } + const __m256i hadd_xy_xz = _mm256_hadd_epi32(xy_sum_32, xz_sum_32); + // hadd_xy_xz = [ae+bf+cg ei+fj+gk ab+bc+cd ef+fg+gh] + // [im+jn+ko mm+nn+oo ij+jk+kl mn+no+op] as i32 + yy_storeu_256(xy_xz_tmp, hadd_xy_xz); + xy_sum += (int64_t)xy_xz_tmp[5] + xy_xz_tmp[4] + xy_xz_tmp[1]; + xz_sum += (int64_t)xy_xz_tmp[7] + xy_xz_tmp[6] + xy_xz_tmp[3]; + + const __m256i hadd_x_x2 = _mm256_hadd_epi32(x_sum_32, x2_sum_32); + // hadd_x_x2 = [aa+bb+cc ee+ff+gg a+b+c e+f+g] + // [ii+jj+kk mm+nn+oo i+j+k m+n+o] as i32 + yy_storeu_256(x_x2_tmp, hadd_x_x2); + x_sum += (int64_t)x_x2_tmp[5] + x_x2_tmp[4] + x_x2_tmp[1]; + x2_sum += (int64_t)x_x2_tmp[7] + x_x2_tmp[6] + x_x2_tmp[3]; + + xy_sum_32 = _mm256_setzero_si256(); + xz_sum_32 = _mm256_setzero_si256(); + x_sum_32 = _mm256_setzero_si256(); + x2_sum_32 = _mm256_setzero_si256(); + } + + // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols + int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; + + // Do we have 2 rows remaining or just the one? Note that width and height + // are powers of 2, so each modulo 3 must be 1 or 2. + if (height % 3 == 1) { // Just horiz corrs on the final row + const int16_t x0 = diff[(height - 1) * stride]; + x_sum += x0; + x_finalrow += x0; + x2_sum += x0 * x0; + x2_finalrow += x0 * x0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 1) * stride + j]; + const int16_t y = diff[(height - 1) * stride + j + 1]; + xy_sum += x * y; + x_sum += y; + x2_sum += y * y; + x_finalrow += y; + x2_finalrow += y * y; + } + } else { // Two rows remaining to do + const int16_t x0 = diff[(height - 2) * stride]; + const int16_t z0 = diff[(height - 1) * stride]; + x_sum += x0 + z0; + x2_sum += x0 * x0 + z0 * z0; + x_finalrow += z0; + x2_finalrow += z0 * z0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 2) * stride + j]; + const int16_t y = diff[(height - 2) * stride + j + 1]; + const int16_t z = diff[(height - 1) * stride + j]; + const int16_t w = diff[(height - 1) * stride + j + 1]; + + // Horizontal and vertical correlations for the penultimate row: + xy_sum += x * y; + xz_sum += x * z; + + // Now just horizontal correlations for the final row: + xy_sum += z * w; + + x_sum += y + w; + x2_sum += y * y + w * w; + x_finalrow += w; + x2_finalrow += w * w; + } + } + + // Do we have 2 columns remaining or just the one? + if (width % 3 == 1) { // Just vert corrs on the final col + const int16_t x0 = diff[width - 1]; + x_sum += x0; + x_finalcol += x0; + x2_sum += x0 * x0; + x2_finalcol += x0 * x0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 1]; + xz_sum += x * z; + x_finalcol += z; + x2_finalcol += z * z; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z; + x2_sum += z * z; + } + } + } else { // Two cols remaining + const int16_t x0 = diff[width - 2]; + const int16_t y0 = diff[width - 1]; + x_sum += x0 + y0; + x2_sum += x0 * x0 + y0 * y0; + x_finalcol += y0; + x2_finalcol += y0 * y0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 2]; + const int16_t y = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 2]; + const int16_t w = diff[(i + 1) * stride + width - 1]; + + // Horizontal and vertical correlations for the penultimate col: + // Skip these on the last iteration of this loop if we also had two + // rows remaining, otherwise the final horizontal and vertical correlation + // get erroneously processed twice + if (i < height - 2 || height % 3 == 1) { + xy_sum += x * y; + xz_sum += x * z; + } + + x_finalcol += w; + x2_finalcol += w * w; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z + w; + x2_sum += z * z + w * w; + } + + // Now just vertical correlations for the final column: + xz_sum += y * w; + } + } + + // Calculate the simple sums and squared-sums + int64_t x_firstrow = 0, x_firstcol = 0; + int64_t x2_firstrow = 0, x2_firstcol = 0; + + for (int j = 0; j < width; ++j) { + x_firstrow += diff[j]; + x2_firstrow += diff[j] * diff[j]; + } + for (int i = 0; i < height; ++i) { + x_firstcol += diff[i * stride]; + x2_firstcol += diff[i * stride] * diff[i * stride]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} diff --git a/third_party/aom/av1/encoder/x86/rdopt_sse4.c b/third_party/aom/av1/encoder/x86/rdopt_sse4.c new file mode 100644 index 0000000000..12ac146195 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/rdopt_sse4.c @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/x86/synonyms.h" + +#include "config/av1_rtcd.h" +#include "av1/encoder/rdopt.h" + +// Process horizontal and vertical correlations in a 4x4 block of pixels. +// We actually use the 4x4 pixels to calculate correlations corresponding to +// the top-left 3x3 pixels, so this function must be called with 1x1 overlap, +// moving the window along/down by 3 pixels at a time. +INLINE static void horver_correlation_4x4(const int16_t *diff, int stride, + __m128i *xy_sum_32, + __m128i *xz_sum_32, __m128i *x_sum_32, + __m128i *x2_sum_32) { + // Pixels in this 4x4 [ a b c d ] + // are referred to as: [ e f g h ] + // [ i j k l ] + // [ m n o p ] + + const __m128i pixelsa = _mm_set_epi64x(*(int64_t *)&diff[0 * stride], + *(int64_t *)&diff[2 * stride]); + const __m128i pixelsb = _mm_set_epi64x(*(int64_t *)&diff[1 * stride], + *(int64_t *)&diff[3 * stride]); + // pixelsa = [d c b a l k j i] as i16 + // pixelsb = [h g f e p o n m] as i16 + + const __m128i slli_a = _mm_slli_epi64(pixelsa, 16); + const __m128i slli_b = _mm_slli_epi64(pixelsb, 16); + // slli_a = [c b a 0 k j i 0] as i16 + // slli_b = [g f e 0 o n m 0] as i16 + + const __m128i xy_madd_a = _mm_madd_epi16(pixelsa, slli_a); + const __m128i xy_madd_b = _mm_madd_epi16(pixelsb, slli_b); + // xy_madd_a = [bc+cd ab jk+kl ij] as i32 + // xy_madd_b = [fg+gh ef no+op mn] as i32 + + const __m128i xy32 = _mm_hadd_epi32(xy_madd_b, xy_madd_a); + // xy32 = [ab+bc+cd ij+jk+kl ef+fg+gh mn+no+op] as i32 + *xy_sum_32 = _mm_add_epi32(*xy_sum_32, xy32); + + const __m128i xz_madd_a = _mm_madd_epi16(slli_a, slli_b); + // xz_madd_a = [bf+cg ae jn+ko im] i32 + + const __m128i swap_b = _mm_srli_si128(slli_b, 8); + // swap_b = [0 0 0 0 g f e 0] as i16 + const __m128i xz_madd_b = _mm_madd_epi16(slli_a, swap_b); + // xz_madd_b = [0 0 gk+fj ei] i32 + + const __m128i xz32 = _mm_hadd_epi32(xz_madd_b, xz_madd_a); + // xz32 = [ae+bf+cg im+jn+ko 0 ei+fj+gk] i32 + *xz_sum_32 = _mm_add_epi32(*xz_sum_32, xz32); + + // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k + // (sum up every element in slli_a and swap_b) + const __m128i sum_slli_a = _mm_hadd_epi16(slli_a, slli_a); + const __m128i sum_slli_a32 = _mm_cvtepi16_epi32(sum_slli_a); + // sum_slli_a32 = [c+b a k+j i] as i32 + const __m128i swap_b32 = _mm_cvtepi16_epi32(swap_b); + // swap_b32 = [g f e 0] as i32 + *x_sum_32 = _mm_add_epi32(*x_sum_32, sum_slli_a32); + *x_sum_32 = _mm_add_epi32(*x_sum_32, swap_b32); + // sum = [c+b+g a+f k+j+e i] as i32 + + // Also sum their squares + const __m128i slli_a_2 = _mm_madd_epi16(slli_a, slli_a); + const __m128i swap_b_2 = _mm_madd_epi16(swap_b, swap_b); + // slli_a_2 = [c2+b2 a2 k2+j2 i2] + // swap_b_2 = [0 0 g2+f2 e2] + const __m128i sum2 = _mm_hadd_epi32(slli_a_2, swap_b_2); + // sum2 = [0 g2+f2+e2 c2+b2+a2 k2+j2+i2] + *x2_sum_32 = _mm_add_epi32(*x2_sum_32, sum2); +} + +void av1_get_horver_correlation_full_sse4_1(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - right neighbour pixel + // z - below neighbour pixel + // w - down-right neighbour pixel + int64_t xy_sum = 0, xz_sum = 0; + int64_t x_sum = 0, x2_sum = 0; + + // Process horizontal and vertical correlations through the body in 4x4 + // blocks. This excludes the final row and column and possibly one extra + // column depending how 3 divides into width and height + int32_t xy_tmp[4] = { 0 }, xz_tmp[4] = { 0 }; + int32_t x_tmp[4] = { 0 }, x2_tmp[4] = { 0 }; + __m128i xy_sum_32 = _mm_setzero_si128(); + __m128i xz_sum_32 = _mm_setzero_si128(); + __m128i x_sum_32 = _mm_setzero_si128(); + __m128i x2_sum_32 = _mm_setzero_si128(); + for (int i = 0; i <= height - 4; i += 3) { + for (int j = 0; j <= width - 4; j += 3) { + horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, + &xz_sum_32, &x_sum_32, &x2_sum_32); + } + xx_storeu_128(xy_tmp, xy_sum_32); + xx_storeu_128(xz_tmp, xz_sum_32); + xx_storeu_128(x_tmp, x_sum_32); + xx_storeu_128(x2_tmp, x2_sum_32); + xy_sum += (int64_t)xy_tmp[3] + xy_tmp[2] + xy_tmp[1]; + xz_sum += (int64_t)xz_tmp[3] + xz_tmp[2] + xz_tmp[0]; + x_sum += (int64_t)x_tmp[3] + x_tmp[2] + x_tmp[1] + x_tmp[0]; + x2_sum += (int64_t)x2_tmp[2] + x2_tmp[1] + x2_tmp[0]; + xy_sum_32 = _mm_setzero_si128(); + xz_sum_32 = _mm_setzero_si128(); + x_sum_32 = _mm_setzero_si128(); + x2_sum_32 = _mm_setzero_si128(); + } + + // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols + int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; + + // Do we have 2 rows remaining or just the one? Note that width and height + // are powers of 2, so each modulo 3 must be 1 or 2. + if (height % 3 == 1) { // Just horiz corrs on the final row + const int16_t x0 = diff[(height - 1) * stride]; + x_sum += x0; + x_finalrow += x0; + x2_sum += x0 * x0; + x2_finalrow += x0 * x0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 1) * stride + j]; + const int16_t y = diff[(height - 1) * stride + j + 1]; + xy_sum += x * y; + x_sum += y; + x2_sum += y * y; + x_finalrow += y; + x2_finalrow += y * y; + } + } else { // Two rows remaining to do + const int16_t x0 = diff[(height - 2) * stride]; + const int16_t z0 = diff[(height - 1) * stride]; + x_sum += x0 + z0; + x2_sum += x0 * x0 + z0 * z0; + x_finalrow += z0; + x2_finalrow += z0 * z0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 2) * stride + j]; + const int16_t y = diff[(height - 2) * stride + j + 1]; + const int16_t z = diff[(height - 1) * stride + j]; + const int16_t w = diff[(height - 1) * stride + j + 1]; + + // Horizontal and vertical correlations for the penultimate row: + xy_sum += x * y; + xz_sum += x * z; + + // Now just horizontal correlations for the final row: + xy_sum += z * w; + + x_sum += y + w; + x2_sum += y * y + w * w; + x_finalrow += w; + x2_finalrow += w * w; + } + } + + // Do we have 2 columns remaining or just the one? + if (width % 3 == 1) { // Just vert corrs on the final col + const int16_t x0 = diff[width - 1]; + x_sum += x0; + x_finalcol += x0; + x2_sum += x0 * x0; + x2_finalcol += x0 * x0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 1]; + xz_sum += x * z; + x_finalcol += z; + x2_finalcol += z * z; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z; + x2_sum += z * z; + } + } + } else { // Two cols remaining + const int16_t x0 = diff[width - 2]; + const int16_t y0 = diff[width - 1]; + x_sum += x0 + y0; + x2_sum += x0 * x0 + y0 * y0; + x_finalcol += y0; + x2_finalcol += y0 * y0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 2]; + const int16_t y = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 2]; + const int16_t w = diff[(i + 1) * stride + width - 1]; + + // Horizontal and vertical correlations for the penultimate col: + // Skip these on the last iteration of this loop if we also had two + // rows remaining, otherwise the final horizontal and vertical correlation + // get erroneously processed twice + if (i < height - 2 || height % 3 == 1) { + xy_sum += x * y; + xz_sum += x * z; + } + + x_finalcol += w; + x2_finalcol += w * w; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z + w; + x2_sum += z * z + w * w; + } + + // Now just vertical correlations for the final column: + xz_sum += y * w; + } + } + + // Calculate the simple sums and squared-sums + int64_t x_firstrow = 0, x_firstcol = 0; + int64_t x2_firstrow = 0, x2_firstcol = 0; + + for (int j = 0; j < width; ++j) { + x_firstrow += diff[j]; + x2_firstrow += diff[j] * diff[j]; + } + for (int i = 0; i < height; ++i) { + x_firstcol += diff[i * stride]; + x2_firstcol += diff[i * stride] * diff[i * stride]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} diff --git a/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c b/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c new file mode 100644 index 0000000000..a492483721 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/encoder/reconinter_enc.h" + +void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref, int ref_stride, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for + // 2-tap yet. + int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; + + if (!subpel_x_q3 && !subpel_y_q3) { + if (width >= 16) { + int i; + assert(!(width & 15)); + /*Read 16 pixels one row at a time.*/ + for (i = 0; i < height; i++) { + int j; + for (j = 0; j < width; j += 16) { + xx_storeu_128(comp_pred, xx_loadu_128(ref)); + comp_pred += 16; + ref += 16; + } + ref += ref_stride - width; + } + } else if (width >= 8) { + int i; + assert(!(width & 7)); + assert(!(height & 1)); + /*Read 8 pixels two rows at a time.*/ + for (i = 0; i < height; i += 2) { + __m128i s0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i s1 = xx_loadl_64(ref + 1 * ref_stride); + xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1)); + comp_pred += 16; + ref += 2 * ref_stride; + } + } else { + int i; + assert(!(width & 3)); + assert(!(height & 3)); + /*Read 4 pixels four rows at a time.*/ + for (i = 0; i < height; i++) { + const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride); + const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride); + const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride); + const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride); + const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1), + _mm_unpacklo_epi32(row2, row3)); + xx_storeu_128(comp_pred, reg); + comp_pred += 16; + ref += 4 * ref_stride; + } + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, + width, height); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, + width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); + uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) + ? temp + (filter_taps >> 1) * MAX_SB_SIZE + : temp; + uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); + int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, + kernel_x, 16, NULL, -1, width, intermediate_height); + aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, + kernel_y, 16, width, height); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w0, + const __m128i *w1, + const __m128i *r, + void *const result) { + assert(DIST_PRECISION_BITS <= 4); + __m128i mult0 = _mm_mullo_epi16(*p0, *w0); + __m128i mult1 = _mm_mullo_epi16(*p1, *w1); + __m128i sum = _mm_adds_epu16(mult0, mult1); + __m128i round = _mm_adds_epu16(sum, *r); + __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS); + + xx_storeu_128(result, shift); +} + +void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, + const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred8, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred8, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; + if (!subpel_x_q3 && !subpel_y_q3) { + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + if (width >= 8) { + int i; + assert(!(width & 7)); + /*Read 8 pixels one row at a time.*/ + for (i = 0; i < height; i++) { + int j; + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + _mm_storeu_si128((__m128i *)comp_pred, s0); + comp_pred += 8; + ref += 8; + } + ref += ref_stride - width; + } + } else { + int i; + assert(!(width & 3)); + /*Read 4 pixels two rows at a time.*/ + for (i = 0; i < height; i += 2) { + __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); + __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); + __m128i t0 = _mm_unpacklo_epi64(s0, s1); + _mm_storeu_si128((__m128i *)comp_pred, t0); + comp_pred += 8; + ref += 2 * ref_stride; + } + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, + NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1); + uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) + ? temp + (filter_taps >> 1) * MAX_SB_SIZE + : temp; + uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz( + ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); + aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE, + comp_pred8, width, NULL, -1, kernel_y, 16, width, + height, bd); + } +} + +void aom_highbd_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); + /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ + assert(!(width * height & 7)); + int n = width * height >> 3; + for (int i = 0; i < n; i++) { + __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0)); + comp_pred16 += 8; + pred += 8; + } +} + +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, + int subpel_search) { + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + int n; + int i; + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + assert(!(width * height & 7)); + n = width * height >> 3; + + const int16_t wt0 = (int16_t)jcp_param->fwd_offset; + const int16_t wt1 = (int16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set1_epi16(wt0); + const __m128i w1 = _mm_set1_epi16(wt1); + const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = _mm_set1_epi16(round); + + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred16); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); + + comp_pred16 += 8; + pred += 8; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + int n; + int i; + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + for (i = 0; i < n; i++) { + __m128i s0 = xx_loadu_128(comp_pred); + __m128i p0 = xx_loadu_128(pred); + xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0)); + comp_pred += 16; + pred += 16; + } +} diff --git a/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c b/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c new file mode 100644 index 0000000000..df7aa95855 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1, + const __m128i *w, const __m128i *r, + void *const result) { + __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1); + __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w); + __m128i round_lo = _mm_add_epi16(mult_lo, *r); + __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS); + + __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1); + __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w); + __m128i round_hi = _mm_add_epi16(mult_hi, *r); + __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS); + + xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi)); +} + +void aom_dist_wtd_comp_avg_upsampled_pred_ssse3( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { + int n; + int i; + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + + const int8_t w0 = (int8_t)jcp_param->fwd_offset; + const int8_t w1 = (int8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = _mm_set1_epi16(round); + + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred); + __m128i p1 = xx_loadu_128(pred); + + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + } +} diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c b/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c new file mode 100644 index 0000000000..752d6f3f0b --- /dev/null +++ b/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c @@ -0,0 +1,647 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +#define SSE_STRIDE (BW + 2) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = { + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 }, + { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, + { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 }, + { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } +}; + +DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = { + { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 } +}; + +#define CALC_X_GRADIENT(AC, GI, DF, out) \ + out = _mm256_abs_epi16( \ + _mm256_add_epi16(_mm256_add_epi16(AC, GI), _mm256_slli_epi16(DF, 1))); + +#define CALC_Y_GRADIENT(AC, GI, BH, out) \ + out = _mm256_abs_epi16( \ + _mm256_add_epi16(_mm256_sub_epi16(AC, GI), _mm256_slli_epi16(BH, 1))); + +double av1_estimate_noise_from_single_plane_avx2(const uint8_t *src, int height, + int width, int stride, + int edge_thresh) { + int count = 0; + int64_t accum = 0; + // w32 stores width multiple of 32. + const int w32 = (width - 1) & ~0x1f; + const __m256i zero = _mm256_setzero_si256(); + const __m256i edge_threshold = _mm256_set1_epi16(edge_thresh); + __m256i num_accumulator = zero; + __m256i sum_accumulator = zero; + + // A | B | C + // D | E | F + // G | H | I + // g_x = (A - C) + (G - I) + 2*(D - F) + // g_y = (A + C) - (G + I) + 2*(B - H) + // v = 4*E - 2*(D+F+B+H) + (A+C+G+I) + + // Process the width multiple of 32 here. + for (int w = 1; w < w32; w += 32) { + int h = 1; + const int start_idx = h * stride + w; + const int stride_0 = start_idx - stride; + + __m256i num_accum_row_lvl = zero; + const __m256i A = _mm256_loadu_si256((__m256i *)(&src[stride_0 - 1])); + const __m256i C = _mm256_loadu_si256((__m256i *)(&src[stride_0 + 1])); + const __m256i D = _mm256_loadu_si256((__m256i *)(&src[start_idx - 1])); + const __m256i F = _mm256_loadu_si256((__m256i *)(&src[start_idx + 1])); + __m256i B = _mm256_loadu_si256((__m256i *)(&src[stride_0])); + __m256i E = _mm256_loadu_si256((__m256i *)(&src[start_idx])); + + const __m256i A_lo = _mm256_unpacklo_epi8(A, zero); + const __m256i A_hi = _mm256_unpackhi_epi8(A, zero); + const __m256i C_lo = _mm256_unpacklo_epi8(C, zero); + const __m256i C_hi = _mm256_unpackhi_epi8(C, zero); + const __m256i D_lo = _mm256_unpacklo_epi8(D, zero); + const __m256i D_hi = _mm256_unpackhi_epi8(D, zero); + const __m256i F_lo = _mm256_unpacklo_epi8(F, zero); + const __m256i F_hi = _mm256_unpackhi_epi8(F, zero); + + __m256i sub_AC_lo = _mm256_sub_epi16(A_lo, C_lo); + __m256i sub_AC_hi = _mm256_sub_epi16(A_hi, C_hi); + __m256i sum_AC_lo = _mm256_add_epi16(A_lo, C_lo); + __m256i sum_AC_hi = _mm256_add_epi16(A_hi, C_hi); + __m256i sub_DF_lo = _mm256_sub_epi16(D_lo, F_lo); + __m256i sub_DF_hi = _mm256_sub_epi16(D_hi, F_hi); + __m256i sum_DF_lo = _mm256_add_epi16(D_lo, F_lo); + __m256i sum_DF_hi = _mm256_add_epi16(D_hi, F_hi); + + for (; h < height - 1; h++) { + __m256i sum_GI_lo, sub_GI_lo, sum_GI_hi, sub_GI_hi, gx_lo, gy_lo, gx_hi, + gy_hi; + const int k = h * stride + w; + const __m256i G = _mm256_loadu_si256((__m256i *)(&src[k + stride - 1])); + const __m256i H = _mm256_loadu_si256((__m256i *)(&src[k + stride])); + const __m256i I = _mm256_loadu_si256((__m256i *)(&src[k + stride + 1])); + + const __m256i B_lo = _mm256_unpacklo_epi8(B, zero); + const __m256i B_hi = _mm256_unpackhi_epi8(B, zero); + const __m256i G_lo = _mm256_unpacklo_epi8(G, zero); + const __m256i G_hi = _mm256_unpackhi_epi8(G, zero); + const __m256i I_lo = _mm256_unpacklo_epi8(I, zero); + const __m256i I_hi = _mm256_unpackhi_epi8(I, zero); + const __m256i H_lo = _mm256_unpacklo_epi8(H, zero); + const __m256i H_hi = _mm256_unpackhi_epi8(H, zero); + + sub_GI_lo = _mm256_sub_epi16(G_lo, I_lo); + sub_GI_hi = _mm256_sub_epi16(G_hi, I_hi); + sum_GI_lo = _mm256_add_epi16(G_lo, I_lo); + sum_GI_hi = _mm256_add_epi16(G_hi, I_hi); + const __m256i sub_BH_lo = _mm256_sub_epi16(B_lo, H_lo); + const __m256i sub_BH_hi = _mm256_sub_epi16(B_hi, H_hi); + + CALC_X_GRADIENT(sub_AC_lo, sub_GI_lo, sub_DF_lo, gx_lo) + CALC_Y_GRADIENT(sum_AC_lo, sum_GI_lo, sub_BH_lo, gy_lo) + + const __m256i ga_lo = _mm256_add_epi16(gx_lo, gy_lo); + + CALC_X_GRADIENT(sub_AC_hi, sub_GI_hi, sub_DF_hi, gx_hi) + CALC_Y_GRADIENT(sum_AC_hi, sum_GI_hi, sub_BH_hi, gy_hi) + + const __m256i ga_hi = _mm256_add_epi16(gx_hi, gy_hi); + + __m256i cmp_lo = _mm256_cmpgt_epi16(edge_threshold, ga_lo); + __m256i cmp_hi = _mm256_cmpgt_epi16(edge_threshold, ga_hi); + const __m256i comp_reg = _mm256_add_epi16(cmp_lo, cmp_hi); + + // v = 4*E -2*(D+F+B+H) + (A+C+G+I) + if (_mm256_movemask_epi8(comp_reg) != 0) { + const __m256i sum_BH_lo = _mm256_add_epi16(B_lo, H_lo); + const __m256i sum_BH_hi = _mm256_add_epi16(B_hi, H_hi); + + // 2*(D+F+B+H) + const __m256i sum_DFBH_lo = + _mm256_slli_epi16(_mm256_add_epi16(sum_DF_lo, sum_BH_lo), 1); + // (A+C+G+I) + const __m256i sum_ACGI_lo = _mm256_add_epi16(sum_AC_lo, sum_GI_lo); + const __m256i sum_DFBH_hi = + _mm256_slli_epi16(_mm256_add_epi16(sum_DF_hi, sum_BH_hi), 1); + const __m256i sum_ACGI_hi = _mm256_add_epi16(sum_AC_hi, sum_GI_hi); + + // Convert E register values from 8bit to 16bit + const __m256i E_lo = _mm256_unpacklo_epi8(E, zero); + const __m256i E_hi = _mm256_unpackhi_epi8(E, zero); + + // 4*E - 2*(D+F+B+H)+ (A+C+G+I) + const __m256i var_lo_0 = _mm256_abs_epi16(_mm256_add_epi16( + _mm256_sub_epi16(_mm256_slli_epi16(E_lo, 2), sum_DFBH_lo), + sum_ACGI_lo)); + const __m256i var_hi_0 = _mm256_abs_epi16(_mm256_add_epi16( + _mm256_sub_epi16(_mm256_slli_epi16(E_hi, 2), sum_DFBH_hi), + sum_ACGI_hi)); + cmp_lo = _mm256_srli_epi16(cmp_lo, 15); + cmp_hi = _mm256_srli_epi16(cmp_hi, 15); + const __m256i var_lo = _mm256_mullo_epi16(var_lo_0, cmp_lo); + const __m256i var_hi = _mm256_mullo_epi16(var_hi_0, cmp_hi); + + num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_lo); + num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_hi); + + sum_accumulator = _mm256_add_epi32(sum_accumulator, + _mm256_unpacklo_epi16(var_lo, zero)); + sum_accumulator = _mm256_add_epi32(sum_accumulator, + _mm256_unpackhi_epi16(var_lo, zero)); + sum_accumulator = _mm256_add_epi32(sum_accumulator, + _mm256_unpacklo_epi16(var_hi, zero)); + sum_accumulator = _mm256_add_epi32(sum_accumulator, + _mm256_unpackhi_epi16(var_hi, zero)); + } + sub_AC_lo = sub_DF_lo; + sub_AC_hi = sub_DF_hi; + sub_DF_lo = sub_GI_lo; + sub_DF_hi = sub_GI_hi; + sum_AC_lo = sum_DF_lo; + sum_AC_hi = sum_DF_hi; + sum_DF_lo = sum_GI_lo; + sum_DF_hi = sum_GI_hi; + B = E; + E = H; + } + const __m256i num_0 = _mm256_unpacklo_epi16(num_accum_row_lvl, zero); + const __m256i num_1 = _mm256_unpackhi_epi16(num_accum_row_lvl, zero); + num_accumulator = + _mm256_add_epi32(num_accumulator, _mm256_add_epi32(num_0, num_1)); + } + + // Process the remaining width here. + for (int h = 1; h < height - 1; ++h) { + for (int w = w32 + 1; w < width - 1; ++w) { + const int k = h * stride + w; + + // Compute sobel gradients + const int g_x = (src[k - stride - 1] - src[k - stride + 1]) + + (src[k + stride - 1] - src[k + stride + 1]) + + 2 * (src[k - 1] - src[k + 1]); + const int g_y = (src[k - stride - 1] - src[k + stride - 1]) + + (src[k - stride + 1] - src[k + stride + 1]) + + 2 * (src[k - stride] - src[k + stride]); + const int ga = abs(g_x) + abs(g_y); + + if (ga < edge_thresh) { + // Find Laplacian + const int v = + 4 * src[k] - + 2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) + + (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] + + src[k + stride + 1]); + accum += abs(v); + ++count; + } + } + } + + // s0 s1 n0 n1 s2 s3 n2 n3 + __m256i sum_avx = _mm256_hadd_epi32(sum_accumulator, num_accumulator); + __m128i sum_avx_lo = _mm256_castsi256_si128(sum_avx); + __m128i sum_avx_hi = _mm256_extractf128_si256(sum_avx, 1); + // s0+s2 s1+s3 n0+n2 n1+n3 + __m128i sum_avx_1 = _mm_add_epi32(sum_avx_lo, sum_avx_hi); + // s0+s2+s1+s3 n0+n2+n1+n3 + __m128i result = _mm_add_epi32(_mm_srli_si128(sum_avx_1, 4), sum_avx_1); + + accum += _mm_cvtsi128_si32(result); + count += _mm_extract_epi32(result, 2); + + // If very few smooth pels, return -1 since the estimate is unreliable. + return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2; +} + +static AOM_FORCE_INLINE void get_squared_error_16x16_avx2( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + uint16_t *frame_sse, const unsigned int sse_stride) { + (void)block_width; + const uint8_t *src1 = frame1; + const uint8_t *src2 = frame2; + uint16_t *dst = frame_sse; + for (int i = 0; i < block_height; i++) { + __m128i vf1_128, vf2_128; + __m256i vf1, vf2, vdiff1, vsqdiff1; + + vf1_128 = _mm_loadu_si128((__m128i *)(src1)); + vf2_128 = _mm_loadu_si128((__m128i *)(src2)); + vf1 = _mm256_cvtepu8_epi16(vf1_128); + vf2 = _mm256_cvtepu8_epi16(vf2_128); + vdiff1 = _mm256_sub_epi16(vf1, vf2); + vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1); + + _mm256_storeu_si256((__m256i *)(dst), vsqdiff1); + // Set zero to uninitialized memory to avoid uninitialized loads later + *(int *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + src1 += stride, src2 += stride2; + dst += sse_stride; + } +} + +static AOM_FORCE_INLINE void get_squared_error_32x32_avx2( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + uint16_t *frame_sse, const unsigned int sse_stride) { + (void)block_width; + const uint8_t *src1 = frame1; + const uint8_t *src2 = frame2; + uint16_t *dst = frame_sse; + for (int i = 0; i < block_height; i++) { + __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2; + + vsrc1 = _mm256_loadu_si256((__m256i *)src1); + vsrc2 = _mm256_loadu_si256((__m256i *)src2); + vmax = _mm256_max_epu8(vsrc1, vsrc2); + vmin = _mm256_min_epu8(vsrc1, vsrc2); + vdiff = _mm256_subs_epu8(vmax, vmin); + + __m128i vtmp1 = _mm256_castsi256_si128(vdiff); + __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1); + vdiff1 = _mm256_cvtepu8_epi16(vtmp1); + vdiff2 = _mm256_cvtepu8_epi16(vtmp2); + + vres1 = _mm256_mullo_epi16(vdiff1, vdiff1); + vres2 = _mm256_mullo_epi16(vdiff2, vdiff2); + _mm256_storeu_si256((__m256i *)(dst), vres1); + _mm256_storeu_si256((__m256i *)(dst + 16), vres2); + // Set zero to uninitialized memory to avoid uninitialized loads later + *(int *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + src1 += stride; + src2 += stride2; + dst += sse_stride; + } +} + +static AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col, + int block_width) { + __m128i v128tmp = _mm_loadu_si128((__m128i *)(src)); + if (col == 0) { + // For the first column, replicate the first element twice to the left + v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]); + } + if (col == block_width - 4) { + // For the last column, replicate the last element twice to the right + v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]); + } + return _mm256_cvtepu16_epi32(v128tmp); +} + +static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) { + // Mask the required 5 values inside the vector + __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]); + __m128i v128a, v128b; + // Extract 256b as two 128b registers A and B + v128a = _mm256_castsi256_si128(vtmp); + v128b = _mm256_extracti128_si256(vtmp, 1); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + v128a = _mm_add_epi32(v128a, v128b); + // B = [A2+B2, A3+B3, 0, 0] + v128b = _mm_srli_si128(v128a, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + v128a = _mm_add_epi32(v128a, v128b); + // B = [A1+B1+A3+B3, 0, 0, 0] + v128b = _mm_srli_si128(v128a, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + v128a = _mm_add_epi32(v128a, v128b); + return _mm_extract_epi32(v128a, 0); +} + +// AVX2 implementation of approx_exp() +static AOM_INLINE __m256 approx_exp_avx2(__m256 y) { +#define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2) +#define B \ + 127 // Offset for the exponent according to IEEE floating point standard. +#define C 60801 // Magic number controls the accuracy of approximation + const __m256 multiplier = _mm256_set1_ps(A); + const __m256i offset = _mm256_set1_epi32(B * (1 << 23) - C); + + y = _mm256_mul_ps(y, multiplier); + y = _mm256_castsi256_ps(_mm256_add_epi32(_mm256_cvttps_epi32(y), offset)); + return y; +#undef A +#undef B +#undef C +} + +static void apply_temporal_filter( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const int *subblock_mses, unsigned int *accumulator, uint16_t *count, + uint16_t *frame_sse, uint32_t *luma_sse_sum, + const double inv_num_ref_pixels, const double decay_factor, + const double inv_factor, const double weight_factor, double *d_factor, + int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_sse[BH][BW]; + + if (block_width == 32) { + get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width, + block_height, frame_sse, SSE_STRIDE); + } else { + get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width, + block_height, frame_sse, SSE_STRIDE); + } + + __m256i vsrc[5]; + + // Traverse 4 columns at a time + // First and last columns will require padding + for (int col = 0; col < block_width; col += 4) { + uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + vsrc[i] = xx_load_and_pad(src, col, block_width); + src += SSE_STRIDE; + } + + // Copy first row to first 2 vectors + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (int row = 0; row < block_height; row++) { + __m256i vsum = _mm256_setzero_si256(); + + // Add 5 consecutive rows + for (int i = 0; i < 5; i++) { + vsum = _mm256_add_epi32(vsum, vsrc[i]); + } + + // Push all elements by one element to the top + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + // Load next row to the last element + if (row <= block_height - 4) { + vsrc[4] = xx_load_and_pad(src, col, block_width); + src += SSE_STRIDE; + } else { + vsrc[4] = vsrc[3]; + } + + // Accumulate the sum horizontally + for (int i = 0; i < 4; i++) { + acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i); + } + } + } + + double subblock_mses_scaled[4]; + double d_factor_decayed[4]; + for (int idx = 0; idx < 4; idx++) { + subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; + d_factor_decayed[idx] = d_factor[idx] * decay_factor; + } + if (tf_wgt_calc_lvl == 0) { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } else { + __m256d subblock_mses_reg[4]; + __m256d d_factor_mul_n_decay_qr_invs[4]; + const __m256 zero = _mm256_set1_ps(0.0f); + const __m256 point_five = _mm256_set1_ps(0.5f); + const __m256 seven = _mm256_set1_ps(7.0f); + const __m256d inv_num_ref_pixel_256bit = _mm256_set1_pd(inv_num_ref_pixels); + const __m256d weight_factor_256bit = _mm256_set1_pd(weight_factor); + const __m256 tf_weight_scale = _mm256_set1_ps((float)TF_WEIGHT_SCALE); + // Maintain registers to hold mse and d_factor at subblock level. + subblock_mses_reg[0] = _mm256_set1_pd(subblock_mses_scaled[0]); + subblock_mses_reg[1] = _mm256_set1_pd(subblock_mses_scaled[1]); + subblock_mses_reg[2] = _mm256_set1_pd(subblock_mses_scaled[2]); + subblock_mses_reg[3] = _mm256_set1_pd(subblock_mses_scaled[3]); + d_factor_mul_n_decay_qr_invs[0] = _mm256_set1_pd(d_factor_decayed[0]); + d_factor_mul_n_decay_qr_invs[1] = _mm256_set1_pd(d_factor_decayed[1]); + d_factor_mul_n_decay_qr_invs[2] = _mm256_set1_pd(d_factor_decayed[2]); + d_factor_mul_n_decay_qr_invs[3] = _mm256_set1_pd(d_factor_decayed[3]); + + for (int i = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + uint32_t *luma_sse_sum_temp = luma_sse_sum + i * BW; + for (int j = 0; j < block_width; j += 8) { + const __m256i acc_sse = + _mm256_lddqu_si256((__m256i *)(acc_5x5_sse[i] + j)); + const __m256i luma_sse = + _mm256_lddqu_si256((__m256i *)((luma_sse_sum_temp + j))); + + // uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + const __m256i diff_sse = _mm256_add_epi32(acc_sse, luma_sse); + + const __m256d diff_sse_pd_1 = + _mm256_cvtepi32_pd(_mm256_castsi256_si128(diff_sse)); + const __m256d diff_sse_pd_2 = + _mm256_cvtepi32_pd(_mm256_extracti128_si256(diff_sse, 1)); + + // const double window_error = diff_sse * inv_num_ref_pixels; + const __m256d window_error_1 = + _mm256_mul_pd(diff_sse_pd_1, inv_num_ref_pixel_256bit); + const __m256d window_error_2 = + _mm256_mul_pd(diff_sse_pd_2, inv_num_ref_pixel_256bit); + + // const int subblock_idx = y_blk_raster_offset + (j >= block_width / + // 2); + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + const __m256d blk_error = subblock_mses_reg[subblock_idx]; + + // const double combined_error = + // weight_factor *window_error + subblock_mses_scaled[subblock_idx]; + const __m256d combined_error_1 = _mm256_add_pd( + _mm256_mul_pd(window_error_1, weight_factor_256bit), blk_error); + + const __m256d combined_error_2 = _mm256_add_pd( + _mm256_mul_pd(window_error_2, weight_factor_256bit), blk_error); + + // d_factor_decayed[subblock_idx] + const __m256d d_fact_mul_n_decay = + d_factor_mul_n_decay_qr_invs[subblock_idx]; + + // double scaled_error = combined_error * + // d_factor_decayed[subblock_idx]; + const __m256d scaled_error_1 = + _mm256_mul_pd(combined_error_1, d_fact_mul_n_decay); + const __m256d scaled_error_2 = + _mm256_mul_pd(combined_error_2, d_fact_mul_n_decay); + + const __m128 scaled_error_ps_1 = _mm256_cvtpd_ps(scaled_error_1); + const __m128 scaled_error_ps_2 = _mm256_cvtpd_ps(scaled_error_2); + + const __m256 scaled_error_ps = _mm256_insertf128_ps( + _mm256_castps128_ps256(scaled_error_ps_1), scaled_error_ps_2, 0x1); + + // scaled_error = AOMMIN(scaled_error, 7); + const __m256 scaled_diff_ps = _mm256_min_ps(scaled_error_ps, seven); + const __m256 minus_scaled_diff_ps = _mm256_sub_ps(zero, scaled_diff_ps); + // const int weight = + //(int)(approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE + 0.5f); + const __m256 exp_result = approx_exp_avx2(minus_scaled_diff_ps); + const __m256 scale_weight_exp_result = + _mm256_mul_ps(exp_result, tf_weight_scale); + const __m256 round_result = + _mm256_add_ps(scale_weight_exp_result, point_five); + __m256i weights_in_32bit = _mm256_cvttps_epi32(round_result); + + __m128i weights_in_16bit = + _mm_packus_epi32(_mm256_castsi256_si128(weights_in_32bit), + _mm256_extractf128_si256(weights_in_32bit, 0x1)); + + // count[k] += weight; + // accumulator[k] += weight * pixel_value; + const int stride_idx = i * stride2 + j; + const __m128i count_array = + _mm_loadu_si128((__m128i *)(count + stride_idx)); + _mm_storeu_si128((__m128i *)(count + stride_idx), + _mm_add_epi16(count_array, weights_in_16bit)); + + const __m256i accumulator_array = + _mm256_loadu_si256((__m256i *)(accumulator + stride_idx)); + const __m128i pred_values = + _mm_loadl_epi64((__m128i *)(frame2 + stride_idx)); + + const __m256i pred_values_u32 = _mm256_cvtepu8_epi32(pred_values); + const __m256i mull_frame2_weight_u32 = + _mm256_mullo_epi32(pred_values_u32, weights_in_32bit); + _mm256_storeu_si256( + (__m256i *)(accumulator + stride_idx), + _mm256_add_epi32(accumulator_array, mull_frame2_weight_u32)); + } + } + } +} + +void av1_apply_temporal_filter_avx2( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!"); + assert(!is_high_bitdepth && "Only support low bit-depth with avx2!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint16_t frame_sse[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0, k = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++, k++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx]; + } + } + } + } + } + + apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w, + plane_w, plane_h, subblock_mses, accum + plane_offset, + count + plane_offset, frame_sse, luma_sse_sum, + inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + plane_offset += plane_h * plane_w; + } +} diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c b/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c new file mode 100644 index 0000000000..842d3b13c8 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "aom_dsp/mathutils.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +// For the squared error buffer, keep a padding for 4 samples +#define SSE_STRIDE (BW + 4) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = { + { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } +}; + +static void get_squared_error(const uint8_t *frame1, const unsigned int stride, + const uint8_t *frame2, const unsigned int stride2, + const int block_width, const int block_height, + uint16_t *frame_sse, + const unsigned int dst_stride) { + const uint8_t *src1 = frame1; + const uint8_t *src2 = frame2; + uint16_t *dst = frame_sse; + + for (int i = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j += 16) { + // Set zero to uninitialized memory to avoid uninitialized loads later + *(int *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j)); + __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j)); + + __m128i vmax = _mm_max_epu8(vsrc1, vsrc2); + __m128i vmin = _mm_min_epu8(vsrc1, vsrc2); + __m128i vdiff = _mm_subs_epu8(vmax, vmin); + + __m128i vzero = _mm_setzero_si128(); + __m128i vdiff1 = _mm_unpacklo_epi8(vdiff, vzero); + __m128i vdiff2 = _mm_unpackhi_epi8(vdiff, vzero); + + __m128i vres1 = _mm_mullo_epi16(vdiff1, vdiff1); + __m128i vres2 = _mm_mullo_epi16(vdiff2, vdiff2); + + _mm_storeu_si128((__m128i *)(dst + j + 2), vres1); + _mm_storeu_si128((__m128i *)(dst + j + 10), vres2); + } + + // Set zero to uninitialized memory to avoid uninitialized loads later + *(int *)(dst + block_width + 2) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + src1 += stride; + src2 += stride2; + dst += dst_stride; + } +} + +static void xx_load_and_pad(uint16_t *src, __m128i *dstvec, int col, + int block_width) { + __m128i vtmp = _mm_loadu_si128((__m128i *)src); + __m128i vzero = _mm_setzero_si128(); + __m128i vtmp1 = _mm_unpacklo_epi16(vtmp, vzero); + __m128i vtmp2 = _mm_unpackhi_epi16(vtmp, vzero); + // For the first column, replicate the first element twice to the left + dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA); + // For the last column, replicate the last element twice to the right + dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54); +} + +static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) { + __m128i veca, vecb; + // Mask and obtain the required 5 values inside the vector + veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]); + vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + veca = _mm_add_epi32(veca, vecb); + // B = [A2+B2, A3+B3, 0, 0] + vecb = _mm_srli_si128(veca, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + veca = _mm_add_epi32(veca, vecb); + // B = [A1+B1+A3+B3, 0, 0, 0] + vecb = _mm_srli_si128(veca, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + veca = _mm_add_epi32(veca, vecb); + return _mm_cvtsi128_si32(veca); +} + +static void apply_temporal_filter( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const int *subblock_mses, unsigned int *accumulator, uint16_t *count, + uint16_t *frame_sse, uint32_t *luma_sse_sum, + const double inv_num_ref_pixels, const double decay_factor, + const double inv_factor, const double weight_factor, double *d_factor, + int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_sse[BH][BW]; + + get_squared_error(frame1, stride, frame2, stride2, block_width, block_height, + frame_sse, SSE_STRIDE); + + __m128i vsrc[5][2]; + + // Traverse 4 columns at a time + // First and last columns will require padding + for (int col = 0; col < block_width; col += 4) { + uint16_t *src = frame_sse + col; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + xx_load_and_pad(src, vsrc[i], col, block_width); + src += SSE_STRIDE; + } + + // Padding for top 2 rows + vsrc[0][0] = vsrc[2][0]; + vsrc[0][1] = vsrc[2][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + + for (int row = 0; row < block_height; row++) { + __m128i vsum1 = _mm_setzero_si128(); + __m128i vsum2 = _mm_setzero_si128(); + + // Add 5 consecutive rows + for (int i = 0; i < 5; i++) { + vsum1 = _mm_add_epi32(vsrc[i][0], vsum1); + vsum2 = _mm_add_epi32(vsrc[i][1], vsum2); + } + + // Push all elements by one element to the top + for (int i = 0; i < 4; i++) { + vsrc[i][0] = vsrc[i + 1][0]; + vsrc[i][1] = vsrc[i + 1][1]; + } + + if (row <= block_height - 4) { + // Load next row + xx_load_and_pad(src, vsrc[4], col, block_width); + src += SSE_STRIDE; + } else { + // Padding for bottom 2 rows + vsrc[4][0] = vsrc[3][0]; + vsrc[4][1] = vsrc[3][1]; + } + + // Accumulate the sum horizontally + for (int i = 0; i < 4; i++) { + acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum1, vsum2, i); + } + } + } + + double subblock_mses_scaled[4]; + double d_factor_decayed[4]; + for (int idx = 0; idx < 4; idx++) { + subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; + d_factor_decayed[idx] = d_factor[idx] * decay_factor; + } + if (tf_wgt_calc_lvl == 0) { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } else { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } +} + +void av1_apply_temporal_filter_sse2( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!"); + assert(!is_high_bitdepth && "Only support low bit-depth with sse2!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint16_t frame_sse[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0, k = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++, k++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; + } + } + } + } + } + + apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w, + plane_w, plane_h, subblock_mses, accum + plane_offset, + count + plane_offset, frame_sse, luma_sse_sum, + inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + plane_offset += plane_h * plane_w; + } +} diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c new file mode 100644 index 0000000000..9cde860534 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom/aom_integer.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c + */ +uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + int n = -N; + + uint64_t csse; + + const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE); + const __m256i v_zext_q = yy_set1_64_from_32i(~0); + + __m256i v_acc0_q = _mm256_setzero_si256(); + + assert(N % 64 == 0); + + r1 += N; + d += N; + m += N; + + do { + const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n)); + const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n)); + const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n)); + + const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w); + const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w); + const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b); + + const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w); + const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w); + + const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w); + const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w); + + const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d); + + const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w); + + const __m256i v_sum0_q = _mm256_add_epi64( + _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32)); + + v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q); + + n += 16; + } while (n); + + v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8)); + __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q); + __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1); + v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); +#if AOM_ARCH_X86_64 + csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0); +#else + xx_storel_64(&csse, v_acc_q_0); +#endif + + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * See av1_wedge_sign_from_residuals_c + */ +int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc; + __m256i v_acc0_d = _mm256_setzero_si256(); + + // Input size limited to 8192 by the use of 32 bit accumulators and m + // being between [0, 64]. Overflow might happen at larger sizes, + // though it is practically impossible on real video input. + assert(N < 8192); + assert(N % 64 == 0); + + do { + const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m)); + const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32)); + + const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds)); + const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16)); + const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32)); + const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48)); + + const __m256i v_m0_w = + _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b)); + const __m256i v_m1_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1)); + const __m256i v_m2_w = + _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b)); + const __m256i v_m3_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1)); + + const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w); + const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w); + const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w); + const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w); + + const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d); + const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d); + + const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d); + + v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d); + + ds += 64; + m += 64; + + N -= 64; + } while (N); + + __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31); + v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d), + _mm256_unpackhi_epi32(v_acc0_d, v_sign_d)); + + __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8)); + + __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q); + __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1); + v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); + +#if AOM_ARCH_X86_64 + acc = _mm_extract_epi64(v_acc_q_0, 0); +#else + xx_storel_64(&acc, v_acc_q_0); +#endif + + return acc > limit; +} + +/** + * av1_wedge_compute_delta_squares_c + */ +void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + const __m256i v_neg_w = _mm256_set1_epi32((int)0xffff0001); + + assert(N % 64 == 0); + + do { + const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a)); + const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b)); + const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16)); + const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16)); + const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32)); + const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32)); + const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48)); + const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48)); + + const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w); + const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w); + const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w); + const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w); + const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w); + const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w); + const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w); + const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w); + + // Negate top word of pairs + const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w); + const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w); + const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w); + const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w); + const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w); + const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w); + const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w); + const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w); + + const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w); + const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w); + const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w); + const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w); + const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w); + const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w); + const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w); + const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w); + + const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w); + const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w); + const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w); + const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w); + + _mm256_store_si256((__m256i *)(d), v_r0_w); + _mm256_store_si256((__m256i *)(d + 16), v_r1_w); + _mm256_store_si256((__m256i *)(d + 32), v_r2_w); + _mm256_store_si256((__m256i *)(d + 48), v_r3_w); + + a += 64; + b += 64; + d += 64; + N -= 64; + } while (N); +} diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c new file mode 100644 index 0000000000..d7ac2223f2 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/x86/synonyms.h" + +#include "aom/aom_integer.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c + */ +uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + int n = -N; + int n8 = n + 8; + + uint64_t csse; + + const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); + const __m128i v_zext_q = xx_set1_64_from_32i(~0); + + __m128i v_acc0_q = _mm_setzero_si128(); + + assert(N % 64 == 0); + + r1 += N; + d += N; + m += N; + + do { + const __m128i v_r0_w = xx_load_128(r1 + n); + const __m128i v_r1_w = xx_load_128(r1 + n8); + const __m128i v_d0_w = xx_load_128(d + n); + const __m128i v_d1_w = xx_load_128(d + n8); + const __m128i v_m01_b = xx_load_128(m + n); + + const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); + const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); + const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); + const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); + + const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); + const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); + const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); + const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); + + const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); + const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); + const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); + const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); + + const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); + const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); + + const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); + const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); + + const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), + _mm_srli_epi64(v_sq0_d, 32)); + const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), + _mm_srli_epi64(v_sq1_d, 32)); + + v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); + v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); + + n8 += 16; + n += 16; + } while (n); + + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); + +#if AOM_ARCH_X86_64 + csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); +#else + xx_storel_64(&csse, v_acc0_q); +#endif + + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * See av1_wedge_sign_from_residuals_c + */ +int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc; + + __m128i v_sign_d; + __m128i v_acc0_d = _mm_setzero_si128(); + __m128i v_acc1_d = _mm_setzero_si128(); + __m128i v_acc_q; + + // Input size limited to 8192 by the use of 32 bit accumulators and m + // being between [0, 64]. Overflow might happen at larger sizes, + // though it is practically impossible on real video input. + assert(N < 8192); + assert(N % 64 == 0); + + do { + const __m128i v_m01_b = xx_load_128(m); + const __m128i v_m23_b = xx_load_128(m + 16); + const __m128i v_m45_b = xx_load_128(m + 32); + const __m128i v_m67_b = xx_load_128(m + 48); + + const __m128i v_d0_w = xx_load_128(ds); + const __m128i v_d1_w = xx_load_128(ds + 8); + const __m128i v_d2_w = xx_load_128(ds + 16); + const __m128i v_d3_w = xx_load_128(ds + 24); + const __m128i v_d4_w = xx_load_128(ds + 32); + const __m128i v_d5_w = xx_load_128(ds + 40); + const __m128i v_d6_w = xx_load_128(ds + 48); + const __m128i v_d7_w = xx_load_128(ds + 56); + + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); + const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); + const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); + const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); + const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); + const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); + + const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); + const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); + const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); + const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); + const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); + const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); + const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); + const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); + + const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); + const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); + const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); + const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); + + const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); + const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); + + v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); + v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); + + ds += 64; + m += 64; + + N -= 64; + } while (N); + + v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); + v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), + _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); + + v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); + v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), + _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); + + v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + +#if AOM_ARCH_X86_64 + acc = _mm_cvtsi128_si64(v_acc_q); +#else + xx_storel_64(&acc, v_acc_q); +#endif + + return acc > limit; +} + +// Negate under mask +static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) { + return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w); +} + +/** + * av1_wedge_compute_delta_squares_c + */ +void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0, + (short)0xffff, 0, (short)0xffff, 0); + + assert(N % 64 == 0); + + do { + const __m128i v_a0_w = xx_load_128(a); + const __m128i v_b0_w = xx_load_128(b); + const __m128i v_a1_w = xx_load_128(a + 8); + const __m128i v_b1_w = xx_load_128(b + 8); + const __m128i v_a2_w = xx_load_128(a + 16); + const __m128i v_b2_w = xx_load_128(b + 16); + const __m128i v_a3_w = xx_load_128(a + 24); + const __m128i v_b3_w = xx_load_128(b + 24); + + const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w); + const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w); + const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w); + const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w); + const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w); + const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w); + const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w); + const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w); + + // Negate top word of pairs + const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w); + const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w); + const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w); + const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w); + const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w); + const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w); + const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w); + const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w); + + const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w); + const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w); + const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w); + const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w); + const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w); + const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w); + const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w); + const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w); + + const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w); + const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w); + const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w); + const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w); + + xx_store_128(d, v_r0_w); + xx_store_128(d + 8, v_r1_w); + xx_store_128(d + 16, v_r2_w); + xx_store_128(d + 24, v_r3_w); + + a += 32; + b += 32; + d += 32; + N -= 32; + } while (N); +} -- cgit v1.2.3